Ejemplo n.º 1
0
def run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred):
    # type: (str, str, str) -> None

    from sbsp_general import ENV
    bin_external = ENV["pd-bin-external"]
    prog = f"{bin_external}/gms2/gmhmmp2"
    mgm_mod = f"{bin_external}/gms2/mgm_11.mod"
    cmd = f"{prog} -m {pf_new_mod} -M {mgm_mod} -s {pf_sequence} -o {pf_new_pred} --format gff"
    run_shell_cmd(cmd)
Ejemplo n.º 2
0
def get_identital_labels(pf_gms2, pf_sbsp, pf_toolp, **kwargs):

    pf_lst = get_value(kwargs, "pf_lst", None)
    if pf_lst is not None:
        run_shell_cmd()

    else:
        run_shell_cmd(
            f"compp -a {pf_gms2} -b {pf_sbsp} -I -q -n | grep -v \"#\" > {pf_toolp}"
        )
Ejemplo n.º 3
0
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs):
    group = get_value(kwargs, "group", "A", default_if_none=True)
    pf_mod = os_join(env["pd-work"], "a.mod")
    cmd = f"cd {env['pd-work']}; "
    cmd += f"{env['pd-bin-external']}/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6"
    run_shell_cmd(cmd)
    mod = GMS2Mod.init_from_file(pf_mod)
    # remove_p(pf_mod)

    return mod
Ejemplo n.º 4
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    pbs_package = PBSJobPackage.load(args.pf_job_input)
    func = pbs_package["func"]
    func_args = pbs_package["func_kwargs"]

    if "sbsp_options" in func_args:

        rs = func_args["sbsp_options"].safe_get("random-seed")
        if rs is None:
            random.seed(100)
        else:
            random.seed(int(rs))
            logger.critical("Random-seed: {}".format(rs))

    else:
        random.seed(100)

    if "env" in func_args:
        if args.pd_work is not None:
            func_args["env"] = func_args["env"].duplicate(
                {"pd-work": args.pd_work})
            logger.critical("{}".format(func_args["env"]["pd-work"]))

    # Update pd-work to create a tmp directory

    mkdir_p(func_args["env"]["pd-work"])
    func_args["env"]["pd-work"] = run_shell_cmd("mktemp --tmpdir={} -d".format(
        func_args["env"]["pd-work"])).strip()

    # logger.critical("{}\n{}".format(func, func_args))
    output = {"data": func(**func_args)}

    PBSJobPackage.save(output, args.pf_job_output)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    # link to taxonomy dump
    lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip"

    pd_output = args.pd_output

    mkdir_p(pd_output)
    pf_output = os_join(pd_output, "taxdump.zip")

    logger.info(f"Downloading file: {lp_taxonomy}")
    urllib.request.urlretrieve(lp_taxonomy, pf_output)

    logger.info("Download complete. Unzipping")
    run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
Ejemplo n.º 6
0
def files_are_different(pf_1, pf_2):
    # type: (str, str) -> bool

    try:
        output = run_shell_cmd("diff {} {}".format(pf_1, pf_2))

        return len(output.strip()) != 0
    except Exception:
        return True
def run_prodigal(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> None
    pd_data = env["pd-data"]
    pd_work = env["pd-work"]
    pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal")

    pf_sequence = os_join(pd_data, gi.name, "sequence.fasta")

    use_pbs = get_value(kwargs, "use_pbs", False)

    # FIXME: put in genetic code
    cmd_run = "{}  -i {}  -g 11  -o prodigal.gff  -f gff  -t prodigal.parameters  -q \n".format(
        pe_tool, pf_sequence)

    if use_pbs:
        pf_pbs = os_join(pd_work, "run.pbs")
        create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs)

        run_shell_cmd("qsub {} &".format(pf_pbs))
    else:
        cmd_run = f"cd {pd_work}; {cmd_run}"
        run_shell_cmd(cmd_run)
Ejemplo n.º 8
0
def add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod,
                                pf_new_mod, **kwargs):
    # type: (Environment, str, str, str, str) -> None

    group = get_value(kwargs, "group", None)

    # run toolp and create model file
    mod = train_and_create_models(env,
                                  pf_labels=pf_toolp,
                                  pf_sequences=pf_sequence,
                                  group=group)
    rbs_toolp = mod.items["RBS_MAT"]  # type: Dict[str, List[float]]
    spacer = mod.items["RBS_POS_DISTR"]

    cmd = ""

    # remove RBS_MAT and RBS_POS_DISTR from new model
    # cmd += " awk '{if ($1 == \"$RBS_MAT\") NR += 4 ; else print }' " + "{} > {}".format(pf_gms2_mod, pf_new_mod)
    cmd += "awk 'BEGIN{sut=0} {if (sut == 1) {l=substr($1,1,1);  if (l != \"$\") next ; else {sut=0; print}} "
    cmd += "else if ($1 == \"$RBS_MAT\" || $1 == \"$RBS_POS_DISTR\") sut = 1; else print }' "
    cmd += "{} > {}".format(pf_gms2_mod, pf_new_mod)
    run_shell_cmd(cmd)

    # write toolp RBS_MAT to new model file
    rbs_as_str = "\n\n$RBS_MAT\n"
    for i in sorted(rbs_toolp.keys()):
        rbs_as_str += str(i) + " " + " ".join([str(x)
                                               for x in rbs_toolp[i]]) + "\n"
    rbs_as_str += "\n\n"

    rbs_as_str += "$RBS_POS_DISTR\n"
    for i in sorted(spacer.keys()):
        rbs_as_str += str(i) + " " + str(spacer[i]) + "\n"
    rbs_as_str += "\n\n"

    append_to_file(rbs_as_str, pf_new_mod)

    return
def run_gms2(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> None

    genome_type = get_value(kwargs, "genome_type", "auto")
    pd_data = env["pd-data"]
    pd_work = env["pd-work"]
    pe_tool = os_join(env["pd-bin-external"], "gms2", "gms2.pl")

    pf_sequence = os_join(pd_data, gi.name, "sequence.fasta")
    use_pbs = get_value(kwargs, "use_pbs", False)

    # FIXME: put in genetic code
    cmd_run = "{} --gcode 11 --format gff --out gms2.gff --seq {}  --v --genome-type {} --fgio-dist-thresh 25".format(
        pe_tool, pf_sequence, genome_type)

    if use_pbs:
        pf_pbs = os_join(pd_work, "run.pbs")
        create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs)

        run_shell_cmd("qsub {} &".format(pf_pbs))
    else:
        cmd_run = f"cd {pd_work}; {cmd_run}"
        run_shell_cmd(cmd_run)
Ejemplo n.º 10
0
 def _qsub(pf_pbs):
     # type: (str) -> str
     return run_shell_cmd("qsub  -V " + pf_pbs, do_not_log=True).strip()
Ejemplo n.º 11
0
def download_assembly_summary_entry(entry, pd_output, **kwargs):
    # type: (Dict[str, Any], str, Dict[str, Any]) -> Dict[str, Any]

    force_download = get_value(kwargs,
                               "force_download",
                               None,
                               valid={"all", "annotation_changed"})

    # build name
    gcf = entry["assembly_accession"]
    acc = entry["asm_name"].replace(" ", "_")

    output = {
        "assembly_accession": gcf,
        "asm_name": acc,
        "name": entry["name"],
        "parent_id": entry["parent_id"] if "parent_id" in entry else "",
        "genetic_code": entry["genetic_code"]
    }

    ftplink = entry["ftp_path"]

    # if genbank and has refseq, prefer refseq
    if "GCA" in gcf and entry["gbrs_paired_asm"] != "na" and len(
            entry["gbrs_paired_asm"]) > 0:
        gcf = entry["gbrs_paired_asm"]
        output["assembly_accession"] = gcf
        ftplink = create_ftplink_from_gcf_acc(gcf, acc)

    gcfid = "{}_{}".format(gcf, acc)
    pd_gcfid = os.path.join(pd_output, gcfid)
    pd_runs = os.path.join(pd_gcfid, "runs")

    try:

        mkdir_p(pd_gcfid)
        mkdir_p(pd_runs)

        fn_sequence = "{}_genomic.fna".format(gcfid)
        fn_labels = "{}_genomic.gff".format(gcfid)

        pf_ftp_sequence = os.path.join(ftplink, "{}.gz".format(fn_sequence))
        pf_ftp_labels = os.path.join(ftplink, "{}.gz".format(fn_labels))

        for not_allowed in {"#", "(", ")", ","}:
            if not_allowed in pf_ftp_sequence or not_allowed in pf_ftp_labels:
                raise ValueError("Invalid character in path")

        for not_allowed in {"#", "(", ")", "/", ":", ","}:
            if not_allowed in fn_sequence or not_allowed in fn_labels:
                raise ValueError("Invalid character in path")

        pf_local_sequence = os.path.join(pd_gcfid, "sequence.fasta")
        pf_local_labels = os.path.join(pd_gcfid, "ncbi.gff")

        # don't re-download. TODO: add option to force re-download
        if force_download != "any" and os.path.isfile(
                pf_local_sequence) and os.path.isfile(pf_local_labels):
            if force_download is None:
                return output

            if force_download == "annotation_changed":
                run_shell_cmd(
                    "cd {}; mkdir temporary; cd temporary; wget --quiet {}; gunzip -f {};"
                    .format(pd_gcfid, pf_ftp_labels,
                            "{}.gz".format(fn_labels)))

                update = files_are_different(
                    pf_1=os.path.join(pd_gcfid, "temporary", fn_labels),
                    pf_2=os.path.join(pd_gcfid, "ncbi.gff"))

                if update:
                    run_shell_cmd("cd {}; mv {} ../ncbi.gff".format(
                        os.path.join(pd_gcfid, "temporary"), fn_labels))

                    # download sequence file again
                    run_shell_cmd(
                        "pwd; cd {}; wget --quiet {}; gunzip -f {};".format(
                            pd_gcfid,
                            pf_ftp_sequence,
                            "{}.gz".format(fn_sequence),
                        ), )

                    run_shell_cmd("cd {}; mv {} {};".format(
                        pd_gcfid,
                        fn_sequence,
                        "sequence.fasta",
                    ))

                # cleanup
                run_shell_cmd("cd {}; rm -r temporary".format(pd_gcfid))
            elif force_download == "no_download":
                return output
            else:  # FIXME: it's getting out of control. Create different lists: updated, all valid, etc...
                raise ValueError("nope")
        else:
            run_shell_cmd(
                "pwd; cd {}; wget --quiet {}; wget --quiet {}; gunzip -f {}; gunzip -f {}"
                .format(pd_gcfid, pf_ftp_sequence, pf_ftp_labels,
                        "{}.gz".format(fn_sequence),
                        "{}.gz".format(fn_labels)), )

            run_shell_cmd("cd {}; mv {} {}; mv {} {}".format(
                pd_gcfid, fn_sequence, "sequence.fasta", fn_labels,
                "ncbi.gff"))
    except (IOError, OSError, ValueError, subprocess.CalledProcessError):
        # cleanup failed attempt
        if os.path.exists(pd_gcfid) and os.path.isdir(pd_gcfid):
            shutil.rmtree(pd_gcfid)
        raise ValueError(
            "Could not download data for genome: {}".format(gcfid)) from None

    return output
Ejemplo n.º 12
0
def get_annotation_date(pf_labels):
    return run_shell_cmd("grep -m 1 \"annotation-date\" {}".format(pf_labels) +
                         r" | awk '{print $2}'",
                         do_not_log=True).strip()
Ejemplo n.º 13
0
def count_cds(pf_labels):
    # type: (str) -> int

    return int(
        run_shell_cmd("grep -c CDS {}".format(pf_labels), do_not_log=True))
Ejemplo n.º 14
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    run_shell_cmd(
        gen_cmd_create_blast_database(args.pf_sequences, args.pf_db, "nucl",
                                      True))