def relative_entropy_analysis_for_gi(env, gi, prl_options):
    # type: (Environment, GenomeInfo, ParallelizationOptions) -> pd.DataFrame
    # Steps:

    list_entries = list()

    # set up labels (as lst) and sequence for genome
    setup_info = set_up_labels_and_sequence_for_genome(env, gi)

    if prl_options["use-pbs"]:
        pd_figures = os_join(prl_options["pbs-pd-head"], gi.name)
    else:
        pd_figures = os_join(env["pd-work"], gi.name)
    mkdir_p(pd_figures)

    for percent in range(10, 101, 5):
        for trial in range(10):
            info = relative_entropy_analysis_for_gi_for_percent(
                env,
                pf_sequence=setup_info["pf_sequence"],
                pf_labels=setup_info["pf_labels"],
                group=setup_info["group"],
                pf_mod=setup_info["pf_mod"],
                pf_verified=setup_info["pf_verified"],
                percent=percent,
                pd_figures=pd_figures)

            list_entries.append({
                "Genome": gi.name,
                "Percent": percent,
                "Trial": trial,
                **info
            })

    return pd.DataFrame(list_entries)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    pbs_package = PBSJobPackage.load(args.pf_job_input)
    func = pbs_package["func"]
    func_args = pbs_package["func_kwargs"]

    if "sbsp_options" in func_args:

        rs = func_args["sbsp_options"].safe_get("random-seed")
        if rs is None:
            random.seed(100)
        else:
            random.seed(int(rs))
            logger.critical("Random-seed: {}".format(rs))

    else:
        random.seed(100)

    if "env" in func_args:
        if args.pd_work is not None:
            func_args["env"] = func_args["env"].duplicate(
                {"pd-work": args.pd_work})
            logger.critical("{}".format(func_args["env"]["pd-work"]))

    # Update pd-work to create a tmp directory

    mkdir_p(func_args["env"]["pd-work"])
    func_args["env"]["pd-work"] = run_shell_cmd("mktemp --tmpdir={} -d".format(
        func_args["env"]["pd-work"])).strip()

    # logger.critical("{}\n{}".format(func, func_args))
    output = {"data": func(**func_args)}

    PBSJobPackage.save(output, args.pf_job_output)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db,
                     **kwargs):
    # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None

    dn_run = get_value(kwargs, "dn_run", "sbsp")

    # Check if clade is known
    try:
        pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]]
    except KeyError:
        raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"]))

    logger.info("Scheduling: {}".format(gi.name))

    pd_work = os_join(env["pd-work"], gi.name,
                      dn_run)  # genome working environment
    curr_env = env.duplicate({"pd-work":
                              pd_work})  # create environment for genome
    pf_output = os_join(pd_work, "output.csv")  # output file

    mkdir_p(pd_work)  # create working directory

    # write genome name to file list (for running)
    pf_list = os_join(pd_work, "query.list")
    GenomeInfoList([gi]).to_file(pf_list)

    # create options for pipeline for current genome
    po = PipelineSBSPOptions(curr_env,
                             pf_list,
                             pf_t_db=pf_t_db,
                             pf_output=pf_output,
                             sbsp_options=sbsp_options,
                             prl_options=prl_options,
                             **kwargs)
    sbsp_on_gi(gi, po)
def get_orthologs_from_files_deprecated(env, pf_q_list, pf_t_list, pf_output, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> str

    clean = get_value(kwargs, "clean", False)

    # pf_q_list = data["pf-q-list"]
    # pf_t_list = data["pf-t-list"]

    pd_work = env["pd-work"]

    mkdir_p(pd_work)

    # run blast
    fn_blast_out = "blast.xml"
    pf_blast_out = os.path.join(pd_work, fn_blast_out)

    run_blast(env, pf_q_list, pf_t_list, pf_blast_out, **kwargs)

    # convert blast output to csv
    convert_blast_output_to_csv(pf_blast_out, pf_output, select_best_alignment_per_qt_pair=True)

    if clean:
        try:
            os.remove(pf_blast_out)
        except OSError:
            pass

    return pf_output
Example #5
0
def compare_gms2_sbsp_ncbi_for_genome_list(env, gil, gcfid_to_pd_sbsp, pf_output_summary, **kwargs):
    # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None

    prodigal = get_value(kwargs, "prodigal", None)
    list_summary = list()
    list_pf_gms2_sbsp_not_ncbi = list()
    list_pf_gms2_sbsp_ncbi = list()

    for gi in gil:
        logger.info("{}".format(gi.name))
        pd_genome = os.path.join(env["pd-data"], gi.name)
        pf_gms2 = os.path.join(pd_genome, "runs", "gms2", "gms2.gff")
        pf_ncbi = os.path.join(pd_genome, "ncbi.gff")
        pf_sbsp_details = os.path.join(gcfid_to_pd_sbsp[gi.name], "output.csv")

        labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
        labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

        key_3prime_to_label_gms2 = map_key_to_labels(labels_gms2)
        key_3prime_to_label_ncbi = map_key_to_labels(labels_ncbi)

        df_sbsp = pd.read_csv(pf_sbsp_details, header=0)

        for index, row in df_sbsp.groupby("q-key", as_index=False).agg("first").iterrows():

            q_key_3prime = create_3prime_key_from_fields(
                accession=row["q-accession"], left=row["q-left-sbsp"], right=row["q-right-sbsp"],
                strand=row["q-strand-sbsp"]
            )


            # make sure key is in both
            if q_key_3prime in key_3prime_to_label_gms2 and q_key_3prime in key_3prime_to_label_ncbi:

                # make sure SBSP 5' matches GMS2
                label_sbsp = Label(
                    Coordinates(row["q-left-sbsp"]-1, row["q-right-sbsp"]-1, row["q-strand-sbsp"]),
                    seqname=row["q-accession"]
                )

                label_gms2 = key_3prime_to_label_gms2[q_key_3prime]

                if labels_match_5prime_3prime(label_sbsp, label_gms2):

                    label_ncbi = key_3prime_to_label_ncbi[q_key_3prime]
                    if labels_match_5prime_3prime(label_sbsp, label_ncbi):
                        list_pf_gms2_sbsp_ncbi.append(row["pf-msa-output"])
                    else:
                        list_pf_gms2_sbsp_not_ncbi.append(row["pf-msa-output"])

    pd_gms2_sbsp_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_ncbi")
    pd_gms2_sbsp_not_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_not_ncbi")

    mkdir_p(pd_gms2_sbsp_ncbi)
    mkdir_p(pd_gms2_sbsp_not_ncbi)

    # copy files
    copy_files_with_new_indexing(list_pf_gms2_sbsp_ncbi, pd_gms2_sbsp_ncbi)
    copy_files_with_new_indexing(list_pf_gms2_sbsp_not_ncbi, pd_gms2_sbsp_not_ncbi)
Example #6
0
def analyze_gms2_components_on_verified_set_for_gi(env, gi):
    # type: (Environment, GenomeInfo) -> pd.DataFrame

    list_entries = list()

    start_components = {
        "Start Codons", "Start Context", "RBS", "Promoter",
    }

    pd_gi = os_join(env["pd-work"], gi.name)
    mkdir_p(pd_gi)

    # for each component to keep on
    for component_on in sorted(start_components) + ["MGM2*", "MGM", "GMS2"]:
        components_off = start_components.difference({component_on})

        if component_on == "MGM2*" or component_on == "GMS2":
            components_off = set()
        elif component_on == "MGM":
            pass
        elif not component_in_model_file(env, gi, component_on) and component_on not in {"MGM2*", "MGM", "GMS2"}:
            continue

        native_coding_off = False if component_on == "GMS2" else True

        pd_gi_component = os_join(pd_gi, component_on).replace(" ", "")
        mkdir_p(pd_gi_component)

        env_dup = env.duplicate({"pd-work": pd_gi_component})

        if component_on == "Start Context":
            component_on = {component_on}  # "rbs", "promoter"}
            components_off.remove("RBS")
            components_off.remove("Promoter")
        else:
            component_on = {component_on}


        results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off,
                                                                   native_coding_off=native_coding_off)

        list_entries.append({
            "Genome": gi.name,
            "Component": next(iter(component_on)).replace("_", "-"),
            # **{con: True for con in component_on},                             # current component is on
            # **{coff: False for coff in components_off},     # all others are off
            **results
        })



    return pd.DataFrame(list_entries)
    def run(self):
        # type: () -> None
        pd_work = self.env["pd-work"]

        # make sure working directory is up and running
        mkdir_p(pd_work)

        # Copy genome file to local directory, and write sbsp options
        copyfile(self.pipeline_options["pf-q-list"],
                 os_join(pd_work, "run.list"))
        self.pipeline_options["sbsp-options"].to_file(
            os_join(pd_work, "sbsp-options.conf"))

        state = self._run_helper()  # run compute steps
        self._compare(state)  # run comparison
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    # link to taxonomy dump
    lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip"

    pd_output = args.pd_output

    mkdir_p(pd_output)
    pf_output = os_join(pd_output, "taxdump.zip")

    logger.info(f"Downloading file: {lp_taxonomy}")
    urllib.request.urlretrieve(lp_taxonomy, pf_output)

    logger.info("Download complete. Unzipping")
    run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
def run_tool_on_gil(env, gil, tool, **kwargs):
    # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None

    logger.info("Running tool {} on {} genomes".format(tool, len(gil)))
    dn_run = get_value(kwargs, "dn_run", tool, default_if_none=True)
    func = {
        "gms2": run_gms2,
        "prodigal": run_prodigal,
    }[tool]

    for gi in gil:
        pd_work = os_join(env["pd-work"], gi.name, dn_run)
        mkdir_p(pd_work)
        curr_env = env.duplicate({"pd-work": pd_work})

        func(curr_env, gi, **kwargs)
Example #10
0
def compute_features(env, pf_data, pf_output, **kwargs):
    # type: (Environment, str, str, Dict[str, Any]) -> str

    pd_work = env["pd-work"]

    mkdir_p(pd_work)

    df = compute_feature_helper(env, pf_data)

    # clean up
    df.drop("q-nucl-gene-sequence", axis=1, inplace=True)
    df.drop("q-prot-gene-sequence", axis=1, inplace=True)
    df.drop("t-nucl-gene-sequence", axis=1, inplace=True)
    df.drop("t-prot-gene-sequence", axis=1, inplace=True)

    df.to_csv(pf_output, index=False)
    return pf_output
Example #11
0
    def _run_helper_for_attribute(self, value_to_comparison, pd_output):
        # type: (Dict[Any, Dict[str, Any]], str) -> None

        mkdir_p(pd_output)

        list_df = list()
        for value, comparison in sorted(value_to_comparison.items(),
                                        key=lambda x: x[0]):
            df = self._stats_summary_to_df(comparison["stats"])
            list_df.append((value, df))

        df_numbers = self._merge_multiple_stats_summary(
            list_df, ["Common 3'", "Common 5'"])
        df_percentages = self._merge_multiple_stats_summary(
            list_df, ["% Common 3'", "% Common 5'"])

        self._histogram_multiple_stats_summary_by_attribute(list_df, pd_output)

        df_numbers.to_csv(os.path.join(pd_output, "numbers.csv"), index=False)
        df_percentages.to_csv(os.path.join(pd_output, "percentages.csv"),
                              index=False)
Example #12
0
def _run_codeml(seq_a, seq_b, **kwargs):
    # type: (str, str, Dict[str, Any]) -> Dict[str, Any]

    pd_work = get_value(kwargs, "pd_work", ".", default_if_none=True)
    pf_ctl = get_value(kwargs, "pf_ctl", None)

    if pf_ctl is None:
        raise ValueError("Cannot compute distance without CTL file for CodeML")

    if not os.path.isfile(pf_ctl):
        raise ValueError("File doesn't exist: {}".format(pf_ctl))

    random_name = generate_random_non_existing_filename(pd_work)

    pd_codeml_run = os.path.join(pd_work, random_name)
    mkdir_p(pd_codeml_run)

    shutil.copyfile(pf_ctl, os.path.join(pd_codeml_run, "codeml.ctl"))

    pf_sequences = os.path.join(pd_codeml_run, "in.phy")
    write_to_temporary_alignment_file(pf_sequences, [seq_a, seq_b])

    write_string_to_file("(1)\n", os.path.join(pd_codeml_run, "in.tre"))

    # run code ml
    scorer = codeml.Codeml(tree=os.path.join(pd_codeml_run, "in.tre"),
                           alignment=pf_sequences,
                           out_file=os.path.join(pd_codeml_run, "out.txt"),
                           working_dir=pd_codeml_run)

    try:
        results = scorer.run(ctl_file="codeml.ctl", verbose=False)
    except Exception:
        results = {}

    shutil.rmtree(pd_codeml_run)

    return results
def collect_alignments_for_genome(env, gi):
    # type: (Environment, GenomeInfo) -> None
    pd_genome = os_join(env["pd-work"], gi.name)

    mkdir_p(pd_genome)

    pd_run = os_join(env["pd-runs"], gi.name)

    # load labels and data files
    pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff")
    pf_gms2 = os_join(pd_run, "gms2", "gms2.gff")
    pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff")
    pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv")

    common_options = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    try:

        labels_sbsp = read_labels_from_file(pf_sbsp,
                                            name="SBSP",
                                            **common_options)
        labels_gms2 = read_labels_from_file(pf_gms2,
                                            name="GMS2",
                                            **common_options)
        labels_ncbi = read_labels_from_file(pf_ncbi,
                                            name="NCBI",
                                            **common_options)
        df_details = pd.read_csv(pf_sbsp_details)
        add_q_key_3p_to_df(df_details, "q-3prime")
    except FileNotFoundError:
        return

    # get genes where GMS2=SBSP
    lcd_full = LabelsComparisonDetailed(labels_gms2,
                                        labels_sbsp,
                                        name_a="gms2",
                                        name_b="sbsp")

    labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a")

    # get labels where gms2_eq_sbsp doesn't match NCBI
    lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp,
                                    labels_ncbi,
                                    name_a="gms2_eq_sbsp",
                                    name_b="ncbi")
    labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a")

    # get msa files for all these labels
    set_3prime_keys = {
        create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand())
        for l in labels_gms2_eq_sbsp_not_ncbi
    }

    df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin(
        set_3prime_keys)]

    set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"])

    for pf_msa_out in set_pf_msa_out:
        shutil.copy(pf_msa_out, pd_genome)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    prl_options = ParallelizationOptions.init_from_dict(env, vars(args))

    if not prl_options["use-pbs"]:
        df = relative_entropy_analysis(env, gil, prl_options)
    else:
        pbs = PBS(env,
                  prl_options,
                  splitter=split_genome_info_list,
                  merger=merge_identity)
        list_df = pbs.run(data={"gil": gil},
                          func=relative_entropy_analysis,
                          func_kwargs={
                              "env": env,
                              "prl_options": prl_options
                          })
        df = pd.concat(list_df, ignore_index=True, sort=False)

    df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False)

    pd_figures = os_join(env["pd-work"], "summary_figures")
    mkdir_p(pd_figures)

    sns.scatterplot(df,
                    "Percent",
                    "Error",
                    figure_options=FigureOptions(
                        ylim=[0, 20], save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Motif",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Spacer",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.scatterplot(
        df,
        "RE Motif",
        "RE Spacer",
        hue="Genome",
        identity=True,
        figure_options=FigureOptions(save_fig=next_name(pd_figures)))

    sns.lmplot(df,
               "Percent",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Motif",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Spacer",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "Percent",
               "RE",
               hue="Genome",
               figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def download_assembly_summary_entry(entry, pd_output, **kwargs):
    # type: (Dict[str, Any], str, Dict[str, Any]) -> Dict[str, Any]

    force_download = get_value(kwargs,
                               "force_download",
                               None,
                               valid={"all", "annotation_changed"})

    # build name
    gcf = entry["assembly_accession"]
    acc = entry["asm_name"].replace(" ", "_")

    output = {
        "assembly_accession": gcf,
        "asm_name": acc,
        "name": entry["name"],
        "parent_id": entry["parent_id"] if "parent_id" in entry else "",
        "genetic_code": entry["genetic_code"]
    }

    ftplink = entry["ftp_path"]

    # if genbank and has refseq, prefer refseq
    if "GCA" in gcf and entry["gbrs_paired_asm"] != "na" and len(
            entry["gbrs_paired_asm"]) > 0:
        gcf = entry["gbrs_paired_asm"]
        output["assembly_accession"] = gcf
        ftplink = create_ftplink_from_gcf_acc(gcf, acc)

    gcfid = "{}_{}".format(gcf, acc)
    pd_gcfid = os.path.join(pd_output, gcfid)
    pd_runs = os.path.join(pd_gcfid, "runs")

    try:

        mkdir_p(pd_gcfid)
        mkdir_p(pd_runs)

        fn_sequence = "{}_genomic.fna".format(gcfid)
        fn_labels = "{}_genomic.gff".format(gcfid)

        pf_ftp_sequence = os.path.join(ftplink, "{}.gz".format(fn_sequence))
        pf_ftp_labels = os.path.join(ftplink, "{}.gz".format(fn_labels))

        for not_allowed in {"#", "(", ")", ","}:
            if not_allowed in pf_ftp_sequence or not_allowed in pf_ftp_labels:
                raise ValueError("Invalid character in path")

        for not_allowed in {"#", "(", ")", "/", ":", ","}:
            if not_allowed in fn_sequence or not_allowed in fn_labels:
                raise ValueError("Invalid character in path")

        pf_local_sequence = os.path.join(pd_gcfid, "sequence.fasta")
        pf_local_labels = os.path.join(pd_gcfid, "ncbi.gff")

        # don't re-download. TODO: add option to force re-download
        if force_download != "any" and os.path.isfile(
                pf_local_sequence) and os.path.isfile(pf_local_labels):
            if force_download is None:
                return output

            if force_download == "annotation_changed":
                run_shell_cmd(
                    "cd {}; mkdir temporary; cd temporary; wget --quiet {}; gunzip -f {};"
                    .format(pd_gcfid, pf_ftp_labels,
                            "{}.gz".format(fn_labels)))

                update = files_are_different(
                    pf_1=os.path.join(pd_gcfid, "temporary", fn_labels),
                    pf_2=os.path.join(pd_gcfid, "ncbi.gff"))

                if update:
                    run_shell_cmd("cd {}; mv {} ../ncbi.gff".format(
                        os.path.join(pd_gcfid, "temporary"), fn_labels))

                    # download sequence file again
                    run_shell_cmd(
                        "pwd; cd {}; wget --quiet {}; gunzip -f {};".format(
                            pd_gcfid,
                            pf_ftp_sequence,
                            "{}.gz".format(fn_sequence),
                        ), )

                    run_shell_cmd("cd {}; mv {} {};".format(
                        pd_gcfid,
                        fn_sequence,
                        "sequence.fasta",
                    ))

                # cleanup
                run_shell_cmd("cd {}; rm -r temporary".format(pd_gcfid))
            elif force_download == "no_download":
                return output
            else:  # FIXME: it's getting out of control. Create different lists: updated, all valid, etc...
                raise ValueError("nope")
        else:
            run_shell_cmd(
                "pwd; cd {}; wget --quiet {}; wget --quiet {}; gunzip -f {}; gunzip -f {}"
                .format(pd_gcfid, pf_ftp_sequence, pf_ftp_labels,
                        "{}.gz".format(fn_sequence),
                        "{}.gz".format(fn_labels)), )

            run_shell_cmd("cd {}; mv {} {}; mv {} {}".format(
                pd_gcfid, fn_sequence, "sequence.fasta", fn_labels,
                "ncbi.gff"))
    except (IOError, OSError, ValueError, subprocess.CalledProcessError):
        # cleanup failed attempt
        if os.path.exists(pd_gcfid) and os.path.isdir(pd_gcfid):
            shutil.rmtree(pd_gcfid)
        raise ValueError(
            "Could not download data for genome: {}".format(gcfid)) from None

    return output
Example #16
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))
Example #17
0
    def _run_helper(self, comparison, pd_output):
        # type: (Dict[str, Any], str) -> None

        mkdir_p(pd_output)
        df = self._stats_summary_to_df(comparison["stats"])
        df.to_csv(os.path.join(pd_output, "summary_stats.csv"), index=False)
Example #18
0
 def _setup_pbs_run(self):
     mkdir_p(self._prl_options["pbs-pd-head"])
Example #19
0
    def _generate_pbs_header_array(num_jobs, job_name, prl_options,
                                   pd_compute):
        """

        :param num_jobs:
        :param job_name:
        :param prl_options:
        :type prl_options: ParallelizationOptions
        :return:
        """

        num_nodes = prl_options["pbs-nodes"]
        ppn = prl_options["pbs-ppn"]
        walltime = prl_options["pbs-walltime"]

        pd_compute = os.path.abspath(
            os.path.join(prl_options["pbs-pd-root-compute"],
                         prl_options["pbs-dn-compute"]))

        pd_job_template = os.path.join(pd_compute, "job_${PBS_ARRAYID}")

        pd_pbs_logs = os.path.join(prl_options["pbs-pd-head"], "pbs_logs")
        mkdir_p(pd_pbs_logs)

        node_property = prl_options.safe_get("pbs-node-property")
        if node_property is not None:
            node_property = ":" + node_property
        else:
            node_property = ""

        pbs_text = ""

        pbs_text += "#PBS -N " + str(job_name) + "\n"
        pbs_text += "#PBS -o " + "{}/{}".format(pd_pbs_logs,
                                                "error_${PBS_ARRAYID}") + "\n"
        pbs_text += "#PBS -j oe" + "\n"
        pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str(
            ppn) + "{}\n".format(node_property)
        pbs_text += "#PBS -l walltime=" + str(walltime) + "\n"

        if prl_options:
            array_param = "1-{}".format(num_jobs)
            if prl_options["pbs-concurrent-nodes"]:
                total_concurrent_jobs = prl_options[
                    "pbs-concurrent-nodes"] * int(8 / ppn)
                array_param = "{}%{}".format(array_param,
                                             total_concurrent_jobs)

            pbs_text += "#PBS -t {}".format(array_param) + "\n"

        pbs_text += "#PBS -W umask=002" + "\n"

        pbs_text += "export PATH=\"/home/karl/anaconda/envs/biogem_sbsp/bin:$PATH\"\n"

        pbs_text += "mkdir -p {}".format(pd_job_template) + "\n"

        pbs_text += "PBS_O_WORKDIR=" + pd_job_template + "\n"
        pbs_text += "cd $PBS_O_WORKDIR \n"
        pbs_text += "sleep 60\n"

        pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n"
        pbs_text += "echo This job runs on the following nodes:" + "\n"
        pbs_text += "echo `cat $PBS_NODEFILE`" + "\n"

        return pbs_text
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0