def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df_bac = load_obj(args.pf_input_bac)  # type: pd.DataFrame
    df_arc = load_obj(args.pf_input_arc)  # type: pd.DataFrame
    df_bac["Type"] = "Bacteria"
    df_arc["Type"] = "Archaea"

    df = pd.concat([df_bac, df_arc], sort=False)
    # df = df.sample(100)
    df["GENOME_TYPE"] = df["GENOME_TYPE"].apply(
        lambda x: x.strip().split("-")[1].upper())
    df.loc[df["GENOME_TYPE"] == "D2", "GENOME_TYPE"] = "D"

    df.reset_index(inplace=True)
    import matplotlib
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': False,
        'pgf.rcfonts': False,
    })

    visualize_matrix_column(env, df, "RBS_MAT")
    visualize_matrix_column(
        env, df[(df["Type"] == "Bacteria") & (df["GENOME_TYPE"] == "C")],
        "PROMOTER_MAT")
Esempio n. 2
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df = load_obj(args.pf_data)  # type: pd.DataFrame
    df.reset_index(inplace=True)
    df = df[df["GENOME_TYPE"] == "group-a"].copy()

    df["RE"] = df[["RBS_MAT", "NON_MAT"]].apply(lambda r: relative_entropy(
        MotifModel(r["RBS_MAT"], None), GMS2Noncoding(r["NON_MAT"])),
                                                axis=1)

    sns.jointplot(df, "GC", "RE")
    sns.kdeplot(df, "GC", "RE")
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    mgm_models = load_obj(
        args.pf_mgm_models
    )  # type: Dict[str, Dict[str, Dict[str, MGMMotifModelAllGC]]]
    df_test = pd.read_csv(args.pf_test)  # type: pd.DataFrame
    # df_test = df_test.head(500).copy()
    run_mgm_models_on_test_data(env, mgm_models, df_test, args.species_type,
                                args.pf_output)
    # df_test = parallelize_dataframe_by_chunks(df_test, run_mgm_models_on_test_data, "df_test", {
    #     "env": env, "mgm_models": mgm_models, "species_type": args.species_type, "pf_output": args.pf_output
    # })

    # return
    df_test.to_csv(args.pf_output, index=False)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    if args.pf_load_state is None:
        gcfid_to_number_of_targets = count_targets_per_gcfid(
            args.pf_sbsp_output)

        df_assembly_summary = read_assembly_summary_into_dataframe(
            args.pf_assembly_summary)
        gcfid_to_assembly_info = get_assembly_info_per_gcfid(
            df_assembly_summary)

        taxid_to_number_of_targets = {
            int(gcfid_to_assembly_info[gcfid]["taxid"]):
            gcfid_to_number_of_targets[gcfid]
            for gcfid in gcfid_to_number_of_targets
            if gcfid in gcfid_to_assembly_info
        }

        tree = TaxonomyTree.load(args.pf_taxonomy_tree)

        tree.update_tree_attributes(
            set_number_of_targets_per_taxid,
            {"taxid_to_number_of_targets": taxid_to_number_of_targets},
            direction="bottom-up")

        if args.pf_save_state is not None:
            save_obj(tree, args.pf_save_state)
    else:
        tree = load_obj(args.pf_load_state)

    tree_string = tree.to_string(check_if_should_print=should_print,
                                 attribute_name="number_of_targets",
                                 attribute_format="{:,}",
                                 tag_name=args.tag,
                                 max_depth=args.max_depth)
    write_string_to_file(tree_string, args.pf_output)
Esempio n. 5
0
 def load(pf_load):
     # type: (str) -> TaxonomyTree
     return load_obj(pf_load)
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df_bac = load_obj(args.pf_data).reset_index()  # type: pd.DataFrame
    df_bac = df_bac[df_bac["GENOME_TYPE"].isin(args.group)]
    min_gc = 20
    max_gc = 70

    if args.motif_type == "PROMOTER":
        df_bac = df_bac[df_bac["GC"] >= 40].copy()

    gc_values = np.arange(min_gc, max_gc, 2)
    models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type)

    num_plots = len(models)
    num_rows = int(math.sqrt(num_plots))
    num_cols = math.ceil(num_plots / float(num_rows))

    fig, axes = plt.subplots(num_rows,
                             num_cols,
                             sharex="all",
                             sharey="all",
                             figsize=(12, 10))

    model_index = 0
    for r in range(num_rows):
        for c in range(num_cols):
            if model_index >= len(models):
                break

            if models[model_index] is None:
                model_index += 1
                continue

            bgd = [0.25] * 4
            bgd = background_from_gc(gc_values[model_index])

            newmod = lm.transform_matrix(models[model_index][0],
                                         to_type="information",
                                         from_type="probability",
                                         background=models[model_index][1])
            # from copy import copy
            # newmod = copy(models[model_index][0])
            # for idx in newmod.index:
            #     # see https://bioconductor.org/packages/release/bioc/vignettes/universalmotif/inst/doc/IntroductionToSequenceMotifs.pdf
            #
            #     uncertainty = sum(
            #         [newmod.at[idx, l] * math.log2(newmod.at[idx, l]) for l in newmod.columns]
            #     )
            #     fIC = math.log2(4) - uncertainty
            #     for i, l in enumerate(sorted(newmod.columns)):
            #         newmod.at[idx, l] = max(1 * newmod.at[idx, l] * math.log2(newmod.at[idx, l] / models[model_index][1][i]), 0)
            lm.Logo(newmod, ax=axes[r][c])

            axes[r][c].set_ylim(0, 2)
            axes[r][c].set_title(int(gc_values[model_index]))
            # fig.show()
            model_index += 1

    plt.tight_layout()
    plt.savefig(next_name(env["pd-work"]))
    plt.show()