コード例 #1
0
def get_fold_change_distribution(nr: NameResolver):
    lab = "winata"
    methods = ["star", "salmon", "kallisto"]
    condition = [24, 72]
    colors = palette.get_color_list
    p_value = 0.05
    for m, c in zip(methods, colors):
        fn = nr.deseq2_results(method=m, lab=lab, condition=condition,
                               lfc=True)
        df = pd.read_csv(fn)
        data = df
        data = data[data[DESEQ2_PADJ] < p_value]
        data = data[DESEQ2_LOG2_CHANGE].values
        density = gaussian_kde(data)
        xs = np.linspace(-20, 20, 1000)
        plt.plot(xs, density(xs), color=c, label=m, lw=2)
        plt.fill_between(xs, density(xs), color=c, alpha=0.1)
    plt.axvline(0, ls="--", color=palette.black())
    plt.ylabel(f"Probability Density (p-value<{p_value})")
    plt.xlabel("Log$_2$ Fold Change")
    plt.legend(loc=0)
    plt.title(
        f"{condition[0]} v {condition[1]} ({lab} lab)")
    plt.xlim(-20, 20)
    plt.grid(axis="both", zorder=0, ls=":")
    plt.ylim(0)
    plt.tight_layout()
    plt.savefig("density.png", dpi=300)
    plt.show()
コード例 #2
0
def run():
    nr = NameResolver("config.json")
    genes = INTERESTED_GENES
    # geometric_mean(nr)
    # check_gene_expression(nr, genes)
    base = "/mnt/windows/Enigma/Zebrafish/data/deseq2/mutant"
    file_wt = f"{base}/AnalysiWTvsTbx5a/star.result_lfc.csv"
    search_constant_genes(nr, file_wt)
コード例 #3
0
def count_pca(nr: NameResolver):
    all_conditions = sorted([30, 48, 72])
    lab = "hills"
    methods = ["stringtie"]
    combinations = list(itertools.combinations(all_conditions, 2))
    dfs = []
    r = {}
    for pair in combinations:
        for method in methods:
            f = nr.deseq2_vst(method, lab=lab, condition=list(pair))
            df = pd.read_csv(f).set_index("gene_id")

            col_change = {}
            for c in df.columns:
                col_change[c] = f"{c}_{method}"
            df = df.rename(columns=col_change)
            cols = df.columns
            r[f"{pair[0]}_{method}"] = list(cols[: int(len(cols) / 2)])
            r[f"{pair[1]}_{method}"] = list(cols[int(len(cols) / 2):])
            dfs.append(df)

    colors = []
    columns = []
    labels = {}
    time_colors = {}
    for i, t in enumerate(all_conditions):
        time_colors[str(t)] = palette.get_color_list[i]

    for color, key in zip(palette.get_color_list[:len(r)], r):
        labels[key] = color
        columns.extend(r[key])
        # colors.extend([color] * len(r[key]))
        colors.extend([time_colors[key.split("_")[0]]] * len(r[key]))

    df = pd.concat(dfs, axis=1, join="outer", sort=False)
    df = df.loc[:, ~df.columns.duplicated()]
    df = df[columns].fillna(0)

    pca = PCA()
    a1 = pca.fit(df.values)
    var = a1.components_
    percentages = [round(x, 2) for x in a1.explained_variance_ratio_ * 100]
    plt.scatter(var[0, :], var[1, :], color=colors)
    plt.xlabel(f"PC1: {percentages[0]} %")
    plt.ylabel(f"PC2: {percentages[1]} %")

    hdl = []
    for key in time_colors:
        hdl.append(Patch(color=time_colors[key],
                         label=f"{key} hpf"))

    plt.legend(handles=hdl, loc=0)
    plt.title(f"Counts with {methods} ({lab} lab)")
    plt.tight_layout()
    plt.savefig("pca.png", dpi=300)
    plt.show()
コード例 #4
0
def mean_sd_plot(nr: NameResolver, *, vst: bool):
    lab = "winata"
    condition = [24, 48]
    method = "kallisto"

    fn = nr.deseq2_counts(method=method, lab=lab, condition=condition)
    if vst:
        fn = nr.deseq2_vst(method=method, lab=lab, condition=condition)

    data = pd.read_csv(fn).set_index("gene_id")
    data["temp"] = data.sum(axis=1)
    plt.scatter(data.mean(axis=1), data.std(axis=1), marker=".",
                c=data["temp"])
    plt.xlabel("Mean")
    plt.ylabel("Standard Deviation")
    clb = plt.colorbar()
    clb.ax.set_title("Total Counts")
    plt.title(f"{condition[0]} vs {condition[1]} ({method}, {lab} lab)")
    plt.tight_layout()
    plt.savefig("mean_sd.png", dpi=300)
    plt.show()
コード例 #5
0
def plot_volcano(nr: NameResolver):
    lab = "winata"
    method = "salmon"
    condition = [24, 48]
    fn = nr.deseq2_results(method=method, lab=lab,
                           condition=condition,
                           lfc=True)
    volcano_plot(fn)
    plt.ylim(0, 100)
    plt.xlim(-15, 15)
    plt.axvline(0, ls="--", color=palette.black())
    plt.title(f"{condition[0]} vs {condition[1]} ({method}, {lab} lab)")
    plt.tight_layout()
    plt.savefig("volcano.png", dpi=300)
    plt.show()
コード例 #6
0
def plot_all_volcano(nr: NameResolver):
    methods = ["salmon", "kallisto", "stringtie"]
    conditions = [[30, 48], [48, 72], [30, 72]]
    max_cols = 3
    lab = "hills"

    txt_cond = []
    data = []
    for m in methods:
        txt_cond.extend([f"{x[0]}v{x[1]} {m}" for x in conditions])
        for con in conditions:
            fn = nr.deseq2_results(m, lab=lab, condition=con, lfc=True)
            data.append(fn)

    condition_array = txt_cond
    dfs = data

    rows = int(np.ceil(len(condition_array) / max_cols))
    offset = max_cols * rows - len(condition_array)
    temp = np.zeros(len(condition_array) + offset)
    if max_cols > len(condition_array):
        max_cols = len(condition_array)
        temp = np.zeros(len(condition_array))

    temp = temp.reshape(-1, max_cols)
    gs = gridspec.GridSpec(temp.shape[0], temp.shape[1])
    fig = plt.figure()
    for ind, condition in enumerate(condition_array):
        ax = fig.add_subplot(gs[ind])  # type:plt.Axes
        volcano_plot(dfs[ind], ax=ax,
                     add_labels=False)
        ax.set_title(condition)
        ax.set_xlim(-10, 10)
        ax.set_ylim(0, 50)

    fig.add_subplot(111, frameon=False)
    plt.tick_params(labelcolor='none',
                    top=False,
                    bottom=False,
                    left=False,
                    right=False)
    plt.ylabel("-Log$_{10}$ Adj P value")
    plt.xlabel("Log$_2$ Fold change")
    plt.tight_layout()
    plt.savefig("all_volcano.png", dpi=300)
    plt.show()
コード例 #7
0
def plot_fold_change(nr: NameResolver, genes: list):
    p = Palette()
    lab = "winata"
    method = "star"
    con = [24, 72]
    fn = nr.deseq2_results(method=method, lab=lab, condition=con, lfc=True)
    d = pd.read_csv(fn)
    d = convert_id_to_gene(nr, d)
    d = d[d['gene_id'].isin(genes)].fillna(1989)
    d = d[d[DESEQ2_PADJ] <= 0.05]
    d = d[d[DESEQ2_LOG2_CHANGE] > -1]
    d = d[d[DESEQ2_LOG2_CHANGE] < 1]
    plt.hist(d['log2FoldChange'].values, bins=50, color=p.ultramarine())
    plt.ylabel("Frequency")
    plt.xlabel("Log$_2$Fold Change")
    plt.title(f"Genes with Log$_2$Fold change (p<0.05)\n{lab} {method} {con}")
    plt.savefig("genes.png", dpi=300)
    plt.show()
コード例 #8
0
def count_genes(nr: NameResolver):
    lab = "winata"
    method = "star"
    condition = [24, 72]

    def __assign(x):
        if x[DESEQ2_PADJ] > 0.05:
            return palette.gray()
        if -1 < x[DESEQ2_LOG2_CHANGE] < 1:
            return palette.blue()
        else:
            return palette.red()

    # genes = list(set(INTERESTED_GENES).difference(BASE_GENES))
    genes = HOUSE_KEEPING
    fn = nr.deseq2_results(method=method,
                           lab=lab,
                           condition=condition,
                           lfc=True)
    data = convert_id_to_gene(nr, pd.read_csv(fn))
    data = data[data["gene_id"].isin(genes)]

    data["color"] = data.apply(lambda x: __assign(x), axis=1)
    data = data.sort_values(by="gene_id")

    ind = range(len(data["gene_id"].values))
    plt.barh(ind,
             data[DESEQ2_LOG2_CHANGE],
             color=data["color"],
             xerr=data[DESEQ2_LFCSE],
             error_kw=dict(ecolor=palette.gray(shade=70), capsize=5),
             zorder=100)
    plt.yticks(ind, data["gene_id"])
    plt.axvspan(-1, 1, color=palette.gray(shade=20), zorder=0)
    plt.axvline(1, ls="--", color=palette.black(), zorder=0)
    plt.axvline(-1, ls="--", color=palette.black(), zorder=0)
    plt.axvline(0, color=palette.black(), zorder=100)
    plt.grid(axis="both", ls=":", color=palette.gray(), zorder=0)
    plt.title(f"{condition[0]} vs {condition[1]} ({method}, {lab} lab)")
    plt.xlabel("Log$_2$ Fold Change")
    plt.ylabel("Gene")
    plt.tight_layout()
    plt.savefig("genes.png", dpi=300)
    plt.show()
コード例 #9
0
def get_average_dataframe(nr: NameResolver, lab: str, time: int,
                          genotype: str) -> pd.DataFrame:
    method = "stringtie"
    dfs = []
    for r in _extract_runs(lab, time, genotype):
        df = pd.read_csv(nr.run_output_file(r, method), sep="\t")
        df = df.sort_values(by="TPM", ascending=False)
        df = df.drop_duplicates(subset="Gene ID").reset_index(drop=True)
        df = df.set_index("Gene ID")
        dfs.append(df)

    tps = [tp[["TPM"]] for tp in dfs]
    tps = pd.concat(tps, join="inner", axis=1)
    name = "TPM_AVG"
    tps[name] = tps.mean(axis=1)
    tps = tps[[name]]
    dfs = dfs[0]
    del dfs["TPM"]
    dfs = pd.concat([dfs, tps], join="inner", axis=1)
    dfs = dfs.rename(columns={name: "TPM"}).reset_index()
    return dfs
コード例 #10
0
def single_ma(nr: NameResolver):
    method = "star"
    lab = "hills"
    condition = [30, 48]

    _, ax = plt.subplots()
    filename = nr.deseq2_results(method,
                                 lab=lab,
                                 condition=condition,
                                 lfc=True)

    ma_plot(filename,
            color=palette.cerulean(shade=60),
            accent_color=palette.green_light(),
            ax=ax,
            marker=".")

    ax.set_xlim(0, 1000)
    ax.set_ylim(-15, 15)
    ax.set_title(f"{condition[0]} v {condition[1]} ({method}, {lab} lab)")
    plt.tight_layout()
    plt.savefig("ma.png", dpi=300)
    plt.show()
コード例 #11
0
def search_constant_genes_in_multiple(nr: NameResolver, lab, method,
                                      conditions):
    # lab = "winata"
    # method = "star"
    # conditions = [[24, 72]]
    dfs = []
    for con in conditions:
        fn = nr.deseq2_results(method=method, lab=lab, condition=con, lfc=True)
        d = pd.read_csv(fn)
        d = d[d[DESEQ2_PADJ] <= 0.05]
        d = d[d[DESEQ2_LOG2_CHANGE] > -1]
        d = d[d[DESEQ2_LOG2_CHANGE] < 1]
        d = d[["gene_id", DESEQ2_LOG2_CHANGE]]
        d = d.set_index("gene_id")
        d = d.rename(columns={DESEQ2_LOG2_CHANGE: f"{con}"})
        dfs.append(d)

    df = pd.concat(dfs, axis=1, sort=False, join="inner").reset_index()
    df = convert_id_to_gene(nr, df).set_index("gene_id")
    df = df.apply(lambda x: pow(x, 2))
    df["sum"] = df.sum(axis=1)
    df = df.sort_values(by="sum")
    needed_genes = df.index.values
    return needed_genes
コード例 #12
0
def run():
    nr = NameResolver("config.json")
    plot_volcano(nr)