예제 #1
0
def plots(input_dir, date, data_filter, virus, passage_order, transition_order, pairs, label_order, pairs_adar, filter_reads=None):
    output_dir = input_dir + date + "_plots"
    plus_minus = u"\u00B1"
    try:
        os.mkdir(output_dir)
    except OSError:
        print("Creation of the directory %s failed" % output_dir)
    else:
        print("Successfully created the directory %s " % output_dir)
    if filter_reads is True:
        data_filter["no_variants"] = np.where(data_filter["Prob"] < 0.95, 0, data_filter["no_variants"])
        data_filter["Read_count"] = data_filter[data_filter["Read_count"] > 10000]
    mutation_order = ["A>G", "U>C", "G>A", "C>U", "A>C", "U>G", "A>U", "U>A", "G>C", "C>G", "C>A", "G>U"]
    type_order = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]
    # g1 = sns.catplot("label", "frac_and_weight", data=data_filter, hue="Mutation", order=label_order, palette="tab20",
    #                     kind="point", dodge=True, hue_order=mutation_order, join=False, estimator=weighted_varaint,
    #                  orient="v")
    # g1.set_axis_labels("Passage", "Variant Frequency {} CI=95%".format(plus_minus))
    # g1.set_xticklabels(fontsize=9, rotation=45)
    # g1.set(yscale='log')
    # g1.set(ylim=(10**-5, 10**-1))
    #
    # # plt.show()
    # g1.savefig(output_dir + "/All_Mutations_point_plot", dpi=300)
    # plt.close()

    data_filter["passage"] = data_filter["passage"].astype(str)
    data_filter["passage"] = np.where(data_filter["passage"] != "RNA\nControl", "p" + data_filter["passage"], data_filter["passage"])
    g2 = sns.catplot("passage", "frac_and_weight", data=data_filter, hue="Mutation", order=passage_order,
                     palette=mutation_palette(4)
                     , kind="point", dodge=0.5, hue_order=transition_order, join=False, estimator=weighted_varaint,
                     orient="v")
    g2.set_axis_labels("Passage", "Variant Frequency {} CI=95%".format(plus_minus))
    g2.set(yscale='log')
    g2.set(ylim=(10 ** -6, 10 ** -2))
    # g2.set_xticklabels(fontsize=10, rotation=45)
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/Prgress reports/20200913 Final report/plots" +
    #                   "/Transition_Mutations_point_plot_Mahoney", dpi=300)
    g2.savefig(output_dir + "/Transition_Mutations_point_plot_{0}".format(virus), dpi=300)
    plt.close()

    passage_g = sns.boxplot(x="passage", y="Frequency", data=data_filter, hue="Mutation", order=passage_order,
                            palette=mutation_palette(4), dodge=True, hue_order=transition_order)
    passage_g.set_yscale('log')
    passage_g.set_ylim(10 ** -6, 10 ** -1)
    passage_g.set(xlabel="Passage", ylabel="Variant Frequency")

    annot = Annotator(passage_g, pairs, x="passage", y="Frequency", hue="Mutation", data=data_filter,
                      order=passage_order, hue_order=transition_order)
    annot.configure(test='t-test_welch', text_format='star', loc='outside', verbose=2,
                    comparisons_correction="Bonferroni")
    annot.apply_test()
    file_path = output_dir + "/sts.csv"
    with open(file_path, "w") as o:
        with contextlib.redirect_stdout(o):
            passage_g, test_results = annot.annotate()
    plt.legend(bbox_to_anchor=(1.05, 0.5), loc=2, borderaxespad=0.)
    plt.tight_layout()
    plt.savefig(output_dir + "/Transition_Mutations_box_stat_plot_{0}".format(virus), dpi=300)
    plt.close()

    data_filter_synonymous = data_filter.loc[data_filter.Type == "Synonymous"]
    data_filter_synonymous["Mutation"] = np.where(((data_filter_synonymous["Mutation"] == "A>G") &
                                                   (data_filter_synonymous["5`_ADAR_Preference"] == "High")),
                                                  "High\nADAR-like\nA>G", np.where(((data_filter_synonymous["Mutation"] == "A>G")
                                                                                    & (data_filter_synonymous["5`_ADAR_Preference"] == "Intermediate")),
                                                                                   "Intermediate\nADAR-like\nA>G",
                                                                                   np.where(((data_filter_synonymous["Mutation"] == "A>G") &
                                                                                             (data_filter_synonymous["5`_ADAR_Preference"] == "Low")),
                                                                                            "Low\nADAR-like\nA>G",
                                                                                            data_filter_synonymous["Mutation"])))
    data_filter_synonymous["Mutation_adar"] = np.where(((data_filter_synonymous["Mutation"] == "U>C") &
                                                        (data_filter_synonymous["3`_ADAR_Preference"] == "High")),
                                                       "High\nADAR-like\nU>C", np.where(((data_filter_synonymous["Mutation"] == "U>C")
                                                                                         & (data_filter_synonymous["3`_ADAR_Preference"] == "Intermediate")),
                                                                                        "Intermediate\nADAR-like\nU>C",
                                                                                        np.where(((data_filter_synonymous["Mutation"] == "U>C") &
                                                                                                  (data_filter_synonymous["3`_ADAR_Preference"] == "Low")),
                                                                                                 "Low\nADAR-like\nU>C",
                                                                                                 data_filter_synonymous["Mutation"])))
    mutation_adar_order = ["High\nADAR-like\nA>G", "Low\nADAR-like\nA>G",
                           "High\nADAR-like\nU>C", "Low\nADAR-like\nU>C"]

    data_filter_synonymous["passage"] = data_filter_synonymous["passage"].astype(str)
    catplot_adar = sns.catplot(x="passage", y="frac_and_weight", data=data_filter_synonymous, hue="Mutation_adar",
                               order=passage_order, palette=mutation_palette(4, adar=True), kind="point", dodge=0.5,
                               hue_order=mutation_adar_order, join=False, estimator=weighted_varaint, orient="v",
                               legend=True)
    catplot_adar.set_axis_labels("Passage", "Variant Frequency {0} CI=95%".format(plus_minus))
    catplot_adar.set(yscale='log')
    catplot_adar.set(ylim=(10 ** -6, 10 ** -2))
    plt.savefig(output_dir + "/adar_pref_mutation_point_plot_{0}.png".format(virus), dpi=300)
    plt.close()

    adar_g = sns.boxplot(x="passage", y="Frequency", data=data_filter_synonymous, hue="Mutation_adar",
                         order=passage_order, palette=mutation_palette(4, adar=True), dodge=True,
                         hue_order=mutation_adar_order)
    adar_g.set_yscale('log')
    adar_g.set_ylim(10 ** -6, 10 ** -1)
    adar_g.set(xlabel="Passage", ylabel="Variant Frequency")
    annot = Annotator(adar_g, pairs_adar, x="passage", y="Frequency", hue="Mutation_adar",
                      data=data_filter_synonymous, hue_order=mutation_adar_order, order=passage_order)
    annot.configure(test='t-test_welch', text_format='star', loc='outside', verbose=2,
                    comparisons_correction="Bonferroni")
    annot.apply_test()
    file_path = output_dir + "/sts_adar.csv"
    with open(file_path, "w") as o:
        with contextlib.redirect_stdout(o):
            adar_g, test_results = annot.annotate()
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.tight_layout()
    plt.savefig(output_dir + "/adar_pref_mutation_box_plot_{0}.png".format(virus), dpi=300)
    plt.close()
예제 #2
0
def main():
    replica_lst = (1, 2, 3)
    input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages/"
    prefix = "inosine_predict_context"
    output_dir = input_dir + "20201112_10000coverage_%s" % prefix
    try:
        os.mkdir(output_dir)
    except OSError:
        print("Creation of the directory %s failed" % output_dir)
    else:
        print("Successfully created the directory %s " % output_dir)
    for replica in replica_lst:
        data_filter = pd.read_pickle(input_dir + prefix + "/data_filter.pkl")
        data_filter_ag = pd.read_pickle(input_dir + prefix +
                                        "/data_filter_ag.pkl")
        data_filter_uc = pd.read_pickle(input_dir + prefix +
                                        "/data_filter_uc.pkl")
        data_filter["passage"] = data_filter["passage"].astype(int)
        data_filter_ag["passage"] = data_filter_ag["passage"].astype(int)
        data_filter_uc["passage"] = data_filter_uc["passage"].astype(int)
        data_filter = data_filter[(data_filter['Read_count'] > 10000)]
        data_filter_ag = data_filter_ag[(data_filter_ag['Read_count'] > 10000)]
        data_filter_uc = data_filter_uc[(data_filter_uc['Read_count'] > 10000)]

        data_filter = data_filter[data_filter["label"] != "p10-1"]
        data_filter_ag = data_filter_ag[data_filter_ag["label"] != "p10-1"]
        data_filter_uc = data_filter_uc[data_filter_uc["label"] != "p10-1"]

        data_filter["replica"] = np.where(
            data_filter["label"] == "RNA Control\nPrimer ID", replica,
            data_filter["replica"])
        data_filter_ag["replica"] = np.where(
            data_filter_ag["label"] == "RNA Control\nPrimer ID", replica,
            data_filter_ag["replica"])
        data_filter_uc["replica"] = np.where(
            data_filter_uc["label"] == "RNA Control\nPrimer ID", replica,
            data_filter_uc["replica"])

        data_filter = data_filter[data_filter["replica"] == replica]
        data_filter_ag = data_filter_ag[data_filter_ag["replica"] == replica]
        data_filter_uc = data_filter_uc[data_filter_uc["replica"] == replica]

        #Plots
        transition_order = ["A>G", "U>C", "G>A", "C>U"]
        type_order = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]
        type_order_ag = ["Synonymous", "Non-Synonymous"]
        adar_preference = ["High", "Intermediate", "Low"]

        data_filter_grouped = data_filter.groupby(
            ["label", "passage", "replica", "Type",
             "Mutation"])["frac_and_weight"].agg(lambda x: weighted_varaint(x))
        data_filter_grouped = data_filter_grouped.reset_index()

        data_filter_grouped = data_filter_grouped.rename(
            columns={"frac_and_weight": "Frequency"})
        data_filter_grouped["Frequency"] = data_filter_grouped[
            "Frequency"].astype(float)
        data_filter_grouped = data_filter_grouped[
            data_filter_grouped["label"] != "RNA Control\nPrimer ID"]
        data_filter_grouped = data_filter_grouped[
            data_filter_grouped["label"] != "RNA Control\nRND"]
        data_filter_grouped["replica"] = data_filter_grouped["replica"].astype(
            int)
        data_filter_grouped = data_filter_grouped[
            data_filter_grouped["replica"] == replica]
        print(data_filter_grouped.to_string())

        data_reg_ag = data_filter_grouped[data_filter_grouped["Mutation"] ==
                                          "A>G"]
        data_reg_uc = data_filter_grouped[data_filter_grouped["Mutation"] ==
                                          "U>C"]
        data_reg_ga = data_filter_grouped[data_filter_grouped["Mutation"] ==
                                          "G>A"]
        data_reg_cu = data_filter_grouped[data_filter_grouped["Mutation"] ==
                                          "C>U"]

        data_reg_ag_syn = data_reg_ag[data_reg_ag["Type"] == "Synonymous"]
        data_reg_uc_syn = data_reg_uc[data_reg_uc["Type"] == "Synonymous"]
        data_reg_ga_syn = data_reg_ga[data_reg_ga["Type"] == "Synonymous"]
        data_reg_cu_syn = data_reg_cu[data_reg_cu["Type"] == "Synonymous"]

        data_reg_ag_non_syn = data_reg_ag[data_reg_ag["Type"] ==
                                          "Non-Synonymous"]
        data_reg_uc_non_syn = data_reg_uc[data_reg_uc["Type"] ==
                                          "Non-Synonymous"]
        data_reg_ga_non_syn = data_reg_ga[data_reg_ga["Type"] ==
                                          "Non-Synonymous"]
        data_reg_cu_non_syn = data_reg_cu[data_reg_cu["Type"] ==
                                          "Non-Synonymous"]

        data_reg_ga_pmsc = data_reg_ga[data_reg_ga["Type"] ==
                                       "Premature Stop Codon"]
        data_reg_cu_pmsc = data_reg_cu[data_reg_cu["Type"] ==
                                       "Premature Stop Codon"]

        stat_slope1, stat_intercept1, r_value1, p_value1, std_err1 = stats.linregress(
            data_reg_ag_syn['passage'], data_reg_ag_syn['Frequency'])
        stat_slope2, stat_intercept2, r_value2, p_value2, std_err2 = stats.linregress(
            data_reg_uc_syn['passage'], data_reg_uc_syn['Frequency'])
        stat_slope3, stat_intercept3, r_value3, p_value3, std_err3 = stats.linregress(
            data_reg_ga_syn['passage'], data_reg_ga_syn['Frequency'])
        stat_slope4, stat_intercept4, r_value4, p_value4, std_err4 = stats.linregress(
            data_reg_cu_syn['passage'], data_reg_cu_syn['Frequency'])
        # data_reg_adar_nonsyn
        stat_slope5, stat_intercept5, r_value5, p_value5, std_err5 = stats.linregress(
            data_reg_ag_non_syn['passage'], data_reg_ag_non_syn['Frequency'])
        stat_slope6, stat_intercept6, r_value6, p_value6, std_err6 = stats.linregress(
            data_reg_uc_non_syn['passage'], data_reg_uc_non_syn['Frequency'])
        stat_slope7, stat_intercept7, r_value7, p_value7, std_err7 = stats.linregress(
            data_reg_ga_non_syn['passage'], data_reg_ga_non_syn['Frequency'])
        stat_slope8, stat_intercept8, r_value8, p_value8, std_err8 = stats.linregress(
            data_reg_cu_non_syn['passage'], data_reg_cu_non_syn['Frequency'])
        #pmsc
        stat_slope11, stat_intercept11, r_value11, p_value11, std_err11 = stats.linregress(
            data_reg_ga_pmsc['passage'], data_reg_ga_pmsc['Frequency'])
        stat_slope12, stat_intercept12, r_value12, p_value12, std_err12 = stats.linregress(
            data_reg_cu_pmsc['passage'], data_reg_cu_pmsc['Frequency'])
        data_filter_grouped = data_filter_grouped.rename(
            columns={"passage": "Passage"})
        reg_plot = sns.lmplot(x="Passage",
                              y="Frequency",
                              data=data_filter_grouped,
                              hue="Mutation",
                              hue_order=transition_order,
                              fit_reg=True,
                              col="Mutation",
                              col_order=transition_order,
                              row="Type",
                              row_order=type_order,
                              palette=mutation_palette(4),
                              line_kws={'label': "Linear Reg"},
                              legend=True,
                              height=6,
                              x_estimator=np.mean,
                              x_jitter=.05)  # markers=["o", "v", "x"]
        reg_plot.fig.subplots_adjust(wspace=.02)
        ax = reg_plot.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        label_line_1 = "y={0:.3g}x+{1:.3g}".format(stat_slope1,
                                                   stat_intercept1)
        label_line_2 = "y={0:.3g}x+{1:.3g}".format(stat_slope2,
                                                   stat_intercept2)
        label_line_3 = "y={0:.3g}x+{1:.3g}".format(stat_slope3,
                                                   stat_intercept3)
        label_line_4 = "y={0:.3g}x+{1:.3g}".format(stat_slope4,
                                                   stat_intercept4)
        label_line_5 = "y={0:.3g}x+{1:.3g}".format(stat_slope5,
                                                   stat_intercept5)
        label_line_6 = "y={0:.3g}x+{1:.3g}".format(stat_slope6,
                                                   stat_intercept6)
        label_line_7 = "y={0:.3g}x+{1:.3g}".format(stat_slope7,
                                                   stat_intercept7)
        label_line_8 = "y={0:.3g}x+{1:.3g}".format(stat_slope8,
                                                   stat_intercept8)
        label_line_11 = "y={0:.3g}x+{1:.3g}".format(stat_slope11,
                                                    stat_intercept11)
        label_line_12 = "y={0:.3g}x+{1:.3g}".format(stat_slope12,
                                                    stat_intercept12)
        L_labels[0].set_text(label_line_1)
        ax = reg_plot.axes[0, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_2)
        ax = reg_plot.axes[0, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_3)
        ax = reg_plot.axes[0, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_4)
        ax = reg_plot.axes[1, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_5)
        ax = reg_plot.axes[1, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_6)
        ax = reg_plot.axes[1, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_7)
        ax = reg_plot.axes[1, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_8)
        ax = reg_plot.axes[2, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_11)
        ax = reg_plot.axes[2, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_12)

        reg_plot.set(xlim=(0, 13))
        reg_plot.set(ylim=(0.000, 0.001))
        # reg_plot.fig.suptitle("RV #%s" % str(replica), y=0.99)
        # plt.tight_layout()
        reg_plot.savefig(output_dir +
                         "/transition_lmplot_replica%s.png" % str(replica),
                         dpi=300)
        plt.close()
        data_filter["Transition"] = data_filter.Mutation.str.contains("A>G") | data_filter.Mutation.str.contains("U>C") \
                                    | data_filter.Mutation.str.contains("C>U") | data_filter.Mutation.str.contains("G>A")
        data_transition = data_filter[data_filter["Transition"] == True]
        position_mutation = sns.relplot(x="Pos",
                                        y="Frequency",
                                        data=data_transition,
                                        hue="Mutation",
                                        col="passage",
                                        hue_order=transition_order,
                                        col_wrap=3,
                                        palette=mutation_palette(4),
                                        height=4)
        #, estimator=weighted_varaint)#,

        position_mutation.set_axis_labels("", "Variant Frequency")
        position_mutation.axes.flat[0].set_yscale('symlog', linthreshy=10**-5)
        position_mutation.axes.flat[0].set_ylim(10**-6, 10**-1)
        plt.savefig(output_dir + "/position_mutation_replica%s.png" % replica,
                    dpi=300)
        plt.close()

        columns = ["Mutation", "Replica", "Type", "Slope", "Intercept"]
        mutation_rate_df = pd.DataFrame(columns=columns)
        mutation_rate_df.loc[0] = [
            "A>G", replica, "Synonymous", stat_slope1, stat_intercept1
        ]
        mutation_rate_df.loc[1] = [
            "U>C", replica, "Synonymous", stat_slope2, stat_intercept2
        ]
        mutation_rate_df.loc[2] = [
            "G>A", replica, "Synonymous", stat_slope3, stat_intercept3
        ]
        mutation_rate_df.loc[3] = [
            "C>U", replica, "Synonymous", stat_slope4, stat_intercept4
        ]
        mutation_rate_df.loc[4] = [
            "A>G", replica, "Non-Synonymous", stat_slope5, stat_intercept5
        ]
        mutation_rate_df.loc[5] = [
            "U>C", replica, "Non-Synonymous", stat_slope6, stat_intercept6
        ]
        mutation_rate_df.loc[6] = [
            "G>A", replica, "Non-Synonymous", stat_slope7, stat_intercept7
        ]
        mutation_rate_df.loc[7] = [
            "C>U", replica, "Non-Synonymous", stat_slope8, stat_intercept8
        ]
        mutation_rate_df.loc[8] = [
            "G>A", replica, "Pre Mature Stop Codon", stat_slope11,
            stat_intercept11
        ]
        mutation_rate_df.loc[9] = [
            "C>U", replica, "Pre Mature Stop Codon", stat_slope12,
            stat_intercept12
        ]

        mutation_rate_df.to_csv(output_dir + "/mutation_rate%s.csv" % replica,
                                sep=',',
                                encoding='utf-8')
        mutation_rate_df.to_pickle(output_dir +
                                   "/mutation_rate%s.pkl" % replica)

        # # A>G Prev Context
        mutation_ag = sns.catplot("passage",
                                  "frac_and_weight",
                                  data=data_filter_ag,
                                  hue="5`_ADAR_Preference",
                                  palette=mutation_palette(3,
                                                           adar=True,
                                                           ag=True),
                                  kind="point",
                                  dodge=True,
                                  estimator=weighted_varaint,
                                  orient="v",
                                  col="Type",
                                  join=False,
                                  col_order=type_order_ag,
                                  hue_order=adar_preference)
        mutation_ag.set(yscale="log")
        mutation_ag.set(ylim=(1 * 10**-5, 1 * 10**-2))
        mutation_ag.fig.suptitle("A>G ADAR_like Mutation in RV #%s" % replica,
                                 y=0.99)
        plt.subplots_adjust(top=0.85)
        mutation_ag.set_axis_labels("Passage", "Variant Frequency")
        mutation_ag.savefig(
            output_dir + "/ag_ADAR_like_Mutation_col_replica%s.png" % replica,
            dpi=300)
        plt.close()

        data_filter_ag_grouped = data_filter_ag.groupby([
            "label", "Type", "passage", "replica", "ADAR_grade_five",
            "5`_ADAR_Preference"
        ])["frac_and_weight"].agg(lambda x: weighted_varaint(x))
        data_filter_ag_grouped = data_filter_ag_grouped.reset_index()

        # print(data_filter_ag_grouped.to_string())

        data_filter_ag_grouped = data_filter_ag_grouped.rename(
            columns={"frac_and_weight": "Frequency"})
        data_filter_ag_grouped["Frequency"] = data_filter_ag_grouped[
            "Frequency"].astype(float)
        data_filter_ag_grouped = data_filter_ag_grouped[
            data_filter_ag_grouped["label"] != "RNA Control\nPrimer ID"]
        data_filter_ag_grouped = data_filter_ag_grouped[
            data_filter_ag_grouped["label"] != "RNA Control\nRND"]
        data_filter_ag_grouped = data_filter_ag_grouped[
            data_filter_ag_grouped["replica"] == replica]

        data_reg_full_adar = data_filter_ag_grouped[
            data_filter_ag_grouped["ADAR_grade_five"] == 1]
        data_reg_semi_adar = data_filter_ag_grouped[
            data_filter_ag_grouped["ADAR_grade_five"] == 0.5]
        data_reg_nonadar = data_filter_ag_grouped[
            data_filter_ag_grouped["ADAR_grade_five"] == 0]

        data_reg_full_adar_syn = data_reg_full_adar[data_reg_full_adar["Type"]
                                                    == "Synonymous"]
        data_reg_semi_adar_syn = data_reg_semi_adar[data_reg_semi_adar["Type"]
                                                    == "Synonymous"]
        data_reg_nonadar_syn = data_reg_nonadar[data_reg_nonadar["Type"] ==
                                                "Synonymous"]

        data_reg_full_adar_non_syn = data_reg_full_adar[
            data_reg_full_adar["Type"] == "Non-Synonymous"]
        data_reg_semi_adar_non_syn = data_reg_semi_adar[
            data_reg_semi_adar["Type"] == "Non-Synonymous"]
        data_reg_nonadar_non_syn = data_reg_nonadar[data_reg_nonadar["Type"] ==
                                                    "Non-Synonymous"]

        stat_slope1, stat_intercept1, r_value1, p_value1, std_err1 = stats.linregress(
            data_reg_full_adar_syn['passage'],
            data_reg_full_adar_syn['Frequency'])
        stat_slope2, stat_intercept2, r_value2, p_value2, std_err2 = stats.linregress(
            data_reg_semi_adar_syn['passage'],
            data_reg_semi_adar_syn['Frequency'])
        stat_slope3, stat_intercept3, r_value3, p_value3, std_err3 = stats.linregress(
            data_reg_nonadar_syn['passage'], data_reg_nonadar_syn['Frequency'])
        # data_reg_adar_nonsyn
        stat_slope4, stat_intercept4, r_value4, p_value4, std_err4 = stats.linregress(
            data_reg_full_adar_non_syn['passage'],
            data_reg_full_adar_non_syn['Frequency'])
        stat_slope5, stat_intercept5, r_value5, p_value5, std_err5 = stats.linregress(
            data_reg_semi_adar_non_syn['passage'],
            data_reg_semi_adar_non_syn['Frequency'])
        stat_slope6, stat_intercept6, r_value6, p_value6, std_err6 = stats.linregress(
            data_reg_nonadar_non_syn['passage'],
            data_reg_nonadar_non_syn['Frequency'])
        data_filter_ag_grouped = data_filter_ag_grouped.rename(
            columns={"passage": "Passage"})
        ag_reg_plot = sns.lmplot(x="Passage",
                                 y="Frequency",
                                 data=data_filter_ag_grouped,
                                 hue="5`_ADAR_Preference",
                                 hue_order=adar_preference,
                                 markers=["o", "v", "x"],
                                 fit_reg=True,
                                 col="5`_ADAR_Preference",
                                 col_order=adar_preference,
                                 row="Type",
                                 row_order=type_order_ag,
                                 palette=mutation_palette(3,
                                                          adar=True,
                                                          ag=True),
                                 line_kws={'label': "Linear Reg"},
                                 legend=True,
                                 height=6,
                                 x_estimator=np.mean,
                                 x_jitter=.05)
        ag_reg_plot.fig.subplots_adjust(wspace=.02)
        ax = ag_reg_plot.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        label_line_1 = "y={0:.3g}x+{1:.3g}".format(stat_slope1,
                                                   stat_intercept1)
        label_line_2 = "y={0:.3g}x+{1:.3g}".format(stat_slope2,
                                                   stat_intercept2)
        label_line_3 = "y={0:.3g}x+{1:.3g}".format(stat_slope3,
                                                   stat_intercept3)
        label_line_4 = "y={0:.3g}x+{1:.3g}".format(stat_slope4,
                                                   stat_intercept4)
        label_line_5 = "y={0:.3g}x+{1:.3g}".format(stat_slope5,
                                                   stat_intercept5)
        label_line_6 = "y={0:.3g}x+{1:.3g}".format(stat_slope6,
                                                   stat_intercept6)
        L_labels[0].set_text(label_line_1)
        ax = ag_reg_plot.axes[0, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_2)
        ax = ag_reg_plot.axes[0, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_3)
        ax = ag_reg_plot.axes[1, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_4)
        ax = ag_reg_plot.axes[1, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_5)
        ax = ag_reg_plot.axes[1, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_6)

        ag_reg_plot.set(xlim=(0, 13))
        ag_reg_plot.set(ylim=(0.000, 0.003))
        # ag_reg_plot.fig.suptitle("RV #%s" % str(replica), y=0.99)
        # plt.tight_layout()
        ag_reg_plot.savefig(output_dir +
                            "/ag_lmplot_ADAR_Context_replica%s" % replica,
                            dpi=300)
        plt.close()

        data_filter_ag = data_filter_ag[data_filter_ag["Protein"] != "2A"]
        data_filter_ag = data_filter_ag[data_filter_ag["Protein"] != "3'UTR"]
        data_filter_ag = data_filter_ag[data_filter_ag["Type"] == "Synonymous"]

        position_mutation_ag = sns.relplot(
            x="Pos",
            y="Frequency",
            data=data_filter_ag,
            hue="5`_ADAR_Preference",
            col="passage",
            col_wrap=3,
            palette=mutation_palette(3, adar=True, ag=True),
            hue_order=adar_preference,
            height=4,
            style="5`_ADAR_Preference",
            style_order=["High", "Low", "Intermediate"])

        position_mutation_ag.set_axis_labels("", "Variant Frequency")
        position_mutation_ag.axes.flat[0].set_yscale('symlog',
                                                     linthreshy=10**-4)
        position_mutation_ag.axes.flat[0].set_ylim(10**-5, 10**-2)
        plt.savefig(output_dir +
                    "/ag_position_mutation_replica%s.png" % replica,
                    dpi=300)
        plt.close()

        columns = ["Mutation", "Replica", "Type", "Slope", "Intercept"]
        mutation_rate_ag_df = pd.DataFrame(columns=columns)
        mutation_rate_ag_df.loc[0] = [
            "High ADAR-like A>G", replica, "Synonymous", stat_slope1,
            stat_intercept1
        ]
        mutation_rate_ag_df.loc[1] = [
            "Intermediate ADAR-like A>G", replica, "Synonymous", stat_slope2,
            stat_intercept2
        ]
        mutation_rate_ag_df.loc[2] = [
            "Low ADAR-like A>G", replica, "Synonymous", stat_slope3,
            stat_intercept3
        ]
        mutation_rate_ag_df.loc[3] = [
            "High ADAR-like A>G", replica, "Non-Synonymous", stat_slope4,
            stat_intercept4
        ]
        mutation_rate_ag_df.loc[4] = [
            "Intermediate ADAR-like A>G", replica, "Non-Synonymous",
            stat_slope5, stat_intercept5
        ]
        mutation_rate_ag_df.loc[5] = [
            "Low ADAR-like A>G", replica, "Non-Synonymous", stat_slope6,
            stat_intercept6
        ]

        mutation_rate_ag_df.to_csv(output_dir +
                                   "/mutation_rate_ag%s.csv" % replica,
                                   sep=',',
                                   encoding='utf-8')
        mutation_rate_ag_df.to_pickle(output_dir +
                                      "/mutation_rate_ag%s.pkl" % replica)
        """U>C Context"""
        mutation_uc = sns.catplot("passage",
                                  "frac_and_weight",
                                  data=data_filter_uc,
                                  hue="3`_ADAR_Preference",
                                  palette=mutation_palette(3,
                                                           adar=True,
                                                           uc=True),
                                  kind="point",
                                  dodge=True,
                                  estimator=weighted_varaint,
                                  orient="v",
                                  col="Type",
                                  join=False,
                                  hue_order=adar_preference,
                                  col_order=type_order_ag)
        mutation_uc.set(yscale="log")
        mutation_uc.set(ylim=(1 * 10**-5, 1 * 10**-2))
        # mutation_uc.set(xticks=["0", "2", "5", "8", "10", "12"])
        mutation_uc.set_axis_labels("Passage", "Variant Frequency")
        mutation_uc.savefig(
            output_dir + "/uc_ADAR_like_Mutation_col_replica%s.png" % replica,
            dpi=300)
        plt.close()

        data_filter_uc_grouped = data_filter_uc.groupby([
            "label", "Type", "passage", "replica", "ADAR_grade_three",
            "3`_ADAR_Preference"
        ])["frac_and_weight"].agg(lambda x: weighted_varaint(x))
        data_filter_uc_grouped = data_filter_uc_grouped.reset_index()

        # print(data_filter_uc_grouped.to_string())

        data_filter_uc_grouped = data_filter_uc_grouped.rename(
            columns={"frac_and_weight": "Frequency"})
        data_filter_uc_grouped["Frequency"] = data_filter_uc_grouped[
            "Frequency"].astype(float)
        data_filter_uc_grouped = data_filter_uc_grouped[
            data_filter_uc_grouped["label"] != "RNA Control\nPrimer ID"]
        data_filter_uc_grouped = data_filter_uc_grouped[
            data_filter_uc_grouped["label"] != "RNA Control\nRND"]
        data_filter_uc_grouped = data_filter_uc_grouped[
            data_filter_uc_grouped["replica"] == replica]

        data_reg_full_adar_uc = data_filter_uc_grouped[
            data_filter_uc_grouped["ADAR_grade_three"] == 1]
        data_reg_semi_adar_uc = data_filter_uc_grouped[
            data_filter_uc_grouped["ADAR_grade_three"] == 0.5]
        data_reg_nonadar_uc = data_filter_uc_grouped[
            data_filter_uc_grouped["ADAR_grade_three"] == 0]

        data_reg_full_adar_syn_uc = data_reg_full_adar_uc[
            data_reg_full_adar_uc["Type"] == "Synonymous"]
        data_reg_semi_adar_syn_uc = data_reg_semi_adar_uc[
            data_reg_semi_adar_uc["Type"] == "Synonymous"]
        data_reg_nonadar_syn_uc = data_reg_nonadar_uc[
            data_reg_nonadar_uc["Type"] == "Synonymous"]

        data_reg_full_adar_non_syn_uc = data_reg_full_adar_uc[
            data_reg_full_adar_uc["Type"] == "Non-Synonymous"]
        data_reg_semi_adar_non_syn_uc = data_reg_semi_adar_uc[
            data_reg_semi_adar_uc["Type"] == "Non-Synonymous"]
        data_reg_nonadar_non_syn_uc = data_reg_nonadar_uc[
            data_reg_nonadar_uc["Type"] == "Non-Synonymous"]

        stat_slope7, stat_intercept7, r_value7, p_value7, std_err7 = stats.linregress(
            data_reg_full_adar_syn_uc['passage'],
            data_reg_full_adar_syn_uc['Frequency'])
        stat_slope8, stat_intercept8, r_value8, p_value8, std_err8 = stats.linregress(
            data_reg_semi_adar_syn_uc['passage'],
            data_reg_semi_adar_syn_uc['Frequency'])
        stat_slope9, stat_intercept9, r_value9, p_value9, std_err9 = stats.linregress(
            data_reg_nonadar_syn_uc['passage'],
            data_reg_nonadar_syn_uc['Frequency'])
        # data_reg_adar_nonsyn
        stat_slope10, stat_intercept10, r_value10, p_value10, std_err10 = stats.linregress(
            data_reg_full_adar_non_syn_uc['passage'],
            data_reg_full_adar_non_syn_uc['Frequency'])
        stat_slope11, stat_intercept11, r_value11, p_value11, std_err11 = stats.linregress(
            data_reg_semi_adar_non_syn_uc['passage'],
            data_reg_semi_adar_non_syn_uc['Frequency'])
        stat_slope12, stat_intercept12, r_value12, p_value12, std_err12 = stats.linregress(
            data_reg_nonadar_non_syn_uc['passage'],
            data_reg_nonadar_non_syn_uc['Frequency'])
        data_filter_uc_grouped = data_filter_uc_grouped.rename(
            columns={"passage": "Passage"})
        uc_lmplot = sns.lmplot(x="Passage",
                               y="Frequency",
                               data=data_filter_uc_grouped,
                               hue="3`_ADAR_Preference",
                               markers=["o", "v", "x"],
                               hue_order=adar_preference,
                               fit_reg=True,
                               col="3`_ADAR_Preference",
                               col_order=adar_preference,
                               row="Type",
                               row_order=type_order_ag,
                               palette=mutation_palette(3, adar=True, uc=True),
                               line_kws={'label': "Linear Reg"},
                               legend=True,
                               height=6)
        uc_lmplot.fig.subplots_adjust(wspace=.02)
        ax = uc_lmplot.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        label_line_7 = "y={0:.3g}x+{1:.3g}".format(stat_slope7,
                                                   stat_intercept7)
        label_line_8 = "y={0:.3g}x+{1:.3g}".format(stat_slope8,
                                                   stat_intercept8)
        label_line_9 = "y={0:.3g}x+{1:.3g}".format(stat_slope9,
                                                   stat_intercept9)
        label_line_10 = "y={0:.3g}x+{1:.3g}".format(stat_slope10,
                                                    stat_intercept10)
        label_line_11 = "y={0:.3g}x+{1:.3g}".format(stat_slope11,
                                                    stat_intercept11)
        label_line_12 = "y={0:.3g}x+{1:.3g}".format(stat_slope12,
                                                    stat_intercept12)
        L_labels[0].set_text(label_line_7)
        ax = uc_lmplot.axes[0, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_8)
        ax = uc_lmplot.axes[0, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_9)
        ax = uc_lmplot.axes[1, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_10)
        ax = uc_lmplot.axes[1, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_11)
        ax = uc_lmplot.axes[1, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_12)

        uc_lmplot.set(xlim=(0, 13))
        uc_lmplot.set(ylim=(0.000, 0.003))
        # uc_lmplot.fig.suptitle("RV #%s" % str(replica), y=0.99)
        # plt.tight_layout()
        uc_lmplot.savefig(output_dir +
                          "/uc_lmplot_ADAR_Context_replica%s" % replica,
                          dpi=300)
        plt.close()

        data_filter_uc = data_filter_uc[data_filter_uc["Protein"] != "2A"]
        data_filter_uc = data_filter_uc[data_filter_uc["Protein"] != "3'UTR"]
        data_filter_uc = data_filter_uc[data_filter_uc["Type"] == "Synonymous"]

        position_mutation_uc = sns.relplot(
            x="Pos",
            y="Frequency",
            data=data_filter_uc,
            hue="3`_ADAR_Preference",
            col="passage",
            col_wrap=3,
            palette=mutation_palette(3, adar=True, uc=True),
            hue_order=adar_preference,
            height=4,
            style="3`_ADAR_Preference",
            style_order=["High", "Low", "Intermediate"])

        position_mutation_uc.set_axis_labels("", "Variant Frequency")
        position_mutation_uc.axes.flat[0].set_yscale('symlog',
                                                     linthreshy=10**-4)
        position_mutation_uc.axes.flat[0].set_ylim(10**-5, 10**-2)
        plt.savefig(output_dir +
                    "/uc_position_mutation_replica%s.png" % replica,
                    dpi=300)
        plt.close()

    mutation_rate_df1 = pd.read_pickle(output_dir + "/mutation_rate1.pkl")
    mutation_rate_df2 = pd.read_pickle(output_dir + "/mutation_rate2.pkl")
    mutation_rate_df3 = pd.read_pickle(output_dir + "/mutation_rate3.pkl")
    mutation_rate_df_all = pd.concat(
        [mutation_rate_df1, mutation_rate_df2, mutation_rate_df3], sort=False)
    mutation_rate_df_all.to_csv(output_dir + "/mutation_rate_all.csv",
                                sep=',',
                                encoding='utf-8')
    mutation_rate_df_all_grouped = mutation_rate_df_all.groupby(
        ["Mutation", "Type"])["Slope", "Intercept"].agg(np.median)
    mutation_rate_df_all_grouped = mutation_rate_df_all_grouped.reset_index()
    mutation_rate_df_all_grouped.to_csv(output_dir +
                                        "/mutation_rate_median.csv",
                                        sep=',',
                                        encoding='utf-8')

    mutation_rate_ag_df1 = pd.read_pickle(output_dir +
                                          "/mutation_rate_ag1.pkl")
    mutation_rate_ag_df2 = pd.read_pickle(output_dir +
                                          "/mutation_rate_ag2.pkl")
    mutation_rate_ag_df3 = pd.read_pickle(output_dir +
                                          "/mutation_rate_ag3.pkl")
    mutation_rate_ag_df_all = pd.concat(
        [mutation_rate_ag_df1, mutation_rate_ag_df2, mutation_rate_ag_df3],
        sort=False)
    mutation_rate_ag_df_all.to_csv(output_dir + "/mutation_rate_ag_all.csv",
                                   sep=',',
                                   encoding='utf-8')
    mutation_rate_ag_df_all.to_pickle(output_dir + "/mutation_rate_ag_all.pkl")

    # mutation_rate_ag_df = pd.read_csv(output_dir + "/mutation_rate_ag_all.csv", sep=',', encoding='utf-8')
    mutation_rate_ag_df_all_grouped = mutation_rate_ag_df_all.groupby(
        ["Mutation", "Type"])["Slope", "Intercept"].agg(np.median)
    mutation_rate_ag_df_all_grouped = mutation_rate_ag_df_all_grouped.reset_index(
    )
    mutation_rate_ag_df_all_grouped.to_csv(output_dir +
                                           "/mutation_rate_ag_median.csv",
                                           sep=',',
                                           encoding='utf-8')
예제 #3
0
def main():
    # input_dir = "/Users/odedkushnir/Projects/fitness/AccuNGS/190627_RV_CV/RVB14/"
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/patients/"
    input_dir = "/Users/odedkushnir/PhD_Projects/After_review/AccuNGS/RV/patients/"
    prefix = "inosine_predict_context_freq0.01"
    date = datetime.today().strftime("%Y%m%d")
    output_dir = input_dir + "{0}_{1}".format(date, prefix)
    try:
        os.mkdir(output_dir)
    except OSError:
        print("Creation of the directory %s failed" % output_dir)
    else:
        print("Successfully created the directory %s " % output_dir)

    data_filter = pd.read_pickle(input_dir + prefix + "/data_filter.pkl")
    data_filter_ag = pd.read_pickle(input_dir + prefix + "/data_filter_ag.pkl")
    data_filter_uc = pd.read_pickle(input_dir + prefix + "/data_filter_uc.pkl")
    data_filter["label"] = np.where(
        data_filter["label"] == "RNA Control\nPrimer ID", "RNA\nControl",
        data_filter["label"])

    #Plots
    label_order = [
        "RNA\nControl", "p3 Cell Culture\nControl", "Patient-1", "Patient-4",
        "Patient-5", "Patient-9", "Patient-16", "Patient-17", "Patient-20"
    ]
    mutation_order = [
        "A>G", "U>C", "G>A", "C>U", "A>C", "U>G", "A>U", "U>A", "G>C", "C>G",
        "C>A", "G>U"
    ]
    transition_order = ["A>G", "U>C", "G>A", "C>U"]
    type_order1 = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]
    context_order = ["UpA", "ApA", "CpA", "GpA"]
    type_order2 = ["Synonymous", "Non-Synonymous"]
    context_order_uc = ["UpA", "UpU", "UpG", "UpC"]
    type_order_ag = ["Synonymous", "Non-Synonymous", "NonCodingRegion"]
    adar_preference = ["High", "Intermediate", "Low"]
    plus_minus = u"\u00B1"
    pairs = [(("RNA\nControl", "A>G"), ("RNA\nControl", "G>A")),
             (("p3 Cell Culture\nControl", "A>G"), ("p3 Cell Culture\nControl",
                                                    "G>A")),
             (("Patient-1", "A>G"), ("Patient-1", "G>A")),
             (("Patient-4", "A>G"), ("Patient-4", "G>A")),
             (("Patient-5", "A>G"), ("Patient-5", "G>A")),
             (("Patient-9", "A>G"), ("Patient-9", "G>A")),
             (("Patient-16", "A>G"), ("Patient-16", "G>A")),
             (("Patient-17", "A>G"), ("Patient-17", "G>A")),
             (("Patient-20", "A>G"), ("Patient-20", "G>A")),
             (("RNA\nControl", "A>G"), ("RNA\nControl", "U>C")),
             (("p3 Cell Culture\nControl", "A>G"), ("p3 Cell Culture\nControl",
                                                    "U>C")),
             (("Patient-1", "A>G"), ("Patient-1", "U>C")),
             (("Patient-4", "A>G"), ("Patient-4", "U>C")),
             (("Patient-5", "A>G"), ("Patient-5", "U>C")),
             (("Patient-9", "A>G"), ("Patient-9", "U>C")),
             (("Patient-16", "A>G"), ("Patient-16", "U>C")),
             (("Patient-17", "A>G"), ("Patient-17", "U>C")),
             (("Patient-20", "A>G"), ("Patient-20", "U>C")),
             (("RNA\nControl", "A>G"), ("RNA\nControl", "C>U")),
             (("p3 Cell Culture\nControl", "A>G"), ("p3 Cell Culture\nControl",
                                                    "C>U")),
             (("Patient-1", "A>G"), ("Patient-1", "C>U")),
             (("Patient-4", "A>G"), ("Patient-4", "C>U")),
             (("Patient-5", "A>G"), ("Patient-5", "C>U")),
             (("Patient-9", "A>G"), ("Patient-9", "C>U")),
             (("Patient-16", "A>G"), ("Patient-16", "C>U")),
             (("Patient-17", "A>G"), ("Patient-17", "C>U")),
             (("Patient-20", "A>G"), ("Patient-20", "C>U"))]

    # g1 = sns.catplot(x="label", y="frac_and_weight", data=data_filter, hue="Mutation", order=label_order, palette="tab20",
    #                     kind="point", dodge=True, hue_order=mutation_order, join=False, estimator=weighted_varaint,
    #                  orient="v")
    # g1.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    # g1.set_xticklabels(fontsize=9, rotation=90)
    # g1.set(yscale='log')
    # # g1.set(ylim=(10**-7, 10**-3))
    #
    # # plt.show()
    # g1.savefig(output_dir + "/All_Mutations_point_plot", dpi=300)
    # plt.close()
    g2 = sns.catplot(x="label",
                     y="frac_and_weight",
                     data=data_filter,
                     hue="Mutation",
                     order=label_order,
                     palette=mutation_palette(4),
                     kind="point",
                     dodge=0.5,
                     hue_order=transition_order,
                     join=False,
                     estimator=weighted_varaint,
                     orient="v",
                     legend=True)
    g2.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g2.set(yscale='log')
    g2.set(ylim=(10**-5, 10**-3))
    # g2.set_yticklabels(fontsize=12)
    g2.set_xticklabels(fontsize=10, rotation=90)
    # plt.show()
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/MyPosters/20190924 GGE/plots/Transition_Mutations_point_plot_RV", dpi=300)
    g2.savefig(output_dir + "/Transition_Mutations_point_plot", dpi=300)
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/Prgress reports/20200913 Final report/plots" +
    #                   "/Fig9a_Transition_Mutations_point_plot_Patients", dpi=300)
    plt.close()
    data_filter["label"] = data_filter["label"].astype(str)
    data_filter["Frequency"] = data_filter["Frequency"].astype(float)
    passage_g = sns.boxplot(x="label",
                            y="Frequency",
                            data=data_filter,
                            hue="Mutation",
                            order=label_order,
                            palette=mutation_palette(4),
                            dodge=True,
                            hue_order=transition_order)
    passage_g.set_yscale('log')
    passage_g.set_ylim(10**-6, 10**-1)
    passage_g.set(xlabel="", ylabel="Variant Frequency")
    passage_g.set_xticklabels(labels=label_order, fontsize=10, rotation=90)

    annot = Annotator(passage_g,
                      pairs,
                      x="label",
                      y="Frequency",
                      hue="Mutation",
                      data=data_filter,
                      order=label_order,
                      hue_order=transition_order)
    annot.configure(test='t-test_welch',
                    text_format='star',
                    loc='outside',
                    verbose=2,
                    comparisons_correction="Bonferroni")
    annot.apply_test()
    file_path = output_dir + "/sts.csv"
    with open(file_path, "w") as o:
        with contextlib.redirect_stdout(o):
            passage_g, test_results = annot.annotate()
    plt.legend(bbox_to_anchor=(1.05, 0.5), loc=2, borderaxespad=0.)
    plt.tight_layout()
    plt.savefig(output_dir + "/Transition_Mutations_box_stat_plot_patients",
                dpi=300)
    plt.close()

    # g_rna = sns.catplot(x="RNA", y="frac_and_weight", data=data_filter, hue="Mutation", order=rna_order,
    #                  palette="tab20", kind="point", dodge=True, hue_order=transition_order, join=False, estimator=weighted_varaint,
    #                  orient="v", legend=True)
    # g_rna.set_axis_labels("", "Variant Frequency")
    # g_rna.set(yscale='log')
    # g_rna.set(ylim=(10 ** -6, 10 ** -2))
    # # g2.set_yticklabels(fontsize=12)
    # g_rna.set_xticklabels(fontsize=10, rotation=45)
    # plt.show()
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/MyPosters/20190924 GGE/plots/Transition_Mutations_point_plot_RV", dpi=300)
    # g_rna.savefig(output_dir + "/Transition_Mutations_point_RNA_plot", dpi=300)
    # plt.close()

    # A>G Prev Context
    flatui = ["#3498db", "#9b59b6"]
    g5 = sns.catplot("label",
                     "frac_and_weight",
                     data=data_filter_ag,
                     hue="ADAR_like",
                     order=label_order,
                     palette=mutation_palette(2),
                     kind="point",
                     dodge=True,
                     hue_order=[True, False],
                     estimator=weighted_varaint,
                     orient="v",
                     col="Type",
                     join=False,
                     col_order=type_order2)
    g5.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g5.set(yscale='log')
    g5.set(ylim=(7 * 10**-7, 4 * 10**-3))
    g5.set_xticklabels(rotation=90)
    # plt.show()
    g5.savefig(output_dir + "/Context_point_plot", dpi=300)
    # g5.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/Prgress reports/20200913 Final report/plots" +
    #            "/Fig9b_Context_point_plot_Patients", dpi=300)
    plt.close()

    mutation_ag = sns.catplot("label",
                              "frac_and_weight",
                              data=data_filter_ag,
                              hue="5`_ADAR_Preference",
                              palette=mutation_palette(3, adar=True, ag=True),
                              kind="point",
                              dodge=True,
                              estimator=weighted_varaint,
                              order=label_order,
                              orient="v",
                              col="Type",
                              join=False,
                              col_order=type_order_ag,
                              hue_order=adar_preference)
    mutation_ag.set(yscale="log")
    mutation_ag.set(ylim=(1 * 10**-5, 1 * 10**-2))
    mutation_ag.set_xticklabels(rotation=90)
    mutation_ag.fig.suptitle("A>G ADAR_like Mutation in RV patients", y=0.99)
    plt.subplots_adjust(top=0.85)
    mutation_ag.set_axis_labels(
        "", "Variant Frequency {} CI=95%".format(plus_minus))
    mutation_ag.savefig(output_dir + "/ag_ADAR_like_Mutation_col_patients.png",
                        dpi=300)
    plt.close()

    g6 = sns.catplot("label",
                     "frac_and_weight",
                     data=data_filter_ag,
                     hue="ADAR_like",
                     order=label_order,
                     palette=mutation_palette(2),
                     kind="point",
                     dodge=True,
                     hue_order=[True, False],
                     estimator=weighted_varaint,
                     orient="v",
                     join=False)
    g6.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g6.set(yscale='log')
    g6.set(ylim=(7 * 10**-7, 4 * 10**-3))
    g6.set_xticklabels(rotation=90)
    # plt.show()
    g6.savefig(output_dir + "/Context_point_all_mutations_type_plot", dpi=300)
    plt.close()

    g9 = sns.catplot("label",
                     "frac_and_weight",
                     data=data_filter_uc,
                     hue="Next",
                     order=label_order,
                     palette="tab20",
                     hue_order=context_order_uc,
                     estimator=weighted_varaint,
                     orient="v",
                     dodge=True,
                     kind="point",
                     col="Type",
                     join=False,
                     col_order=type_order2)
    g9.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g9.set(yscale='log')
    g9.set(ylim=(10**-5, 10**-2))
    g9.set_xticklabels(rotation=90)
    # plt.show()
    g9.savefig(output_dir + "/UC_Context_point_plot", dpi=300)
    plt.close()

    data_filter_ag_grouped = data_filter_ag.groupby(
        ["ADAR_like", "label",
         "Type"])["frac_and_weight"].agg(lambda x: weighted_varaint(x))
    data_filter_ag_grouped = data_filter_ag_grouped.reset_index()
    data_filter_ag_grouped = data_filter_ag_grouped.rename(
        columns={"frac_and_weight": "Frequency"})
    data_filter_ag_grouped["Frequency"] = data_filter_ag_grouped[
        "Frequency"].astype(float)
    print(data_filter_ag_grouped.to_string())

    data_filter_ag_grouped_silent = data_filter_ag_grouped[
        data_filter_ag_grouped["Type"] == "Synonymous"]
    data_filter_ag_grouped_silent = data_filter_ag_grouped_silent[
        data_filter_ag_grouped_silent["label"] == "Cell Cultureֿ\nControl"]
예제 #4
0
def linear_reg(data_filter, output_dir, transition_order, type_order, virus, replica, cu=True, ag=True, uc=True,
               ga= True, output_file="/mutation_rate"):
    """

    :param data_filter:
    :param output_dir:
    :param transition_order:
    :param type_order:
    :param virus:
    :param replica:
    :param cu:
    :param output_file:
    :return:
    """
    # data_filter = data_filter.groupby(["label", "passage", "Type", "Mutation", "replica"])[
    #     "frac_and_weight"].agg(
    #     lambda x: weighted_varaint(x))
    # data_filter = data_filter.reset_index()

    # data_filter = data_filter.rename(columns={"frac_and_weight": "Frequency"})
    # data_filter["Frequency"] = data_filter["Frequency"].astype(float)

    data_filter = data_filter[data_filter["label"] != "RNA Control\nPrimer ID"]
    data_filter = data_filter[data_filter["label"] != "RNA Control\nRND"]
    data_filter = data_filter[data_filter["replica"] == replica]
    # print(data_filter.to_string())

    data_reg_ag = data_filter[data_filter["Mutation"] == "A>G"]
    data_reg_uc = data_filter[data_filter["Mutation"] == "U>C"]
    data_reg_ga = data_filter[data_filter["Mutation"] == "G>A"]
    data_reg_cu = data_filter[data_filter["Mutation"] == "C>U"]

    data_reg_ag_syn = data_reg_ag[data_reg_ag["Type"] == "Synonymous"]
    data_reg_uc_syn = data_reg_uc[data_reg_uc["Type"] == "Synonymous"]
    data_reg_ga_syn = data_reg_ga[data_reg_ga["Type"] == "Synonymous"]
    data_reg_cu_syn = data_reg_cu[data_reg_cu["Type"] == "Synonymous"]

    data_reg_ag_non_syn = data_reg_ag[data_reg_ag["Type"] == "Non-Synonymous"]
    data_reg_uc_non_syn = data_reg_uc[data_reg_uc["Type"] == "Non-Synonymous"]
    data_reg_ga_non_syn = data_reg_ga[data_reg_ga["Type"] == "Non-Synonymous"]
    data_reg_cu_non_syn = data_reg_cu[data_reg_cu["Type"] == "Non-Synonymous"]

    data_reg_ga_pmsc = data_reg_ga[data_reg_ga["Type"] == "Premature Stop Codon"]
    data_reg_cu_pmsc = data_reg_cu[data_reg_cu["Type"] == "Premature Stop Codon"]

    if ag==True:
        slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(data_reg_ag_syn['passage'],
                                                                                      data_reg_ag_syn
                                                                                      ['Frequency'])
        slope5, intercept5, r_value5, p_value5, std_err5 = stats.linregress(data_reg_ag_non_syn['passage'],
                                                                                      data_reg_ag_non_syn[
                                                                                          'Frequency'])
    if uc==True:
        slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(data_reg_uc_syn['passage'],
                                                                                      data_reg_uc_syn[
                                                                                          'Frequency'])
        slope6, intercept6, r_value6, p_value6, std_err6 = stats.linregress(data_reg_uc_non_syn['passage'],
                                                                            data_reg_uc_non_syn[
                                                                                'Frequency'])
    if ga == True:
        slope3, intercept3, r_value3, p_value3, std_err3 = stats.linregress(data_reg_ga_syn['passage'],
                                                                                      data_reg_ga_syn[
                                                                                          'Frequency'])
        slope7, intercept7, r_value7, p_value7, std_err7 = stats.linregress(data_reg_ga_non_syn['passage'],
                                                                            data_reg_ga_non_syn[
                                                                                'Frequency'])
        slope11, intercept11, r_value11, p_value11, std_err11 = stats.linregress(data_reg_ga_pmsc['passage'],
                                                                                 data_reg_ga_pmsc[
                                                                                     'Frequency'])
    if cu == True:
        slope4, intercept4, r_value4, p_value4, std_err4 = stats.linregress(data_reg_cu_syn['passage'],
                                                                                      data_reg_cu_syn[
                                                                                          'Frequency'])
        slope8, intercept8, r_value8, p_value8, std_err8 = stats.linregress(data_reg_cu_non_syn['passage'],
                                                                                      data_reg_cu_non_syn[
                                                                                          'Frequency'])
        slope12, intercept12, r_value12, p_value12, std_err12 = stats.linregress(data_reg_cu_pmsc['passage'],
                                                                                           data_reg_cu_pmsc[
                                                                                               'Frequency'])
    data_filter = data_filter.rename(columns={"passage": "Passage"})
    reg_plot = sns.lmplot(x="Passage", y="Frequency", data=data_filter, hue="Mutation",
                          hue_order=transition_order, fit_reg=True, col="Mutation",
                          col_order=transition_order, row="Type", row_order=type_order, palette=mutation_palette(4),
                          line_kws={'label': "Linear Reg"}, legend=True, height=6)  # markers=["o", "v", "x"]

    if ag == None:
        slope1, intercept1, r_value1, p_value1, std_err1 = 0, 0, 0, 0, 0
        slope5, intercept5, r_value5, p_value5, std_err5 = 0, 0, 0, 0, 0
    if uc == None:
        slope2, intercept2, r_value2, p_value2, std_err2 = 0, 0, 0, 0, 0
        slope6, intercept6, r_value6, p_value6, std_err6 = 0, 0, 0, 0, 0
    if ga == None:
        slope3, intercept3, r_value3, p_value3, std_err3 = 0, 0, 0, 0, 0
        slope7, intercept7, r_value7, p_value7, std_err7 = 0, 0, 0, 0, 0
        slope11, intercept11, r_value11, p_value11, std_err11 = 0, 0, 0, 0, 0
    if cu == None:
        slope4, intercept4, r_value4, p_value4, std_err4 = 0, 0, 0, 0, 0
        slope8, intercept8, r_value8, p_value8, std_err8 = 0, 0, 0, 0, 0
        slope12, intercept12, r_value12, p_value12, std_err12 = 0, 0, 0, 0, 0

    label_line_1 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope1, intercept1, p_value1)
    label_line_2 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope2, intercept2, p_value2)
    label_line_3 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope3, intercept3, p_value3)
    label_line_4 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope4, intercept4, p_value4)
    label_line_5 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope5, intercept5, p_value5)
    label_line_6 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope6, intercept6, p_value6)
    label_line_7 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope7, intercept7, p_value7)
    label_line_8 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope8, intercept8, p_value8)
    label_line_11 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope11, intercept11, p_value11)
    label_line_12 = "y={0:.3g}x+{1:.3g} pval={2:.3g}".format(slope12, intercept12, p_value12)
    reg_plot.fig.subplots_adjust(wspace=.02)
    if ag == True:
        ax = reg_plot.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_1)
    if uc == True:
        ax = reg_plot.axes[0, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_2)
    if ga == True:
        ax = reg_plot.axes[0, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_3)
    if cu == True:
        ax = reg_plot.axes[0, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_4)
    if ag == True:
        ax = reg_plot.axes[1, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_5)
    if uc == True:
        ax = reg_plot.axes[1, 1]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_6)
    if ga == True:
        ax = reg_plot.axes[1, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_7)
    if cu == True:
        ax = reg_plot.axes[1, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_8)
    if ga == True:
        ax = reg_plot.axes[2, 2]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_11)
    if cu == True:
        ax = reg_plot.axes[2, 3]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        L_labels[0].set_text(label_line_12)
    if virus == "RVB14":
        reg_plot.set(xlim=(0, 13))
        reg_plot.set(ylim=(0.000, 0.01))
        # reg_plot.fig.suptitle("RV #%s" % str(replica), y=0.99)
    plt.tight_layout()
    reg_plot.savefig(output_dir + output_file + "_lmplot_%s.png" % replica, dpi=300)
    plt.close()

    columns = ["Mutation", "Type", "Slope", "Intercept", "p-val"]
    mutation_rate_df = pd.DataFrame(columns=columns)
    mutation_rate_df.loc[0] = ["A>G", "Synonymous", slope1, intercept1, p_value1]
    mutation_rate_df.loc[1] = ["U>C", "Synonymous", slope2, intercept2, p_value2]
    mutation_rate_df.loc[2] = ["G>A", "Synonymous", slope3, intercept3, p_value3]
    mutation_rate_df.loc[3] = ["C>U", "Synonymous", slope4, intercept4, p_value3]
    mutation_rate_df.loc[4] = ["A>G", "Non-Synonymous", slope5, intercept5, p_value5]
    mutation_rate_df.loc[5] = ["U>C", "Non-Synonymous", slope6, intercept6, p_value6]
    mutation_rate_df.loc[6] = ["G>A", "Non-Synonymous", slope7, intercept7, p_value7]
    mutation_rate_df.loc[7] = ["C>U", "Non-Synonymous", slope8, intercept8, p_value8]
    mutation_rate_df.loc[8] = ["G>A", "Pre Mature Stop Codon", slope11, intercept11, p_value11]
    mutation_rate_df.loc[9] = ["C>U", "Pre Mature Stop Codon", slope12, intercept12, p_value11]
    mutation_rate_df["Virus"] = virus
    mutation_rate_df["Replica"] = replica
    mutation_rate_df.to_csv(output_dir + output_file + ".csv", sep=',', encoding='utf-8')
    mutation_rate_df.to_pickle(output_dir + output_file + str(replica) + ".pkl")

    return data_filter
예제 #5
0
def plots_for_srr(input_dir, output_dir, virus_path, virus):
    data_mutations = pd.read_csv(input_dir + virus_path + "data_mutation.csv")
    # data_mutations["Mutation"] = data_mutations["Mutation"].apply(lambda x: x.split("->")[0] + ">" + x.split("->")[1])
    mutation_order = ["A>G", "U>C", "C>U", "G>A", "A>U", "A>C", "U>A", "U>G", "C>A", "C>G", "G>U", "G>C"]
    transition_order = ["A>G", "U>C", "C>U", "G>A"]
    type_order = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]

    # data_mutations = data_mutations[data_mutations["pval"] < 0.01]
    data_mutations = data_mutations[data_mutations["Prob"] > 0]

    # plt.style.use('classic')
    sns.set_style("ticks")
    mypalette = ["#3498db", "#9b59b6"]
    adar_order = ("ADAR-like", "Non\nADAR-like")

    # All Mutations
    with sns.plotting_context(rc={"legend.fontsize": 6}):
        all_plot = sns.catplot(x="Mutation", y="Frequency", hue="label", data=data_mutations, palette="tab20",
                               order=mutation_order, flierprops={"marker": "."},
                               kind="box", col="Type", col_order=type_order)
    all_plot.set(yscale='log')
    all_plot._legend.set_title('')
    all_plot.set(xlabel="")
    all_plot.fig.suptitle("All Mutations variant frequencies in %s" % virus, y=0.99)

    plt.savefig(output_dir + "All_Mutations_variant_frequencies_in_%s.png" % virus, dpi=300)
    plt.close()

    # Transitions mutations plot
    with sns.plotting_context(rc={"legend.fontsize": 6}):
        transiotion_plot = sns.catplot(x="Mutation", y="Frequency", hue="label", data=data_mutations, palette="tab20",
                                       order=transition_order, flierprops={"marker": "."},
                                       kind="box")
    transiotion_plot.set(yscale='log')
    transiotion_plot._legend.set_title('')
    transiotion_plot.set(xlabel="")
    transiotion_plot.fig.suptitle("Transitions variant frequency in %s" % virus, y=0.99)

    plt.savefig(output_dir + "Transitions_variant_frequency_in_%s.png" % virus, dpi=300)
    plt.close()

##Context
    #5'
    data_context = pd.read_csv(input_dir + virus_path + "data_XpA_by_mutation.csv")
    data_context["no_variants"] = data_context["Freq"] * data_context["Read_count"]
    data_context["freq_and_weight"] = list(zip(data_context.no_variants, data_context.Read_count))
    if virus == "EnteroA":
        data_context = data_context.loc[data_context.Organism != "Coxsackievirus A16"]
    # data_context["Mutation"] = data_context["Mutation"].apply(lambda x: x.split("->")[0] + ">" + x.split("->")[1])
    data_context = data_context.rename(columns={"Freq": "Frequency"})
    data_context['Prev'].replace('AA', 'ApA', inplace=True)
    data_context['Prev'].replace('UA', 'UpA', inplace=True)
    data_context['Prev'].replace('CA', 'CpA', inplace=True)
    data_context['Prev'].replace('GA', 'GpA', inplace=True)
    data_context['Type'].replace('Synonymous', 'Silent', inplace=True)
    data_context['Type'].replace('Non-Synonymous', 'Missense', inplace=True)
    # data_context_rv = data_context_rv[data_context_rv["pval"] < 0.01]
    data_context = data_context[data_context["Prob"] > 0]

    context_order = ["UpA", "ApA", "CpA", "GpA"]

    data_adar = data_context.loc[data_context.Type == "Silent"]
    # print(type(data_adar["Context"]))
    data_adar["ADAR_like"] = data_adar.Prev.str.contains('UpA') | data_adar.Prev.str.contains('ApA')
    data_adar["ADAR_like"] = np.where(data_adar["ADAR_like"] == True, "ADAR-like", "Non\nADAR-like")
    no_organism = data_adar.Organism.value_counts()

    #3'
    data_context_3 = pd.read_csv(input_dir + virus_path + "data_UpX_by_mutation.csv")
    if virus == "EnteroA":
        data_context = data_context.loc[data_context.Organism != "Coxsackievirus A16"]
    # data_context["Mutation"] = data_context["Mutation"].apply(lambda x: x.split("->")[0] + ">" + x.split("->")[1])
    data_context_3 = data_context_3.rename(columns={"Freq": "Frequency"})
    data_context_3['Next'].replace('UA', 'UpA', inplace=True)
    data_context_3['Next'].replace('UU', 'UpU', inplace=True)
    data_context_3['Next'].replace('UC', 'UpC', inplace=True)
    data_context_3['Next'].replace('UG', 'UpG', inplace=True)
    data_context_3['Type'].replace('Synonymous', 'Silent', inplace=True)
    data_context_3['Type'].replace('Non-Synonymous', 'Missense', inplace=True)
    # data_context_rv = data_context_rv[data_context_rv["pval"] < 0.01]
    data_context_3 = data_context_3[data_context_3["Prob"] > 0]

    context_order_3 = ["UpA", "UpU", "UpC", "UpG"]

    data_adar_3 = data_context_3.loc[data_context_3.Type == "Silent"]
    # print(type(data_adar["Context"]))
    data_adar_3["ADAR_like"] = data_adar_3.Next.str.contains('UpA') | data_adar_3.Next.str.contains('UpU')
    data_adar_3["ADAR_like"] = np.where(data_adar_3["ADAR_like"] == True, "ADAR-like", "Non\nADAR-like")
    no_organism = data_adar.Organism.value_counts()


    # 5’ neighbors preferences
    #stat
    data_adar["log10_Frequency"] = data_adar["Frequency"].apply(lambda x: np.log10(x))
    data_adar["log10_Frequency"] = data_adar["log10_Frequency"].astype(float)
    if virus == "EnteroA":
        context_stat_plot = sns.catplot("ADAR_like", "Frequency", data=data_adar, palette=mutation_palette(2), col="Organism",
                                         kind="boxen", col_wrap=int(len(no_organism)/2), order=adar_order) #flierprops={"marker": "."},
    else:
        context_stat_plot = sns.catplot("ADAR_like", "Frequency", data=data_adar, palette=mutation_palette(2), col="Organism",
                                        kind="boxen", col_wrap=len(no_organism), order=adar_order) #flierprops={"marker": "."},

    context_stat_plot.set(yscale='log')
    context_stat_plot.set(ylim=(10 ** -4, 10 ** -2))
    # context_stat_plot.axes.flat[0].set_yscale('symlog', linthreshy=10 ** -5)
    context_stat_plot.set_axis_labels("", "Variant Frequency")
    organism_list = data_adar["Organism"].unique()
    for ax in context_stat_plot.axes.flat:
        print(ax.get_title())
        for organism in organism_list:
            if ax.get_title().split(" = ")[-1] == organism:
                ax.set_title("%s" % organism, pad=20)
                old_statannot.add_stat_annotation(ax, data=data_adar[data_adar["Organism"] == organism], x="ADAR_like", y="Frequency",
                                boxPairList=[("ADAR-like", "Non\nADAR-like")], test='Mann-Whitney', textFormat='star', loc='inside',
                                    verbose=0, lineOffsetToBox=2, lineHeight=0, stack=False, useFixedOffset=True)

    # context_stat_plot.legend(bbox_to_anchor=(1.05, 0.5), loc=3, borderaxespad=0., fontsize='small')
    plt.tight_layout()
    plt.savefig(output_dir + "Boxenplot_Stat_synonymous_variant_frequency_in_%s.png" % virus, dpi=300)
    plt.close()

    if virus == "EnteroA":
        context_stat_plot = sns.catplot(x="ADAR_like", y="freq_and_weight", data=data_adar, palette=mutation_palette(2),
                                        col="Organism", kind="point", dodge=True, col_wrap=int(len(no_organism)/2),
                                        order=adar_order, join=False, estimator=weighted_varaint, orient="v",
                                        legend=True) #flierprops={"marker": "."},
    else:
        context_stat_plot = sns.catplot(x="ADAR_like", y="freq_and_weight", data=data_adar, palette=mutation_palette(2),
                                        col="Organism", kind="point", col_wrap=len(no_organism), order=adar_order,
                                        join=False, estimator=weighted_varaint, orient="v", legend=True) #flierprops={"marker": "."}

    context_stat_plot.set(yscale='log')
    context_stat_plot.set(ylim=(10 ** -4, 10 ** -2))
    # context_stat_plot.axes.flat[0].set_yscale('symlog', linthreshy=10 ** -5)
    context_stat_plot.set_axis_labels("", "Variant Frequency")
    organism_list = data_adar["Organism"].unique()
    # context_stat_plot.legend(bbox_to_anchor=(1.05, 0.5), loc=3, borderaxespad=0., fontsize='small')
    plt.tight_layout()
    plt.savefig(output_dir + "Pointplot_Stat_synonymous_variant_frequency_in_%s.png" % virus, dpi=300)
    plt.close()


    # 3’ neighbors preferences
    #stat
    if virus == "EnteroA":
        context_stat_plot = sns.catplot("ADAR_like", "Frequency", data=data_adar_3, palette=mutation_palette(2), col="Organism",
                                        flierprops={"marker": "."}, kind="box", col_wrap=int(len(no_organism)/2), order=adar_order)
    else:
        context_stat_plot = sns.catplot("ADAR_like", "Frequency", data=data_adar_3, palette=mutation_palette(2), col="Organism",
                                        flierprops={"marker": "."}, kind="box", col_wrap=len(no_organism), order=adar_order)

    context_stat_plot.set(yscale='log')
    context_stat_plot.set(ylim=(10 ** -5, 0))
    context_stat_plot.set_axis_labels("", "Variant Frequency")
    organism_list = data_adar["Organism"].unique()
    for ax in context_stat_plot.axes.flat:
        print(ax.get_title())
        for organism in organism_list:
            if ax.get_title().split(" = ")[-1] == organism:
                ax.set_title("3' ADAR context in %s" % organism, pad=20)
                old_statannot.add_stat_annotation(ax, data=data_adar_3[data_adar_3["Organism"] == organism], x="ADAR_like", y="Frequency",
                                boxPairList=[("ADAR-like", "Non\nADAR-like")], test='Mann-Whitney', textFormat='star', loc='inside',
                                    verbose=0, lineOffsetToBox=2, lineHeight=0, stack=False, useFixedOffset=True)

    # context_stat_plot.legend(bbox_to_anchor=(1.05, 0.5), loc=3, borderaxespad=0., fontsize='small')
    plt.tight_layout()
    plt.savefig(output_dir + "Stat_synonymous_variant_frequency_in_%s_3.png" % virus, dpi=300)
    plt.close()
def main():
    # input_dir = "/Users/odedkushnir/Projects/fitness/AccuNGS/190627_RV_CV/RVB14/"
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/capsid/"
    input_dir = "/Users/odedkushnir/PhD_Projects/After_review/AccuNGS/RV/capsid/"
    prefix = "inosine_predict_context"
    date = datetime.today().strftime("%Y%m%d")
    output_dir = input_dir + "{0}_{1}".format(date, prefix)
    try:
        os.mkdir(output_dir)
    except OSError:
        print("Creation of the directory %s failed" % output_dir)
    else:
        print("Successfully created the directory %s " % output_dir)

    data_filter = pd.read_pickle(input_dir + prefix + "/data_filter.pkl")
    data_filter_ag = pd.read_pickle(input_dir + prefix + "/data_filter_ag.pkl")
    data_filter_uc = pd.read_pickle(input_dir + prefix + "/data_filter_uc.pkl")

    data_filter_replica1 = data_filter[
        (data_filter["label"] == "Capsid-31-Amicon") |
        (data_filter["label"] == "Free-31-Amicon") |
        (data_filter["label"] == "RNA Control\nPrimer ID") |
        (data_filter["label"] == "p8 Mixed Population")]
    data_filter_replica1["RNA"] = np.where(
        (data_filter_replica1["RNA"] == "Capsid"), "p9 Capsid #1",
        data_filter_replica1["RNA"])
    data_filter_replica1["RNA"] = np.where(
        (data_filter_replica1["RNA"] == "Free"), "p9 Free #1",
        data_filter_replica1["RNA"])
    data_filter_replica1["RNA"] = np.where(
        (data_filter_replica1["RNA"] == "p8 Mixed Population"),
        "p8 Mixed\nPopulation", data_filter_replica1["RNA"])

    data_filter_replica1.to_csv(input_dir + prefix + "/data_filter_rep1.csv",
                                sep=",",
                                encoding='utf-8')

    data_filter_ag_replica1 = data_filter_ag[
        (data_filter_ag["label"] == "Capsid-31-Amicon") |
        (data_filter_ag["label"] == "Free-31-Amicon") |
        (data_filter_ag["label"] == "RNA Control\nPrimer ID") |
        (data_filter_ag["label"] == "p8 Mixed Population")]
    data_filter_ag_replica1["RNA"] = np.where(
        (data_filter_ag_replica1["RNA"] == "Capsid"), "p9 Capsid #1",
        data_filter_ag_replica1["RNA"])
    data_filter_ag_replica1["RNA"] = np.where(
        (data_filter_ag_replica1["RNA"] == "Free"), "p9 Free #1",
        data_filter_ag_replica1["RNA"])
    data_filter_ag_replica1["RNA"] = np.where(
        (data_filter_ag_replica1["RNA"] == "p8 Mixed Population"),
        "p8 Mixed\nPopulation", data_filter_ag_replica1["RNA"])

    data_filter_ag_replica1.to_csv(input_dir + prefix +
                                   "/data_filter_ag_rep1.csv",
                                   sep=",",
                                   encoding='utf-8')
    #Plots
    label_order = [
        "RNA Control\nPrimer ID", "p8 Mixed Population", "Capsid-31-Amicon",
        "Capsid-32-Ultra", "Capsid-33-Ultra", "Free-31-Amicon",
        "Free-32-Ultra", "Free-33-Ultra"
    ]  #

    rna_order_replica1 = [
        "RNA Control\nPrimer ID", "p8 Mixed\nPopulation", "p9 Capsid #1",
        "p9 Free #1"
    ]
    transition_order = ["A>G", "U>C", "G>A", "C>U"]
    type_order_ag = ["Synonymous", "Non-Synonymous"]
    adar_preference = ["High", "Intermediate", "Low"]
    rna_order = [
        "RNA Control\nPrimer ID", "p8 Mixed Population", "Capsid", "Free"
    ]
    type_order = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]
    mutation_order = [
        "A>G", "U>C", "G>A", "C>U", "A>C", "U>G", "A>U", "U>A", "G>C", "C>G",
        "C>A", "G>U"
    ]
    context_order_uc = ["UpU", "UpA", "UpC", "UpG"]
    context_order = ["UpA", "ApA", "CpA", "GpA"]
    type_order = ["Synonymous", "Non-Synonymous"]
    plus_minus = u"\u00B1"

    # g1 = sns.catplot(x="label", y="frac_and_weight", data=data_filter, hue="Mutation", order=label_order, palette="tab20",
    #                     kind="point", dodge=False, hue_order=mutation_order, join=True, estimator=weighted_varaint,
    #                  orient="v")
    # g1.set_axis_labels("", "Variant Frequency")
    # g1.set_xticklabels(fontsize=9, rotation=45)
    # g1.set(yscale='log')
    # g1.set(ylim=(10**-7, 10**-3))
    # g1.savefig(output_dir + "/All_Mutations_point_plot", dpi=300)
    # plt.close()

    g2 = sns.catplot(x="label",
                     y="frac_and_weight",
                     data=data_filter,
                     hue="Mutation",
                     order=label_order,
                     palette=mutation_palette(4),
                     kind="point",
                     dodge=True,
                     hue_order=transition_order,
                     join=False,
                     estimator=weighted_varaint,
                     orient="v",
                     legend=True)
    g2.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g2.set(yscale='log')
    g2.set(ylim=(10**-6, 10**-2))
    # g2.set_yticklabels(fontsize=12)
    g2.set_xticklabels(fontsize=10, rotation=90)
    # plt.show()
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/MyPosters/20190924 GGE/plots/Transition_Mutations_point_plot_RV", dpi=300)
    g2.savefig(output_dir + "/Transition_Mutations_label_point_plot", dpi=300)
    plt.close()

    g_rna = sns.catplot(x="RNA",
                        y="frac_and_weight",
                        data=data_filter_replica1,
                        hue="Mutation",
                        order=rna_order_replica1,
                        palette=mutation_palette(4),
                        kind="point",
                        dodge=True,
                        hue_order=transition_order,
                        join=False,
                        estimator=weighted_varaint,
                        orient="v",
                        legend=True)
    g_rna.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g_rna.set(yscale='log')
    g_rna.set(ylim=(10**-6, 10**-2))
    g_rna.savefig(output_dir + "/Transition_Mutations_RNA_point_plot", dpi=300)
    plt.close()

    # A>G Prev Context
    g4 = sns.catplot("label",
                     "frac_and_weight",
                     data=data_filter_ag,
                     hue="5`_ADAR_Preference",
                     order=label_order,
                     palette=mutation_palette(3, adar=True, ag=True),
                     kind="point",
                     dodge=True,
                     hue_order=adar_preference,
                     estimator=weighted_varaint,
                     orient="v",
                     col="Type",
                     join=False,
                     col_order=type_order_ag)
    g4.set_axis_labels("", "Variant Frequency {} CI=95%".format(plus_minus))
    g4.set(yscale='log')
    g4.set(ylim=(7 * 10**-7, 4 * 10**-3))
    g4.set_xticklabels(rotation=90)
    # plt.show()
    g4.savefig(output_dir + "/Context_label_point_plot", dpi=300)
    plt.close()

    mutation_g8 = sns.catplot("RNA",
                              "frac_and_weight",
                              data=data_filter_ag_replica1,
                              hue="5`_ADAR_Preference",
                              palette=mutation_palette(3, adar=True, ag=True),
                              kind="point",
                              dodge=True,
                              estimator=weighted_varaint,
                              order=rna_order_replica1,
                              orient="v",
                              col="Type",
                              join=False,
                              col_order=type_order_ag,
                              hue_order=adar_preference)
    mutation_g8.set(yscale="log")
    # mutation_g8.fig.suptitle("A>G Mutation trajectories in RV", y=0.99)
    mutation_g8.set_axis_labels(
        "", "Variant Frequency {} CI=95%".format(plus_minus))
    mutation_g8.set(ylim=(1 * 10**-5, 1 * 10**-2))
    # plt.show()
    mutation_g8.savefig(output_dir + "/ag_ADAR_like_Mutation_col.png", dpi=300)
    plt.close()

    mutation_g8_box = sns.catplot("RNA",
                                  "Frequency",
                                  data=data_filter_ag_replica1,
                                  hue="5`_ADAR_Preference",
                                  palette=mutation_palette(3,
                                                           adar=True,
                                                           ag=True),
                                  order=rna_order_replica1,
                                  col="Type",
                                  col_order=type_order_ag,
                                  hue_order=adar_preference,
                                  kind="box")
    mutation_g8_box.set(yscale="log")
    mutation_g8_box.savefig(output_dir + "/ag_ADAR_like_Mutation_col_box.png",
                            dpi=300)

    data_filter_ag_grouped = data_filter_ag.groupby(
        ["5`_ADAR_Preference", "label", "Type", "RNA",
         "Pos"])["frac_and_weight"].agg(lambda x: weighted_varaint(x))
    data_filter_ag_grouped = data_filter_ag_grouped.reset_index()
    data_filter_ag_grouped = data_filter_ag_grouped.rename(
        columns={"frac_and_weight": "Frequency"})
    data_filter_ag_grouped["Frequency"] = data_filter_ag_grouped[
        "Frequency"].astype(float)
    # data_filter_ag_grouped = data_filter_ag_grouped[data_filter_ag_grouped["RNA"] = "Capsid"]
    print(data_filter_ag_grouped.to_string())

    data_filter_ag_grouped_silent = data_filter_ag_grouped[
        data_filter_ag_grouped["Type"] == "Synonymous"]
    # data_filter_ag_grouped_silent = data_filter_ag_grouped_silent[data_filter_ag_grouped_silent["Protein"] != "2A"]
    # data_filter_ag_grouped_silent = data_filter_ag_grouped_silent[data_filter_ag_grouped_silent["Protein"] != "3'UTR"]

    position_mutation = sns.relplot(
        x="Pos",
        y="Frequency",
        data=data_filter_ag_grouped_silent,
        hue="5`_ADAR_Preference",
        col="RNA",
        col_wrap=2,
        style="5`_ADAR_Preference",
        palette=mutation_palette(3, adar=True, ag=True),
        hue_order=adar_preference,
        style_order=["High", "Low", "Intermediate"],
        height=4)

    position_mutation.set_axis_labels("", "Variant Frequency")
    position_mutation.axes.flat[0].set_yscale('symlog', linthreshy=10**-4)
    position_mutation.axes.flat[0].set_ylim(10**-4, 10**-1)
    plt.savefig(output_dir + "/position_mutation.png", dpi=300)
    plt.close()
예제 #7
0
def main():
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages/"
    """Local"""
    input_dir = "/Users/odedkushnir/PhD_Projects/After_review/AccuNGS/RV/passages/"
    prefix = "inosine_predict_context"
    date = datetime.today().strftime("%Y%m%d")
    output_dir = input_dir + "{0}_{1}".format(date, prefix)
    try:
        os.mkdir(output_dir)
    except OSError:
        print("Creation of the directory %s failed" % output_dir)
    else:
        print("Successfully created the directory %s " % output_dir)

    data_filter = pd.read_pickle(input_dir + prefix + "/data_filter.pkl")
    data_filter_ag = pd.read_pickle(input_dir + prefix + "/data_filter_ag.pkl")
    data_filter_uc = pd.read_pickle(input_dir + prefix + "/data_filter_uc.pkl")
    data_filter["passage"] = data_filter["passage"].astype(int)
    data_filter["no_variants"] = np.where(data_filter["Prob"] < 0.95, 0,
                                          data_filter["no_variants"])
    data_filter["Read_count"] = data_filter[data_filter["Read_count"] > 10000]

    #Plots
    label_order = [
        "RNA Control\nRND", "RNA Control\nPrimer ID", "p2-1", "p2-2", "p2-3",
        "p5-1", "p5-2", "p5-3", "p8-1", "p8-2", "p8-3", "p10-2", "p10-3",
        "p12-1", "p12-2", "p12-3"
    ]
    mutation_order = [
        "A>G", "U>C", "G>A", "C>U", "A>C", "U>G", "A>U", "U>A", "G>C", "C>G",
        "C>A", "G>U"
    ]
    transition_order = ["A>G", "U>C", "G>A", "C>U"]
    type_order = ["Synonymous", "Non-Synonymous", "Premature Stop Codon"]
    type_order_ag = ["Synonymous", "Non-Synonymous"]
    context_order = ["UpA", "ApA", "CpA", "GpA"]
    context_order_uc = ["UpU", "UpA", "UpC", "UpG"]
    adar_preference = ["High", "Intermediate", "Low"]
    plus_minus = u"\u00B1"

    # g1 = sns.catplot(x="label", y="frac_and_weight", data=data_filter, hue="Mutation", order=label_order,
    #                  palette="Set2",
    #                  kind="point", dodge=False, hue_order=mutation_order, join=True, estimator=weighted_varaint,
    #                  orient="v")
    # g1.set_axis_labels("", "Variant Frequency")
    # g1.set_xticklabels(fontsize=9, rotation=45)
    # g1.set(yscale='log')
    # g1.set(ylim=(10 ** -7, 10 ** -3))
    #
    # # plt.show()
    # g1.savefig(output_dir + "/All_Mutations_point_plot", dpi=300)
    # plt.close()
    #
    # g2 = sns.catplot(x="label", y="frac_and_weight", data=data_filter, hue="Mutation", order=label_order,
    #                  palette=mutation_palette(4), kind="point", dodge=True, hue_order=transition_order, join=False,
    #                  estimator=weighted_varaint,
    #                  orient="v", legend=True)
    # g2.set_axis_labels("", "Variant Frequency")
    # g2.set(yscale='log', ylim=(10 ** -6, 10 ** -2), xlim=(0, 12, 2))
    # # g2.set_yticklabels(fontsize=12)
    # g2.set_xticklabels(fontsize=9, rotation=90)
    # plt.show()
    # g2.savefig("/Users/odedkushnir/Google Drive/Studies/PhD/MyPosters/20190924 GGE/plots/Transition_Mutations_point_plot_RV", dpi=300)
    # g2.savefig(output_dir + "/Transition_Mutations_point_plot", dpi=300)
    # plt.close()
    replica_lst = [1, 2, 3]
    for replica in replica_lst:
        data_filter_replica = data_filter[data_filter["replica"] == replica]
        data_filter_replica["passage"] = data_filter_replica["passage"].astype(
            str)
        data_filter_replica["passage"] = "p" + data_filter_replica["passage"]
        if replica == 2:
            data_filter_replica = pd.read_pickle(input_dir + prefix +
                                                 "/data_filter.pkl")
            data_filter_replica["passage"] = data_filter_replica[
                "passage"].astype(int)
            data_filter_replica["no_variants"] = np.where(
                data_filter_replica["Prob"] < 0.95, 0,
                data_filter_replica["no_variants"])
            data_filter_replica["Read_count"] = data_filter_replica[
                data_filter_replica["Read_count"] > 10000]
            data_filter_replica["passage"] = data_filter_replica[
                "passage"].astype(str)
            data_filter_replica[
                "passage"] = "p" + data_filter_replica["passage"]
            data_filter_replica["replica"] = np.where(
                data_filter_replica["passage"] == "p0", 2,
                data_filter_replica["replica"])
            data_filter_replica = data_filter_replica[
                data_filter_replica["replica"] == replica]
        data_filter_replica["passage"] = np.where(
            data_filter_replica["passage"] == "p0", "RNA\nControl",
            data_filter_replica["passage"])

        if replica == 1:
            passage_order = ["RNA\nControl", "p2", "p5", "p8", "p12"]
            pairs = [(("RNA\nControl", "A>G"), ("RNA\nControl", "G>A")),
                     (("p2", "A>G"), ("p2", "G>A")),
                     (("p5", "A>G"), ("p5", "G>A")),
                     (("p8", "A>G"), ("p8", "G>A")),
                     (("p12", "A>G"), ("p12", "G>A")),
                     (("RNA\nControl", "A>G"), ("RNA\nControl", "U>C")),
                     (("p2", "A>G"), ("p2", "U>C")),
                     (("p5", "A>G"), ("p5", "U>C")),
                     (("p8", "A>G"), ("p8", "U>C")),
                     (("p12", "A>G"), ("p12", "U>C")),
                     (("RNA\nControl", "A>G"), ("RNA\nControl", "C>U")),
                     (("p2", "A>G"), ("p2", "C>U")),
                     (("p5", "A>G"), ("p5", "C>U")),
                     (("p8", "A>G"), ("p8", "C>U")),
                     (("p12", "A>G"), ("p12", "C>U"))]
            pairs_adar = [(("RNA\nControl", "High\nADAR-like\nA>G"),
                           ("RNA\nControl", "Low\nADAR-like\nA>G")),
                          (("p2", "High\nADAR-like\nA>G"),
                           ("p2", "Low\nADAR-like\nA>G")),
                          (("p5", "High\nADAR-like\nA>G"),
                           ("p5", "Low\nADAR-like\nA>G")),
                          (("p8", "High\nADAR-like\nA>G"),
                           ("p8", "Low\nADAR-like\nA>G")),
                          (("p12", "High\nADAR-like\nA>G"),
                           ("p12", "Low\nADAR-like\nA>G")),
                          (("p2", "High\nADAR-like\nU>C"),
                           ("p2", "Low\nADAR-like\nU>C")),
                          (("p5", "High\nADAR-like\nU>C"),
                           ("p5", "Low\nADAR-like\nU>C")),
                          (("p8", "High\nADAR-like\nU>C"),
                           ("p8", "Low\nADAR-like\nU>C")),
                          (("p12", "High\nADAR-like\nU>C"),
                           ("p12", "Low\nADAR-like\nU>C"))]
        else:
            passage_order = ["RNA\nControl", "p2", "p5", "p8", "p10", "p12"]
            pairs = [(("RNA\nControl", "A>G"), ("RNA\nControl", "G>A")),
                     (("p2", "A>G"), ("p2", "G>A")),
                     (("p5", "A>G"), ("p5", "G>A")),
                     (("p8", "A>G"), ("p8", "G>A")),
                     (("p10", "A>G"), ("p10", "G>A")),
                     (("p12", "A>G"), ("p12", "G>A")),
                     (("RNA\nControl", "A>G"), ("RNA\nControl", "U>C")),
                     (("p2", "A>G"), ("p2", "U>C")),
                     (("p5", "A>G"), ("p5", "U>C")),
                     (("p8", "A>G"), ("p8", "U>C")),
                     (("p10", "A>G"), ("p10", "U>C")),
                     (("p12", "A>G"), ("p12", "U>C")),
                     (("RNA\nControl", "A>G"), ("RNA\nControl", "C>U")),
                     (("p2", "A>G"), ("p2", "C>U")),
                     (("p5", "A>G"), ("p5", "C>U")),
                     (("p8", "A>G"), ("p8", "C>U")),
                     (("p10", "A>G"), ("p10", "C>U")),
                     (("p12", "A>G"), ("p12", "C>U"))]
            pairs_adar = [(("RNA\nControl", "High\nADAR-like\nA>G"),
                           ("RNA\nControl", "Low\nADAR-like\nA>G")),
                          (("p2", "High\nADAR-like\nA>G"),
                           ("p2", "Low\nADAR-like\nA>G")),
                          (("p5", "High\nADAR-like\nA>G"),
                           ("p5", "Low\nADAR-like\nA>G")),
                          (("p8", "High\nADAR-like\nA>G"),
                           ("p8", "Low\nADAR-like\nA>G")),
                          (("p10", "High\nADAR-like\nA>G"),
                           ("p10", "Low\nADAR-like\nA>G")),
                          (("p12", "High\nADAR-like\nA>G"),
                           ("p12", "Low\nADAR-like\nA>G")),
                          (("RNA\nControl", "High\nADAR-like\nU>C"),
                           ("RNA\nControl", "Low\nADAR-like\nU>C")),
                          (("p2", "High\nADAR-like\nU>C"),
                           ("p2", "Low\nADAR-like\nU>C")),
                          (("p5", "High\nADAR-like\nU>C"),
                           ("p5", "Low\nADAR-like\nU>C")),
                          (("p8", "High\nADAR-like\nU>C"),
                           ("p8", "Low\nADAR-like\nU>C")),
                          (("p10", "High\nADAR-like\nU>C"),
                           ("p10", "Low\nADAR-like\nU>C")),
                          (("p12", "High\nADAR-like\nU>C"),
                           ("p12", "Low\nADAR-like\nU>C"))]

        passage_g = sns.catplot(x="passage",
                                y="frac_and_weight",
                                data=data_filter_replica,
                                hue="Mutation",
                                order=passage_order,
                                palette=mutation_palette(4),
                                kind="point",
                                dodge=0.5,
                                hue_order=transition_order,
                                join=False,
                                estimator=weighted_varaint,
                                orient="v",
                                legend=True)
        passage_g.set_axis_labels(
            "Passage", "Variant Frequency {} CI=95%".format(plus_minus))
        passage_g.set(yscale='log', ylim=(10**-6, 10**-2))
        plt.savefig(
            output_dir +
            "/Transition_Mutations_point_plot_RVB14_replica%s" % str(replica),
            dpi=300)
        plt.close()

        passage_g1 = sns.boxplot(x="passage",
                                 y="Frequency",
                                 data=data_filter_replica,
                                 hue="Mutation",
                                 order=passage_order,
                                 palette=mutation_palette(4),
                                 dodge=True,
                                 hue_order=transition_order)
        passage_g1.set_yscale('log')
        passage_g1.set_ylim(10**-6, 10**-2)
        passage_g1.set(xlabel="Passage", ylabel="Variant Frequency")
        annot = Annotator(passage_g1,
                          pairs,
                          x="passage",
                          y="Frequency",
                          hue="Mutation",
                          data=data_filter_replica,
                          order=passage_order,
                          hue_order=transition_order)
        annot.configure(test='t-test_welch',
                        text_format='star',
                        loc='outside',
                        verbose=2,
                        comparisons_correction="Bonferroni")
        annot.apply_test()
        file_path = output_dir + "/sts{0}.csv".format(replica)
        with open(file_path, "w") as o:
            with contextlib.redirect_stdout(o):
                passage_g1, test_results = annot.annotate()
        plt.legend(bbox_to_anchor=(1.05, 0.5), loc=2, borderaxespad=0.)
        plt.tight_layout()
        plt.savefig(
            output_dir +
            "/Transition_Mutations_box_stat_plot_RVB14_replica{0}".format(
                replica),
            dpi=300)
        plt.close()
        # data_filter["passage"] = data_filter["passage"].astype(int)
        #
        #
        # g4 = sns.relplot("passage", "frac_and_weight", data=data_filter, hue="Mutation", palette=mutation_palette(4),
        #                  hue_order=transition_order, estimator=weighted_varaint, col="Type", kind="line",
        #                  col_order=type_order)
        #
        # g4.axes.flat[0].set_yscale('symlog', linthreshy=10 ** -5)
        # g4.set_axis_labels("Passage", "Variant Frequency")
        # # plt.show()
        # g4.savefig(output_dir + "/Time_Transition_Mutations_line_plot", dpi=300)
        # plt.close()
        """ADAR preferences"""
        data_filter_replica_synonymous = data_filter_replica.loc[
            data_filter_replica.Type == "Synonymous"]
        # data_filter_synonymous["ADAR_like"] = (data_filter_synonymous.Prev.str.contains('UpA') | data_filter_synonymous.Prev.str.contains('ApA'))
        data_filter_replica_synonymous["Mutation"] = np.where(
            ((data_filter_replica_synonymous["Mutation"] == "A>G") &
             (data_filter_replica_synonymous["5`_ADAR_Preference"] == "High")),
            "High\nADAR-like\nA>G",
            np.where(
                ((data_filter_replica_synonymous["Mutation"] == "A>G")
                 & (data_filter_replica_synonymous["5`_ADAR_Preference"]
                    == "Intermediate")), "Intermediate\nADAR-like\nA>G",
                np.where(
                    ((data_filter_replica_synonymous["Mutation"] == "A>G") &
                     (data_filter_replica_synonymous["5`_ADAR_Preference"]
                      == "Low")), "Low\nADAR-like\nA>G",
                    data_filter_replica_synonymous["Mutation"])))
        data_filter_replica_synonymous["Mutation_adar"] = np.where(
            ((data_filter_replica_synonymous["Mutation"] == "U>C") &
             (data_filter_replica_synonymous["3`_ADAR_Preference"] == "High")),
            "High\nADAR-like\nU>C",
            np.where(
                ((data_filter_replica_synonymous["Mutation"] == "U>C")
                 & (data_filter_replica_synonymous["3`_ADAR_Preference"]
                    == "Intermediate")), "Intermediate\nADAR-like\nU>C",
                np.where(
                    ((data_filter_replica_synonymous["Mutation"] == "U>C") &
                     (data_filter_replica_synonymous["3`_ADAR_Preference"]
                      == "Low")), "Low\nADAR-like\nU>C",
                    data_filter_replica_synonymous["Mutation"])))
        mutation_adar_order = [
            "High\nADAR-like\nA>G", "Low\nADAR-like\nA>G",
            "High\nADAR-like\nU>C", "Low\nADAR-like\nU>C"
        ]
        # data_filter_replica_synonymous["passage"] = data_filter_replica_synonymous["passage"].astype(str)
        # data_filter_replica_synonymous["passage"] = "p" + data_filter_replica_synonymous["passage"]
        catplot_adar = sns.catplot(x="passage",
                                   y="frac_and_weight",
                                   data=data_filter_replica_synonymous,
                                   hue="Mutation_adar",
                                   order=passage_order,
                                   palette=mutation_palette(4, adar=True),
                                   kind="point",
                                   dodge=0.5,
                                   hue_order=mutation_adar_order,
                                   join=False,
                                   estimator=weighted_varaint,
                                   orient="v",
                                   legend=True)
        catplot_adar.set_axis_labels(
            "Passage", "Variant Frequency {} CI=95%".format(plus_minus))
        catplot_adar.set(yscale='log')
        catplot_adar.set(ylim=(10**-6, 10**-2))
        # catplot_adar.set_xticklabels(fontsize=8)
        # plt.tight_layout()
        plt.savefig(
            output_dir +
            "/adar_pref_mutation_point_plot_RVB14_replica{0}.png".format(
                replica),
            dpi=300)
        plt.close()

        adar_g = sns.boxplot(x="passage",
                             y="Frequency",
                             data=data_filter_replica_synonymous,
                             hue="Mutation_adar",
                             order=passage_order,
                             palette=mutation_palette(4, adar=True),
                             dodge=True,
                             hue_order=mutation_adar_order)
        adar_g.set_yscale('log')
        adar_g.set_ylim(10**-6, 10**-1)
        adar_g.set(xlabel="Passage", ylabel="Variant Frequency")

        annot = Annotator(adar_g,
                          pairs_adar,
                          x="passage",
                          y="Frequency",
                          hue="Mutation_adar",
                          data=data_filter_replica_synonymous,
                          hue_order=mutation_adar_order)
        annot.configure(test='t-test_welch',
                        text_format='star',
                        loc='outside',
                        verbose=2,
                        comparisons_correction="Bonferroni")
        annot.apply_test()
        file_path = output_dir + "/sts_adar_{0}.csv".format(replica)
        with open(file_path, "w") as o:
            with contextlib.redirect_stdout(o):
                adar_g, test_results = annot.annotate()
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.tight_layout()
        plt.savefig(
            output_dir +
            "/adar_pref_mutation_box_stat_plot_RVB14_replica{0}".format(
                replica),
            dpi=300)
        plt.close()
def main():
    """RV"""
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages"
    # Local
    input_dir = "/Users/odedkushnir/PhD_Projects/After_review/AccuNGS/RV/passages"
    experiment = "passages"
    today = datetime.date.today().strftime("%Y%m%d")
    output_dir = input_dir + "/{0}_Co_occur_all_{1}".format(today, experiment)
    prefix = "/p*"
    date = "20201012"
    min_coverage = 5000
    virus = "RVB14"

    q = "q38"
    """1"""
    region_lst = [
        629, 835, 1621, 2329, 3196, 3634, 3925, 4915, 5170, 5239, 5785, 7165
    ]
    # data_all_passages = creating_co_occur_passages_df(input_dir, experiment, prefix, date, q, region_lst, output_dir)
    data_all_passages = creating_co_occur_patients_df(input_dir, experiment,
                                                      prefix, date, q,
                                                      region_lst, output_dir)

    data_all_passages_grouped = data_all_passages.groupby(
        ["label", "New_Stretch"])["Frequency"].agg(np.mean)
    data_all_passages_grouped = data_all_passages_grouped.reset_index()
    data_all_passages_grouped = data_all_passages_grouped.rename(
        columns={"Frequency": "meanfreq"})
    data_all_passages = data_all_passages.merge(data_all_passages_grouped,
                                                how="left",
                                                on=["label", "New_Stretch"])
    # label_order = ["p2-1", "p2-2", "p2-3", "p5-1", "p5-2", "p5-3", "p8-1", "p8-2", "p8-3", "p10-2", "p10-3", "p12-1",
    #                "p12-2", "p12-3"]
    label_order = ["p2-3", "p5-3", "p8-3", "p10-3", "p12-3"]
    style_order = [
        "No editing context", "ADAR (antisense)", "ADAR (sense)", "APOBEC3F"
    ]
    style_adar = ["ADAR (antisense)", "ADAR (sense)"]
    markers = {
        "No editing context": "o",
        "ADAR (antisense)": "<",
        "ADAR (sense)": ">",
        "APOBEC3F": "*"
    }
    markers_adar = {"ADAR (antisense)": "<", "ADAR (sense)": ">"}
    hue_order = ["A>G", "U>C", "G>A", "C>U"]
    hue_order_adar = ["A>G", "U>C"]
    file_name = "Passages_Editing_context"
    plot_editing_context_plot(data_all_passages,
                              output_dir,
                              file_name,
                              label_order,
                              style_order=style_order,
                              markers=markers,
                              hue_order=hue_order,
                              mutation_palette=mutation_palette(4),
                              experiment=experiment,
                              col_wrap=3)
    data_all_passages = data_all_passages[data_all_passages["ADAR"] != "No"]
    file_name = "Passages_ADAR_context"
    plot_editing_context_plot(data_all_passages,
                              output_dir,
                              file_name,
                              label_order,
                              style_order=style_adar,
                              markers=markers_adar,
                              hue_order=hue_order_adar,
                              mutation_palette=mutation_palette(2,
                                                                adar=True,
                                                                ag=True,
                                                                uc=True),
                              experiment=experiment,
                              col_wrap=3)
    # """2"""
    # # data_all_passages = pd.read_pickle(output_dir + "/all_co_occur_protein.pkl")
    # df_co_occur_new = grouped_co_occur(data_all_passages, input_dir, experiment, output_dir, q)
    #
    # g1 = sns.boxenplot(x="Stretch_Type_Non-Synonymous", y="Stretch_Freq", data=df_co_occur_new,
    #                  order=[False, True], color="0.8")
    # g1 = sns.stripplot(x="Stretch_Type_Non-Synonymous", y="Stretch_Freq", data=df_co_occur_new,
    #                  order=[False, True], palette=[sns.color_palette("muted")[3], sns.color_palette("muted")[4]])
    # old_statannot.add_stat_annotation(g1, data=df_co_occur_new, x="Stretch_Type_Non-Synonymous", y="Stretch_Freq",
    #                     boxPairList=[(False, True)], test='Mann-Whitney', textFormat='star',
    #                                   loc='outside', verbose=2,
    #                                   order=[False, True])
    # g1.set(xticklabels=("Synonymous\nStretches", "Stretches\nWith Non-Synonymous"))
    # g1.set(xlabel="")
    # g1.set(ylabel = "Stretch Frequency")
    # g1.set(ylim= (0, 0.007))
    # plt.tight_layout()
    # plt.savefig(output_dir + "/Frequencies_of_Stertch_Type_Non-Synonymous", dpi=300)
    # plt.close()
    # df_adar_path = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/passages/" \
    #                "inosine_predict_context/data_filter.csv"
    # """3"""
    # # df_co_occur_new = pd.read_pickle(output_dir + "/all_co_occur_grouped.pkl")
    # df_regression = regression_stretches(df_co_occur_new, output_dir, df_adar_path, experiment, min_coverage)
    # plot_regression(df_regression, output_dir)
    # """4"""
    # # df_regression = pd.read_pickle(output_dir + "/all_co_occur_grouped_adar_preferences.pkl")
    # plot_regression_stretches_adar_preferences(df_regression, output_dir)
    """RV-Capsid_Free"""
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/capsid"
    # experiment = "capsid"
    # output_dir = input_dir + "/20201129Co_occur_all_%s" % experiment
    # prefix = "/*_3*"
    # min_coverage = 5000
    # virus = "RVB14"
    # date = "20201012"
    # q = "q38"
    # """1"""
    # region_lst = [629, 835, 1621, 2329, 3196, 3634, 3925, 4915, 5170, 5239, 5785, 7165]
    # # data_all_capsid = creating_co_occur_passages_df(input_dir, experiment, prefix, date, q, region_lst, output_dir)
    # data_all_capsid = creating_co_occur_patients_df(input_dir, experiment, prefix, date, q, region_lst, output_dir)
    # data_all_capsid["label"] = np.where(data_all_capsid["label"] == "Free-31-Amicon", "Free #1",
    #                                     data_all_capsid["label"])
    # data_all_capsid["label"] = np.where(data_all_capsid["label"] == "Free-32-Ultra", "Free #2",
    #                                     data_all_capsid["label"])
    # data_all_capsid["label"] = np.where(data_all_capsid["label"] == "Free-33-Ultra", "Free #3",
    #                                     data_all_capsid["label"])
    # data_all_capsid["label"] = np.where(data_all_capsid["label"] == "Capsid-31-Amicon", "Capsid #1",
    #                                     data_all_capsid["label"])
    # data_all_capsid["label"] = np.where(data_all_capsid["label"] == "Capsid-33-Ultra", "Capsid #3",
    #                                     data_all_capsid["label"])
    # label_order = ["Free #1", "Free #2", "Free #3", "Capsid #1", "Capsid #3"]
    # style_order = ["No editing context", "ADAR (antisense)", "ADAR (sense)", "APOBEC3F"]
    # style_adar = ["ADAR (antisense)", "ADAR (sense)"]
    # markers = {"No editing context": "o", "ADAR (antisense)": "<", "ADAR (sense)": ">", "APOBEC3F": "*"}
    # markers_adar = {"ADAR (antisense)": "<", "ADAR (sense)": ">"}
    # hue_order = ["A>G", "U>C", "G>A", "C>U"]
    # hue_order_adar = ["A>G", "U>C"]
    # file_name = "Capsid_Editing_context"
    # plot_editing_context_plot(data_all_capsid, output_dir, file_name, label_order, col_wrap=3, style_order=style_order,
    #                           markers=markers, hue_order=hue_order, mutation_palette=mutation_palette(4))
    # data_all_capsid = data_all_capsid[data_all_capsid["ADAR"] != "No"]
    # file_name = "Capsid_ADAR_context"
    # plot_editing_context_plot(data_all_capsid, output_dir, file_name, label_order, col_wrap=3, style_order=style_adar,
    #                           markers=markers_adar, hue_order=hue_order_adar,
    #                           mutation_palette=mutation_palette(2, adar=True, ag=True, uc=True))

    # """2"""
    # # data_all_capsid = pd.read_pickle(output_dir + "/all_co_occur_protein.pkl")
    # # df_co_occur_new = grouped_co_occur(data_all_capsid, input_dir, experiment, output_dir, q)
    # #
    # # Plots
    # df_co_occur_new = pd.read_pickle(output_dir + "/all_co_occur_grouped.pkl")
    # g1 = sns.catplot(x="RNA", y="Stretch_Freq", data=df_co_occur_new, order=["Capsid", "Free"], kind="strip",
    #                  col="replica", palette=[sns.color_palette("muted")[3], sns.color_palette("muted")[4]],
    #                  legend=True)
    # g1.set(yscale='log')
    # g1.set(ylim=(10**-5, 10**-1))
    # plt.tight_layout()
    # plt.savefig(output_dir + "/Frequencies_of_Stertch", dpi=300)
    # plt.close()
    # stretch_g1 = sns.catplot(x="Stretch_Type_Non-Synonymous", y="Stretch_Freq", data=df_co_occur_new,
    #                          order=[False, True], hue="RNA", hue_order=["Capsid", "Free"], kind="strip", col="replica",
    #                                         palette=[sns.color_palette()[2], sns.color_palette()[6]],
    #                          sharey=True)
    # stretch_g1.set(xticklabels=("Synonymous\nStretches", "Stretches\nWith Non-Synonymous"))
    # stretch_g1.set(xlabel="")
    # stretch_g1.set(ylabel = "Stretch Frequency")
    # stretch_g1.set(ylim= (0, 0.007))
    #
    # plt.tight_layout()
    #
    # # g1 = sns.boxenplot(x="Stretch_Type_Non-Synonymous", y="Stretch_Freq", data=df_co_occur_new,
    # #                  order=[False, True], color="0.8", hue="RNA", hue_order=["Capsid", "Free"])
    # # g1 = sns.stripplot(x="Stretch_Type_Non-Synonymous", y="Stretch_Freq", data=df_co_occur_new,
    # #                  order=[False, True], hue="RNA", hue_order=["Capsid", "Free"], palette=[sns.color_palette("muted")[3], sns.color_palette("muted")[4]])
    # # # old_statannot.add_stat_annotation(g1, data=df_co_occur_new, x="Stretch_Type_Non-Synonymous", y="Stretch_Freq",
    # # #                     boxPairList=[(False, True)], test='Mann-Whitney', textFormat='star',
    # # #                                   loc='outside', verbose=2,
    # # #                                   order=[False, True])
    # # g1.set(xticklabels=("Synonymous\nStretches", "Stretches\nWith Non-Synonymous"))
    # # g1.set(xlabel="")
    # # g1.set(ylabel = "Stretch Frequency")
    # plt.savefig(output_dir + "/Frequencies_of_Stertch_Type_Non-Synonymous_replicas", dpi=300)
    #
    # """3"""
    # df_adar_path = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/capsid/" \
    #                "inosine_predict_context/data_filter.csv"
    #
    # # df_co_occur_new = pd.read_pickle(output_dir + "/all_co_occur_grouped.pkl")
    # df_regression = regression_stretches(df_co_occur_new, output_dir, df_adar_path, experiment, min_coverage)
    # """4"""
    # # df_regression = pd.read_pickle(output_dir + "/all_co_occur_grouped_adar_preferences.pkl")
    # plot_capsid_free_plot(df_regression, output_dir)
    """RV-Patients"""
    # input_dir = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/patients"
    # experiment = "patients"
    # output_dir = input_dir + "/20201202Co_occur_%s" % experiment
    # prefix = "/Patient_*"
    # min_coverage = 5000
    # virus = "RVA"
    # date = "20201124"
    # q = "q30_consensusX7"
    #
    #
    # region_lst = [503, 709, 1495, 2197, 3094, 3523, 3808, 4771, 5005, 5068, 5617, 6728]
    # data_all_patients = creating_co_occur_patients_df(input_dir, experiment, prefix, date, q, region_lst, output_dir)
    # label_order = ["Patient-1", "Patient-4", "Patient-5", "Patient-9", "Patient-16", "Patient-17", "Patient-20"]
    # style_order = ["No editing context", "ADAR (antisense)", "ADAR (sense)", "APOBEC3F"]
    # style_adar = ["ADAR (antisense)", "ADAR (sense)"]
    # markers = {"No editing context": "o", "ADAR (antisense)": "<", "ADAR (sense)": ">", "APOBEC3F": "*"}
    # markers_adar = {"ADAR (antisense)": "<", "ADAR (sense)": ">"}
    # hue_order = ["A>G", "U>C", "G>A", "C>U"]
    # hue_order_adar = ["A>G", "U>C"]
    # file_name = "Patient_Editing_context"
    # plot_editing_context_plot(data_all_patients, output_dir, file_name, label_order, col_wrap=3, style_order=style_order
    #                           , markers=markers, hue_order=hue_order, mutation_palette=mutation_palette(4))
    # data_all_patients = data_all_patients[data_all_patients["ADAR"] != "No"]
    # file_name = "Patient_ADAR_context"
    # plot_editing_context_plot(data_all_patients, output_dir, file_name, label_order, col_wrap=3, style_order=style_adar,
    #                           markers=markers_adar, hue_order=hue_order_adar,
    #                           mutation_palette=mutation_palette(2, adar=True, ag=True, uc=True))
    #
    # # data_all_patients = pd.read_pickle(output_dir + "/all_co_occur_protein.pkl")
    #
    # df_co_occur_new = grouped_co_occur(data_all_patients, input_dir,experiment, output_dir, q=q.split("_")[0])

    # plot_patient_plot(df_co_occur_new, output_dir, "Frequencies_of_label_Stertch_Type_Non-Synonymous")

    # df_adar_path = "/Volumes/STERNADILABHOME$/volume3/okushnir/AccuNGS/20201008RV-202329127/merged/patients/" \
    #                "inosine_predict_context/data_filter.csv"

    # df_co_occur_new = pd.read_pickle(output_dir + "/all_co_occur_grouped.pkl")
    # df_regression = regression_stretches(df_co_occur_new, output_dir, df_adar_path, experiment, min_coverage)
    # df_regression = pd.read_pickle(output_dir + "/all_co_occur_grouped_adar_preferences.pkl")
    # plot_patient_plot(df_regression, output_dir)
    """CV"""