コード例 #1
0
def survival_calculator(input_data, session_key):

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "bcr_analysis"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()

    surv_dataset = ['GSE40272', 'GSE70769']

    def table_split(cph):
        cph_d = cph.summary
        cph_d_index = cph_d.index.tolist()

        if len(cph_d_index) > 1:
            t_arr = []
            for x in cph_d_index:
                testing = x
                coef = format(cph_d['coef'].loc[x], ".2f")
                haz_c = format(cph_d['exp(coef)'].loc[x], ".2f")
                haz_c_int = str(
                    format(np.exp(
                        cph_d['lower 0.95'].loc[x]), ".2f")) + "-" + str(
                            format(np.exp(cph_d['upper 0.95'].loc[x]), ".2f"))
                pval = format(cph_d['p'].loc[x], ".3f")
                concor = format(
                    concordance_index(
                        cph.durations,
                        -cph.predict_partial_hazard(cph.data).values.ravel(),
                        cph.event_observed), ".2f")

                t_arr.append([
                    testing + "(Multivariate)", coef,
                    haz_c + "(" + haz_c_int + ")", pval, concor
                ])

            return t_arr
        else:
            for x in cph_d_index:
                testing = x
                coef = format(cph_d['coef'].loc[x], ".2f")
                haz_c = format(cph_d['exp(coef)'].loc[x], ".2f")
                haz_c_int = str(
                    format(np.exp(
                        cph_d['lower 0.95'].loc[x]), ".2f")) + "-" + str(
                            format(np.exp(cph_d['upper 0.95'].loc[x]), ".2f"))
                pval = format(cph_d['p'].loc[x], ".3f")
                concor = format(
                    concordance_index(
                        cph.durations,
                        -cph.predict_partial_hazard(cph.data).values.ravel(),
                        cph.event_observed), ".2f")

            return [[
                testing, coef, haz_c + "(" + haz_c_int + ")", pval, concor
            ]]

    expr_set_arr = []
    clinical_set_arr = []

    for surv in surv_dataset:
        expr_set = pd.read_csv('user_data/clinical_data/' + surv + '.csv',
                               index_col=0)
        expr_set.index = expr_set.index.astype(int)
        expr_set.index = expr_set.index.astype(str)
        clinical_set = pd.read_excel('user_data/clinical_data/' + surv +
                                     '_clinical.xlsx',
                                     index_col=0)

        expr_set_arr.append(expr_set)
        clinical_set_arr.append(clinical_set)

    test_set_arr = []
    for expr in expr_set_arr:
        if len(input_data) > 1:
            df_data = gene_set_zscore(expr,
                                      input_data,
                                      sample_status="multiple")
            test_set = pd.Series(data=df_data,
                                 index=expr.columns.tolist(),
                                 name='Input_Zscore')
            test_set_arr.append(test_set)
            surv_input = 'Input_Zscore'
        elif len(input_data) == 1:
            test_set = expr.loc[input_data[0]]
            test_set_arr.append(test_set)
            surv_input = input_data[0]

    dv_arr = []
    for i, test in enumerate(test_set_arr):
        test_set_up = test[test > test.median()]
        test_set_down = test[test < test.median()]
        clinical_up = clinical_set_arr[i].loc[test_set_up.index.tolist()]
        clinical_down = clinical_set_arr[i].loc[test_set_down.index.tolist()]
        clinical_arr = [clinical_up, clinical_down]

        dv_arr.append(clinical_arr)

    for dv in dv_arr:
        pt.survival_plot_and_cox(
            dv,
            label=[surv_input + '_up', surv_input + '_down'],
            filename=plot_path + test_name + str(filecount))
        file_list.append(template_plot_path + test_name + str(filecount) +
                         ".png")
        filecount += 1

    table_arr = []
    cph = CoxPHFitter()
    for i, test in enumerate(test_set_arr):
        cph_data = test
        cph_data.loc[cph_data > test_set.median()] = 1
        cph_data.loc[cph_data < test_set.median()] = 0
        cph_data = pd.concat([cph_data, clinical_set_arr[i]], axis=1)

        table_data = []
        cph.fit(cph_data[[surv_input, 'bcrstatus', 'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_split(cph)

        cph.fit(cph_data[['gleason', 'bcrstatus', 'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_data + table_split(cph)

        cph.fit(cph_data[[surv_input, 'gleason', 'bcrstatus',
                          'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_data + table_split(cph)

        #table_data = [['Variable','Coefficient','Hazard Ratio','P-value','C-index']]+table_data

        table_arr.append(table_data)
        rw = open(plot_path + surv_dataset[i] + '.cox.tsv', "w")
        rw.write("Variable\tCoefficient\tHR(95%Conf)\tP-value\tC-index\n")

        for x in table_data:
            rw.write('\t'.join(x) + '\n')
        rw.close()

    return file_list, table_arr
コード例 #2
0
def set_calculator(input_data,
                   group_info,
                   group_samples,
                   session_key,
                   set_name='USER_GENE_SET'):
    def fet_f(a1, b1, total_gene_assum):
        a1_inter_b1 = list(set(a1).intersection(b1))
        a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1))
        b1_unique_froma1 = list(set(b1) - set(a1_inter_b1))

        oddsratio, pvalue = stats.fisher_exact(
            [[len(a1_inter_b1), len(b1_unique_froma1)],
             [
                 len(a1_unique_fromb1),
                 total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) +
                                     len(a1_unique_fromb1))
             ]])
        return len(a1), len(a1_inter_b1), pvalue

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "USER_SET"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)
        files = glob.glob(nginx_plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()
    df_all = all_expr_df

    gsea_mapping_rate = len(
        list(set(input_data).intersection(df_all.index.tolist())))
    list_numb = len(input_data)
    gsea_mapping_rate = float(gsea_mapping_rate) / float(list_numb) * 100
    gsea_mapping_rate = "%.2f" % gsea_mapping_rate

    #############GSEA#############
    gmt_temp = set_name + '\tNA\t' + '\t'.join(input_data)
    #fixed_path_gmt = 'user_data/'+userID+'/user.gmt'
    fixed_path_gmt = plot_path + 'user.gmt'
    rw = open(fixed_path_gmt, 'w')
    rw.write(gmt_temp)
    rw.close()

    sample_list = group_samples
    class_vector = [[group_info[i]] * len(item)
                    for i, item in enumerate(sample_list)]
    class_vector = [y for x in class_vector for y in x]
    class_vector = map(str, class_vector)

    df_user_s = [df_all[s] for s in sample_list]
    df_user_s = pd.concat(df_user_s, axis=1)

    df_user_s.columns = range(len(df_user_s.columns.tolist()))
    df_user_s = df_user_s.reset_index()

    gsea_result = gseapy.gsea(data=df_user_s,
                              gene_sets=fixed_path_gmt,
                              cls=class_vector,
                              outdir=plot_path,
                              min_size=2,
                              max_size=1000,
                              weighted_score_type=1,
                              permutation_type='phenotype',
                              method='signal_to_noise',
                              ascending=False,
                              figsize=(6.5, 6),
                              format='png')
    #ledge_genes = gsea_result.results[gsea_result.results.keys()[0]]['ledge_genes'].split(";")## leading edge subset
    pval_es = gsea_result.results[gsea_result.results.keys()[0]][
        'pval']  ###GSEA p-value

    #with Image(filename=plot_path+set_name+".gsea.pdf", resolution=300) as img:
    #	with Image(width=img.width, height=img.height, background=Color("white")) as bg:
    #		bg.composite(img,0,0)
    #		bg.save(filename=plot_path+set_name+".gsea.png")

    if pval_es <= 0.05:

        file_list.append(template_plot_path + set_name + ".gsea.png")
        filecount += 1
    else:
        file_list.append(" ")
        gsea_mapping_rate = 'Not Applicable'
        filecount += 1
    #############GSEA#############

    fold_change = all_expr_df[sample_list[0]].median(
        axis=1) - all_expr_df[sample_list[1]].median(axis=1)

    if pval_es <= 0.05:
        #############MRA#############
        mra_set_t = mra_set.T
        mra_list = list(set(mra_set.index.tolist()))
        mra_targets = [mra_set_t[x].values.tolist() for x in mra_list]
        mra_targets = [
            map(str, x[0]) if type(x[0]) == list else [str(x[0])]
            for x in mra_targets
        ]

        total_genes = len(list(set(mra_set.values.flatten())))

        pvals = [
            fet_f(mra_targets[a], input_data, total_genes)
            for a in range(len(mra_list))
        ]
        #pvals = [fet_f(mra_targets[a], ledge_genes, total_genes) for a in range(len(mra_list))] ## leading edge subset
        pvals_list = [[
            mra_list[i],
            int(item[0]),
            int(item[1]),
            float("{0:.4f}".format(item[2])),
            float("{0:.4f}".format(fold_change.loc[mra_list[i]]))
        ] for i, item in enumerate(pvals) if item[2] < 0.01
                      and fold_change.loc[mra_list[i]] >= 0.1 and item[0] > 10]
        #table_arr = pvals_list #####Table data

        index_change = lambda x, y: [y] + x[1:]
        #table_arr = [index_change(x,pcta_id.loc[x[0]]['Symbol']) for x in table_arr]

        network_data = pd.DataFrame(
            data=pvals_list, columns=['TF', 'targets', 'mapped', 'pval', 'fc'])
        network_data = network_data.set_index('TF')
        network_data['Symbol'] = pcta_id.loc[map(
            str, network_data.index.tolist())]['Symbol']

        network_data[
            'prob_mapped'] = network_data['mapped'] / network_data['targets']
        network_data = network_data.sort_values('prob_mapped', ascending=False)
        network_data = network_data.loc[network_data.index.tolist()[:10]]
        network_data = network_data.round(4)
        network_data['targets'].astype(int)
        network_data['mapped'].astype(int)

        selected_network_expr = df_all[group_samples[0]].loc[
            network_data.index.tolist()[:10]]
        selected_network_expr['Symbol'] = pcta_id.loc[map(
            str, selected_network_expr.index.tolist())]['Symbol']

        network_data = network_data.set_index('Symbol')
        selected_network_expr = selected_network_expr.set_index('Symbol')

        pt.network_plot(network_data,
                        selected_network_expr,
                        tit=group_info[0],
                        filename=plot_path + test_name + str(filecount))
        file_list.append(template_plot_path + test_name + str(filecount) +
                         ".png")

        network_data = network_data[[
            'targets', 'mapped', 'prob_mapped', 'fc', 'pval'
        ]]

        int_ = lambda x: [int(x[0]), int(x[1])] + x[2:]
        table_arr = [[i] + int_(network_data.loc[i].tolist())
                     for i in network_data.index.tolist()]

        network_data.to_csv(plot_path + 'mra_candidates.csv')

        os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path))

        #############MRA#############

    else:
        table_arr = []
        file_list.append(" ")

    return file_list, table_arr, gsea_mapping_rate
コード例 #3
0
def association_calculator(input_data, group_info, group_samples, session_key):

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    template_plot_path = "images/" + session_key + "/"
    test_name = "association_test"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
    else:
        files = glob.glob(plot_path + "*")
        print files
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    #####Lollipop
    pt = MY_PLOT()
    df = all_expr_df
    df = df.drop('Symbol', axis=1)

    if len(input_data) > 1:
        df_data = [
            gene_set_zscore(df[samples], input_data, sample_status="multiple")
            for samples in group_samples
        ]
        ylab = 'Z score'
    elif len(input_data) == 1:
        df_data = [
            df[samples].loc[input_data[0]].values.tolist()
            for samples in group_samples
        ]
        ylab = 'Expression'

    pt.lollipop([sorted(d) for d in df_data],
                ylab=ylab,
                label_data=group_info,
                filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1
    #####Lollipop

    #####Violin
    df_data_arr = [pd.DataFrame(data=d) for d in df_data]
    df_data_arr = pd.concat(df_data_arr, axis=1)
    df_data_arr.columns = group_info

    pt.violin_plt(df_data_arr,
                  ylab=ylab,
                  tit='',
                  filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1
    #####Violin

    #####Histogram
    pt.histogram_group(df_data_arr,
                       legend=True,
                       filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1
    #####Histogram

    return file_list
コード例 #4
0
def correlation_calculator(input_data1, input_data2, group_info, group_samples,
                           session_key):

    ####ALL SAMPLES INSERTION####
    group_info = ['ALL'] + group_info
    all_samples = [sample for samples in group_samples for sample in samples]
    group_samples.insert(0, all_samples)

    ####ALL SAMPLES INSERTION####

    def input_data_process(df, input_d):
        if len(input_d) > 1:
            df_data = [
                gene_set_zscore(df[samples], input_d, sample_status="multiple")
                for samples in group_samples
            ]
        elif len(input_d) == 1:
            df_data = [
                df[samples].loc[input_d[0]].values.tolist()
                for samples in group_samples
            ]
        return df_data

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "correlation_test"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    #####Scatter
    pt = MY_PLOT()
    df = all_expr_df
    df = df.drop('Symbol', axis=1)

    df_data1 = input_data_process(df, input_data1)
    df_data2 = input_data_process(df, input_data2)
    group_df = []

    for a in range(len(group_info)):
        d = {'Input1': df_data1[a], 'Input2': df_data2[a]}
        group_df.append(pd.DataFrame(data=d))

    pt.shared_scatter(group_df,
                      x_dat='Input1',
                      y_dat='Input2',
                      title=group_info,
                      filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1

    os.system("cp %s/* %s" % (plot_path, nginx_plot_path))

    return file_list
コード例 #5
0
def set_c(input_data, group_info, group_samples, session_key):
    def fet_f(a1, b1, total_gene_assum):
        a1_inter_b1 = list(set(a1).intersection(b1))
        a1_unique_fromb1 = list(set(a1) - set(a1_inter_b1))
        b1_unique_froma1 = list(set(b1) - set(a1_inter_b1))

        oddsratio, pvalue = stats.fisher_exact(
            [[len(a1_inter_b1), len(b1_unique_froma1)],
             [
                 len(a1_unique_fromb1),
                 total_gene_assum - (len(a1_inter_b1) + len(b1_unique_froma1) +
                                     len(a1_unique_fromb1))
             ]])
        return len(a1), len(a1_inter_b1), pvalue

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    template_plot_path = "images/" + session_key + "/"
    test_name = "USER_SET"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()
    df_all = all_expr_df

    #############GSEA#############
    gmt_temp = 'USER_SET\tNA\t' + '\t'.join(input_data)
    #fixed_path_gmt = 'user_data/'+userID+'/user.gmt'
    fixed_path_gmt = plot_path + 'user.gmt'
    rw = open(fixed_path_gmt, 'w')
    rw.write(gmt_temp)
    rw.close()

    sample_list = group_samples
    class_vector = [[group_info[i]] * len(item)
                    for i, item in enumerate(sample_list)]
    class_vector = [y for x in class_vector for y in x]
    class_vector = map(str, class_vector)

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 20})

    df_user_s = [df_all[s] for s in sample_list]
    df_user_s = pd.concat(df_user_s, axis=1)

    df_user_s.columns = range(len(df_user_s.columns.tolist()))
    df_user_s = df_user_s.reset_index()

    gseapy.gsea(data=df_user_s,
                gene_sets=fixed_path_gmt,
                cls=class_vector,
                outdir=plot_path,
                min_size=2,
                max_size=1000,
                weighted_score_type=1,
                permutation_type='gene_set',
                method='signal_to_noise',
                ascending=False,
                figsize=(6.5, 6),
                format='png')

    file_list.append(template_plot_path + "USER_SET.gsea.png")
    filecount += 1
    #############GSEA#############
    current_task.update_state(state='PROGRESS', meta={'process_percent3': 40})
    #############MRA#############
    mra_set_t = mra_set.T
    mra_list = list(set(mra_set.index.tolist()))
    mra_targets = [mra_set_t[x].values.tolist() for x in mra_list]
    mra_targets = [
        map(str, x[0]) if type(x[0]) == list else [str(x[0])]
        for x in mra_targets
    ]

    total_genes = len(list(set(mra_set.values.flatten())))

    pvals = [
        fet_f(mra_targets[a], input_data, total_genes)
        for a in range(len(mra_list))
    ]
    pvals_list = [[mra_list[i], item[0], item[1], item[2]]
                  for i, item in enumerate(pvals)
                  if item[2] < 0.01 and item[0] > 10]
    table_arr = pvals_list  #####Table data

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 60})

    rw = open(plot_path + 'mra_candidates.tsv', "w")
    rw.write("Gene(EntrezID)\tTF_targets\tMapped_genes\tP-value\n")
    for x in table_arr:
        x = [str(y) for y in x]
        rw.write('\t'.join(x) + '\n')
    rw.close()

    network_data = pd.DataFrame(data=pvals_list,
                                columns=['TF', 'targets', 'mapped', 'pval'])
    network_data = network_data.set_index('TF')
    network_data[
        'prob_mapped'] = network_data['mapped'] / network_data['targets']
    network_data = network_data.sort_values('prob_mapped', ascending=False)
    network_data = network_data.loc[network_data.index.tolist()[:10]]

    current_task.update_state(state='PROGRESS', meta={'process_percent': 80})

    selected_network_expr = df_all[group_samples[0]].loc[
        network_data.index.tolist()[:10]]

    pt.network_plot(network_data,
                    selected_network_expr,
                    tit=group_info[0],
                    filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")

    current_task.update_state(state='PROGRESS', meta={'process_percent3': 100})

    #############MRA#############
    return random.random()
コード例 #6
0
def correlation_c(input_data1,
                  input_data2,
                  dataset_input_data,
                  group_info,
                  group_samples,
                  plot_color,
                  session_key,
                  name1='Input 1',
                  name2='Input 2'):

    current_task.update_state(state='PROGRESS', meta={'process_percent2': 0})
    ####ALL SAMPLES INSERTION####
    group_info = ['ALL'] + group_info
    all_samples = [sample for samples in group_samples for sample in samples]
    group_samples.insert(0, all_samples)

    ####ALL SAMPLES INSERTION####

    def input_data_process(df, input_d):
        if len(input_d) > 1:
            df_data = [
                gene_set_zscore_single_thr(df[samples],
                                           input_d,
                                           sample_status="multiple")
                for samples in group_samples
            ]

        elif len(input_d) == 1:
            df_data = [
                df[samples].loc[input_d[0]].values.tolist()
                for samples in group_samples
            ]
        return df_data

    current_task.update_state(state='PROGRESS', meta={'process_percent2': 20})
    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "correlation_test"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)
        files = glob.glob(nginx_plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    #####Scatter
    pt = MY_PLOT()

    if dataset_input_data == 'PCTA':
        df = all_expr_df
        df = df.drop('Symbol', axis=1)
    else:
        df = tcga_expr_df

    df_data1 = input_data_process(df, input_data1)
    current_task.update_state(state='PROGRESS', meta={'process_percent2': 40})
    df_data2 = input_data_process(df, input_data2)
    current_task.update_state(state='PROGRESS', meta={'process_percent2': 60})
    group_df = []

    for a in range(len(group_info)):
        d = {name1: df_data1[a], name2: df_data2[a]}
        group_df.append(pd.DataFrame(data=d))
        current_task.update_state(state='PROGRESS',
                                  meta={'process_percent2': 80})

    pt.shared_scatter(group_df,
                      x_dat=name1,
                      y_dat=name2,
                      title=group_info,
                      color_arr=plot_color,
                      filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1

    os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path))
    current_task.update_state(state='PROGRESS', meta={'process_percent2': 100})

    return random.random()
コード例 #7
0
def survival_c(input_data, session_key):
    current_task.update_state(state='PROGRESS', meta={'process_percent1': 0})
    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "bcr_analysis"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)
        files = glob.glob(nginx_plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    pt = MY_PLOT()

    surv_dataset = ['TCGA_PRAD', 'GSE40272', 'GSE70769']
    expr_set_arr = []
    clinical_set_arr = []

    for surv in surv_dataset:
        if surv != 'TCGA_PRAD':
            expr_set = pd.read_csv('user_data/clinical_data/' + surv + '.csv',
                                   index_col=0)
            expr_set.index = expr_set.index.astype(int)
            expr_set.index = expr_set.index.astype(str)
            clinical_set = pd.read_excel('user_data/clinical_data/' + surv +
                                         '_clinical.xlsx',
                                         index_col=0)

        else:
            expr_set = tcga_expr_df
            clinical_set = pd.read_excel('user_data/clinical_data/' + surv +
                                         '_clinical.xlsx',
                                         index_col=0)

        expr_set_arr.append(expr_set)
        clinical_set_arr.append(clinical_set)

    current_task.update_state(state='PROGRESS', meta={'process_percent1': 20})

    test_set_arr = []
    for expr in expr_set_arr:
        if len(input_data) > 1:
            df_data = gene_set_zscore_single_thr(expr,
                                                 input_data,
                                                 sample_status="multiple")
            test_set = pd.Series(data=df_data,
                                 index=expr.columns.tolist(),
                                 name='Input_Zscore')
            test_set_arr.append(test_set)
            surv_input = 'Input_Zscore'
        elif len(input_data) == 1:
            test_set = expr.loc[input_data[0]]

            symbol = pcta_id.loc[input_data[0]]['Symbol']
            test_set = test_set.rename(symbol)

            test_set_arr.append(test_set)
            #surv_input = input_data[0]
            surv_input = symbol

    current_task.update_state(state='PROGRESS', meta={'process_percent1': 40})

    dv_arr = []
    for i, test in enumerate(test_set_arr):
        test_set_up = test[test > test.median()]
        test_set_down = test[test < test.median()]
        clinical_up = clinical_set_arr[i].loc[test_set_up.index.tolist()]
        clinical_down = clinical_set_arr[i].loc[test_set_down.index.tolist()]
        clinical_arr = [clinical_up, clinical_down]

        dv_arr.append(clinical_arr)

    current_task.update_state(state='PROGRESS', meta={'process_percent1': 60})
    for dv in dv_arr:
        pt.survival_plot_and_cox(
            dv,
            label=[surv_input + '_up', surv_input + '_down'],
            filename=plot_path + test_name + str(filecount))
        file_list.append(template_plot_path + test_name + str(filecount) +
                         ".png")
        filecount += 1

    current_task.update_state(state='PROGRESS', meta={'process_percent1': 80})

    table_arr = []
    cph = CoxPHFitter()
    for i, test in enumerate(test_set_arr):
        cph_data = test
        cph_data.loc[cph_data > test_set.median()] = 1
        cph_data.loc[cph_data < test_set.median()] = 0
        cph_data = pd.concat([cph_data, clinical_set_arr[i]], axis=1)

        table_data = []
        cph.fit(cph_data[[surv_input, 'bcrstatus', 'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_split(cph)

        cph.fit(cph_data[['gleason', 'bcrstatus', 'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_data + table_split(cph)

        cph.fit(cph_data[[surv_input, 'gleason', 'bcrstatus',
                          'bcrmonth']].dropna(),
                'bcrmonth',
                event_col='bcrstatus')
        table_data = table_data + table_split(cph)

        #table_data = [['Variable','Coefficient','Hazard Ratio','P-value','C-index']]+table_data

        table_arr.append(table_data)
        rw = open(plot_path + surv_dataset[i] + '.cox.tsv', "w")
        rw.write("Variable\tCoefficient\tHR(95%Conf)\tP-value\tC-index\n")

        for x in table_data:
            rw.write('\t'.join(x) + '\n')
        rw.close()

    os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path))
    current_task.update_state(state='PROGRESS', meta={'process_percent1': 100})

    return random.random()
コード例 #8
0
def association_c(input_data, dataset_input_data, group_info, group_samples,
                  plot_color, session_key):
    current_task.update_state(state='PROGRESS', meta={'process_percent': 0})

    plot_path = "/home/ubuntu/django_proj/pcta_updated/main/static/images/" + session_key + "/"
    nginx_plot_path = "/home/ubuntu/django_proj/pcta_updated/main/staticimages/" + session_key + "/"

    template_plot_path = "images/" + session_key + "/"
    test_name = "association_test"

    if not os.path.exists(plot_path):
        os.mkdir(plot_path)
        os.mkdir(nginx_plot_path)
    else:
        files = glob.glob(plot_path + "*")
        for f in files:
            os.remove(f)
        files = glob.glob(nginx_plot_path + "*")
        for f in files:
            os.remove(f)

    filecount = 0
    file_list = []

    #####Lollipop
    pt = MY_PLOT()

    if dataset_input_data == 'PCTA':
        df = all_expr_df
        df = df.drop('Symbol', axis=1)
    else:
        df = tcga_expr_df

    current_task.update_state(state='PROGRESS', meta={'process_percent': 10})

    if len(input_data) > 1:
        df_data = [
            gene_set_zscore_single_thr(df[samples],
                                       input_data,
                                       sample_status="multiple")
            for samples in group_samples
        ]
        ylab = 'Z score'
    elif len(input_data) == 1:
        df_data = [
            df[samples].loc[input_data[0]].values.tolist()
            for samples in group_samples
        ]
        ylab = 'Expression'

    current_task.update_state(state='PROGRESS', meta={'process_percent': 20})

    pt.lollipop([sorted(d) for d in df_data],
                label_data=group_info,
                ylab=ylab,
                color_arr=plot_color,
                filename=plot_path + test_name + str(filecount))

    ###########Oneway ANNOVA with group data
    if len(df_data) == 3:
        fv, pv = stats.f_oneway(df_data[0], df_data[1], df_data[2])
    else:
        fv, pv = stats.f_oneway(df_data[0], df_data[1], df_data[2], df_data[3])

    rw = open(plot_path + 'onewayANOVA_result.tsv', "w")
    rw.write("F-value\tP-value\n")
    rw.write(str('{:.3f}'.format(fv)) + '\t' + float_format_change(pv) + '\n')
    rw.close()
    ###########Oneway ANNOVA with group data

    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1
    #####Lollipop
    current_task.update_state(state='PROGRESS', meta={'process_percent': 30})

    #####Violin or boxplot
    df_data_arr = [pd.DataFrame(data=d) for d in df_data]
    df_data_arr = pd.concat(df_data_arr, axis=1)
    df_data_arr.columns = group_info

    pt.violin_plt(df_data_arr,
                  ylab=ylab,
                  tit='',
                  color_arr=plot_color,
                  filename=plot_path + test_name + str(filecount))

    ###########Ranksum with group data
    rw = open(plot_path + 'ranksum_result.tsv', "w")
    rw.write("Test group\tFoldChange\tP-value\n")

    if 'GS=7' in group_info:
        #others_index = [ i for i,item in enumerate(group_info) if item!='mCRPC' ]
        #others_values = [ df_data[i] for i in others_index ]
        #others_values = list(sum(others_values, []))
        #test_values = df_data[group_info.index('mCRPC')]
        if dataset_input_data == 'PCTA':
            test_values, others_values = test_and_others_selection(
                'mCRPC',
                group_info,
                df_data,
                others_name=['GS=7', 'GS<7', 'GS>7'])
            rv, pv = stats.ranksums(test_values, others_values)
            fc = float(np.mean(test_values)) - float(np.mean(others_values))
            rw.write("mCRPC VS Primary" + '\t' + str('{:.3f}'.format(fc)) +
                     '\t' + float_format_change(pv) + '\n')

            test_values, others_values = test_and_others_selection(
                'Benign',
                group_info,
                df_data,
                others_name=['GS=7', 'GS<7', 'GS>7'])
            rv, pv = stats.ranksums(test_values, others_values)
            fc = float(np.mean(others_values)) - float(np.mean(test_values))
            rw.write("Primary VS Benign" + '\t' + str('{:.3f}'.format(fc)) +
                     '\t' + float_format_change(pv) + '\n')
            rw.close()

        else:
            test_values, others_values = test_and_others_selection(
                'GS>7', group_info, df_data, others_name=['GS=7', 'GS<7'])
            rv, pv = stats.ranksums(test_values, others_values)
            fc = float(np.mean(test_values)) - float(np.mean(others_values))
            rw.write("GS>7 VS Others" + '\t' + str('{:.3f}'.format(fc)) +
                     '\t' + float_format_change(pv) + '\n')

            test_values, others_values = test_and_others_selection(
                'GS<7', group_info, df_data, others_name=['GS=7', 'GS>7'])
            rv, pv = stats.ranksums(test_values, others_values)
            fc = float(np.mean(others_values)) - float(np.mean(test_values))
            rw.write("GS<7  VS Others" + '\t' + str('{:.3f}'.format(fc)) +
                     '\t' + float_format_change(pv) + '\n')
            rw.close()

        #ranksum_test_case = [df_data[group_info.index('mCRPC')], ]
    elif 'PCS1' in group_info:
        test_values, others_values = test_and_others_selection(
            'PCS1', group_info, df_data)
        rv, pv = stats.ranksums(test_values, others_values)
        fc = float(np.mean(test_values)) - float(np.mean(others_values))
        rw.write("PCS1 VS Others" + '\t' + str('{:.3f}'.format(fc)) + '\t' +
                 float_format_change(pv) + '\n')
        rw.close()

    elif 'LumB' in group_info:
        test_values, others_values = test_and_others_selection(
            'LumB', group_info, df_data)
        rv, pv = stats.ranksums(test_values, others_values)
        fc = float(np.mean(test_values)) - float(np.mean(others_values))
        rw.write("LumB VS Others" + '\t' + str('{:.3f}'.format(fc)) + '\t' +
                 float_format_change(pv) + '\n')
        rw.close()

    current_task.update_state(state='PROGRESS', meta={'process_percent': 40})
    ###########Ranksum with group data

    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1
    #####Violin or boxplot

    current_task.update_state(state='PROGRESS', meta={'process_percent': 50})

    #####Histogram
    #pt.histogram_group(df_data_arr,legend=True,colo=plot_color,filename=plot_path+test_name+str(filecount))
    pt.line_trend(df_data_arr,
                  ylab='Mean of ' + ylab,
                  legend=True,
                  colo=plot_color,
                  filename=plot_path + test_name + str(filecount))
    file_list.append(template_plot_path + test_name + str(filecount) + ".png")
    filecount += 1

    os.system("cp -rf %s/* %s" % (plot_path, nginx_plot_path))
    current_task.update_state(state='PROGRESS', meta={'process_percent': 100})
    #####Histogram

    #return file_list
    return random.random()