def TCGA_heatmap_pancancer_exist_sigGenes_private(target_cancer, sigGene_addr,tcga_cancer_diff_sig_gene_addr, tcga_heatmap_pancancer_addr, tcga_pancancer_cluster_addr):

    with open(sigGene_addr) as sigGene_f:
        significant_genes = [x.strip() for x in sigGene_f.readlines()]
    # significant_genes = anova_sig_result_df['gene'].tolist()

    print(significant_genes)

    # anova_sig_result_df.to_csv("/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_sig_genes_pv_{}.tsv".format(target_cancer, gene_value_mode,p_value_threshold),sep='\t')

    # with open ("/home/wch23/Project/LifeArc/SOX2/result/Sig.Genes/{}_anova_sig_genes.txt".format(target_cancer),'w') as sig_f:
    #     sig_f.write('\n'.join(significant_genes))
    #####
    # draw heatmap with sig genes
    #####
    tcga_cancer_diff_df = dh.load_obj("/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original")

    cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv"
    cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t')

    tcga_cancer_diff_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'] != 'None']
    tcga_cancer_diff_df = tcga_cancer_diff_df.drop(columns=['ensembl_id', 'ensembl_gene'])

    tcga_cancer_original_order = list(tcga_cancer_diff_df)

    tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'].isin(significant_genes)]

    print(tcga_cancer_diff_sig_gene_df.head())
    print(cancer_id_df.head())
    cancer_id_df = cancer_id_df.set_index('fullcode')
    tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_sig_gene_df.set_index('gene_symbol')
    tcga_cancer_diff_sig_gene_cancer_type_df = pd.concat([tcga_cancer_diff_sig_gene_df.T, cancer_id_df],axis=1, join='inner' )
    tcga_cancer_diff_sig_gene_cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T
    print(tcga_cancer_diff_sig_gene_cancer_type_df)
    cancer_type = tcga_cancer_diff_sig_gene_cancer_type_df.loc['Abbreviation']
    print(cancer_type)

    rgb_colors = sns.color_palette("hls",  len(cancer_type.unique()))

    cancer_type_color = dict(zip(cancer_type.unique(), rgb_colors))

    print (cancer_type_color)
    col_colors = cancer_type.map(cancer_type_color)
    #

    print(tcga_cancer_diff_sig_gene_df.info())

    tcga_cancer_diff_sig_gene_df.to_csv(tcga_cancer_diff_sig_gene_addr,sep='\t')
def TCGA_heatmap_pancancer_exist_sigGenes(target_cancer, sigGene_addr):
    # # organism = '9606'
    # # string_node_addr = "/home/wch23/Project/LifeArc/General/data/STRING/{}.protein.links.v11.0.400.nodes.txt".format(
    # #     organism)
    # # with open(string_node_addr) as string_node_f:
    # #     string_node = [x.strip() for x in string_node_f.readlines()]
    #
    # ####
    # # get sig genes from anova reslut
    # ##########
    #
    # # target_cancer = 'PAAD'
    # # target_cancer = "LUSC"
    # gene_value_mode = 'Diff'
    # # anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_{}_vs_other_{}_anova_result_with_ensembl.tsv".format(target_cancer,gene_value_mode)
    # anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result.csv".format(target_cancer, gene_value_mode)
    # anova_result_df = pd.read_csv(anova_result_addr)
    #
    # # p_value_threshold = 1.0E-150
    #
    # anova_sig_result_df = anova_result_df.loc[anova_result_df['pvalue']<p_value_threshold]
    # print(anova_sig_result_df.shape)
    # print(anova_sig_result_df.head())
    with open(sigGene_addr) as sigGene_f:
        significant_genes = [x.strip() for x in sigGene_f.readlines()]
    # significant_genes = anova_sig_result_df['gene'].tolist()

    print(significant_genes)

    # anova_sig_result_df.to_csv("/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_sig_genes_pv_{}.tsv".format(target_cancer, gene_value_mode,p_value_threshold),sep='\t')

    # with open ("/home/wch23/Project/LifeArc/SOX2/result/Sig.Genes/{}_anova_sig_genes.txt".format(target_cancer),'w') as sig_f:
    #     sig_f.write('\n'.join(significant_genes))
    #####
    # draw heatmap with sig genes
    #####
    tcga_cancer_diff_df = dh.load_obj("/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original")

    cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv"
    cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t')

    tcga_cancer_diff_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'] != 'None']
    tcga_cancer_diff_df = tcga_cancer_diff_df.drop(columns=['ensembl_id', 'ensembl_gene'])

    tcga_cancer_original_order = list(tcga_cancer_diff_df)

    tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'].isin(significant_genes)]

    print(tcga_cancer_diff_sig_gene_df.head())
    print(cancer_id_df.head())
    cancer_id_df = cancer_id_df.set_index('fullcode')
    tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_sig_gene_df.set_index('gene_symbol')
    tcga_cancer_diff_sig_gene_cancer_type_df = pd.concat([tcga_cancer_diff_sig_gene_df.T, cancer_id_df],axis=1, join='inner' )
    tcga_cancer_diff_sig_gene_cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T
    print(tcga_cancer_diff_sig_gene_cancer_type_df)
    cancer_type = tcga_cancer_diff_sig_gene_cancer_type_df.loc['Abbreviation']
    print(cancer_type)

    rgb_colors = sns.color_palette("hls",  len(cancer_type.unique()))

    cancer_type_color = dict(zip(cancer_type.unique(), rgb_colors))

    print (cancer_type_color)
    col_colors = cancer_type.map(cancer_type_color)
    #

    print(tcga_cancer_diff_sig_gene_df.info())

    tcga_cancer_diff_sig_gene_df.to_csv('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_matrix.tsv'.format(target_cancer),sep='\t')

    g = sns.clustermap(tcga_cancer_diff_sig_gene_df, metric="correlation", cmap="RdBu_r", robust=True, method="average",z_score=0,
                       col_colors=col_colors,xticklabels=False)  # Average is best
    # g = sns.clustermap(tcga_cancer_diff_sig_gene_df, metric="correlation", cmap="RdBu_r", robust=True, method="average",
    #                    col_colors=col_colors, xticklabels=False)  # Average is best

    g.savefig('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_single_robust_colColor_norm.png'.format(target_cancer))

    print(g.dendrogram_col.reordered_ind)
    print(g.dendrogram_row.reordered_ind)

    clustred_col = g.dendrogram_col.reordered_ind

    cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T[['Abbreviation']]
    cancer_type_df.index.name = 'fullcode'
    cancer_type_df = cancer_type_df.reset_index()
    print(cancer_type_df)

    clusterd_cancer_type_df = cancer_type_df.reindex(clustred_col)
    print(clusterd_cancer_type_df)

    clusterd_cancer_type_df.to_csv('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_tcga_clustred_result.csv'.format(target_cancer))
def main_for_nonType():
    tcga_cancer_diff_df = dh.load_obj(
        "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original"
    )
    with open(
            '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv'
    ) as cancer_type_f:
        tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()]
    cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv"
    cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t')
    # print(cancer_id_df)
    # target_cancer_list=["PAAD"] # pancreas, ITGB5

    # cancer_id_df['Case_Ctrl'] = ['Case' if x in target_cancer_list else 'Ctrl' for x in cancer_id_df['Abbreviation']]
    # print(cancer_id_df)
    #
    tcga_cancer_diff_df = tcga_cancer_diff_df[
        tcga_cancer_diff_df['gene_symbol'] != 'None']
    tcga_cancer_diff_df = tcga_cancer_diff_df.drop(
        columns=['ensembl_id', 'ensembl_gene'])

    # gene_symbol_list = ["MIR3648-2","AL513534.2","MIR3648-1","MIR6753","AC099677.4","EXOSC8","AC127024.5","FAT3","SCARB2","PABPC4L","RUSC1-AS1","GDNF-AS1","IVD","SLC39A2","CBWD6","DUSP12","SMC6","AL031587.5","SBNO2"]
    gene_symbol_list = ["ERBB2"]
    cancer_type = 'All'
    cancer_id_df = cancer_id_df.set_index('fullcode')

    for gene_symbol in gene_symbol_list:

        aGene_TCGA_df = tcga_cancer_diff_df.loc[
            tcga_cancer_diff_df['gene_symbol'] == gene_symbol]
        # aGene_TCGA_df = aGene_TCGA_df
        aGene_TCGA_df = aGene_TCGA_df.set_index('gene_symbol')
        aGene_TCGA_df = aGene_TCGA_df.T
        # aGene_TCGA_df = aGene_TCGA_df.reset_index()

        print("Gene:", gene_symbol)
        print(aGene_TCGA_df.head())

        aGene_TCGA_cancer_id_df = pd.concat([aGene_TCGA_df, cancer_id_df],
                                            join='inner',
                                            axis=1)
        # aGene_TCGA_cancer_id_df = pd.merge(aGene_TCGA_df,cancer_id_df,left_on='index', right_on='fullcode')
        print(aGene_TCGA_cancer_id_df)

        aGene_TCGA_cancer_id_df['Abbreviation'] = aGene_TCGA_cancer_id_df[
            'Abbreviation'].astype('category')
        cancer_types = aGene_TCGA_cancer_id_df['Abbreviation'].tolist()
        cancer_types = list(set(cancer_types))
        cancer_types = sorted(cancer_types)

        ## for other type boxplot

        plt.figure(figsize=(10, 5))
        # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, palette=pkmn_type_colors,
        #                  showfliers=False);
        ax = sns.boxplot(
            x='Abbreviation',
            y=gene_symbol,
            data=aGene_TCGA_cancer_id_df,
            order=cancer_types,
            # palette=pkmn_type_colors,
            color='w',
            linewidth=1.5,
            showfliers=False)
        # ax = sns.swarmplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, color=".25")
        ax.set_title(gene_symbol)
        ax.set_xlabel('TCGA')
        ax.set_ylabel('Tumour vs Normal')
        ax.xaxis.grid(True)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        plt.setp(ax.artists, edgecolor='k', facecolor='w')
        plt.setp(ax.lines, color='k')
        file_dir = '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}'.format(
            cancer_type)
        from pathlib import Path
        Path(file_dir).mkdir(parents=True, exist_ok=True)

        ax.get_figure().savefig(
            '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_original_new_v2.pdf'
            .format(cancer_type, gene_symbol))

        plt.show()
def get_sig_genes_by_anova(target_cancers, p_value_th):
    tcga_cancer_diff_df = dh.load_obj(
        "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original"
    )
    with open(
            '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv'
    ) as cancer_type_f:
        tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()]
    cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv"
    cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t')
    # print(cancer_id_df)
    # target_cancer_list=["PAAD"] # pancreas, ITGB5
    target_cancer_list = target_cancers
    # target_cancer_list = ["LUSC"]  # Lung squamous cell carcinoma , Sox2
    target_cancer = '_'.join(target_cancer_list)
    cancer_id_df['Case_Ctrl'] = [
        'Case' if x in target_cancer_list else 'Ctrl'
        for x in cancer_id_df['Abbreviation']
    ]
    # print(cancer_id_df)
    #
    tcga_cancer_diff_df = tcga_cancer_diff_df[
        tcga_cancer_diff_df['gene_symbol'] != 'None']
    tcga_cancer_diff_df = tcga_cancer_diff_df.drop(
        columns=['ensembl_id', 'ensembl_gene'])

    # # # print(tcga_cancer_diff_df.head())
    # #
    gene_value_mode = 'Diff'
    # p_value_threshold = 1.0e-40  # -350 = significant gene(0) , -320 = significant(8300)
    p_value_threshold = p_value_th

    base_mean_threshold = 0
    anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result.csv".format(
        target_cancer, gene_value_mode)
    selected_genes_l = get_selected_genes_using_oneway_ANOVA(
        tcga_cancer_diff_df, p_value_threshold, cancer_id_df, target_cancer,
        gene_value_mode, anova_result_addr)
    selected_genes_addr = "/home/wch23/Project/LifeArc/TCGA/Result/significant_genes_original_{}_from_{}_anova_{}.csv".format(
        gene_value_mode, target_cancer, str(p_value_threshold))

    sig_genes_df = pd.DataFrame(selected_genes_l, columns=["gene", 'pvalue'])
    sig_genes_df.to_csv(selected_genes_addr, index=False)

    tcga_id_mapping_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_gtex_id_mapping.csv"
    anova_result_with_ensembl_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result_with_ensembl.csv".format(
        target_cancer, gene_value_mode)
    make_anova_result_with_symbol_ensembl(anova_result_addr,
                                          tcga_id_mapping_addr,
                                          anova_result_with_ensembl_addr)

    ################
    # save significant genes
    ################
    significant_genes_addr = "/home/wch23/Project/LifeArc/TCGA/Result/significant_genes_only_original_{}_from_{}_anova_{}.csv".format(
        gene_value_mode, target_cancer, str(p_value_threshold))
    anova_result_df = pd.read_csv(anova_result_addr)
    sig_genes_df = anova_result_df[
        anova_result_df['pvalue'] <= p_value_threshold]
    # sig_genes = sig_genes_df['gene'].tolist()
    #
    # with open(significant_genes_addr, 'w') as sig_genes_f:
    #     sig_genes_f.write('\n'.join(sig_genes))
    sig_genes_df.to_csv(significant_genes_addr, sep='\t')
def main_for_combine_genes():
    tcga_cancer_diff_df = dh.load_obj(
        "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original"
    )
    with open(
            '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv'
    ) as cancer_type_f:
        tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()]
    cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv"
    cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t')
    # print(cancer_id_df)
    # target_cancer_list=["PAAD"] # pancreas, ITGB5

    # cancer_id_df['Case_Ctrl'] = ['Case' if x in target_cancer_list else 'Ctrl' for x in cancer_id_df['Abbreviation']]
    # print(cancer_id_df)
    #
    tcga_cancer_diff_df = tcga_cancer_diff_df[
        tcga_cancer_diff_df['gene_symbol'] != 'None']
    tcga_cancer_diff_df = tcga_cancer_diff_df.drop(
        columns=['ensembl_id', 'ensembl_gene'])

    # print(cancer_id_df)
    # gene_symbol_list = ["AC128709.3", "AL035258.1", "TCF4-AS1", "AC012498.2", "GSTA8P", "AC128709.2", "LINC01932",
    #                     "POU6F2-AS2", "SCGB3A2", "KRT74", "AC022031.1", "LINC01206", "GBP6", "AC134043.2", "SERPINB13",
    #                     "AC012498.1", "ADH7", "SFTPA1",
    #                     "SFTPB"]  # ['SRC', 'FYN', 'PRKCA', 'VTN', 'MYL12A','EPHA2' ,'SDC1','YES1']
    # gene_symbol_list = ["CYTIP","B2M","AZI2","LINC00487","TNFSF12-TNFSF13","AL365203.2","HSPA1B"]
    # gene_symbol_list = ["ITGA1","ITGA2","ITGA2B","ITGA3","ITGA4","ITGA5","ITGA6","ITGA7","ITGA8","ITGA9","ITGA10","ITGA11","ITGAD","ITGAE","ITGAL","ITGAM",
    #                     "ITGAV","ITGAX","ITGB1","ITGB2","ITGB3","ITGB4","ITGB5","ITGB6","ITGB7","ITGB8","ITGBL1"]
    # gene_symbol_list = ["TMEM52","EPB41L4B","AC011754.1","RBPJL","AC096633.1","PNLIP","CELP","LHFPL5","AC092535.1","TMED6"]
    # gene_symbol_list = ["DLGAP1","DTNB","BHLHE40","PXN","CYLD","RYBP","GSC","LEFTY2","BMP7","NRP2","BMP4","OTX2","PRDM1","DENND2A","RAD51C","JARID2","KLF9","SOCS3","TGIF1","KANK1","MKRN1","FOS","DNMT3A","STAB2","RABIF","SNCG","ZIC3","FOXD3"]
    gene_symbol_list = ["SOX2", "STAT1"]
    gene_symbol = '_'.join(gene_symbol_list)
    cancer_type = 'LUSC'
    cancer_id_df = cancer_id_df.set_index('fullcode')

    anova_result_df = pd.read_csv(
        "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_Diff_anova_result.csv"
        .format(cancer_type),
        index_col=0)

    aGene_TCGA_df = tcga_cancer_diff_df.loc[
        tcga_cancer_diff_df['gene_symbol'].isin(gene_symbol_list)]
    # aGene_TCGA_df = aGene_TCGA_df
    aGene_TCGA_df = aGene_TCGA_df.set_index('gene_symbol')
    aGene_TCGA_df = aGene_TCGA_df.T
    # aGene_TCGA_df[gene_symbol] = aGene_TCGA_df.sum(axis=1)
    aGene_TCGA_df[gene_symbol] = aGene_TCGA_df.mean(axis=1)

    # aGene_TCGA_df = aGene_TCGA_df.reset_index()
    aGene_TCGA_df = aGene_TCGA_df.drop(columns=gene_symbol_list)
    print("Gene:", gene_symbol)
    print(aGene_TCGA_df.head())

    aGene_TCGA_cancer_id_df = pd.concat([aGene_TCGA_df, cancer_id_df],
                                        join='inner',
                                        axis=1)
    # aGene_TCGA_cancer_id_df = pd.merge(aGene_TCGA_df,cancer_id_df,left_on='index', right_on='fullcode')
    print(aGene_TCGA_cancer_id_df)

    # anova_p_value = anova_result_df.loc[gene_symbol].values[0]
    # print(anova_p_value)

    aGene_TCGA_cancer_id_df['Abbreviation'] = aGene_TCGA_cancer_id_df[
        'Abbreviation'].astype('category')
    cancer_types = aGene_TCGA_cancer_id_df['Abbreviation'].tolist()
    cancer_types = list(set(cancer_types))
    cancer_types = sorted(cancer_types)

    pkmn_type_colors = [
        '#78C850',  # Grass
        '#F08030',  # Fire
        '#6890F0',  # Water
        '#A8B820',  # Bug
        '#A8A878',  # Normal
        '#A040A0',  # Poison
        '#F8D030',  # Electric
        '#E0C068',  # Ground
        '#EE99AC',  # Fairy
        '#C03028',  # Fighting
        '#F85888',  # Psychic
        '#B8A038',  # Rock
        '#705898',  # Ghost
        '#98D8D8',  # Ice
        '#7038F8',  # Dragon
    ]

    # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types,
    #                  showfliers=False);
    # ax.set_title(gene_symbol)
    # ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    # ax.get_figure().savefig('/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_{}_original.pdf'.format(cancer_type,gene_symbol, anova_p_value))

    ## for other type boxplot

    plt.figure(figsize=(10, 5))
    # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, palette=pkmn_type_colors,
    #                  showfliers=False);
    ax = sns.boxplot(
        x='Abbreviation',
        y=gene_symbol,
        data=aGene_TCGA_cancer_id_df,
        order=cancer_types,
        # palette=pkmn_type_colors,
        color='w',
        linewidth=1.5,
        showfliers=False)
    # ax = sns.swarmplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, color=".25")
    ax.set_title(gene_symbol)
    ax.set_xlabel('TCGA')
    ax.set_ylabel('Tumour vs Normal')
    ax.xaxis.grid(True)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

    ################
    # this is for making color brighter
    #################
    # for patch in ax.artists:
    #     r, g, b, a = patch.get_facecolor()
    #     patch.set_facecolor((r, g, b, .3))
    #########################

    plt.setp(ax.artists, edgecolor='k', facecolor='w')
    plt.setp(ax.lines, color='k')
    ax.get_figure().savefig(
        '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_original_new_v2.pdf'
        .format(cancer_type, gene_symbol
                # anova_p_value
                ))

    plt.show()