def TCGA_heatmap_pancancer_exist_sigGenes_private(target_cancer, sigGene_addr,tcga_cancer_diff_sig_gene_addr, tcga_heatmap_pancancer_addr, tcga_pancancer_cluster_addr): with open(sigGene_addr) as sigGene_f: significant_genes = [x.strip() for x in sigGene_f.readlines()] # significant_genes = anova_sig_result_df['gene'].tolist() print(significant_genes) # anova_sig_result_df.to_csv("/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_sig_genes_pv_{}.tsv".format(target_cancer, gene_value_mode,p_value_threshold),sep='\t') # with open ("/home/wch23/Project/LifeArc/SOX2/result/Sig.Genes/{}_anova_sig_genes.txt".format(target_cancer),'w') as sig_f: # sig_f.write('\n'.join(significant_genes)) ##### # draw heatmap with sig genes ##### tcga_cancer_diff_df = dh.load_obj("/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original") cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv" cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t') tcga_cancer_diff_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'] != 'None'] tcga_cancer_diff_df = tcga_cancer_diff_df.drop(columns=['ensembl_id', 'ensembl_gene']) tcga_cancer_original_order = list(tcga_cancer_diff_df) tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'].isin(significant_genes)] print(tcga_cancer_diff_sig_gene_df.head()) print(cancer_id_df.head()) cancer_id_df = cancer_id_df.set_index('fullcode') tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_sig_gene_df.set_index('gene_symbol') tcga_cancer_diff_sig_gene_cancer_type_df = pd.concat([tcga_cancer_diff_sig_gene_df.T, cancer_id_df],axis=1, join='inner' ) tcga_cancer_diff_sig_gene_cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T print(tcga_cancer_diff_sig_gene_cancer_type_df) cancer_type = tcga_cancer_diff_sig_gene_cancer_type_df.loc['Abbreviation'] print(cancer_type) rgb_colors = sns.color_palette("hls", len(cancer_type.unique())) cancer_type_color = dict(zip(cancer_type.unique(), rgb_colors)) print (cancer_type_color) col_colors = cancer_type.map(cancer_type_color) # print(tcga_cancer_diff_sig_gene_df.info()) tcga_cancer_diff_sig_gene_df.to_csv(tcga_cancer_diff_sig_gene_addr,sep='\t')
def TCGA_heatmap_pancancer_exist_sigGenes(target_cancer, sigGene_addr): # # organism = '9606' # # string_node_addr = "/home/wch23/Project/LifeArc/General/data/STRING/{}.protein.links.v11.0.400.nodes.txt".format( # # organism) # # with open(string_node_addr) as string_node_f: # # string_node = [x.strip() for x in string_node_f.readlines()] # # #### # # get sig genes from anova reslut # ########## # # # target_cancer = 'PAAD' # # target_cancer = "LUSC" # gene_value_mode = 'Diff' # # anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_{}_vs_other_{}_anova_result_with_ensembl.tsv".format(target_cancer,gene_value_mode) # anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result.csv".format(target_cancer, gene_value_mode) # anova_result_df = pd.read_csv(anova_result_addr) # # # p_value_threshold = 1.0E-150 # # anova_sig_result_df = anova_result_df.loc[anova_result_df['pvalue']<p_value_threshold] # print(anova_sig_result_df.shape) # print(anova_sig_result_df.head()) with open(sigGene_addr) as sigGene_f: significant_genes = [x.strip() for x in sigGene_f.readlines()] # significant_genes = anova_sig_result_df['gene'].tolist() print(significant_genes) # anova_sig_result_df.to_csv("/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_sig_genes_pv_{}.tsv".format(target_cancer, gene_value_mode,p_value_threshold),sep='\t') # with open ("/home/wch23/Project/LifeArc/SOX2/result/Sig.Genes/{}_anova_sig_genes.txt".format(target_cancer),'w') as sig_f: # sig_f.write('\n'.join(significant_genes)) ##### # draw heatmap with sig genes ##### tcga_cancer_diff_df = dh.load_obj("/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original") cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv" cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t') tcga_cancer_diff_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'] != 'None'] tcga_cancer_diff_df = tcga_cancer_diff_df.drop(columns=['ensembl_id', 'ensembl_gene']) tcga_cancer_original_order = list(tcga_cancer_diff_df) tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_df[tcga_cancer_diff_df['gene_symbol'].isin(significant_genes)] print(tcga_cancer_diff_sig_gene_df.head()) print(cancer_id_df.head()) cancer_id_df = cancer_id_df.set_index('fullcode') tcga_cancer_diff_sig_gene_df = tcga_cancer_diff_sig_gene_df.set_index('gene_symbol') tcga_cancer_diff_sig_gene_cancer_type_df = pd.concat([tcga_cancer_diff_sig_gene_df.T, cancer_id_df],axis=1, join='inner' ) tcga_cancer_diff_sig_gene_cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T print(tcga_cancer_diff_sig_gene_cancer_type_df) cancer_type = tcga_cancer_diff_sig_gene_cancer_type_df.loc['Abbreviation'] print(cancer_type) rgb_colors = sns.color_palette("hls", len(cancer_type.unique())) cancer_type_color = dict(zip(cancer_type.unique(), rgb_colors)) print (cancer_type_color) col_colors = cancer_type.map(cancer_type_color) # print(tcga_cancer_diff_sig_gene_df.info()) tcga_cancer_diff_sig_gene_df.to_csv('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_matrix.tsv'.format(target_cancer),sep='\t') g = sns.clustermap(tcga_cancer_diff_sig_gene_df, metric="correlation", cmap="RdBu_r", robust=True, method="average",z_score=0, col_colors=col_colors,xticklabels=False) # Average is best # g = sns.clustermap(tcga_cancer_diff_sig_gene_df, metric="correlation", cmap="RdBu_r", robust=True, method="average", # col_colors=col_colors, xticklabels=False) # Average is best g.savefig('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_single_robust_colColor_norm.png'.format(target_cancer)) print(g.dendrogram_col.reordered_ind) print(g.dendrogram_row.reordered_ind) clustred_col = g.dendrogram_col.reordered_ind cancer_type_df = tcga_cancer_diff_sig_gene_cancer_type_df.T[['Abbreviation']] cancer_type_df.index.name = 'fullcode' cancer_type_df = cancer_type_df.reset_index() print(cancer_type_df) clusterd_cancer_type_df = cancer_type_df.reindex(clustred_col) print(clusterd_cancer_type_df) clusterd_cancer_type_df.to_csv('/home/wch23/Project/LifeArc/TCGA/Result/heatmap/{}/TCGA_diff_heatmap_tcga_clustred_result.csv'.format(target_cancer))
def main_for_nonType(): tcga_cancer_diff_df = dh.load_obj( "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original" ) with open( '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv' ) as cancer_type_f: tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()] cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv" cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t') # print(cancer_id_df) # target_cancer_list=["PAAD"] # pancreas, ITGB5 # cancer_id_df['Case_Ctrl'] = ['Case' if x in target_cancer_list else 'Ctrl' for x in cancer_id_df['Abbreviation']] # print(cancer_id_df) # tcga_cancer_diff_df = tcga_cancer_diff_df[ tcga_cancer_diff_df['gene_symbol'] != 'None'] tcga_cancer_diff_df = tcga_cancer_diff_df.drop( columns=['ensembl_id', 'ensembl_gene']) # gene_symbol_list = ["MIR3648-2","AL513534.2","MIR3648-1","MIR6753","AC099677.4","EXOSC8","AC127024.5","FAT3","SCARB2","PABPC4L","RUSC1-AS1","GDNF-AS1","IVD","SLC39A2","CBWD6","DUSP12","SMC6","AL031587.5","SBNO2"] gene_symbol_list = ["ERBB2"] cancer_type = 'All' cancer_id_df = cancer_id_df.set_index('fullcode') for gene_symbol in gene_symbol_list: aGene_TCGA_df = tcga_cancer_diff_df.loc[ tcga_cancer_diff_df['gene_symbol'] == gene_symbol] # aGene_TCGA_df = aGene_TCGA_df aGene_TCGA_df = aGene_TCGA_df.set_index('gene_symbol') aGene_TCGA_df = aGene_TCGA_df.T # aGene_TCGA_df = aGene_TCGA_df.reset_index() print("Gene:", gene_symbol) print(aGene_TCGA_df.head()) aGene_TCGA_cancer_id_df = pd.concat([aGene_TCGA_df, cancer_id_df], join='inner', axis=1) # aGene_TCGA_cancer_id_df = pd.merge(aGene_TCGA_df,cancer_id_df,left_on='index', right_on='fullcode') print(aGene_TCGA_cancer_id_df) aGene_TCGA_cancer_id_df['Abbreviation'] = aGene_TCGA_cancer_id_df[ 'Abbreviation'].astype('category') cancer_types = aGene_TCGA_cancer_id_df['Abbreviation'].tolist() cancer_types = list(set(cancer_types)) cancer_types = sorted(cancer_types) ## for other type boxplot plt.figure(figsize=(10, 5)) # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, palette=pkmn_type_colors, # showfliers=False); ax = sns.boxplot( x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, # palette=pkmn_type_colors, color='w', linewidth=1.5, showfliers=False) # ax = sns.swarmplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, color=".25") ax.set_title(gene_symbol) ax.set_xlabel('TCGA') ax.set_ylabel('Tumour vs Normal') ax.xaxis.grid(True) ax.set_xticklabels(ax.get_xticklabels(), rotation=45) plt.setp(ax.artists, edgecolor='k', facecolor='w') plt.setp(ax.lines, color='k') file_dir = '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}'.format( cancer_type) from pathlib import Path Path(file_dir).mkdir(parents=True, exist_ok=True) ax.get_figure().savefig( '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_original_new_v2.pdf' .format(cancer_type, gene_symbol)) plt.show()
def get_sig_genes_by_anova(target_cancers, p_value_th): tcga_cancer_diff_df = dh.load_obj( "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original" ) with open( '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv' ) as cancer_type_f: tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()] cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv" cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t') # print(cancer_id_df) # target_cancer_list=["PAAD"] # pancreas, ITGB5 target_cancer_list = target_cancers # target_cancer_list = ["LUSC"] # Lung squamous cell carcinoma , Sox2 target_cancer = '_'.join(target_cancer_list) cancer_id_df['Case_Ctrl'] = [ 'Case' if x in target_cancer_list else 'Ctrl' for x in cancer_id_df['Abbreviation'] ] # print(cancer_id_df) # tcga_cancer_diff_df = tcga_cancer_diff_df[ tcga_cancer_diff_df['gene_symbol'] != 'None'] tcga_cancer_diff_df = tcga_cancer_diff_df.drop( columns=['ensembl_id', 'ensembl_gene']) # # # print(tcga_cancer_diff_df.head()) # # gene_value_mode = 'Diff' # p_value_threshold = 1.0e-40 # -350 = significant gene(0) , -320 = significant(8300) p_value_threshold = p_value_th base_mean_threshold = 0 anova_result_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result.csv".format( target_cancer, gene_value_mode) selected_genes_l = get_selected_genes_using_oneway_ANOVA( tcga_cancer_diff_df, p_value_threshold, cancer_id_df, target_cancer, gene_value_mode, anova_result_addr) selected_genes_addr = "/home/wch23/Project/LifeArc/TCGA/Result/significant_genes_original_{}_from_{}_anova_{}.csv".format( gene_value_mode, target_cancer, str(p_value_threshold)) sig_genes_df = pd.DataFrame(selected_genes_l, columns=["gene", 'pvalue']) sig_genes_df.to_csv(selected_genes_addr, index=False) tcga_id_mapping_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_gtex_id_mapping.csv" anova_result_with_ensembl_addr = "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_{}_anova_result_with_ensembl.csv".format( target_cancer, gene_value_mode) make_anova_result_with_symbol_ensembl(anova_result_addr, tcga_id_mapping_addr, anova_result_with_ensembl_addr) ################ # save significant genes ################ significant_genes_addr = "/home/wch23/Project/LifeArc/TCGA/Result/significant_genes_only_original_{}_from_{}_anova_{}.csv".format( gene_value_mode, target_cancer, str(p_value_threshold)) anova_result_df = pd.read_csv(anova_result_addr) sig_genes_df = anova_result_df[ anova_result_df['pvalue'] <= p_value_threshold] # sig_genes = sig_genes_df['gene'].tolist() # # with open(significant_genes_addr, 'w') as sig_genes_f: # sig_genes_f.write('\n'.join(sig_genes)) sig_genes_df.to_csv(significant_genes_addr, sep='\t')
def main_for_combine_genes(): tcga_cancer_diff_df = dh.load_obj( "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_diff_matrix_original" ) with open( '/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/tcga_cancer_types_has_normal.tsv' ) as cancer_type_f: tcga_cancer_types = [x.strip() for x in cancer_type_f.readlines()] cancer_id_addr = "/mnt/raid0_data/MTI_DATA/TCGA/GepiaTCGA/cancer_id_info.tsv" cancer_id_df = pd.read_csv(cancer_id_addr, sep='\t') # print(cancer_id_df) # target_cancer_list=["PAAD"] # pancreas, ITGB5 # cancer_id_df['Case_Ctrl'] = ['Case' if x in target_cancer_list else 'Ctrl' for x in cancer_id_df['Abbreviation']] # print(cancer_id_df) # tcga_cancer_diff_df = tcga_cancer_diff_df[ tcga_cancer_diff_df['gene_symbol'] != 'None'] tcga_cancer_diff_df = tcga_cancer_diff_df.drop( columns=['ensembl_id', 'ensembl_gene']) # print(cancer_id_df) # gene_symbol_list = ["AC128709.3", "AL035258.1", "TCF4-AS1", "AC012498.2", "GSTA8P", "AC128709.2", "LINC01932", # "POU6F2-AS2", "SCGB3A2", "KRT74", "AC022031.1", "LINC01206", "GBP6", "AC134043.2", "SERPINB13", # "AC012498.1", "ADH7", "SFTPA1", # "SFTPB"] # ['SRC', 'FYN', 'PRKCA', 'VTN', 'MYL12A','EPHA2' ,'SDC1','YES1'] # gene_symbol_list = ["CYTIP","B2M","AZI2","LINC00487","TNFSF12-TNFSF13","AL365203.2","HSPA1B"] # gene_symbol_list = ["ITGA1","ITGA2","ITGA2B","ITGA3","ITGA4","ITGA5","ITGA6","ITGA7","ITGA8","ITGA9","ITGA10","ITGA11","ITGAD","ITGAE","ITGAL","ITGAM", # "ITGAV","ITGAX","ITGB1","ITGB2","ITGB3","ITGB4","ITGB5","ITGB6","ITGB7","ITGB8","ITGBL1"] # gene_symbol_list = ["TMEM52","EPB41L4B","AC011754.1","RBPJL","AC096633.1","PNLIP","CELP","LHFPL5","AC092535.1","TMED6"] # gene_symbol_list = ["DLGAP1","DTNB","BHLHE40","PXN","CYLD","RYBP","GSC","LEFTY2","BMP7","NRP2","BMP4","OTX2","PRDM1","DENND2A","RAD51C","JARID2","KLF9","SOCS3","TGIF1","KANK1","MKRN1","FOS","DNMT3A","STAB2","RABIF","SNCG","ZIC3","FOXD3"] gene_symbol_list = ["SOX2", "STAT1"] gene_symbol = '_'.join(gene_symbol_list) cancer_type = 'LUSC' cancer_id_df = cancer_id_df.set_index('fullcode') anova_result_df = pd.read_csv( "/home/wch23/Project/LifeArc/TCGA/Result/TCGA_GTEx_original_{}_vs_other_Diff_anova_result.csv" .format(cancer_type), index_col=0) aGene_TCGA_df = tcga_cancer_diff_df.loc[ tcga_cancer_diff_df['gene_symbol'].isin(gene_symbol_list)] # aGene_TCGA_df = aGene_TCGA_df aGene_TCGA_df = aGene_TCGA_df.set_index('gene_symbol') aGene_TCGA_df = aGene_TCGA_df.T # aGene_TCGA_df[gene_symbol] = aGene_TCGA_df.sum(axis=1) aGene_TCGA_df[gene_symbol] = aGene_TCGA_df.mean(axis=1) # aGene_TCGA_df = aGene_TCGA_df.reset_index() aGene_TCGA_df = aGene_TCGA_df.drop(columns=gene_symbol_list) print("Gene:", gene_symbol) print(aGene_TCGA_df.head()) aGene_TCGA_cancer_id_df = pd.concat([aGene_TCGA_df, cancer_id_df], join='inner', axis=1) # aGene_TCGA_cancer_id_df = pd.merge(aGene_TCGA_df,cancer_id_df,left_on='index', right_on='fullcode') print(aGene_TCGA_cancer_id_df) # anova_p_value = anova_result_df.loc[gene_symbol].values[0] # print(anova_p_value) aGene_TCGA_cancer_id_df['Abbreviation'] = aGene_TCGA_cancer_id_df[ 'Abbreviation'].astype('category') cancer_types = aGene_TCGA_cancer_id_df['Abbreviation'].tolist() cancer_types = list(set(cancer_types)) cancer_types = sorted(cancer_types) pkmn_type_colors = [ '#78C850', # Grass '#F08030', # Fire '#6890F0', # Water '#A8B820', # Bug '#A8A878', # Normal '#A040A0', # Poison '#F8D030', # Electric '#E0C068', # Ground '#EE99AC', # Fairy '#C03028', # Fighting '#F85888', # Psychic '#B8A038', # Rock '#705898', # Ghost '#98D8D8', # Ice '#7038F8', # Dragon ] # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, # showfliers=False); # ax.set_title(gene_symbol) # ax.set_xticklabels(ax.get_xticklabels(), rotation=90) # ax.get_figure().savefig('/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_{}_original.pdf'.format(cancer_type,gene_symbol, anova_p_value)) ## for other type boxplot plt.figure(figsize=(10, 5)) # ax = sns.boxplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, palette=pkmn_type_colors, # showfliers=False); ax = sns.boxplot( x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, # palette=pkmn_type_colors, color='w', linewidth=1.5, showfliers=False) # ax = sns.swarmplot(x='Abbreviation', y=gene_symbol, data=aGene_TCGA_cancer_id_df, order=cancer_types, color=".25") ax.set_title(gene_symbol) ax.set_xlabel('TCGA') ax.set_ylabel('Tumour vs Normal') ax.xaxis.grid(True) ax.set_xticklabels(ax.get_xticklabels(), rotation=45) ################ # this is for making color brighter ################# # for patch in ax.artists: # r, g, b, a = patch.get_facecolor() # patch.set_facecolor((r, g, b, .3)) ######################### plt.setp(ax.artists, edgecolor='k', facecolor='w') plt.setp(ax.lines, color='k') ax.get_figure().savefig( '/home/wch23/Project/LifeArc/TCGA/Result/boxplot/{}/TCGA_boxplot_{}_original_new_v2.pdf' .format(cancer_type, gene_symbol # anova_p_value )) plt.show()