def generate_subsampled_datasets(): ''' This will generate subsampled tsvs from the MNIST dataset ''' from clustergrammer import Network net = Network() # load full MNIST data with row labels net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] all_sample_nums = [20, 100, 200, 300, 400, 500, 1000] sample_repeats = 5 for sample_num in all_sample_nums: df_subs = take_multiple_subsamples(df, sample_num, sample_repeats) for inst_subsample in df_subs: inst_df = df_subs[inst_subsample] inst_df = add_MNIST_cats(inst_df, row_cats=False) inst_filename = 'processed_MNIST/random_subsampling/MNIST_' \ +str(sample_num)+'x_random_subsample_'+str(inst_subsample)+'.txt' print(inst_df.shape) inst_df.to_csv(inst_filename, sep='\t')
def make_plex_matrix(): ''' Make a cell line matrix with plex rows and cell line columns. This will be used as a negative control that should show worsening correlation as data is normalized/filtered. ''' import numpy as np import pandas as pd from clustergrammer import Network # load cl_info net = Network() cl_info = net.load_json_to_dict('../cell_line_info/cell_line_info_dict.json') # load cell line expression net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = range(9) rows = [i+1 for i in rows] print(rows) mat = np.zeros((len(rows), len(cols))) for inst_col in cols: for inst_cl in cl_info: if inst_col in inst_cl: inst_plex = int(cl_info[inst_cl]['Plex']) if inst_plex != -1: # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex)) row_index = rows.index(inst_plex) col_index = cols.index(inst_col) mat[row_index, col_index] = 1 df_plex = pd.DataFrame(data=mat, columns=cols, index=rows) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ 'exp-plex.txt' df_plex.to_csv(filename, sep='\t')
def make_plex_matrix(): ''' Make a cell line matrix with plex rows and cell line columns. This will be used as a negative control that should show worsening correlation as data is normalized/filtered. ''' import numpy as np import pandas as pd from clustergrammer import Network # load cl_info net = Network() cl_info = net.load_json_to_dict( '../cell_line_info/cell_line_info_dict.json') # load cell line expression net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = range(9) rows = [i + 1 for i in rows] print(rows) mat = np.zeros((len(rows), len(cols))) for inst_col in cols: for inst_cl in cl_info: if inst_col in inst_cl: inst_plex = int(cl_info[inst_cl]['Plex']) if inst_plex != -1: # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex)) row_index = rows.index(inst_plex) col_index = cols.index(inst_col) mat[row_index, col_index] = 1 df_plex = pd.DataFrame(data=mat, columns=cols, index=rows) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ 'exp-plex.txt' df_plex.to_csv(filename, sep='\t')
def main(): import numpy as np import pandas as pd from clustergrammer import Network rtk_list = load_rtks() net = Network() net.load_file('txt/tmp_cst_drug_treat_cl.txt') df_dict = net.dat_to_df() inst_df = df_dict['mat'] inst_df = inst_df.ix[rtk_list] inst_df.to_csv('txt/RTK_exp_in_drug_treat_cl.txt', sep='\t')
def equal_digit_sampling_MNIST(): ''' Sample N instances of each digit from the MNIST dataset ''' from clustergrammer import Network net = Network() net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] print(df.shape) label_dict = get_label_dict() num_sample = 30 # only keep 20 instances of each numbers ########################################### keep_cols = [] for inst_digit in label_dict: tmp_name = label_dict[inst_digit] # select 20 instances of each digit for i in range(num_sample): inst_name = tmp_name + '-' + str(i) keep_cols.append(inst_name) # grab subset of numbers df = df[keep_cols] df = add_MNIST_cats() print('shape after processing') print(df.shape) df.to_csv('processed_MNIST/MNIST_' + str(num_sample) + 'x_original.txt', sep='\t')
def make_json_from_tsv(name): ''' make a clustergrammer json from a tsv file ''' from clustergrammer import Network print('\n' + name) net = Network() filename = 'txt/'+ name + '.txt' net.load_file(filename) df = net.dat_to_df() net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 1000) num_rows = net.dat['mat'].shape[0] num_cols = net.dat['mat'].shape[1] print('num_rows ' + str(num_rows)) print('num_cols ' + str(num_cols)) if num_cols < 50 or num_rows < 1000: views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) export_filename = 'json/' + name + '.json' net.write_json_to_file('viz', export_filename) else: print('did not cluster, too many columns ')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t', na_rep='nan')
def clust_from_response(response_list): from clustergrammer import Network import scipy import json import pandas as pd import math from copy import deepcopy # print('----------------------') # print('enrichr_clust_from_response') # print('----------------------') ini_enr = transfer_to_enr_dict(response_list) enr = [] scores = {} score_types = ['combined_score', 'pval', 'zscore'] for score_type in score_types: scores[score_type] = pd.Series() for inst_enr in ini_enr: if inst_enr['combined_score'] > 0: # make series of enriched terms with scores for score_type in score_types: # collect the scores of the enriched terms if score_type == 'combined_score': scores[score_type][inst_enr['name']] = inst_enr[score_type] if score_type == 'pval': scores[score_type][inst_enr['name']] = -math.log( inst_enr[score_type]) if score_type == 'zscore': scores[score_type][ inst_enr['name']] = -inst_enr[score_type] # keep enrichement values enr.append(inst_enr) # sort and normalize the scores for score_type in score_types: scores[score_type] = scores[score_type] / scores[score_type].max() scores[score_type].sort_values(ascending=False) number_of_enriched_terms = len(scores['combined_score']) enr_score_types = ['combined_score', 'pval', 'zscore'] if number_of_enriched_terms < 10: num_dict = {'ten': 10} elif number_of_enriched_terms < 20: num_dict = {'ten': 10, 'twenty': 20} else: num_dict = {'ten': 10, 'twenty': 20, 'thirty': 30} # gather lists of top scores top_terms = {} for enr_type in enr_score_types: top_terms[enr_type] = {} for num_terms in list(num_dict.keys()): inst_num = num_dict[num_terms] top_terms[enr_type][num_terms] = scores[enr_type].index.tolist( )[:inst_num] # gather the terms that should be kept - they are at the top of the score list keep_terms = [] for inst_enr_score in top_terms: for tmp_num in list(num_dict.keys()): keep_terms.extend(top_terms[inst_enr_score][tmp_num]) keep_terms = list(set(keep_terms)) # keep enriched terms that are at the top 10 based on at least one score keep_enr = [] for inst_enr in enr: if inst_enr['name'] in keep_terms: keep_enr.append(inst_enr) # fill in full matrix ####################### # genes row_node_names = [] # enriched terms col_node_names = [] # gather information from the list of enriched terms for inst_enr in keep_enr: col_node_names.append(inst_enr['name']) row_node_names.extend(inst_enr['int_genes']) row_node_names = sorted(list(set(row_node_names))) net = Network() net.dat['nodes']['row'] = row_node_names net.dat['nodes']['col'] = col_node_names net.dat['mat'] = scipy.zeros([len(row_node_names), len(col_node_names)]) for inst_enr in keep_enr: inst_term = inst_enr['name'] col_index = col_node_names.index(inst_term) # use combined score for full matrix - will not be seen in viz tmp_score = scores['combined_score'][inst_term] net.dat['node_info']['col']['value'].append(tmp_score) for inst_gene in inst_enr['int_genes']: row_index = row_node_names.index(inst_gene) # save association net.dat['mat'][row_index, col_index] = 1 # cluster full matrix ############################# # do not make multiple views views = [''] if len(net.dat['nodes']['row']) > 1: net.make_clust(dist_type='jaccard', views=views, dendro=False) else: net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False) # get dataframe from full matrix df = net.dat_to_df() for score_type in score_types: for num_terms in num_dict: inst_df = deepcopy(df) inst_net = deepcopy(Network()) inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]] # load back into net inst_net.df_to_dat(inst_df) # make views if len(net.dat['nodes']['row']) > 1: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False) else: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering=False) inst_views = inst_net.viz['views'] # add score_type to views for inst_view in inst_views: inst_view['N_col_sum'] = num_dict[num_terms] inst_view['enr_score_type'] = score_type # add values to col_nodes and order according to rank for inst_col in inst_view['nodes']['col_nodes']: inst_col['rank'] = len( top_terms[score_type] [num_terms]) - top_terms[score_type][num_terms].index( inst_col['name']) inst_name = inst_col['name'] inst_col['value'] = scores[score_type][inst_name] # add views to main network net.viz['views'].extend(inst_views) return net
def clust_from_response(response_list): from clustergrammer import Network import scipy import json import pandas as pd import math from copy import deepcopy print('----------------------') print('enrichr_clust_from_response') print('----------------------') ini_enr = transfer_to_enr_dict( response_list ) enr = [] scores = {} score_types = ['combined_score','pval','zscore'] for score_type in score_types: scores[score_type] = pd.Series() for inst_enr in ini_enr: if inst_enr['combined_score'] > 0: # make series of enriched terms with scores for score_type in score_types: # collect the scores of the enriched terms if score_type == 'combined_score': scores[score_type][inst_enr['name']] = inst_enr[score_type] if score_type == 'pval': scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type]) if score_type == 'zscore': scores[score_type][inst_enr['name']] = -inst_enr[score_type] # keep enrichement values enr.append(inst_enr) # sort and normalize the scores for score_type in score_types: scores[score_type] = scores[score_type]/scores[score_type].max() scores[score_type].sort(ascending=False) number_of_enriched_terms = len(scores['combined_score']) enr_score_types = ['combined_score','pval','zscore'] if number_of_enriched_terms <10: num_dict = {'ten':10} elif number_of_enriched_terms <20: num_dict = {'ten':10, 'twenty':20} else: num_dict = {'ten':10, 'twenty':20, 'thirty':30} # gather lists of top scores top_terms = {} for enr_type in enr_score_types: top_terms[enr_type] = {} for num_terms in num_dict.keys(): inst_num = num_dict[num_terms] top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num] # gather the terms that should be kept - they are at the top of the score list keep_terms = [] for inst_enr_score in top_terms: for tmp_num in num_dict.keys(): keep_terms.extend( top_terms[inst_enr_score][tmp_num] ) keep_terms = list(set(keep_terms)) # keep enriched terms that are at the top 10 based on at least one score keep_enr = [] for inst_enr in enr: if inst_enr['name'] in keep_terms: keep_enr.append(inst_enr) # fill in full matrix ####################### # genes row_node_names = [] # enriched terms col_node_names = [] # gather information from the list of enriched terms for inst_enr in keep_enr: col_node_names.append(inst_enr['name']) row_node_names.extend(inst_enr['int_genes']) row_node_names = sorted(list(set(row_node_names))) net = Network() net.dat['nodes']['row'] = row_node_names net.dat['nodes']['col'] = col_node_names net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)]) for inst_enr in keep_enr: inst_term = inst_enr['name'] col_index = col_node_names.index(inst_term) # use combined score for full matrix - will not be seen in viz tmp_score = scores['combined_score'][inst_term] net.dat['node_info']['col']['value'].append(tmp_score) for inst_gene in inst_enr['int_genes']: row_index = row_node_names.index(inst_gene) # save association net.dat['mat'][row_index, col_index] = 1 # cluster full matrix ############################# # do not make multiple views views = [''] if len(net.dat['nodes']['row']) > 1: net.make_clust(dist_type='jaccard', views=views, dendro=False) else: net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False) # get dataframe from full matrix df = net.dat_to_df() for score_type in score_types: for num_terms in num_dict: inst_df = deepcopy(df) inst_net = deepcopy(Network()) inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]] # load back into net inst_net.df_to_dat(inst_df) # make views if len(net.dat['nodes']['row']) > 1: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False) else: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False) inst_views = inst_net.viz['views'] # add score_type to views for inst_view in inst_views: inst_view['N_col_sum'] = num_dict[num_terms] inst_view['enr_score_type'] = score_type # add values to col_nodes and order according to rank for inst_col in inst_view['nodes']['col_nodes']: inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name']) inst_name = inst_col['name'] inst_col['value'] = scores[score_type][inst_name] # add views to main network net.viz['views'].extend(inst_views) return net
from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') # print(net.dat['nodes']['row']) cat_list = [] for inst_gene in net.dat['nodes']['row']: inst_tuple = [inst_gene] cat_list.append(inst_tuple) df = net.dat_to_df() all_rows = df['mat'].index.tolist() print(all_rows) new_rows = [] for inst_row in all_rows: new_rows.append(list(inst_row)) print('\n\n\n') print(new_rows) for inst_row in new_rows: inst_row.append('something') print('\n\n\n') print(new_rows)