def generate_subsampled_datasets(): ''' This will generate subsampled tsvs from the MNIST dataset ''' from clustergrammer import Network net = Network() # load full MNIST data with row labels net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] all_sample_nums = [20, 100, 200, 300, 400, 500, 1000] sample_repeats = 5 for sample_num in all_sample_nums: df_subs = take_multiple_subsamples(df, sample_num, sample_repeats) for inst_subsample in df_subs: inst_df = df_subs[inst_subsample] inst_df = add_MNIST_cats(inst_df, row_cats=False) inst_filename = 'processed_MNIST/random_subsampling/MNIST_' \ +str(sample_num)+'x_random_subsample_'+str(inst_subsample)+'.txt' print(inst_df.shape) inst_df.to_csv(inst_filename, sep='\t')
def make_phos_homepage_viz(): from clustergrammer import Network net = Network() filename = 'lung_cellline_3_1_16/lung_cellline_phospho/' + \ 'lung_cellline_TMT_phospho_combined_ratios.tsv' net.load_file(filename) # quantile normalize to normalize cell lines net.normalize(axis='col', norm_type='qn') # only keep most differentially regulated PTMs net.filter_N_top('row', 250, 'sum') # take zscore of rows net.normalize(axis='row', norm_type='zscore', keep_orig=True) net.swap_nan_for_zero() # threshold filter PTMs net.filter_threshold('row', threshold=1.75, num_occur=3) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True, calc_cat_pval=True) net.write_json_to_file('viz', 'json/homepage_phos.json', 'indent')
def main(): from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') tmp_size = 50 inst_dm = make_distance_matrix(net, tmp_size) randomly_sample_rows(net, inst_dm, tmp_size)
def make_plex_matrix(): ''' Make a cell line matrix with plex rows and cell line columns. This will be used as a negative control that should show worsening correlation as data is normalized/filtered. ''' import numpy as np import pandas as pd from clustergrammer import Network # load cl_info net = Network() cl_info = net.load_json_to_dict('../cell_line_info/cell_line_info_dict.json') # load cell line expression net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = range(9) rows = [i+1 for i in rows] print(rows) mat = np.zeros((len(rows), len(cols))) for inst_col in cols: for inst_cl in cl_info: if inst_col in inst_cl: inst_plex = int(cl_info[inst_cl]['Plex']) if inst_plex != -1: # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex)) row_index = rows.index(inst_plex) col_index = cols.index(inst_col) mat[row_index, col_index] = 1 df_plex = pd.DataFrame(data=mat, columns=cols, index=rows) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ 'exp-plex.txt' df_plex.to_csv(filename, sep='\t')
def make_plex_matrix(): ''' Make a cell line matrix with plex rows and cell line columns. This will be used as a negative control that should show worsening correlation as data is normalized/filtered. ''' import numpy as np import pandas as pd from clustergrammer import Network # load cl_info net = Network() cl_info = net.load_json_to_dict( '../cell_line_info/cell_line_info_dict.json') # load cell line expression net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = range(9) rows = [i + 1 for i in rows] print(rows) mat = np.zeros((len(rows), len(cols))) for inst_col in cols: for inst_cl in cl_info: if inst_col in inst_cl: inst_plex = int(cl_info[inst_cl]['Plex']) if inst_plex != -1: # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex)) row_index = rows.index(inst_plex) col_index = cols.index(inst_col) mat[row_index, col_index] = 1 df_plex = pd.DataFrame(data=mat, columns=cols, index=rows) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ 'exp-plex.txt' df_plex.to_csv(filename, sep='\t')
def main(): import numpy as np import pandas as pd from clustergrammer import Network rtk_list = load_rtks() net = Network() net.load_file('txt/tmp_cst_drug_treat_cl.txt') df_dict = net.dat_to_df() inst_df = df_dict['mat'] inst_df = inst_df.ix[rtk_list] inst_df.to_csv('txt/RTK_exp_in_drug_treat_cl.txt', sep='\t')
def make_exp_homepage_viz(): from clustergrammer import Network net = Network() net.load_file('CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') # threshold filter expression net.filter_threshold('row', threshold=3.0, num_occur=4) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True, calc_cat_pval=False) net.write_json_to_file('viz', 'json/homepage_exp.json', 'indent')
def prepare_heatmap(matrix_input, html_file, html_dir, tools_dir, categories, distance, linkage): # prepare directory and html os.mkdir(html_dir) env = Environment(loader=FileSystemLoader(tools_dir + "/templates")) template = env.get_template("clustergrammer.template") overview = template.render() with open(html_file, "w") as outf: outf.write(overview) json_output = html_dir + "/mult_view.json" net = Network() net.load_file(matrix_input) if (categories['row']): net.add_cats('row', categories['row']) if (categories['col']): net.add_cats('col', categories['col']) net.cluster(dist_type=distance, linkage_type=linkage) net.write_json_to_file('viz', json_output)
def main(): net = Network() # load genes of interest gene_info = net.load_json_to_dict('../grant_pois/gene_info_with_dark.json') # ENCODE, GTEx, etc # hzome_names = ['my_CCLE_exp.txt', 'ENCODE_TF_targets.txt', 'ChEA_TF_targets.txt'] hzome_names = ['ENCODE_TF_targets.txt'] # define separate sim_cutoffs for different files cutoffs = {} cutoffs['my_CCLE_exp.txt'] = 0.15 cutoffs['ENCODE_TF_targets.txt'] = 0.35 ## 0.6 cutoffs['ChEA_TF_targets.txt'] = 0.2 cutoffs['my_gtex_Moshe_2017_exp.txt'] = 0.2 genes_of_class = gene_info['KIN']['all'] for hzome_name in hzome_names: hzome_filename = '../hzome_data/' + hzome_name print('loading data ') # load hzome data #################### if 'my_' in hzome_name: # if I am providing the data, then load in normal way net.load_file(hzome_filename) hzome_data = net.export_df() else: # load data in hzome format hzome_data = deepcopy(hzome_to_df.load_matrix(hzome_filename)) print('data loaded\n') for gene_class in gene_info: calc_gene_sim_mat(hzome_data, net, gene_info, gene_class, hzome_name, cutoffs)
def make_json_from_tsv(name): ''' make a clustergrammer json from a tsv file ''' from clustergrammer import Network print('\n' + name) net = Network() filename = 'txt/'+ name + '.txt' net.load_file(filename) df = net.dat_to_df() net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 1000) num_rows = net.dat['mat'].shape[0] num_cols = net.dat['mat'].shape[1] print('num_rows ' + str(num_rows)) print('num_cols ' + str(num_cols)) if num_cols < 50 or num_rows < 1000: views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) export_filename = 'json/' + name + '.json' net.write_json_to_file('viz', export_filename) else: print('did not cluster, too many columns ')
def equal_digit_sampling_MNIST(): ''' Sample N instances of each digit from the MNIST dataset ''' from clustergrammer import Network net = Network() net.load_file('processed_MNIST/large_files/MNIST_row_labels.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] print(df.shape) label_dict = get_label_dict() num_sample = 30 # only keep 20 instances of each numbers ########################################### keep_cols = [] for inst_digit in label_dict: tmp_name = label_dict[inst_digit] # select 20 instances of each digit for i in range(num_sample): inst_name = tmp_name + '-' + str(i) keep_cols.append(inst_name) # grab subset of numbers df = df[keep_cols] df = add_MNIST_cats() print('shape after processing') print(df.shape) df.to_csv('processed_MNIST/MNIST_' + str(num_sample) + 'x_original.txt', sep='\t')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t', na_rep='nan')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t')
import time start_time = time.time() from clustergrammer import Network net = Network() # choose tsv file #################### inst_name = 'Tyrosine' # net.load_file('txt/phos_ratios_all_treat_no_geld_ST.txt') net.load_file('txt/phos_ratios_all_treat_no_geld_Tyrosine.txt') net.swap_nan_for_zero() # net.normalize(axis='row', norm_type='zscore', keep_orig=True) print(net.dat.keys()) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos',views=views , dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # run_enrichr=['KEA_2015']) # run_enrichr=['ENCODE_TF_ChIP-seq_2014']) # run_enrichr=['GO_Biological_Process_2015']) net.write_json_to_file('viz', 'json/'+inst_name+'.json', 'no-indent') net.write_json_to_file('sim_row', 'json/'+inst_name+'_sim_row.json', 'no-indent') net.write_json_to_file('sim_col', 'json/'+inst_name+'_sim_col.json', 'no-indent')
# Format index/headers for clustergrammer gene_attribute_matrix.index = gene_attribute_matrix.index.map(lambda s: '%s: %s' % (gene_attribute_matrix.index.name, s)) gene_attribute_matrix.columns = gene_attribute_matrix.columns.map(lambda s: '%s: %s' % (gene_attribute_matrix.columns.name, s)) # Remove names for clustergrammer gene_attribute_matrix.index.name = "" gene_attribute_matrix.columns.name = "" # Write to file # fp = StringIO() # gene_attribute_matrix.to_csv(fp, sep='\t') gene_attribute_matrix.to_csv('tmp.txt', sep='\t') # Custergrammer from clustergrammer import Network net = Network() # net.load_tsv_to_net(fp, name) # StringIO net.load_file('tmp.txt') net.swap_nan_for_zero() # Generate net.make_clust(dist_type='cos',views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # Insert into database cur.execute('insert into `datasets` (`Name`, `prot_att`, `att_att`, `prot_prot`) values (?, ?, ?, ?)', (name, net.export_net_json('viz', indent='no-indent'), net.export_net_json('sim_col', indent='no-indent'), net.export_net_json('sim_row', indent='no-indent'))) con.commit() except Exception as e: print "Couldn't process %s (%s)" % (name, e) continue
''' Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('in.tsv') # net.load_file('txt/ccle_example.txt') # net.load_file('txt/rc_val_cats.txt') # net.load_file('txt/number_labels.txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/tuple_cats.txt') # net.load_file('txt/example_tsv.txt') # net.enrichrgram('KEA_2015') # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 250, rank_type='sum') # net.filter_threshold('row', threshold=3.0, num_occur=4) # net.swap_nan_for_zero() # net.set_cat_color('col', 1, 'Category: one', 'blue')
# make network object and load file from clustergrammer import Network net = Network() net.load_file('mult_view.tsv') # Z-score normalize the rows #net.normalize(axis='row', norm_type='zscore', keep_orig=True) # calculate clustering using default parameters net.cluster() # save visualization JSON to file for use by front end net.write_json_to_file('viz', 'mult_view.json') # needs pandas and sklearn as well # pip install --user --upgrade clustergrammer pandas sklearn
net = Network() filename = sys.argv[1] outname = sys.argv[2] wd = sys.argv[3] jobid = sys.argv[4] use_user_label = sys.argv[5] user_label = jobid + '_user_label_name.txt' df = pd.read_csv(user_label, sep='\t', header=0) unique_array = df.iloc[:, 0].unique() #df['num_unique'] = df.nunique(axis=1) #print(unique_array) #print(df.iloc[0,:].unique()) net.load_file(filename) color_array = [ '#92896B', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ff0000', '#000000' ] color_array2 = [ "#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
# make network object and load file from clustergrammer import Network net = Network() b = "cluster.txt" d = "cluster.json" net.load_file(b) # calculate clustering using default parameters net.cluster() # save visualization JSON to file for use by front end net.write_json_to_file('viz', 'cluster.json')
import time start_time = time.time() from clustergrammer import Network net = Network() # choose tsv file #################### inst_name = 'Tyrosine' # net.load_file('txt/phos_ratios_all_treat_no_geld_ST.txt') net.load_file('txt/phos_ratios_all_treat_no_geld_Tyrosine.txt') net.swap_nan_for_zero() # net.normalize(axis='row', norm_type='zscore', keep_orig=True) print(net.dat.keys()) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # run_enrichr=['KEA_2015']) # run_enrichr=['ENCODE_TF_ChIP-seq_2014']) # run_enrichr=['GO_Biological_Process_2015']) net.write_json_to_file('viz', 'json/' + inst_name + '.json', 'no-indent')
''' Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('C:/Users/omkar/Desktop/clustergram.txt') print("File Loaded!") # net.load_file('txt/ccle_example.txt') # net.load_file('txt/rc_val_cats.txt') # net.load_file('txt/number_labels.txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/tuple_cats.txt') # net.load_file('txt/example_tsv.txt') # net.enrichrgram('KEA_2015') # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 250, rank_type='sum') # net.filter_threshold('row', threshold=3.0, num_occur=4) # net.swap_nan_for_zero()
''' Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('../data_mats/df_predict_merge.txt') net.set_cat_color('row', 1, 'virus: chik', 'blue') net.set_cat_color('row', 1, 'virus: zika', 'red') net.cluster(enrichrgram=False) # transfer colors from original to predicted categories ######################################################## # make category colors the same for Chik groups for inst_cat in net.viz['cat_colors']['row']['cat-1']: new_cat = inst_cat.replace('original', 'predict') inst_color = net.viz['cat_colors']['row']['cat-1'][inst_cat] net.set_cat_color('row', 3, new_cat, inst_color) net.cluster(enrichrgram=False) # write jsons for front-end visualizations
#from sys import argv from clustergrammer import Network net = Network() net.load_file('mat.txt') #argv[1] # calculate clustering using default parameters net.cluster() # save visualization JSON to file for use by front end net.write_json_to_file('viz', 'kbio_mhci_view.json') net2 = Network() net2.load_file('mat2.txt') #argv[1] # calculate clustering using default parameters net2.cluster() # save visualization JSON to file for use by front end net2.write_json_to_file('viz', 'kbio_mhci_view_summary.json')
import os, sys, re from collections import defaultdict # make network object and load file from clustergrammer import Network if __name__ == "__main__": matrix_filename = sys.argv[1] html_output_filename = sys.argv[2] print('loading file...') net = Network() # load matrix file net.load_file(matrix_filename) print('done') # cluster using default parameters print('clustering the matrix...') net.cluster(dist_type='jaccard', linkage_type='complete') # net.cluster(run_clustering=False) print('done') # save visualization JSON to file for use by front end print('saving results in json file...') json_filename = matrix_filename + '.json' net.write_json_to_file('viz', json_filename) print('done') # creating the html page
ids = delta_f.columns.map(lambda x: x.split('|')[0]) fout = open("%s_heatmap_matrix.txt" % args.d, 'w') fout.write("\t\t%s\n" % ('\t'.join(tfs))) cls = [] for i in ids: if ann_dict.get(i, ['NA'])[0] == 'NA': cls.append("Cell Line: %s" % ('NA')) else: cls.append("Cell Line: %s" % (ann_dict[i][0])) fout.write("\t\t%s\n" % ('\t'.join(cls))) ts = [] for i in ids: if ann_dict.get(i, ['NA', 'NA'])[1] == 'NA': ts.append("Tissue: %s" % ('NA')) else: ts.append("Tissue: %s" % (ann_dict[i][1])) fout.write("\t\t%s\n" % ('\t'.join(ts))) for i in range(status.shape[0]): fout.write('%s\t%s\t%s\n' % ("Gene: %s" % genes[i], "Input Gene: %s" % status[i], '\t'.join( delta_f.iloc[i, :].map(str)))) fout.close() net.load_file("%s_heatmap_matrix.txt" % args.d) net.cluster() net.write_json_to_file('viz', '%s_mult_view.json' % args.d)
Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file #net.load_file('txt/rc_two_cats.txt') net.load_file('txt/papseek_data.txt') #net.write_json_to_file('viz', 'json/pooja.json', 'indent') # net.load_file('txt/ccle_example.txt') # net.load_file('txt/rc_val_cats.txt') # net.load_file('txt/number_labels.txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/tuple_cats.txt') # net.load_file('txt/example_tsv.txt') # net.enrichrgram('KEA_2015') # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True)
from clustergrammer import Network import sys filename = sys.argv[-1] net = Network() print("Python is fun.") print(filename) filepath = '/Users/snehalpatil/Documents/GithubProjects/gsesuite-data/heatmap/' + ( filename) print(filepath) net.load_file(filepath) net.cluster() jsonname = filename.replace(".txt", ".json") jsonfilepath = '/Users/snehalpatil/Documents/GithubProjects/gsesuite-data/heatmap/' + jsonname net.write_json_to_file('viz', jsonfilepath)
lambda s: '%s: %s' % (gene_attribute_matrix.index.name, s)) gene_attribute_matrix.columns = gene_attribute_matrix.columns.map( lambda s: '%s: %s' % (gene_attribute_matrix.columns.name, s)) # Remove names for clustergrammer gene_attribute_matrix.index.name = "" gene_attribute_matrix.columns.name = "" # Write to file # fp = StringIO() # gene_attribute_matrix.to_csv(fp, sep='\t') gene_attribute_matrix.to_csv('tmp.txt', sep='\t') # Custergrammer from clustergrammer import Network net = Network() # net.load_tsv_to_net(fp, name) # StringIO net.load_file('tmp.txt') net.swap_nan_for_zero() # Generate net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # Insert into database cur.execute( 'insert into `datasets` (`Name`, `prot_att`, `att_att`, `prot_prot`) values (?, ?, ?, ?)', (name, net.export_net_json('viz', indent='no-indent'), net.export_net_json('sim_col', indent='no-indent'), net.export_net_json('sim_row', indent='no-indent')))
def main(): from clustergrammer import Network # load CCLE cell lines filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/ccle_cl_names.txt' f = open(filename, 'r') lines = f.readlines() f.close() cl_names = [] for inst_line in lines: inst_line = inst_line.strip() cl_names.append(inst_line) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_lung.txt' net = Network() net.load_file(filename) ccle_lung = net.export_df() cols = ccle_lung.columns.tolist() # simplify cols, disguard meta-data ###################################### simple_cols = [] for inst_col in cols: proc_col = inst_col[0].split(': ')[1].replace('NCI', '') if 'CALU' in proc_col: proc_col = proc_col.replace('CALU', 'Calu-') if 'LOU' in proc_col: proc_col = proc_col.replace('LOU', 'Lou-') if 'CAL' in proc_col: proc_col = proc_col.replace('CAL', 'CAL-') simple_cols.append(proc_col) ccle_lung.columns = simple_cols cols = ccle_lung.columns.tolist() found_cols = [] for inst_col in cols: if inst_col in cl_names: found_cols.append(inst_col) # found all cell lines print('found ' + str(len(found_cols))) # save subset of cell lines that are also found in the CST PTM data ccle_cst_lung = ccle_lung[cl_names] save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_CST_lung.txt' ccle_cst_lung.to_csv(save_filename, sep='\t')
import time # import StringIO start_time = time.time() # import network class from Network.py from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/example_tsv.txt') # net.load_file('txt/col_categories.txt') # net.load_file('txt/mat_cats.tsv') # net.load_file('txt/mat_1mb.Txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/sim_mat_4_cats.txt') views = ['N_row_sum','N_row_var'] # # filtering rows and cols by sum # net.filter_sum('row', threshold=20) # net.filter_sum('col', threshold=30) # # keep top rows based on sum # net.filter_N_top('row', 10, 'sum') net.make_clust(dist_type='cos',views=views , dendro=True, sim_mat=True, filter_sim=0.1) # net.produce_view({'N_row_sum':10,'dist':'euclidean'})
# make network object and load file from clustergrammer import Network net = Network() net.load_file('txt/new_matrix.txt') # net.add_cats("col",[ # { # "title": "year", # "cats": { # "1995": [ # "p2", # "p3" # ], # "1998":[ # "p1", # "p4" # ] # } # }, # { # "title": "s_author", # "cats": { # "aa": [ # "p1", # "p3" # ], # "bb":[ # "p1", # "p2", # "p3", # "p4" # ],
''' Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('txt/heatmap_features.txt') net.set_cat_color('row', 1, 'Feature Type: Interactivity', 'yellow') net.set_cat_color('row', 1, 'Feature Type: Sharing', 'blue') net.set_cat_color('row', 1, 'Feature Type: Usability', 'orange') net.set_cat_color('row', 1, 'Feature Type: Biology-Specific', 'red') net.cluster(dist_type='cos', views=[], dendro=True, filter_sim=0.1, calc_cat_pval=False, enrichrgram=False) # write jsons for front-end visualizations net.write_json_to_file('viz', 'json/mult_view.json', 'indent')
def make_ccle_matrix_subset(): ''' This will save a subset of the downsampled matrix using the proteins of interest ''' from clustergrammer import Network import json_scripts print('-- load CCLE downsampled data') # load downsampled CCLE data net = Network() net.load_file('CCLE/CCLE_kmeans_ds_col_100.txt') df = net.export_df() # load proteins of interest filename = 'proteins_of_interest/proteins_of_interest.json' poi = json_scripts.load_to_dict(filename) all_poi = [] for inst_type in poi: all_poi.extend(poi[inst_type]) # only keep pois that are found in the CCLE all_genes = df.index.tolist() found_poi = list(set(all_genes) & set(all_poi)) num_found_poi = len(found_poi) print( str(num_found_poi) + ' proteins of interest were found in the CCLE data') # filter dataframe using row list (transpose and transpose-back) ################################################################## df = df.transpose() df = df[found_poi] df = df.transpose() # save version without protein categories (e.g. kinase) df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi_no_cats.txt', sep='\t') row_cats = [] for inst_gene in found_poi: # add protein type to gene names found_type = '' for inst_type in poi: if inst_gene in poi[inst_type]: found_type = inst_type gene_name = 'gene: ' + inst_gene cat_name = 'type: ' + found_type inst_tuple = (gene_name, cat_name) row_cats.append(inst_tuple) # redefine index df.index = row_cats print('-- save matrix with proteins_of_interest subset') df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi.txt', sep='\t')
import time start_time = time.time() from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/tmp.txt') views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True) net.write_json_to_file('viz', 'json/mult_view.json') net.write_json_to_file('sim_row', 'json/mult_view_sim_row.json') net.write_json_to_file('sim_col', 'json/mult_view_sim_col.json') elapsed_time = time.time() - start_time print('\n\nelapsed time') print(elapsed_time)
''' The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' import os from clustergrammer import Network for filename in os.listdir("tsv"): name = filename.split(".")[0] net = Network() # load matrix tsv file print name net.load_file('tsv/' + name + '.tsv') # optional filtering and normalization ########################################## net.swap_nan_for_zero() net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # write jsons for front-end visualizations net.write_json_to_file('viz', 'output/' + name + '.json', 'indent')