def df_filter_row(df, threshold, take_abs=True): ''' filter rows in matrix at some threshold and remove columns that have a sum below this threshold ''' from copy import deepcopy from clustergrammer import Network net = Network() if take_abs is True: df_copy = deepcopy(df['mat'].abs()) else: df_copy = deepcopy(df['mat']) ini_rows = df_copy.index.values.tolist() df_copy = df_copy.transpose() tmp_sum = df_copy.sum(axis=0) tmp_sum = tmp_sum.abs() tmp_sum.sort_values(inplace=True, ascending=False) tmp_sum = tmp_sum[tmp_sum > threshold] keep_rows = sorted(tmp_sum.index.values.tolist()) if len(keep_rows) < len(ini_rows): df['mat'] = net.grab_df_subset(df['mat'], keep_rows=keep_rows) if 'mat_up' in df: df['mat_up'] = net.grab_df_subset(df['mat_up'], keep_rows=keep_rows) df['mat_dn'] = net.grab_df_subset(df['mat_dn'], keep_rows=keep_rows) return df
def add_mutations(cl_info): print('add mutations\n') from clustergrammer import Network net = Network() old_cl_info = net.load_json_to_dict('cell_line_muts.json') cl_muts = old_cl_info['muts'] for inst_cl in cl_info: # remove plex name if necessary if '_plex_' in inst_cl: simple_cl = inst_cl.split('_')[0] else: simple_cl = inst_cl for inst_mut in cl_muts: mutated_cls = cl_muts[inst_mut] if simple_cl in mutated_cls: has_mut = 'true' else: has_mut = 'false' mutation_title = 'mut-'+inst_mut # use the original long cell line name (with possible plex) cl_info[inst_cl][mutation_title] = has_mut return cl_info
def df_filter_col(df, threshold, take_abs=True): ''' filter columns in matrix at some threshold and remove rows that have all zero values ''' from copy import deepcopy from clustergrammer import Network net = Network() if take_abs is True: df_copy = deepcopy(df['mat'].abs()) else: df_copy = deepcopy(df['mat']) df_copy = df_copy.transpose() df_copy = df_copy[df_copy.sum(axis=1) > threshold] df_copy = df_copy.transpose() df_copy = df_copy[df_copy.sum(axis=1) > 0] if take_abs is True: inst_rows = df_copy.index.tolist() inst_cols = df_copy.columns.tolist() df['mat'] = net.grab_df_subset(df['mat'], inst_rows, inst_cols) else: df['mat'] = df_copy return df
def calc_treatment_ratios(): from clustergrammer import Network net = Network() net.load_tsv_to_net('treated_cell_12_1_2015/treated_cl_phospho.tsv')
def make_enr_vect_clust(): import enrichr_functions as enr_fun from clustergrammer import Network net = Network() g2e_post = net.load_json_to_dict('json/g2e_enr_vect.json') net = enr_fun.make_enr_vect_clust(g2e_post, 0.001, 1) net.write_json_to_file('viz','json/enr_vect_example.json')
def main(): from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') tmp_size = 50 inst_dm = make_distance_matrix(net, tmp_size) randomly_sample_rows(net, inst_dm, tmp_size)
def main(): from clustergrammer import Network net = Network() gene_list = ['EGFR', 'TP53', 'SMARCA4', 'CLASP1'] list_id = net.enrichr('post', gene_list) print(list_id) enr, response_list = net.enrichr('get', lib='ChEA_2015', list_id=list_id, max_terms=10) print(response_list)
def make_viz_json(inst_df, name): from clustergrammer import Network net = Network() filename = 'json/'+name load_df = {} load_df['mat'] = inst_df net.df_to_dat(load_df) net.swap_nan_for_zero() net.make_clust(views=[]) net.write_json_to_file('viz', filename, 'no-indent')
def cluster(): from clustergrammer import Network net = Network() vect_post = net.load_json_to_dict('fake_vect_post.json') net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() # net.N_top_views() net.make_clust(dist_type='cos',views=['N_row_sum','N_row_var'], dendro=True) net.write_json_to_file('viz','json/large_vect_post_example.json','indent')
def make_plex_matrix(): ''' Make a cell line matrix with plex rows and cell line columns. This will be used as a negative control that should show worsening correlation as data is normalized/filtered. ''' import numpy as np import pandas as pd from clustergrammer import Network # load cl_info net = Network() cl_info = net.load_json_to_dict('../cell_line_info/cell_line_info_dict.json') # load cell line expression net.load_file('../CCLE_gene_expression/CCLE_NSCLC_all_genes.txt') tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = range(9) rows = [i+1 for i in rows] print(rows) mat = np.zeros((len(rows), len(cols))) for inst_col in cols: for inst_cl in cl_info: if inst_col in inst_cl: inst_plex = int(cl_info[inst_cl]['Plex']) if inst_plex != -1: # print(inst_col + ' in ' + inst_cl + ': ' + str(inst_plex)) row_index = rows.index(inst_plex) col_index = cols.index(inst_col) mat[row_index, col_index] = 1 df_plex = pd.DataFrame(data=mat, columns=cols, index=rows) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ 'exp-plex.txt' df_plex.to_csv(filename, sep='\t')
def main(): import numpy as np import pandas as pd from clustergrammer import Network rtk_list = load_rtks() net = Network() net.load_file('txt/tmp_cst_drug_treat_cl.txt') df_dict = net.dat_to_df() inst_df = df_dict['mat'] inst_df = inst_df.ix[rtk_list] inst_df.to_csv('txt/RTK_exp_in_drug_treat_cl.txt', sep='\t')
def post_to_clustergrammer(): from clustergrammer import Network import requests import json upload_url = 'http://localhost:9000/clustergrammer/vector_upload/' # upload_url = 'http://amp.pharm.mssm.edu/clustergrammer/vector_upload/' net = Network() vect_post = net.load_json_to_dict('test_vector_upload.json') # vect_post = net.load_json_to_dict('fake_vect_post.json') r = requests.post(upload_url, data=json.dumps(vect_post) ) link = r.text print(link)
def main( buff, inst_filename, mongo_address, viz_id): import numpy as np import flask from bson.objectid import ObjectId from pymongo import MongoClient from flask import request from clustergrammer import Network import StringIO client = MongoClient(mongo_address) db = client.clustergrammer viz_id = ObjectId(viz_id) found_viz = db.networks.find_one({'_id':viz_id}) try: net = Network() net.load_tsv_to_net(buff) net.swap_nan_for_zero() views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cosine', dendro=True, views=views, \ linkage_type='average') export_dat = {} export_dat['name'] = inst_filename export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'user_upload' dat_id = db.network_data.insert(export_dat) update_viz = net.viz update_dat = dat_id except: print('\n-----------------------') print('error in clustering') print('-----------------------\n') update_viz = 'error' update_dat = 'error' found_viz['viz'] = update_viz found_viz['dat'] = update_dat db.networks.update_one( {'_id':viz_id}, {'$set': found_viz} ) client.close()
def mock_g2e_json(gl): import enrichr_functions as enr_fun from clustergrammer import Network ''' A json of signatures from g2e, for enrichment vectoring, should look like this { "signature_ids":[ {"col_title":"title 1", "enr_id_up":###, "enr_id_dn":###}, {"col_title":"title 2", "enr_id_up":###, "enr_id_dn":###} ], "background_type":"ChEA_2015" } ''' net = Network() g2e_post = {} sig_ids = [] # I have to get user_list_ids from Enrichr tmp = 1 for inst_gl in gl: inst_sig = {} inst_sig['col_title'] = 'Sig-'+str(tmp) tmp = tmp+1 # submit to enrichr and get user_list_ids for inst_updn in inst_gl: inst_list = inst_gl[inst_updn] inst_id = enr_fun.enrichr_post_request(inst_list) inst_sig['enr_id_'+inst_updn] = inst_id sig_ids.append(inst_sig) g2e_post['signature_ids'] = sig_ids g2e_post['background_type'] = 'ChEA_2015' net.save_dict_to_json(g2e_post,'json/g2e_enr_vect.json','indent')
def main(): ''' This will add cell line category information (including plexes and gene-expression groups to the gene expression data from CCLE) ''' from clustergrammer import Network net = Network() # load original CCLE gene expression data for CST lung cancer cell lines filename = 'CCLE_gene_expression/CCLE_NSCLC_all_genes.txt' f = open(filename, 'r') lines = f.readlines() f.close() # load cell line info cl_info = net.load_json_to_dict('cell_line_info/cell_line_muts.json') # write to new file new_file = 'CCLE_gene_expression/CCLE_NSCLC_cats_all_genes.txt' fw = open(new_file, 'w') fw.close()
def clustergrammer_load(): # import network class from Network.py from clustergrammer import Network net = Network() net.pandas_load_file('mat_cats.tsv') net.make_clust(dist_type='cos',views=['N_row_sum','N_row_var']) net.write_json_to_file('viz','json/mult_cats.json','indent') print('\n**********************') print(net.dat['node_info']['row'].keys()) print('\n\n')
def main(): import time start_time = time.time() import pandas as pd import StringIO # import network class from Network.py from clustergrammer import Network net = Network() # load data to dataframe # net.load_tsv_to_net('txt/example_tsv_network.txt') # net.load_tsv_to_net('txt/mat_1mb.txt') # choose file ################ # file_buffer = open('txt/col_categories.txt') file_buffer = open('txt/example_tsv_network.txt' ) buff = StringIO.StringIO( file_buffer.read() ) net.pandas_load_tsv_to_net(buff) # filter rows views = ['filter_row_sum','N_row_sum'] # distance metric dist_type = 'cosine' # linkage type linkage_type = 'average' net.make_clust(dist_type=dist_type, views=views, calc_col_cats=True,\ linkage_type=linkage_type) net.write_json_to_file('viz', 'json/mult_view.json', 'no-indent') elapsed_time = time.time() - start_time print('\n\n\nelapsed time: '+str(elapsed_time))
def proc_locally(): from clustergrammer import Network # import run_g2e_background net = Network() vect_post = net.load_json_to_dict('large_vect_post.json') print(vect_post.keys()) # mongo_address = '10.125.161.139' net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() net.N_top_views() print(net.viz.keys())
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t')
def main(mongo_address, viz_id, vect_post): from bson.objectid import ObjectId from pymongo import MongoClient from clustergrammer import Network # set up database connection client = MongoClient(mongo_address) db = client.clustergrammer viz_id = ObjectId(viz_id) # get placeholder viz data found_viz = db.networks.find_one({'_id': viz_id }) # initialize export_dat export_dat = {} export_viz = {} # try to make clustegram using vect_post try: # ini network obj net = Network() # vector endpoint net.load_vect_post_to_net(vect_post) # swap nans for zeros net.swap_nan_for_zero() # deprecated clustering modules #################################### # cluster g2e using pandas # net.fast_mult_views() # # calculate top views rather than percentage views # net.N_top_views() #################################### net.make_filtered_views(dist_type='cosine', dendro=True, \ views=['N_row_sum'], linkage_type='average') # export dat try: # convert data to list net.dat['mat'] = net.dat['mat'].tolist() net.dat['mat_up'] = net.dat['mat_up'].tolist() net.dat['mat_dn'] = net.dat['mat_dn'].tolist() export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'g2e_enr_vect' dat_id = db.network_data.insert( export_dat ) print('G2E: network data successfully uploaded') except: export_dat['dat'] = 'data-too-large' export_dat['source'] = 'g2e_enr_vect' dat_id = db.network_data.insert( export_dat ) print('G2E: network data too large to be uploaded') update_viz = net.viz update_dat = dat_id # if there is an error update json with error except: print('\n--------------------------------') print('G2E clustering error') print('----------------------------------\n') update_viz = 'error' update_dat = 'error' # export vix to database found_viz['viz'] = update_viz found_viz['dat'] = update_dat # update the viz data try: db.networks.update_one( {"_id":viz_id}, {"$set": found_viz} ) print('\n\n---------------------------------------------------') print( 'G2E Successfully made and uploaded clustergram') print('---------------------------------------------------\n\n') except: print('\n--------------------------------') print('G2E error in loading viz into database') print('----------------------------------\n') # close database connection client.close()
import time start_time = time.time() from clustergrammer import Network net = Network() # choose tsv file #################### inst_name = 'Tyrosine' # net.load_file('txt/phos_ratios_all_treat_no_geld_ST.txt') net.load_file('txt/phos_ratios_all_treat_no_geld_Tyrosine.txt') net.swap_nan_for_zero() # net.normalize(axis='row', norm_type='zscore', keep_orig=True) print(net.dat.keys()) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos',views=views , dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # run_enrichr=['KEA_2015']) # run_enrichr=['ENCODE_TF_ChIP-seq_2014']) # run_enrichr=['GO_Biological_Process_2015']) net.write_json_to_file('viz', 'json/'+inst_name+'.json', 'no-indent') net.write_json_to_file('sim_row', 'json/'+inst_name+'_sim_row.json', 'no-indent') net.write_json_to_file('sim_col', 'json/'+inst_name+'_sim_col.json', 'no-indent')
def clust_from_response(response_list): from clustergrammer import Network import scipy import json import pandas as pd import math from copy import deepcopy print('----------------------') print('enrichr_clust_from_response') print('----------------------') ini_enr = transfer_to_enr_dict( response_list ) enr = [] scores = {} score_types = ['combined_score','pval','zscore'] for score_type in score_types: scores[score_type] = pd.Series() for inst_enr in ini_enr: if inst_enr['combined_score'] > 0: # make series of enriched terms with scores for score_type in score_types: # collect the scores of the enriched terms if score_type == 'combined_score': scores[score_type][inst_enr['name']] = inst_enr[score_type] if score_type == 'pval': scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type]) if score_type == 'zscore': scores[score_type][inst_enr['name']] = -inst_enr[score_type] # keep enrichement values enr.append(inst_enr) # sort and normalize the scores for score_type in score_types: scores[score_type] = scores[score_type]/scores[score_type].max() scores[score_type].sort(ascending=False) number_of_enriched_terms = len(scores['combined_score']) enr_score_types = ['combined_score','pval','zscore'] if number_of_enriched_terms <10: num_dict = {'ten':10} elif number_of_enriched_terms <20: num_dict = {'ten':10, 'twenty':20} else: num_dict = {'ten':10, 'twenty':20, 'thirty':30} # gather lists of top scores top_terms = {} for enr_type in enr_score_types: top_terms[enr_type] = {} for num_terms in num_dict.keys(): inst_num = num_dict[num_terms] top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num] # gather the terms that should be kept - they are at the top of the score list keep_terms = [] for inst_enr_score in top_terms: for tmp_num in num_dict.keys(): keep_terms.extend( top_terms[inst_enr_score][tmp_num] ) keep_terms = list(set(keep_terms)) # keep enriched terms that are at the top 10 based on at least one score keep_enr = [] for inst_enr in enr: if inst_enr['name'] in keep_terms: keep_enr.append(inst_enr) # fill in full matrix ####################### # genes row_node_names = [] # enriched terms col_node_names = [] # gather information from the list of enriched terms for inst_enr in keep_enr: col_node_names.append(inst_enr['name']) row_node_names.extend(inst_enr['int_genes']) row_node_names = sorted(list(set(row_node_names))) net = Network() net.dat['nodes']['row'] = row_node_names net.dat['nodes']['col'] = col_node_names net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)]) for inst_enr in keep_enr: inst_term = inst_enr['name'] col_index = col_node_names.index(inst_term) # use combined score for full matrix - will not be seen in viz tmp_score = scores['combined_score'][inst_term] net.dat['node_info']['col']['value'].append(tmp_score) for inst_gene in inst_enr['int_genes']: row_index = row_node_names.index(inst_gene) # save association net.dat['mat'][row_index, col_index] = 1 # cluster full matrix ############################# # do not make multiple views views = [''] if len(net.dat['nodes']['row']) > 1: net.make_clust(dist_type='jaccard', views=views, dendro=False) else: net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False) # get dataframe from full matrix df = net.dat_to_df() for score_type in score_types: for num_terms in num_dict: inst_df = deepcopy(df) inst_net = deepcopy(Network()) inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]] # load back into net inst_net.df_to_dat(inst_df) # make views if len(net.dat['nodes']['row']) > 1: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False) else: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False) inst_views = inst_net.viz['views'] # add score_type to views for inst_view in inst_views: inst_view['N_col_sum'] = num_dict[num_terms] inst_view['enr_score_type'] = score_type # add values to col_nodes and order according to rank for inst_col in inst_view['nodes']['col_nodes']: inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name']) inst_name = inst_col['name'] inst_col['value'] = scores[score_type][inst_name] # add views to main network net.viz['views'].extend(inst_views) return net
''' The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' # from clustergrammer import Network from clustergrammer import Network net = Network() # load matrix tsv file net.load_stdin() # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 250, rank_type='sum') # net.filter_threshold('row', threshold=3.0, num_occur=4) # net.swap_nan_for_zero() net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # output jsons for front-end visualizations
def make_phos_homepage_viz(): from clustergrammer import Network net = Network() filename = 'lung_cellline_3_1_16/lung_cellline_phospho/' + \ 'lung_cellline_TMT_phospho_combined_ratios.tsv' net.load_file(filename) # quantile normalize to normalize cell lines net.normalize(axis='col', norm_type='qn') # only keep most differentially regulated PTMs net.filter_N_top('row', 250, 'sum') # take zscore of rows net.normalize(axis='row', norm_type='zscore', keep_orig=True) net.swap_nan_for_zero() # threshold filter PTMs net.filter_threshold('row', threshold=1.75, num_occur=3) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True, calc_cat_pval=True) net.write_json_to_file('viz', 'json/homepage_phos.json', 'indent')
def process_GCT_and_export_tsv(): from clustergrammer import Network filename = 'gcts/LDS-1003.gct' print('exporting processed GCT as tsv file') df = load_file(filename) net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 200) net.write_matrix_to_tsv('txt/example_gct_export.txt')
def clust_from_response(response_list): from clustergrammer import Network import scipy import json import pandas as pd import math from copy import deepcopy print('----------------------') print('enrichr_clust_from_response') print('----------------------') ini_enr = transfer_to_enr_dict( response_list ) enr = [] scores = {} score_types = ['combined_score','pval','zscore'] for score_type in score_types: scores[score_type] = pd.Series() for inst_enr in ini_enr: if inst_enr['combined_score'] > 0: # make series of enriched terms with scores for score_type in score_types: # collect the scores of the enriched terms if score_type == 'combined_score': scores[score_type][inst_enr['name']] = inst_enr[score_type] if score_type == 'pval': scores[score_type][inst_enr['name']] = -math.log(inst_enr[score_type]) if score_type == 'zscore': scores[score_type][inst_enr['name']] = -inst_enr[score_type] # keep enrichement values enr.append(inst_enr) # sort and normalize the scores for score_type in score_types: scores[score_type] = scores[score_type]/scores[score_type].max() scores[score_type].sort(ascending=False) number_of_enriched_terms = len(scores['combined_score']) enr_score_types = ['combined_score','pval','zscore'] if number_of_enriched_terms <10: num_dict = {'ten':10} elif number_of_enriched_terms <20: num_dict = {'ten':10, 'twenty':20} else: num_dict = {'ten':10, 'twenty':20, 'thirty':30} # gather lists of top scores top_terms = {} for enr_type in enr_score_types: top_terms[enr_type] = {} for num_terms in list(num_dict.keys()): inst_num = num_dict[num_terms] top_terms[enr_type][num_terms] = scores[enr_type].index.tolist()[: inst_num] # gather the terms that should be kept - they are at the top of the score list keep_terms = [] for inst_enr_score in top_terms: for tmp_num in list(num_dict.keys()): keep_terms.extend( top_terms[inst_enr_score][tmp_num] ) keep_terms = list(set(keep_terms)) # keep enriched terms that are at the top 10 based on at least one score keep_enr = [] for inst_enr in enr: if inst_enr['name'] in keep_terms: keep_enr.append(inst_enr) # fill in full matrix ####################### # genes row_node_names = [] # enriched terms col_node_names = [] # gather information from the list of enriched terms for inst_enr in keep_enr: col_node_names.append(inst_enr['name']) row_node_names.extend(inst_enr['int_genes']) row_node_names = sorted(list(set(row_node_names))) net = Network() net.dat['nodes']['row'] = row_node_names net.dat['nodes']['col'] = col_node_names net.dat['mat'] = scipy.zeros([len(row_node_names),len(col_node_names)]) for inst_enr in keep_enr: inst_term = inst_enr['name'] col_index = col_node_names.index(inst_term) # use combined score for full matrix - will not be seen in viz tmp_score = scores['combined_score'][inst_term] net.dat['node_info']['col']['value'].append(tmp_score) for inst_gene in inst_enr['int_genes']: row_index = row_node_names.index(inst_gene) # save association net.dat['mat'][row_index, col_index] = 1 # cluster full matrix ############################# # do not make multiple views views = [''] if len(net.dat['nodes']['row']) > 1: net.make_clust(dist_type='jaccard', views=views, dendro=False) else: net.make_clust(dist_type='jaccard', views=views, dendro=False, run_clustering=False) # get dataframe from full matrix df = net.dat_to_df() for score_type in score_types: for num_terms in num_dict: inst_df = deepcopy(df) inst_net = deepcopy(Network()) inst_df['mat'] = inst_df['mat'][top_terms[score_type][num_terms]] # load back into net inst_net.df_to_dat(inst_df) # make views if len(net.dat['nodes']['row']) > 1: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False) else: inst_net.make_clust(dist_type='jaccard', views=['N_row_sum'], dendro=False, run_clustering = False) inst_views = inst_net.viz['views'] # add score_type to views for inst_view in inst_views: inst_view['N_col_sum'] = num_dict[num_terms] inst_view['enr_score_type'] = score_type # add values to col_nodes and order according to rank for inst_col in inst_view['nodes']['col_nodes']: inst_col['rank'] = len(top_terms[score_type][num_terms]) - top_terms[score_type][num_terms].index(inst_col['name']) inst_name = inst_col['name'] inst_col['value'] = scores[score_type][inst_name] # add views to main network net.viz['views'].extend(inst_views) return net
#from sys import argv from clustergrammer import Network net = Network() net.load_file('mat.txt') #argv[1] # calculate clustering using default parameters net.cluster() # save visualization JSON to file for use by front end net.write_json_to_file('viz', 'kbio_mhcii_view.json') net2 = Network() net2.load_file('mat2.txt') #argv[1] # calculate clustering using default parameters net2.cluster() # save visualization JSON to file for use by front end net2.write_json_to_file('viz', 'kbio_mhcii_view_summary.json')
def make_json(): from clustergrammer import Network net = Network() row_num = 200 num_columns = 20 # make up all names for all data row_names = make_up_names(row_num) # initialize vect_post vect_post = {} vect_post['title'] = 'Some-Clustergram' vect_post['link'] = 'some-link' vect_post['filter'] = 'N_row_sum' vect_post['is_up_down'] = False vect_post['columns'] = [] split = True # fraction of rows in each column - 1 means all columns have all rows inst_prob = 1 # make column data for col_num in range(num_columns): inst_col = {} col_name = 'Col-' + str( col_num+1 ) + ' make name longer' inst_col['col_name'] = col_name inst_col['link'] = 'col-link' if col_num < 5: inst_col['cat'] = 'brain' else: inst_col['cat'] = 'lung' # save to columns inst_col['data'] = [] #vector # get random subset of row_names vect_rows = get_subset_rows(row_names, inst_prob) # generate vectors for inst_row in vect_rows: # genrate values ################## # add positive/negative values if random.random() > 0.5: value_up = 10*random.random() else: value_up = 0 if random.random() > 0.5: value_dn = -10*random.random() else: value_dn = 0 value = value_up + value_dn # # generate vector component # ############################# # vector.append([ inst_row, value ]) # vector_up.append([ inst_row, value_up ]) # vector_dn.append([ inst_row, value_dn ]) # define row object - within column row_obj = {} row_obj['row_name'] = inst_row row_obj['val'] = value row_obj['val_up'] = value_up row_obj['val_dn'] = value_dn inst_col['data'].append(row_obj) # if split: # inst_col['vector_up'] = vector_up # inst_col['vector_dn'] = vector_dn # save columns to vect_post vect_post['columns'].append(inst_col) net.save_dict_to_json(vect_post, 'fake_vect_post.json', indent='indent')
import time start_time = time.time() from clustergrammer import Network net = Network() # choose tsv file ####################### net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/tuple_cats.txt') # net.load_file('txt/tuple_names.txt') # net.load_file('txt/missing_values.txt') # net.load_file('txt/example_tsv.txt') # net.load_file('txt/col_categories.txt') # net.load_file('txt/mat_cats.tsv') # net.load_file('txt/mat_1mb.txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/sim_mat_4_cats.txt') # net.load_file('txt/number_names.txt') # link = net.Iframe_web_app('txt/rc_two_cats.txt', width=1000, height=800) # link = net.Iframe_web_app( width=1000, height=800) # print(link) # possible filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.filter_sum('col', threshold=30) # net.normalize(axis='row', norm_type='qn')
''' The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' # from clustergrammer import Network from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/rc_val_cats.txt') # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 250, rank_type='sum') # net.filter_threshold('row', threshold=3.0, num_occur=4) # net.swap_nan_for_zero() # net.downsample(ds_type='kmeans', axis='col', num_samples=10) # net.random_sample(random_state=100, num_samples=10, axis='col') # net.clip(-6,6) # net.filter_cat('row', 1, 'Gene Type: Interesting') # net.set_cat_color('col', 1, 'Category: one', 'blue') net.cluster(dist_type='cos', views=['N_row_sum', 'N_row_var'],
import time # import StringIO start_time = time.time() # import network class from Network.py from clustergrammer import Network net = Network() net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/example_tsv.txt') # net.load_file('txt/col_categories.txt') # net.load_file('txt/mat_cats.tsv') # net.load_file('txt/mat_1mb.Txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/sim_mat_4_cats.txt') views = ['N_row_sum','N_row_var'] # # filtering rows and cols by sum # net.filter_sum('row', threshold=20) # net.filter_sum('col', threshold=30) # # keep top rows based on sum # net.filter_N_top('row', 10, 'sum') net.make_clust(dist_type='cos',views=views , dendro=True, sim_mat=True, filter_sim=0.1) # net.produce_view({'N_row_sum':10,'dist':'euclidean'})
def prepare_heatmap(matrix_input, html_file, html_dir, tools_dir, categories, distance, linkage): # prepare directory and html os.mkdir(html_dir) env = Environment(loader=FileSystemLoader(tools_dir + "/templates")) template = env.get_template("clustergrammer.template") overview = template.render() with open(html_file, "w") as outf: outf.write(overview) json_output = html_dir + "/mult_view.json" net = Network() net.load_file(matrix_input) if (categories['row']): net.add_cats('row', categories['row']) if (categories['col']): net.add_cats('col', categories['col']) net.cluster(dist_type=distance, linkage_type=linkage) net.write_json_to_file('viz', json_output)
def make_viz_from_df(df, filename): from clustergrammer import Network net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 2000) num_coluns = net.dat['mat'].shape[1] if num_coluns < 50: # views = ['N_row_sum', 'N_row_var'] views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) filename = 'json/' + filename.split('/')[1].replace('.gct','') + '.json' net.write_json_to_file('viz', filename)
''' Python 2.7 The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' from clustergrammer import Network net = Network() # load matrix tsv file net.load_file('txt/rc_two_cats.txt') # net.load_file('txt/ccle_example.txt') # net.load_file('txt/rc_val_cats.txt') # net.load_file('txt/number_labels.txt') # net.load_file('txt/mnist.txt') # net.load_file('txt/tuple_cats.txt') # net.load_file('txt/example_tsv.txt') # net.enrichrgram('KEA_2015') # optional filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 250, rank_type='sum') # net.filter_threshold('row', threshold=3.0, num_occur=4) # net.swap_nan_for_zero() # net.set_cat_color('col', 1, 'Category: one', 'blue')
# make network object and load file from clustergrammer import Network net = Network() net.load_file('mult_view.tsv') # Z-score normalize the rows #net.normalize(axis='row', norm_type='zscore', keep_orig=True) # calculate clustering using default parameters net.cluster() # save visualization JSON to file for use by front end net.write_json_to_file('viz', 'mult_view.json') # needs pandas and sklearn as well # pip install --user --upgrade clustergrammer pandas sklearn
''' The clustergrammer python module can be installed using pip: pip install clustergrammer or by getting the code from the repo: https://github.com/MaayanLab/clustergrammer-py ''' import os from clustergrammer import Network for filename in os.listdir("tsv"): name = filename.split(".")[0] net = Network() # load matrix tsv file print name net.load_file('tsv/' + name + '.tsv') # optional filtering and normalization ########################################## net.swap_nan_for_zero() net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # write jsons for front-end visualizations net.write_json_to_file('viz', 'output/' + name + '.json', 'indent')