def make_viz_from_df(df, filename): from clustergrammer import Network net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 2000) num_coluns = net.dat['mat'].shape[1] if num_coluns < 50: # views = ['N_row_sum', 'N_row_var'] views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) filename = 'json/' + filename.split('/')[1].replace('.gct', '') + '.json' net.write_json_to_file('viz', filename)
def make_phos_homepage_viz(): from clustergrammer import Network net = Network() filename = 'lung_cellline_3_1_16/lung_cellline_phospho/' + \ 'lung_cellline_TMT_phospho_combined_ratios.tsv' net.load_file(filename) # quantile normalize to normalize cell lines net.normalize(axis='col', norm_type='qn') # only keep most differentially regulated PTMs net.filter_N_top('row', 250, 'sum') # take zscore of rows net.normalize(axis='row', norm_type='zscore', keep_orig=True) net.swap_nan_for_zero() # threshold filter PTMs net.filter_threshold('row', threshold=1.75, num_occur=3) views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos', views=views, dendro=True, sim_mat=True, calc_cat_pval=True) net.write_json_to_file('viz', 'json/homepage_phos.json', 'indent')
def clust_vect(db, viz_doc, vect_post): from clustergrammer import Network try: net = Network() net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cosine', dendro=True, views=views, linkage_type='average') dat_id = upload_dat(db, net) update_viz = net.viz update_dat = dat_id except: print('error clustering') update_viz = 'error' update_dat = 'error' viz_doc['viz'] = update_viz viz_doc['dat'] = update_dat return viz_doc
def make_viz_json(inst_df, name): from clustergrammer import Network net = Network() filename = 'json/'+name load_df = {} load_df['mat'] = inst_df net.df_to_dat(load_df) net.swap_nan_for_zero() net.make_clust(views=[]) net.write_json_to_file('viz', filename, 'no-indent')
def main( buff, inst_filename, mongo_address, viz_id): import numpy as np import flask from bson.objectid import ObjectId from pymongo import MongoClient from flask import request from clustergrammer import Network import StringIO client = MongoClient(mongo_address) db = client.clustergrammer viz_id = ObjectId(viz_id) found_viz = db.networks.find_one({'_id':viz_id}) try: net = Network() net.load_tsv_to_net(buff) net.swap_nan_for_zero() views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cosine', dendro=True, views=views, \ linkage_type='average') export_dat = {} export_dat['name'] = inst_filename export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'user_upload' dat_id = db.network_data.insert(export_dat) update_viz = net.viz update_dat = dat_id except: print('\n-----------------------') print('error in clustering') print('-----------------------\n') update_viz = 'error' update_dat = 'error' found_viz['viz'] = update_viz found_viz['dat'] = update_dat db.networks.update_one( {'_id':viz_id}, {'$set': found_viz} ) client.close()
def main(buff, inst_filename, mongo_address, viz_id): import numpy as np import flask from bson.objectid import ObjectId from pymongo import MongoClient from flask import request from clustergrammer import Network import StringIO client = MongoClient(mongo_address) db = client.clustergrammer viz_id = ObjectId(viz_id) found_viz = db.networks.find_one({'_id': viz_id}) try: net = Network() net.load_tsv_to_net(buff) net.swap_nan_for_zero() views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cosine', dendro=True, views=views, \ linkage_type='average') export_dat = {} export_dat['name'] = inst_filename export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'user_upload' dat_id = db.network_data.insert(export_dat) update_viz = net.viz update_dat = dat_id except: print('\n-----------------------') print('error in clustering') print('-----------------------\n') update_viz = 'error' update_dat = 'error' found_viz['viz'] = update_viz found_viz['dat'] = update_dat db.networks.update_one({'_id': viz_id}, {'$set': found_viz}) client.close()
def cluster(): from clustergrammer import Network net = Network() vect_post = net.load_json_to_dict('fake_vect_post.json') net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() # net.N_top_views() net.make_clust(dist_type='cos',views=['N_row_sum','N_row_var'], dendro=True) net.write_json_to_file('viz','json/large_vect_post_example.json','indent')
def cluster(): from clustergrammer import Network net = Network() vect_post = net.load_json_to_dict('fake_vect_post.json') net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() # net.N_top_views() net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True) net.write_json_to_file('viz', 'json/large_vect_post_example.json', 'indent')
def proc_locally(): from clustergrammer import Network # import run_g2e_background net = Network() vect_post = net.load_json_to_dict('large_vect_post.json') print(vect_post.keys()) # mongo_address = '10.125.161.139' net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() net.N_top_views() print(net.viz.keys())
def process_GCT_and_export_tsv(): from clustergrammer import Network filename = 'gcts/LDS-1003.gct' print('exporting processed GCT as tsv file') df = load_file(filename) net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 200) net.write_matrix_to_tsv('txt/example_gct_export.txt')
def proc_locally(): from clustergrammer import Network # import run_g2e_background net = Network() vect_post = net.load_json_to_dict('large_vect_post.json') print(vect_post.keys()) # mongo_address = '10.125.161.139' net.load_vect_post_to_net(vect_post) net.swap_nan_for_zero() net.N_top_views() print(net.viz.keys())
def process_GCT_and_export_tsv(): from clustergrammer import Network filename = 'gcts/LDS-1003.gct' print('exporting processed GCT as tsv file') df = load_file(filename) net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 200) net.write_matrix_to_tsv('txt/example_gct_export.txt')
def make_json_from_tsv(name): ''' make a clustergrammer json from a tsv file ''' from clustergrammer import Network print('\n' + name) net = Network() filename = 'txt/'+ name + '.txt' net.load_file(filename) df = net.dat_to_df() net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 1000) num_rows = net.dat['mat'].shape[0] num_cols = net.dat['mat'].shape[1] print('num_rows ' + str(num_rows)) print('num_cols ' + str(num_cols)) if num_cols < 50 or num_rows < 1000: views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) export_filename = 'json/' + name + '.json' net.write_json_to_file('viz', export_filename) else: print('did not cluster, too many columns ')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t', na_rep='nan')
def reproduce_Mark_correlation_matrix(): import pandas as pd from scipy.spatial.distance import squareform from clustergrammer import Network from copy import deepcopy dist_vect = calc_custom_dist(data_type='ptm_none', dist_metric='correlation', pairwise='True') dist_mat = squareform(dist_vect) # make similarity matrix dist_mat = 1 - dist_mat net = Network() data_type = 'ptm_none' filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' + \ data_type + '.txt' # load file and export dataframe net = deepcopy(Network()) net.load_file(filename) net.swap_nan_for_zero() tmp_df = net.dat_to_df() df = tmp_df['mat'] cols = df.columns.tolist() rows = cols mark_df = pd.DataFrame(data=dist_mat, columns=cols, index=rows) save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/' \ + 'Mark_corr_sim_mat' + '.txt' mark_df.to_csv(save_filename, sep='\t')
def make_viz_from_df(df, filename): from clustergrammer import Network net = Network() net.df_to_dat(df) net.swap_nan_for_zero() # zscore first to get the columns distributions to be similar net.normalize(axis='col', norm_type='zscore', keep_orig=True) # filter the rows to keep the perts with the largest normalizes values net.filter_N_top('row', 2000) num_coluns = net.dat['mat'].shape[1] if num_coluns < 50: # views = ['N_row_sum', 'N_row_var'] views = ['N_row_sum'] net.make_clust(dist_type='cos', views=views) filename = 'json/' + filename.split('/')[1].replace('.gct','') + '.json' net.write_json_to_file('viz', filename)
# possible filtering and normalization ########################################## # net.filter_sum('row', threshold=20) # net.filter_sum('col', threshold=30) # net.normalize(axis='row', norm_type='qn') # net.normalize(axis='col', norm_type='zscore', keep_orig=True) # net.filter_N_top('row', 100, rank_type='var') # net.filter_N_top('col', 3, rank_type='var') # net.filter_threShold('col', threshold=2, num_occur=3 # net.filter_threshold('row', threshold=3.0, num_occur=4) net.swap_nan_for_zero() # df = net.dat_to_df() views = ['N_row_sum', 'N_row_var'] net.make_clust(dist_type='cos',views=views , dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # run_enrichr=['ChEA_2015']) # run_enrichr=['ENCODE_TF_ChIP-seq_2014']) # run_enrichr=['KEA_2015']) # run_enrichr=['GO_Biological_Process_2015']) net.write_json_to_file('viz', 'json/mult_view.json', 'no-indent') net.write_json_to_file('sim_row', 'json/mult_view_sim_row.json', 'no-indent')
gene_attribute_matrix.columns = gene_attribute_matrix.columns.map( lambda s: '%s: %s' % (gene_attribute_matrix.columns.name, s)) # Remove names for clustergrammer gene_attribute_matrix.index.name = "" gene_attribute_matrix.columns.name = "" # Write to file # fp = StringIO() # gene_attribute_matrix.to_csv(fp, sep='\t') gene_attribute_matrix.to_csv('tmp.txt', sep='\t') # Custergrammer from clustergrammer import Network net = Network() # net.load_tsv_to_net(fp, name) # StringIO net.load_file('tmp.txt') net.swap_nan_for_zero() # Generate net.make_clust(dist_type='cos', views=['N_row_sum', 'N_row_var'], dendro=True, sim_mat=True, filter_sim=0.1, calc_cat_pval=False) # Insert into database cur.execute( 'insert into `datasets` (`Name`, `prot_att`, `att_att`, `prot_prot`) values (?, ?, ?, ?)', (name, net.export_net_json('viz', indent='no-indent'), net.export_net_json('sim_col', indent='no-indent'), net.export_net_json('sim_row', indent='no-indent'))) con.commit()
def main( buff, inst_filename, mongo_address, viz_id): import numpy as np import flask from bson.objectid import ObjectId from pymongo import MongoClient from flask import request from clustergrammer import Network import StringIO ############################## # set up database connection ############################## # set up connection client = MongoClient(mongo_address) db = client.clustergrammer # get placeholder viz data viz_id = ObjectId(viz_id) found_viz = db.networks.find_one({'_id':viz_id}) try: ######################## # load and cluster ######################## # initiate class network net = Network() # net.load_lines_from_tsv_to_net(file_lines) net.pandas_load_tsv_to_net(buff) # swap nans for zero net.swap_nan_for_zero() # deprecated clustering module #################################### # # fast mult views takes care of pre-filtering # net.fast_mult_views() #################################### net.make_filtered_views(dist_type='cosine', dendro=True, \ views=['filter_row_sum'], linkage_type='average') ############################### # save to database ############################### export_dat = {} export_dat['name'] = inst_filename export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'user_upload' # save dat to separate document dat_id = db.network_data.insert(export_dat) update_viz = net.viz update_dat = dat_id except: print('\n-----------------------') print('error in clustering') print('-----------------------\n') update_viz = 'error' update_dat = 'error' # update found_viz found_viz['viz'] = update_viz found_viz['dat'] = update_dat # update found_viz in database db.networks.update_one( {'_id':viz_id}, {'$set': found_viz} ) ############################ # end database connection ############################ client.close()
def main(mongo_address, viz_id, vect_post): from bson.objectid import ObjectId from pymongo import MongoClient from clustergrammer import Network # set up database connection client = MongoClient(mongo_address) db = client.clustergrammer viz_id = ObjectId(viz_id) # get placeholder viz data found_viz = db.networks.find_one({'_id': viz_id }) # initialize export_dat export_dat = {} export_viz = {} # try to make clustegram using vect_post try: # ini network obj net = Network() # vector endpoint net.load_vect_post_to_net(vect_post) # swap nans for zeros net.swap_nan_for_zero() # deprecated clustering modules #################################### # cluster g2e using pandas # net.fast_mult_views() # # calculate top views rather than percentage views # net.N_top_views() #################################### net.make_filtered_views(dist_type='cosine', dendro=True, \ views=['N_row_sum'], linkage_type='average') # export dat try: # convert data to list net.dat['mat'] = net.dat['mat'].tolist() net.dat['mat_up'] = net.dat['mat_up'].tolist() net.dat['mat_dn'] = net.dat['mat_dn'].tolist() export_dat['dat'] = net.export_net_json('dat') export_dat['source'] = 'g2e_enr_vect' dat_id = db.network_data.insert( export_dat ) print('G2E: network data successfully uploaded') except: export_dat['dat'] = 'data-too-large' export_dat['source'] = 'g2e_enr_vect' dat_id = db.network_data.insert( export_dat ) print('G2E: network data too large to be uploaded') update_viz = net.viz update_dat = dat_id # if there is an error update json with error except: print('\n--------------------------------') print('G2E clustering error') print('----------------------------------\n') update_viz = 'error' update_dat = 'error' # export vix to database found_viz['viz'] = update_viz found_viz['dat'] = update_dat # update the viz data try: db.networks.update_one( {"_id":viz_id}, {"$set": found_viz} ) print('\n\n---------------------------------------------------') print( 'G2E Successfully made and uploaded clustergram') print('---------------------------------------------------\n\n') except: print('\n--------------------------------') print('G2E error in loading viz into database') print('----------------------------------\n') # close database connection client.close()