def main(): net = Network() # load genes of interest gene_info = net.load_json_to_dict('../grant_pois/gene_info_with_dark.json') # ENCODE, GTEx, etc # hzome_names = ['my_CCLE_exp.txt', 'ENCODE_TF_targets.txt', 'ChEA_TF_targets.txt'] hzome_names = ['ENCODE_TF_targets.txt'] # define separate sim_cutoffs for different files cutoffs = {} cutoffs['my_CCLE_exp.txt'] = 0.15 cutoffs['ENCODE_TF_targets.txt'] = 0.35 ## 0.6 cutoffs['ChEA_TF_targets.txt'] = 0.2 cutoffs['my_gtex_Moshe_2017_exp.txt'] = 0.2 genes_of_class = gene_info['KIN']['all'] for hzome_name in hzome_names: hzome_filename = '../hzome_data/' + hzome_name print('loading data ') # load hzome data #################### if 'my_' in hzome_name: # if I am providing the data, then load in normal way net.load_file(hzome_filename) hzome_data = net.export_df() else: # load data in hzome format hzome_data = deepcopy(hzome_to_df.load_matrix(hzome_filename)) print('data loaded\n') for gene_class in gene_info: calc_gene_sim_mat(hzome_data, net, gene_info, gene_class, hzome_name, cutoffs)
def make_ccle_matrix_subset(): ''' This will save a subset of the downsampled matrix using the proteins of interest ''' from clustergrammer import Network import json_scripts print('-- load CCLE downsampled data') # load downsampled CCLE data net = Network() net.load_file('CCLE/CCLE_kmeans_ds_col_100.txt') df = net.export_df() # load proteins of interest filename = 'proteins_of_interest/proteins_of_interest.json' poi = json_scripts.load_to_dict(filename) all_poi = [] for inst_type in poi: all_poi.extend(poi[inst_type]) # only keep pois that are found in the CCLE all_genes = df.index.tolist() found_poi = list(set(all_genes) & set(all_poi)) num_found_poi = len(found_poi) print( str(num_found_poi) + ' proteins of interest were found in the CCLE data') # filter dataframe using row list (transpose and transpose-back) ################################################################## df = df.transpose() df = df[found_poi] df = df.transpose() # save version without protein categories (e.g. kinase) df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi_no_cats.txt', sep='\t') row_cats = [] for inst_gene in found_poi: # add protein type to gene names found_type = '' for inst_type in poi: if inst_gene in poi[inst_type]: found_type = inst_type gene_name = 'gene: ' + inst_gene cat_name = 'type: ' + found_type inst_tuple = (gene_name, cat_name) row_cats.append(inst_tuple) # redefine index df.index = row_cats print('-- save matrix with proteins_of_interest subset') df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi.txt', sep='\t')
def main(): from clustergrammer import Network # load CCLE cell lines filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/ccle_cl_names.txt' f = open(filename, 'r') lines = f.readlines() f.close() cl_names = [] for inst_line in lines: inst_line = inst_line.strip() cl_names.append(inst_line) filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_lung.txt' net = Network() net.load_file(filename) ccle_lung = net.export_df() cols = ccle_lung.columns.tolist() # simplify cols, disguard meta-data ###################################### simple_cols = [] for inst_col in cols: proc_col = inst_col[0].split(': ')[1].replace('NCI', '') if 'CALU' in proc_col: proc_col = proc_col.replace('CALU', 'Calu-') if 'LOU' in proc_col: proc_col = proc_col.replace('LOU', 'Lou-') if 'CAL' in proc_col: proc_col = proc_col.replace('CAL', 'CAL-') simple_cols.append(proc_col) ccle_lung.columns = simple_cols cols = ccle_lung.columns.tolist() found_cols = [] for inst_col in cols: if inst_col in cl_names: found_cols.append(inst_col) # found all cell lines print('found ' + str(len(found_cols))) # save subset of cell lines that are also found in the CST PTM data ccle_cst_lung = ccle_lung[cl_names] save_filename = '../lung_cellline_3_1_16/lung_cl_all_ptm/precalc_processed/CCLE_CST_lung.txt' ccle_cst_lung.to_csv(save_filename, sep='\t')