def load_hgram_matrix():
  hgram = json_scripts.load_to_dict('hgram_data_latest/hgram_latest.json')
  hgram['mat'] = np.asarray(hgram['mat'])

  mat = hgram['mat']
  row_names = hgram['nodes']['row']
  # will add resource category information to column names
  tmp_col_names = hgram['nodes']['col']
  tmp_col_cats = hgram['node_info']['col']['info']

  col_names = []
  for i in range(len(tmp_col_names)):
    inst_name = 'Resource: ' + tmp_col_names[i]
    inst_cat = 'Resource Type: ' + tmp_col_cats[i]

    inst_tuple = (inst_name, inst_cat)

    col_names.append(inst_tuple)

  # print(hgram.keys())
  # print( len(hgram['node_info']['col']['info']) )
  # print(len(hgram['nodes']['col']))

  # col_names = tmp_col_names

  # nodes, and mat
  ini_df = pd.DataFrame(data=mat, columns=col_names, index=row_names)




  return ini_df
def json_2_gmt(filename):
	import json_scripts

	# load json 
	inst_json = json_scripts.load_to_dict(filename)

	# get sorted dict keys 
	all_keys = sorted( inst_json.keys() )

	# write gmt 
	###############
	# convert filename to .gmt 
	filename = filename.split('.')[0] + '.gmt'
	fw = open(filename, 'w')

	# loop through keys 
	for inst_key in all_keys:

		# get gene list 
		inst_list = inst_json[inst_key]

		# print( inst_key + '\t' + str(len(inst_list)) + '\n' )

		# write line of gmt 
		fw.write(inst_key + '\tna\t' )

		# write genes 
		for inst_elem in inst_list:
			fw.write(inst_elem + '\t')
		
		# write new line 
		fw.write('\n')

	fw.close()
Example #3
0
def main():
    '''
  I'm just going to add the perturbation signatures as up/dn values.
  I'll generate comma separated files in the files_2-17-2017/ directory
  '''

    file_names = glob.glob('Pert_sigs/*.json')

    for inst_filename in file_names:

        inst_pert = json_scripts.load_to_dict(inst_filename)

        pert_name = inst_filename.split('/')[1].split('.json')[0]

        up_genes = inst_pert['upGenes']
        dn_genes = inst_pert['dnGenes']

        bin_sig = []

        for inst_gene in up_genes:
            bin_sig.append(inst_gene + ',1')

        for inst_gene in dn_genes:
            bin_sig.append(inst_gene + ',-1')

        # save bin_sig to file
        fw = open('files_2-17-2017/' + pert_name + '.txt', 'w')

        for inst_val in bin_sig:
            fw.write(inst_val + '\n')

        fw.close()
def main():
    '''
  I'm working on making similarity matrices for KIN, IC, and GPCR genes based on
  data in the Hzome. Here I'm gathering my old (Hgram) gene lists with the
  latest list of the 'dark' genes from the KMC 2017 grant. I'm saving these to a
  new JSON for later use. The next step is to calculate the similarity matrices
  and visualize them in a notebook or webpage.
  '''
    import json_scripts

    hgram_info = json_scripts.load_to_dict(
        '../harmonogram_classes/gene_classes_harmonogram.json')

    grant_poi = json_scripts.load_to_dict(
        '../grant_pois/proteins_of_interest.json')

    gene_types = ['KIN', 'IC', 'GPCR']

    # make a new json with merged all genes and dark gene info
    gene_info = {}

    for inst_type in gene_types:

        # add any dark genes to all_genes
        dark_genes = grant_poi[inst_type]
        all_genes = hgram_info[inst_type] + dark_genes

        dark_genes = sorted(list(set(dark_genes)))
        all_genes = sorted(list(set(all_genes)))

        print(inst_type)
        print('all: ' + str(len(all_genes)))
        print('dark: ' + str(len(dark_genes)))

        print(len(list(set(dark_genes) - set(all_genes))))

        gene_info[inst_type] = {}
        gene_info[inst_type]['all'] = all_genes
        gene_info[inst_type]['dark'] = dark_genes

        print('\n\n')

    json_scripts.save_to_json(gene_info,
                              '../grant_pois/gene_info_with_dark.json',
                              indent='indent')
def load_gene_classes():
  gene_classes = json_scripts.load_to_dict('gene_classes_harmonogram.json')

  keep_types = ['TF', 'GPCR', 'IC', 'KIN']

  keep_genes = []

  for inst_class in gene_classes:
    if inst_class in keep_types:

      inst_genes = gene_classes[inst_class]
      keep_genes.extend(inst_genes)

  return gene_classes
def extract_nodes():
	import json_scripts

	print('extracting nodes: as, cl, pt')

	# load the LDR data is json format 
	ldr = json_scripts.load_to_dict('LDR/LDR_api.json')

	# first generate lists of cell_lines, assays, and perturbagens 
	nodes = {}
	nodes['as'] = []
	nodes['cl'] = []
	nodes['pt'] = []
	nodes['ct'] = []

	# loop the ldr list 
	for inst_ldr in ldr:

		# add assay (datasetName)
		nodes['as'].append( inst_ldr['datasetName'].strip() )

		# get center name 
		nodes['ct'].append(inst_ldr['group']['name'])

		# # get release 
		# print( 'released: ' + str(inst_ldr['released']) )

		# add cell line(s)
		for inst_cl in inst_ldr['metadata']['cellLines']:
			if 'name' in inst_cl:
				nodes['cl'].append( inst_cl['name'].strip() )

		# add perturbation(s)
		for inst_pt in inst_ldr['metadata']['perturbagens']:
			nodes['pt'].append( inst_pt['name'].strip() )

	# get unique and sort 
	for inst_key in nodes:
		nodes[inst_key] = list(set(nodes[inst_key]))
		nodes[inst_key] = sorted(nodes[inst_key])

		print( 'there are ' + str(len(nodes[inst_key])) + ' ' + inst_key )

	return nodes
def construct_array():
	import json_scripts
	import scipy

	print('\nconstructing array\n')

	# load the LDR data is json format 
	ldr = json_scripts.load_to_dict('LDR/LDR_api.json')

	# load cl and as dictionary 
	as_cl_dict = json_scripts.load_to_dict('as_cl_dict.json')

	# get nodes from 'short name' dictionary values 
	nodes = {}
	nodes['as'] = sorted(list(set(as_cl_dict['as'].values())))
	nodes['cl'] = list(set(as_cl_dict['cl'].values()))
	# add cell-free to list of cell lines 
	nodes['cl'].append('cell-free')
	nodes['cl'] = sorted(nodes['cl'])

	# # run once - add back removed as and cl to Avi dictionary 
	# # find assays and cell lines that were removed from original list 
	# #####################################################################
	# all_nodes = extract_nodes()
	# for inst_data in as_cl_dict:
	# 	# get all nodes
	# 	tmp_dict = set( as_cl_dict[inst_data].keys() )
	# 	tmp_all = set( all_nodes[inst_data] )
	# 	not_found = list( tmp_all - tmp_dict )
	# 	print('\n')
	# 	print(inst_data)
	# 	for tmp in not_found:
	# 		print(tmp)
	# 	print('\n')

	# make 2d matrix for now 
	mat = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])

	# generate two released matrices 
	rl = {}
	rl['t'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])
	rl['f'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])

	# generate perturbation dictionary that will save perturbation 
	# information for assays and cell lines 
	perts = {}

	total = 0

	# loop through the ldf datasets 
	for inst_ldr in ldr:

		# get the inst_assay: put name through dictionary 
		# print( inst_ldr['datasetName'].strip() )
		inst_as = as_cl_dict['as'][ inst_ldr['datasetName'].strip() ]
		# print('inst_as: '+ inst_as)

		# get the cell line(s)
		inst_cls = [] 


		for inst_cl in inst_ldr['metadata']['cellLines']:
			if 'name' in inst_cl:
				#!! remove cell line 'TBD among cell ...'
				if 'TBD among' not in inst_cl['name'].strip():
					inst_cls.append( as_cl_dict['cl'][ inst_cl['name'].strip() ] )

		# get the perturbations 
		inst_pts = []
		for inst_pt in inst_ldr['metadata']['perturbagens']:
			inst_pts.append( inst_pt['name'].strip() )


		# if the assay is kinomescan then set cell line to 'cell-free
		if inst_as == 'KINOMEscan':
			# print('kinomescan')
			inst_cls.append( 'cell-free' )
			# print(inst_cls)
			# print('\n\n\n')


		# add information to mat
		# get index of assay 
		index_as = nodes['as'].index(inst_as)


		# loop through cell lines
		for inst_cl in inst_cls:

			# get the index of the cell line
			index_cl = nodes['cl'].index(inst_cl)

			for inst_pt in inst_pts:

				# check if the perturbation represents multiple perturbations 
				if 'compounds' in inst_pt and 'among' not in inst_pt:
					mult_pts = int(inst_pt.split(' ')[0])
				else:
					mult_pts = 0

				# track the number of perturbations and the released status 
				##############################################################
				if mult_pts == 0:
					mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + 1

					# track number of released 
					if inst_ldr['released'] == True:
						rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + 1
					else:
						rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + 1

				else:
					mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + mult_pts

					# track number of released 
					if inst_ldr['released'] == True:
						rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + mult_pts
					else:
						rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + mult_pts

				# keep track of perturbation information in the dictionary 
				##############################################################
				# genrate as cl tuple 
				inst_tuple = str((inst_as, inst_cl))
				# initailize list if necessary 
				if inst_tuple not in perts:
					perts[inst_tuple] = []
				# generate pert_dict
				pert_dict = {}
				pert_dict['name'] = inst_pt 
				pert_dict['release'] = inst_ldr['released']
				pert_dict['_id'] = inst_ldr['_id']
				# add dictionary to list 
				perts[inst_tuple].append(pert_dict)

				# add to total 
				total = total + 1

	# check perts dictionary 
	print('perts dictionary - the number of found as/cl combinations')
	print(len(perts.keys()))
	# print(perts)

	# print('\n\n'+str(total))
	# save the matrix 
	mat = mat.tolist()
	rl['t'] = rl['t'].tolist() 
	rl['f'] = rl['f'].tolist() 

	# save the list 
	ldr_mat = {}
	ldr_mat['nodes'] = nodes
	ldr_mat['mat'] = mat
	ldr_mat['rl'] = rl
	ldr_mat['perts'] = perts

	json_scripts.save_to_json( ldr_mat, 'ldr_mat.json', 'no-indent' )
def make_ldr_clust():

	import json_scripts
	import numpy as np
	import d3_clustergram 
	from d3_clustergram_class import Network 
	from ast import literal_eval

	# load LDR data - stored as:
	# released status (rl)
	# nodes, and mat 
	ldr = json_scripts.load_to_dict('ldr_mat.json')
	print('\nload ldr_mat.json with perts')
	print(ldr.keys())

	ldr['mat'] = np.asarray(ldr['mat'])
	ldr['rl']['t'] = np.asarray(ldr['rl']['t'])
	ldr['rl']['f'] = np.asarray(ldr['rl']['f'])

	print( 'sum all \t' + str(np.sum(ldr['mat'])) )
	print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) )
	print( 'sum no  \t' + str(np.sum(ldr['rl']['f'])) )

	print(len(ldr['nodes']['as']))
	print(len(ldr['nodes']['cl']))
	print(ldr['mat'].shape)
	print('\n')

	print( 'size all \t' + str(ldr['mat'].shape) )
	print( 'size yes \t' + str(ldr['rl']['t'].shape) )
	print( 'size no  \t' + str(ldr['rl']['f'].shape) )	
	print('\n')

	print( 'sum all \t' + str(np.sum(ldr['mat'])) )
	print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) )
	print( 'sum no  \t' + str(np.sum(ldr['rl']['f'])) )	
	print( 'total yes/no:\t' + str( np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f']) ) )

	# define nodes: unfiltered
	nodes_uf = {}
	nodes_uf['row'] = ldr['nodes']['as']
	nodes_uf['col'] = ldr['nodes']['cl']

	# initialize a new network class 
	##################################
	net = Network()

	net.dat['nodes']['row'] = nodes_uf['row']
	net.dat['nodes']['col'] = nodes_uf['col']

	# net.dat['mat'] = ldr['mat']
	# net.dat['mat_up'] = ldr['rl']['t']
	# net.dat['mat_dn'] = -ldr['rl']['f']

	# only include released data in visualization 
	net.dat['mat'] = ldr['rl']['t']

	# add perts as mat_info
	############################
	print('\nperts')

	net.dat['mat_info'] = {}

	# initialize mat_info 
	for i in range(len(net.dat['nodes']['row'])):
		for j in range(len(net.dat['nodes']['col'])):
			tmp_tuple = str((i,j))

			# initialize info 
			net.dat['mat_info'][tmp_tuple] = {}

	for inst_pert in ldr['perts']:

		pert_data = ldr['perts'][inst_pert]

		inst_pert = literal_eval(inst_pert)
		# assay
		inst_row = inst_pert[0]
		# cell line 
		inst_col = inst_pert[1]

		# assay
		index_row = net.dat['nodes']['row'].index(inst_row)
		# cell line 
		index_col = net.dat['nodes']['col'].index(inst_col)

		# save to mat_info 
		tmp_tuple = str((index_row, index_col))
		net.dat['mat_info'][str(tmp_tuple)] = pert_data

	# filter the matrix using cutoff and min_num_meet
	###################################################
	# filtering matrix 
	cutoff_meet = 1
	min_num_meet = 1
	net.filter_network_thresh( cutoff_meet, min_num_meet )

	# cluster 
	#############
	cutoff_comp = 3
	min_num_comp = 4
	net.cluster_row_and_col('cos', cutoff_comp, min_num_comp, dendro=False)

	# export data visualization to file 
	######################################
	net.write_json_to_file('viz', 'static/networks/LDR_as_cl_released_only.json','indent')
Example #9
0
def merge_sigs_to_mat():

    tmp_exp_sigs = json_scripts.load_to_dict('proc_data/exp-pert_sigs.json')

    exp_sigs = {}
    for inst_sig in tmp_exp_sigs:

        if 'CD34' not in inst_sig:
            exp_sigs[inst_sig] = tmp_exp_sigs[inst_sig]

    all_sigs = sorted(exp_sigs.keys())

    num_sigs = len(all_sigs)

    print('num_sigs: ' + str(num_sigs))

    # collect all genes across all experimental signatures
    all_genes = []

    for sig_name in exp_sigs:

        inst_sig = exp_sigs[sig_name]

        for inst_gene in inst_sig:

            # fix sept problems
            if '-SEP' in inst_gene:
                inst_num = inst_gene.split('-')[0]
                inst_gene = 'SEPT' + inst_num

            if inst_gene != '-':
                all_genes.append(inst_gene)

    print(len(all_genes))
    all_genes = sorted(list(set(all_genes)))
    print(len(all_genes))

    num_genes = len(all_genes)

    print('there are ' + str(num_genes) + ' unique genes')

    mat = np.zeros([num_genes, num_sigs])

    # fill in the matrix
    for sig_name in exp_sigs:

        inst_sig = exp_sigs[sig_name]

        col_index = all_sigs.index(sig_name)

        for inst_gene in inst_sig:

            # initialize value as false
            inst_value = False

            if inst_gene in all_genes:
                inst_value = inst_sig[inst_gene]

            if inst_value != False:

                row_index = all_genes.index(inst_gene)

                # fill in matrix
                mat[row_index, col_index] = inst_value

    # save as dataframe
    df = pd.DataFrame(data=mat, columns=all_sigs, index=all_genes)

    df.to_csv('proc_data/exp-pert_sigs.txt', sep='\t')
def make_ccle_matrix_subset():
    '''
  This will save a subset of the downsampled matrix using the proteins of interest
  '''
    from clustergrammer import Network
    import json_scripts

    print('-- load CCLE downsampled data')

    # load downsampled CCLE data
    net = Network()
    net.load_file('CCLE/CCLE_kmeans_ds_col_100.txt')

    df = net.export_df()

    # load proteins of interest
    filename = 'proteins_of_interest/proteins_of_interest.json'
    poi = json_scripts.load_to_dict(filename)

    all_poi = []
    for inst_type in poi:
        all_poi.extend(poi[inst_type])

    # only keep pois that are found in the CCLE
    all_genes = df.index.tolist()

    found_poi = list(set(all_genes) & set(all_poi))

    num_found_poi = len(found_poi)

    print(
        str(num_found_poi) +
        ' proteins of interest were found in the CCLE data')

    # filter dataframe using row list (transpose and transpose-back)
    ##################################################################
    df = df.transpose()
    df = df[found_poi]
    df = df.transpose()

    # save version without protein categories (e.g. kinase)
    df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi_no_cats.txt', sep='\t')

    row_cats = []

    for inst_gene in found_poi:

        # add protein type to gene names
        found_type = ''
        for inst_type in poi:

            if inst_gene in poi[inst_type]:
                found_type = inst_type

        gene_name = 'gene: ' + inst_gene
        cat_name = 'type: ' + found_type
        inst_tuple = (gene_name, cat_name)

        row_cats.append(inst_tuple)

    # redefine index
    df.index = row_cats

    print('-- save matrix with proteins_of_interest subset')
    df.to_csv('CCLE/CCLE_kmeans_ds_col_100_poi.txt', sep='\t')
Example #11
0
def make_ldr_clust():
    import json_scripts
    import numpy as np
    import d3_clustergram

    # load LDR data
    ldr = json_scripts.load_to_dict('ldr_mat.json')

    print(ldr.keys())

    ldr['mat'] = np.asarray(ldr['mat'])
    ldr['rl']['t'] = np.asarray(ldr['rl']['t'])
    ldr['rl']['f'] = np.asarray(ldr['rl']['f'])

    print('sum all \t' + str(np.sum(ldr['mat'])))
    print('sum yes \t' + str(np.sum(ldr['rl']['t'])))
    print('sum no  \t' + str(np.sum(ldr['rl']['f'])))

    print(len(ldr['nodes']['as']))
    print(len(ldr['nodes']['cl']))
    print(ldr['mat'].shape)

    # define nodes: unfiltered
    nodes_uf = {}
    nodes_uf['row'] = ldr['nodes']['as']
    nodes_uf['col'] = ldr['nodes']['cl']

    # define parameters
    compare_cutoff = 0.05
    min_num_compare = 2

    # filter to remove nodes with no values
    ldr['mat'], nodes = d3_clustergram.filter_sim_mat(ldr['mat'], nodes_uf, 1,
                                                      1)
    # cherrypick using hte nodes
    ldr['rl']['t'] = d3_clustergram.cherrypick_mat_from_nodes(
        nodes_uf, nodes, ldr['rl']['t'])
    ldr['rl']['f'] = d3_clustergram.cherrypick_mat_from_nodes(
        nodes_uf, nodes, ldr['rl']['f'])

    print('size all \t' + str(ldr['mat'].shape))
    print('size yes \t' + str(ldr['rl']['t'].shape))
    print('size no  \t' + str(ldr['rl']['f'].shape))
    print('\n')

    print('sum all \t' + str(np.sum(ldr['mat'])))
    print('sum yes \t' + str(np.sum(ldr['rl']['t'])))
    print('sum no  \t' + str(np.sum(ldr['rl']['f'])))
    print('total yes/no:\t' +
          str(np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f'])))

    print('\n\n\n')
    # print out nodes
    for inst_row in nodes['row']:
        print(inst_row)

    print('\n\n\n')
    # print out nodes
    for inst_row in nodes['row']:
        print(inst_row)

    print('\n\n\n')

    # cluster rows and columns
    print('calculating clustering')
    clust_order = d3_clustergram.cluster_row_and_column(
        nodes, ldr['mat'], 'cosine', compare_cutoff, min_num_compare)

    print('finished calculating clustering')

    # write the d3_clustergram
    base_path = 'static/networks/'
    full_path = base_path + 'LDR_as_cl.json'

    # add class information
    row_class = {}
    col_class = {}

    print(len(nodes['row']))
    print(len(nodes['col']))

    # # last minute cleaning up of row/col names
    # for i in range(len(nodes['col'])):
    # 	nodes['col'][i] = nodes['col'][i].replace('/ single drugs','')
    # for i in range(len(nodes['row'])):
    # 	nodes['row'][i] = nodes['row'][i].replace('cell lines','')

    # write the clustergram
    d3_clustergram.write_json_single_value(nodes, clust_order, ldr, full_path,
                                           row_class, col_class)
Example #12
0
def add_grant_num_to_clust():
	import json_scripts
	import numpy as np 
	import scipy

	print('\n-----------------\nadding grant numbers\n-----------------\n')

	# load json of Andrew data
	data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json')

	print( '\nthere are ' + str(len(data_json['nodes']['row'])) + ' genes in total' )
	print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total\n' )

	data_mat = np.asarray(data_json['data_mat'])

	print('data_mat shape')
	print(data_mat.shape)

	print('\ngoing to add grants per gene as a column into the harmonogram\n')

	# make an array of zeros that will be added to the matrix as a new column 
	num_rows = len(data_json['nodes']['row'])
	extra_col = scipy.zeros([ num_rows, 1 ])
	# #!! temporarily switching to ones from zeros
	# extra_col = scipy.ones([ num_rows, 1 ])

	print('extra col shape')
	print(extra_col.shape)
	print(extra_col)

	# add the column using hstack
	data_mat = np.hstack((data_mat, extra_col))

	print('data_mat shape after adding in extra column')
	print(data_mat.shape)

	# does not need to be done here 
	######################
	# # add extra resource name 
	# data_json['nodes']['col'].append('Grants_Per_Gene')

	print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total after adding grants per gene\n' )
	# print(data_json['nodes']['col'])

 	# add grants data to data_mat
 	###############################
	# load grants_per_gene data
	grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json')

	# make list of genes that were not found
	genes_not_found = []
	genes_found = []

	# loop through genes and add grant information into data_mat 
	for inst_gene in grants_gene:

		# get the index if the gene is in rows 
		if inst_gene in data_json['nodes']['row']:

			# get the index of inst_gene
			inst_index = data_json['nodes']['row'].index(inst_gene)

			# print(inst_index)

			# keep track of found genes 
			genes_found.append(inst_gene)

			# save CumulProbWeightSum to the matrix 
			inst_grants = grants_gene[inst_gene]['CumulProbWeightSum']

			# save the number of grants to the last column  
			# data_mat[inst_index,-1] = inst_grants
			## put in fake data
			data_mat[inst_index,82] = 1 #inst_grants

		else:
			# keep track of not found genes 
			genes_not_found.append(inst_gene)

	# print(len(genes_found))
	# print(len(genes_not_found))
	# print(data_mat.shape)


	print('\n-------------\nchecking data_mat\n----------------\n')
	print(len(data_mat[:,-1]))
	print(data_mat[inst_index,82])
	print(data_mat[2,82])

	# convert data_mat to list
	data_mat = data_mat.tolist()

	# add back to json 
	data_json['data_mat'] = data_mat

	# save to json 
	json_scripts.save_to_json(data_json, 'andrew_data/cumul_probs.json', 'no_indent')
Example #13
0
def load_andrew_data():
	import json_scripts 
	import scipy
	import numpy as np 

	# load resource classes
	load_resource_classes()

	# load resource mapping names 
	load_resource_real_names()

	# load Andrew's data 
	matrix = json_scripts.load_to_dict('andrew_data/gene_dataset_cumulprobs_20150609.json')

 	# add grants data to data_mat
 	###############################
	# load grants_per_gene data
	grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json')

	# only keep the resources with real names 
	rn = json_scripts.load_to_dict('resource_real_names.json')

	# Andrew data format 
	######################
	# matrix is a list of dictionaries 
	# each element of the list has a dictionary with two keys: label and entries
	# the first element of the list describes the columns of the matrix - label: n.a., entries: resources 
	# the rest of the rows have gene names and the value of the gene in each resource  
	# I will convert Andrew's data into 
	# nodes and data_mat 
	print('\nstarting to process data')

	# save row and column data to nodes 
	nodes = {}
	# initialize a list of genes 
	nodes['row'] = []
	# get the good resources - get the real names 
	nodes['col'] = rn.values()

	# save the column index of grants per gene 
	col_index_grant = nodes['col'].index('Grants_Per_Gene')

	# print('\nlength of nodes col')
	# print(len(nodes['col']))
	# print('\n')

	# get the number of rows in the matrix
	# make the matrix smaller by one row 
	# num_rows = len(matrix) 
	num_rows = len(matrix) - 1

	# print('\nmatrix:')
	# print(matrix[0]['label'])
	# print(matrix[1]['label'])
	# print(matrix[2]['label'])
	# print('...')
	# print(matrix[-2]['label'])
	# print(matrix[-1]['label'])
	# print('\n')

	# print('there are '+str(num_rows)+' genes in the original data from Andrew')

	# initialize data matrix
	# rows - genes
	# cols - good resources 
	data_mat = scipy.zeros([ num_rows, len(rn.keys()) ])

	print('\n---------------\nadding original data to matrix\n----------------')

	# loop through the list 
	# add one to account for the full length of the matrix
	for i in range(num_rows+1):

		# get the inst row of the matrix 
		inst_row = matrix[i]

		# grab the gene name 
		inst_name = inst_row['label']

		# grab the list of entries - the actual numerical data 
		inst_entries = inst_row['entries'] 

		# gather the resource names 
		if i == 0:

			# gather all resource (columns) 
			all_res = inst_row['entries']

		# skip the first line - it has column information
		if i > 0:

			# save to nodes['row']
			nodes['row'].append(inst_name)

			# only add data from good resources
			######################################

			# save values to matrix 
			for j in range(len(inst_entries)):

				# only add data from good resources 
				if all_res[j] in rn:

					# get the inst 
					inst_data_point = inst_entries[j]

					# get the resource index in the list of good resources - nodes['col']
					# translate the long name (with underscores) to the real name 
					inst_index = nodes['col'].index( rn[all_res[j]] )

					# fill in the matrix with the entries from row i 

					# shift the index back one to compensate for first row
					matrix_index = i-1
					# shift the index to account for first row of colun labels
					data_mat[matrix_index,inst_index] = inst_data_point

	print('\n---------------\nadding grants to matrix\n----------------')

	# add grants per gene to matrix
	##################################
	for inst_gene in grants_gene:

		# get the index of the gene if it is in the original rows 
		if inst_gene in nodes['row']:

			# get the index of inst_gene
			inst_index = nodes['row'].index(inst_gene)

			# get the number of grants
			inst_grants = grants_gene[inst_gene]['CumulProbWeightSum']

			# save the number of grants to the appropriate column
			data_mat[inst_index,col_index_grant] = inst_grants

	# print('i '+str(i))
	# print('\n')			
	# print('shape of data_mat after filling in ')
	# print(data_mat.shape)
	# print('\n')			
	# print('length of nodes row')
	# print(len(nodes['row']))
	# print('nodes')
	# print(nodes['row'][0])
	# print(nodes['row'][-1])
	# print('\n')

	# save json of the numpy-ready data 
	#
	# convert numpy array to list 
	data_mat = data_mat.tolist()

	# make one dictionary 
	inst_dict = {}
	inst_dict['nodes'] = nodes
	inst_dict['data_mat'] = data_mat 

	# save to json 
	json_scripts.save_to_json(inst_dict,'andrew_data/cumul_probs.json','no_indent')
Example #14
0
def generate_d3_json():
	import json_scripts
	import d3_clustergram
	import scipy
	import numpy as np 

	print('loading json in generate_d3_json')
	# load saved json of andrew data 
	data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json')

	# get nodes and data_mat 
	nodes = data_json['nodes']
	data_mat = np.asarray(data_json['data_mat'])

	print(nodes['col'])
	print(data_mat.shape)

	print('calculating clustering orders')

	# gene and resource classes 
	################################# 
	# gene class 
	gc = json_scripts.load_to_dict('gene_classes_harmonogram.json')
	# resource class 
	rc = json_scripts.load_to_dict('resource_classes_harminogram.json')

	# loop through classes
	for inst_class in gc:

		print(inst_class + '\n')

		# initialize class matrix 
		# class_mat is the subset of data_mat that only has genes of one class, e.g. kinases
		class_mat = np.array([])

		# initialize class_nodes for export 
		class_nodes = {}
		class_nodes['col'] = nodes['col']
		class_nodes['row'] = []

		# loop through the rows and check if they are in the class
		for i in range(len(nodes['row'])):

			# get the index 
			inst_gs = nodes['row'][i]

			# check if in class list 
			if inst_gs in gc[inst_class]:

				# append gene symbol name to row 
				class_nodes['row'].append(inst_gs)

				# initialize class_mat if necesary 
				if len(class_mat) == 0:
					class_mat = data_mat[i,:]
				else:

					# fill in class_mat
					class_mat = np.vstack( (class_mat, data_mat[i,:] ))  


		# actual clustering 
		########################
		# cluster the matrix, return clust_order
		clust_order = d3_clustergram.cluster_row_and_column( class_nodes, class_mat, 'cosine' )

		# # mock clustering
		# ############################
		# print('mock clustering')
		# clust_order = {}
		# # mock cluster 
		# clust_order['clust'] = {}
		# clust_order['clust']['row'] = range(len(class_nodes['row']))
		# clust_order['clust']['col'] = range(len(class_nodes['col']))
		# # mock rank 
		# clust_order['rank'] = {}
		# clust_order['rank']['row'] = range(len(class_nodes['row']))
		# clust_order['rank']['col'] = range(len(class_nodes['col']))

		print('generating d3 json')

		# generate d3_clust json: return json 
		d3_json = d3_clustergram.d3_clust_single_value(class_nodes, clust_order, class_mat )

		# add extra information (data_group) to d3_json - add resource class to d3_json['col_nodes']
		###############################################################################################
		# loop through col_nodes
		for inst_col in d3_json['col_nodes']:

			# get the inst_res
			inst_res = inst_col['name']

			# add the resource-class - data_group
			inst_col['data_group'] = rc[ inst_res ]['data_group'].replace(' ','_')

		# add extra link information about grant: this will be used to color the grant links externally 
		# from the d3_clustergram code 
		for inst_link in d3_json['links']:

			inst_link['info'] = 0

			if d3_json['col_nodes'][inst_link['target']]['name'] == 'Grants_Per_Gene':

				inst_link['info'] = 1

		print('saving to disk')

		# save visualization json 
		json_scripts.save_to_json(d3_json,'static/networks/'+inst_class+'_cumul_probs.json','no_indent')
Example #15
0
def make_ldr_clust():
	import json_scripts
	import numpy as np
	import d3_clustergram 

	# load LDR data
	ldr = json_scripts.load_to_dict('ldr_mat.json')

	print(ldr.keys())

	ldr['mat'] = np.asarray(ldr['mat'])
	ldr['rl']['t'] = np.asarray(ldr['rl']['t'])
	ldr['rl']['f'] = np.asarray(ldr['rl']['f'])

	print( 'sum all \t' + str(np.sum(ldr['mat'])) )
	print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) )
	print( 'sum no  \t' + str(np.sum(ldr['rl']['f'])) )

	print(len(ldr['nodes']['as']))
	print(len(ldr['nodes']['cl']))
	print(ldr['mat'].shape)

	# define nodes: unfiltered
	nodes_uf = {}
	nodes_uf['row'] = ldr['nodes']['as']
	nodes_uf['col'] = ldr['nodes']['cl']

	# define parameters
	compare_cutoff = 0.05
	min_num_compare = 2

	# filter to remove nodes with no values 
	ldr['mat'], nodes = d3_clustergram.filter_sim_mat( ldr['mat'], nodes_uf, 1, 1 )
	# cherrypick using hte nodes 
	ldr['rl']['t'] = d3_clustergram.cherrypick_mat_from_nodes(nodes_uf, nodes, ldr['rl']['t'])
	ldr['rl']['f'] = d3_clustergram.cherrypick_mat_from_nodes(nodes_uf, nodes, ldr['rl']['f'])

	print( 'size all \t' + str(ldr['mat'].shape) )
	print( 'size yes \t' + str(ldr['rl']['t'].shape) )
	print( 'size no  \t' + str(ldr['rl']['f'].shape) )	
	print('\n')

	print( 'sum all \t' + str(np.sum(ldr['mat'])) )
	print( 'sum yes \t' + str(np.sum(ldr['rl']['t'])) )
	print( 'sum no  \t' + str(np.sum(ldr['rl']['f'])) )	
	print( 'total yes/no:\t' + str( np.sum(ldr['rl']['t']) + np.sum(ldr['rl']['f']) ) )

	print('\n\n\n')
	# print out nodes 
	for inst_row in nodes['row']:
		print(inst_row)

		
	print('\n\n\n')
	# print out nodes 
	for inst_row in nodes['row']:
		print(inst_row)

	print('\n\n\n')

	# cluster rows and columns 
	print('calculating clustering')
	clust_order = d3_clustergram.cluster_row_and_column( nodes, ldr['mat'], 'cosine', compare_cutoff, min_num_compare )

	print('finished calculating clustering')

	# write the d3_clustergram 
	base_path = 'static/networks/'
	full_path = base_path + 'LDR_as_cl.json'

	# add class information 
	row_class = {}
	col_class = {}

	print(len(nodes['row']))
	print(len(nodes['col']))

	# # last minute cleaning up of row/col names 
	# for i in range(len(nodes['col'])):
	# 	nodes['col'][i] = nodes['col'][i].replace('/ single drugs','')
	# for i in range(len(nodes['row'])):
	# 	nodes['row'][i] = nodes['row'][i].replace('cell lines','')

	# write the clustergram 
	d3_clustergram.write_json_single_value( nodes, clust_order, ldr, full_path, row_class, col_class)