def load_resource_real_names():
	import json_scripts
	print('loading resource real names')

	# open text file
	filename = 'andrew_data/resource_mapping_names.txt'
	f = open(filename,'r')
	lines = f.readlines()
	f.close()

	# make a dictionar of real resource names 
	rn = {}

	# loop through the lines
	for inst_line in lines:

		# clean the line
		inst_line = inst_line.strip().split('\t')

		# if there is a real name, keep the resource 
		if len(inst_line) == 2:
			
			# add the resource and real name to dict - no spaces 
			rn[inst_line[0]] = inst_line[1].replace(' ','_')

	# save dictionary to json 
	json_scripts.save_to_json(rn,'resource_real_names.json','indent')
def main():
	import cookielib, poster, urllib2, json, json_scripts

	# make a get request to get the gmt names and meta data from Enrichr
	x = urllib2.urlopen('http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=meta')
	response = x.read()
	gmt_data = json.loads(response)

	# local version 
	# gmt_data = json_scripts.load_to_dict('enrichr_gmts.json')

	# generate list of gmts 
	gmt_names = []

	# get library names 
	for inst_gmt in gmt_data['libraries']:

		# only include active gmts 
		if inst_gmt['isActive'] == True:

			gmt_names.append(inst_gmt['libraryName'])

	inst_dict = {}
	inst_dict['names'] = gmt_names

	# save json with list of gmt names 
	json_scripts.save_to_json(inst_dict,'gmt_names.json','noindent')
def load_resource_classes():
	import json_scripts
	print('loading resource classes')

	# open text file 
	filename = 'andrew_data/resource_classes.txt'
	f = open(filename,'r')
	lines = f.readlines()
	f.close()

	# add the information into a dictionary 
	rc = {}

	# loop through the lines
	for i in range(len(lines)):

		# get a list of line components 
		inst_line = lines[i].split('\t')

		# get key names from first row 
		if i != 0:

			# I need dataset name, not resource name 
			################

			# get resource name - no spaces 
			inst_name = inst_line[1].replace(' ','_')

			# initialize dictionary 
			rc[inst_name] = {}

			# # dataset name
			# rc[inst_name]['dataset_name'] = inst_line[1]

			# description 
			rc[inst_name]['description'] = inst_line[2]

			# data type 
			rc[inst_name]['data_type'] = inst_line[3]

			# data group
			rc[inst_name]['data_group'] = inst_line[4]

			# association
			rc[inst_name]['association'] = inst_line[5]

			# attribute type
			rc[inst_name]['attribute_type'] = inst_line[6]

			# attribute group 
			rc[inst_name]['attribute_group'] = inst_line[7]

	# save resource classes 
	json_scripts.save_to_json(rc,'resource_classes_harminogram.json','indent')
Example #4
0
def load_sigs_to_json():
    import glob

    print('load')

    # normal files
    file_names = glob.glob('files_2-17-2017/hdf_day*.txt')

    pert_files = glob.glob('files_2-17-2017/Pert*.txt')

    file_names = file_names + pert_files

    print('\n\n')

    print(file_names)

    print('\n\n')

    # # full char dir files
    # file_names = glob.glob('files_2-17-2017/big*.txt')

    # store all signatures in a dictionary
    exp_sigs = {}

    for inst_filename in file_names:

        inst_sig = inst_filename.split('.txt')[0].split('/')[1].split(
            '_chdir')[0]

        # initialize dictionary for signature
        exp_sigs[inst_sig] = {}

        f = open(inst_filename, 'r')
        lines = f.readlines()

        for inst_line in lines:
            inst_line = inst_line.strip().split(',')

            inst_gene = inst_line[0]
            inst_value = inst_line[1]

            exp_sigs[inst_sig][inst_gene] = inst_value

        f.close()

    json_scripts.save_to_json(exp_sigs,
                              'proc_data/exp-pert_sigs.json',
                              indent='indent')
def main():
    '''
  I'm working on making similarity matrices for KIN, IC, and GPCR genes based on
  data in the Hzome. Here I'm gathering my old (Hgram) gene lists with the
  latest list of the 'dark' genes from the KMC 2017 grant. I'm saving these to a
  new JSON for later use. The next step is to calculate the similarity matrices
  and visualize them in a notebook or webpage.
  '''
    import json_scripts

    hgram_info = json_scripts.load_to_dict(
        '../harmonogram_classes/gene_classes_harmonogram.json')

    grant_poi = json_scripts.load_to_dict(
        '../grant_pois/proteins_of_interest.json')

    gene_types = ['KIN', 'IC', 'GPCR']

    # make a new json with merged all genes and dark gene info
    gene_info = {}

    for inst_type in gene_types:

        # add any dark genes to all_genes
        dark_genes = grant_poi[inst_type]
        all_genes = hgram_info[inst_type] + dark_genes

        dark_genes = sorted(list(set(dark_genes)))
        all_genes = sorted(list(set(all_genes)))

        print(inst_type)
        print('all: ' + str(len(all_genes)))
        print('dark: ' + str(len(dark_genes)))

        print(len(list(set(dark_genes) - set(all_genes))))

        gene_info[inst_type] = {}
        gene_info[inst_type]['all'] = all_genes
        gene_info[inst_type]['dark'] = dark_genes

        print('\n\n')

    json_scripts.save_to_json(gene_info,
                              '../grant_pois/gene_info_with_dark.json',
                              indent='indent')
def load_grants_per_gene():
	import json_scripts

	# open text file
	filename = 'andrew_data/grantspergene_weighted_standardized.txt'
	f = open(filename,'r')
	lines = f.readlines()
	f.close()

	# grab the column names 
	col_names = lines[0].strip().split('\t')

	# initialize dictionary 
	grant_gene = {}

	# loop through the lines
	for i in range(len(lines)):

		# grab the data for each gene 
		if i > 0:

			# get inst_line 
			inst_line = lines[i].strip().split('\t')

			# get gene name 
			inst_name = inst_line[0]

			# initialize dictionary for gene
			grant_gene[inst_name] = {}

			# save information on gene 
			for j in range(len(col_names)):
				# skip first element 
				if j > 0:

					# save column name as dictionary key
					grant_gene[inst_name][col_names[j]] = float(inst_line[j])

			# print(grant_gene[inst_name])

	# print(grant_gene['SELL'])

	# save to json 
	json_scripts.save_to_json(grant_gene, 'andrew_data/grants_per_gene.json', 'indent')
def make_protein_dictionary():
    '''
  This script makes a python dictionary from the proteins of interest lists
  and saves them as a JSON for later use.
  '''
    print('-- generate dictionary with protein names')

    import json_scripts

    poi = {}

    for inst_type in ['kinase', 'gpcr', 'ion_channel']:
        inst_names = load_names(inst_type)

        poi[inst_type] = inst_names

    json_scripts.save_to_json(poi,
                              'proteins_of_interest/proteins_of_interest.json',
                              indent='indent')
def main():
	import json_scripts

	# load gene list text file 
	filename = 'example_gene_50.txt'
	f = open(filename,'r')
	genes_text = f.readlines()
	f.close()

	# clean gene names
	genes_text = [d.strip().upper() for d in genes_text]

	# remove duplicates
	genes_text = list(set(genes_text))

	print(len(genes_text))

	# generate dictionary 
	example_list = {}
	example_list['genes'] = genes_text

	# save to json 
	json_scripts.save_to_json(example_list,'example_gene_50.json','no_indent')
def construct_array():
	import json_scripts
	import scipy

	print('\nconstructing array\n')

	# load the LDR data is json format 
	ldr = json_scripts.load_to_dict('LDR/LDR_api.json')

	# load cl and as dictionary 
	as_cl_dict = json_scripts.load_to_dict('as_cl_dict.json')

	# get nodes from 'short name' dictionary values 
	nodes = {}
	nodes['as'] = sorted(list(set(as_cl_dict['as'].values())))
	nodes['cl'] = list(set(as_cl_dict['cl'].values()))
	# add cell-free to list of cell lines 
	nodes['cl'].append('cell-free')
	nodes['cl'] = sorted(nodes['cl'])

	# # run once - add back removed as and cl to Avi dictionary 
	# # find assays and cell lines that were removed from original list 
	# #####################################################################
	# all_nodes = extract_nodes()
	# for inst_data in as_cl_dict:
	# 	# get all nodes
	# 	tmp_dict = set( as_cl_dict[inst_data].keys() )
	# 	tmp_all = set( all_nodes[inst_data] )
	# 	not_found = list( tmp_all - tmp_dict )
	# 	print('\n')
	# 	print(inst_data)
	# 	for tmp in not_found:
	# 		print(tmp)
	# 	print('\n')

	# make 2d matrix for now 
	mat = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])

	# generate two released matrices 
	rl = {}
	rl['t'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])
	rl['f'] = scipy.zeros([ len(nodes['as']), len(nodes['cl']) ])

	# generate perturbation dictionary that will save perturbation 
	# information for assays and cell lines 
	perts = {}

	total = 0

	# loop through the ldf datasets 
	for inst_ldr in ldr:

		# get the inst_assay: put name through dictionary 
		# print( inst_ldr['datasetName'].strip() )
		inst_as = as_cl_dict['as'][ inst_ldr['datasetName'].strip() ]
		# print('inst_as: '+ inst_as)

		# get the cell line(s)
		inst_cls = [] 


		for inst_cl in inst_ldr['metadata']['cellLines']:
			if 'name' in inst_cl:
				#!! remove cell line 'TBD among cell ...'
				if 'TBD among' not in inst_cl['name'].strip():
					inst_cls.append( as_cl_dict['cl'][ inst_cl['name'].strip() ] )

		# get the perturbations 
		inst_pts = []
		for inst_pt in inst_ldr['metadata']['perturbagens']:
			inst_pts.append( inst_pt['name'].strip() )


		# if the assay is kinomescan then set cell line to 'cell-free
		if inst_as == 'KINOMEscan':
			# print('kinomescan')
			inst_cls.append( 'cell-free' )
			# print(inst_cls)
			# print('\n\n\n')


		# add information to mat
		# get index of assay 
		index_as = nodes['as'].index(inst_as)


		# loop through cell lines
		for inst_cl in inst_cls:

			# get the index of the cell line
			index_cl = nodes['cl'].index(inst_cl)

			for inst_pt in inst_pts:

				# check if the perturbation represents multiple perturbations 
				if 'compounds' in inst_pt and 'among' not in inst_pt:
					mult_pts = int(inst_pt.split(' ')[0])
				else:
					mult_pts = 0

				# track the number of perturbations and the released status 
				##############################################################
				if mult_pts == 0:
					mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + 1

					# track number of released 
					if inst_ldr['released'] == True:
						rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + 1
					else:
						rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + 1

				else:
					mat[ index_as, index_cl ] = mat[ index_as, index_cl ] + mult_pts

					# track number of released 
					if inst_ldr['released'] == True:
						rl['t'][index_as, index_cl] = rl['t'][index_as, index_cl] + mult_pts
					else:
						rl['f'][index_as, index_cl] = rl['f'][index_as, index_cl] + mult_pts

				# keep track of perturbation information in the dictionary 
				##############################################################
				# genrate as cl tuple 
				inst_tuple = str((inst_as, inst_cl))
				# initailize list if necessary 
				if inst_tuple not in perts:
					perts[inst_tuple] = []
				# generate pert_dict
				pert_dict = {}
				pert_dict['name'] = inst_pt 
				pert_dict['release'] = inst_ldr['released']
				pert_dict['_id'] = inst_ldr['_id']
				# add dictionary to list 
				perts[inst_tuple].append(pert_dict)

				# add to total 
				total = total + 1

	# check perts dictionary 
	print('perts dictionary - the number of found as/cl combinations')
	print(len(perts.keys()))
	# print(perts)

	# print('\n\n'+str(total))
	# save the matrix 
	mat = mat.tolist()
	rl['t'] = rl['t'].tolist() 
	rl['f'] = rl['f'].tolist() 

	# save the list 
	ldr_mat = {}
	ldr_mat['nodes'] = nodes
	ldr_mat['mat'] = mat
	ldr_mat['rl'] = rl
	ldr_mat['perts'] = perts

	json_scripts.save_to_json( ldr_mat, 'ldr_mat.json', 'no-indent' )
def assay_cl_dict():
	import json_scripts

	f = open('LDR/assays_and_cl_lists_for_Avi-AM.txt', 'r')
	lines = f.readlines()
	f.close()

	# make names dictionary 
	names = {}
	names['as'] = {}
	names['cl'] = {}

	# will go through assays and cell lines 
	inst_data = ''

	# loop through the lines 
	for inst_line in lines:

		# strip the line 
		inst_line = inst_line.strip()

		if 'assays:' in inst_line:
			inst_data = 'as'
			# print(inst_data)

		if 'cell lines:' in inst_line:
			inst_data = 'cl'
			# print(inst_data)

		# load assays 
		##############
		if inst_data == 'as':
			# check if there is a short name 
			if '\t' in inst_line:
				inst_sn = inst_line.split('\t')[0]
				inst_ln =  inst_line.split('\t')[1]

				names[inst_data][inst_ln] = inst_sn

				# # add data to dictionary 
				# if inst_ln not in names[inst_data]:
				# # add short name to dictionary 
				# names[inst_data][inst_key].append(inst_ln)

			# if there is no short name add long name as key and value 
			elif len(inst_line) > 0:
				names[inst_data][inst_line] = inst_line
				# names[inst_data][inst_line].append(inst_line)

		# load cell lines 
		###################
		if inst_data == 'cl':
			# check if there is a short name 
			if '\t' in inst_line:
				inst_sn = inst_line.split('\t')[0]
				inst_ln  = inst_line.split('\t')[1]

				names[inst_data][inst_ln] = inst_sn

				# # add data to dictionary
				# if inst_key not in names[inst_data]:
				# 	names[inst_data][inst_key] = []
				# # add short name to dictionary 
				# names[inst_data][inst_key].append(inst_ln)

			# if tehre is no short name add long name as key and value 
			elif len(inst_line) > 0:
				names[inst_data][inst_line] = inst_line
				# names[inst_data][inst_line].append(inst_line)

	# print(len(names['as'].keys()))
	# print('\n')
	# print(len(names['cl'].keys()))
	# print('\n')
	# print( len(list(set(names['cl'].values()))) )
	# print('\n')
	# print( len(list(set(names['as'].values()))) )

	json_scripts.save_to_json(names,'as_cl_dict.json','indent')
Example #11
0
def write_json_single_value(nodes, clust_order, mat, full_path, row_class={}, col_class={}, link_hl={} ):
	import json
	import json_scripts
	import d3_clustergram

	# initialize dict
	d3_json = d3_clustergram.ini_d3_json()

	# generate distance cutoffs 
	all_dist = []
	for i in range(11):
		all_dist.append(float(i)/10)

	#!! generate tmp classes 
	import random
	random.seed(122341)

	# append row dicts to array 
	for i in range(len(nodes['row'])):
		inst_dict = {}
		inst_dict['name'] = nodes['row'][i]
		inst_dict['clust'] = clust_order['clust']['row'].index(i)
		# do not need to get index
		inst_dict['rank'] = clust_order['rank']['row'][i]

		# save group 
		inst_dict['group'] = []
		for inst_dist in all_dist:
			inst_dict['group'].append( float(clust_order['group']['row'][inst_dist][i]) )

		# save value for bar 
		inst_dict['value'] = random.random()

		# add class information 
		inst_dict['class'] = row_class[nodes['row'][i]]

		# append dictionary 
		d3_json['row_nodes'].append(inst_dict)
				

	# append col dicts to array 
	for i in range(len(nodes['col'])):
		inst_dict = {}
		inst_dict['name'] = nodes['col'][i]
		inst_dict['clust'] = clust_order['clust']['col'].index(i)
		# do not need to get index
		inst_dict['rank'] = clust_order['rank']['col'][i]
		
		# save group data for different cutoffs
		inst_dict['group'] = []
		for inst_dist in all_dist:
			inst_dict['group'].append( float(clust_order['group']['col'][inst_dist][i]) )

		# save value for bar 
		inst_dict['value'] = random.random()

		# add class information 
		inst_dict['class'] = col_class[nodes['col'][i]]

		# append dictionary 
		d3_json['col_nodes'].append(inst_dict)

	# links - generate edge list 
	for i in range(len(nodes['row'])):
		for j in range(len(nodes['col'])):
			if abs(mat[i,j]) > 0:
				inst_dict = {}
				inst_dict['source'] = i
				inst_dict['target'] = j
				inst_dict['value'] = mat[i,j]

				# initailize with no highlight
				inst_dict['highlight'] = 0

				# add highlight if necessary 
				if len(link_hl) > 0:
					# check highlight
					if nodes['col'][j] in link_hl:
						# check if gene is a known target of the transcription factor 
						if nodes['row'][i] in link_hl[nodes['col'][j]]:

							# highlight 
							inst_dict['highlight'] = 1
							
				d3_json['links'].append( inst_dict )

	# write json 
	##############
	# fw = open(full_path, 'w')
	# fw.write( json.dumps( d3_json, indent=2) )
	# fw.close()	
	json_scripts.save_to_json(d3_json, full_path, 'noindent')
Example #12
0
def add_grant_num_to_clust():
	import json_scripts
	import numpy as np 
	import scipy

	print('\n-----------------\nadding grant numbers\n-----------------\n')

	# load json of Andrew data
	data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json')

	print( '\nthere are ' + str(len(data_json['nodes']['row'])) + ' genes in total' )
	print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total\n' )

	data_mat = np.asarray(data_json['data_mat'])

	print('data_mat shape')
	print(data_mat.shape)

	print('\ngoing to add grants per gene as a column into the harmonogram\n')

	# make an array of zeros that will be added to the matrix as a new column 
	num_rows = len(data_json['nodes']['row'])
	extra_col = scipy.zeros([ num_rows, 1 ])
	# #!! temporarily switching to ones from zeros
	# extra_col = scipy.ones([ num_rows, 1 ])

	print('extra col shape')
	print(extra_col.shape)
	print(extra_col)

	# add the column using hstack
	data_mat = np.hstack((data_mat, extra_col))

	print('data_mat shape after adding in extra column')
	print(data_mat.shape)

	# does not need to be done here 
	######################
	# # add extra resource name 
	# data_json['nodes']['col'].append('Grants_Per_Gene')

	print( 'there are ' + str(len(data_json['nodes']['col'])) + ' resources in total after adding grants per gene\n' )
	# print(data_json['nodes']['col'])

 	# add grants data to data_mat
 	###############################
	# load grants_per_gene data
	grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json')

	# make list of genes that were not found
	genes_not_found = []
	genes_found = []

	# loop through genes and add grant information into data_mat 
	for inst_gene in grants_gene:

		# get the index if the gene is in rows 
		if inst_gene in data_json['nodes']['row']:

			# get the index of inst_gene
			inst_index = data_json['nodes']['row'].index(inst_gene)

			# print(inst_index)

			# keep track of found genes 
			genes_found.append(inst_gene)

			# save CumulProbWeightSum to the matrix 
			inst_grants = grants_gene[inst_gene]['CumulProbWeightSum']

			# save the number of grants to the last column  
			# data_mat[inst_index,-1] = inst_grants
			## put in fake data
			data_mat[inst_index,82] = 1 #inst_grants

		else:
			# keep track of not found genes 
			genes_not_found.append(inst_gene)

	# print(len(genes_found))
	# print(len(genes_not_found))
	# print(data_mat.shape)


	print('\n-------------\nchecking data_mat\n----------------\n')
	print(len(data_mat[:,-1]))
	print(data_mat[inst_index,82])
	print(data_mat[2,82])

	# convert data_mat to list
	data_mat = data_mat.tolist()

	# add back to json 
	data_json['data_mat'] = data_mat

	# save to json 
	json_scripts.save_to_json(data_json, 'andrew_data/cumul_probs.json', 'no_indent')
Example #13
0
def load_andrew_data():
	import json_scripts 
	import scipy
	import numpy as np 

	# load resource classes
	load_resource_classes()

	# load resource mapping names 
	load_resource_real_names()

	# load Andrew's data 
	matrix = json_scripts.load_to_dict('andrew_data/gene_dataset_cumulprobs_20150609.json')

 	# add grants data to data_mat
 	###############################
	# load grants_per_gene data
	grants_gene = json_scripts.load_to_dict('andrew_data/grants_per_gene.json')

	# only keep the resources with real names 
	rn = json_scripts.load_to_dict('resource_real_names.json')

	# Andrew data format 
	######################
	# matrix is a list of dictionaries 
	# each element of the list has a dictionary with two keys: label and entries
	# the first element of the list describes the columns of the matrix - label: n.a., entries: resources 
	# the rest of the rows have gene names and the value of the gene in each resource  
	# I will convert Andrew's data into 
	# nodes and data_mat 
	print('\nstarting to process data')

	# save row and column data to nodes 
	nodes = {}
	# initialize a list of genes 
	nodes['row'] = []
	# get the good resources - get the real names 
	nodes['col'] = rn.values()

	# save the column index of grants per gene 
	col_index_grant = nodes['col'].index('Grants_Per_Gene')

	# print('\nlength of nodes col')
	# print(len(nodes['col']))
	# print('\n')

	# get the number of rows in the matrix
	# make the matrix smaller by one row 
	# num_rows = len(matrix) 
	num_rows = len(matrix) - 1

	# print('\nmatrix:')
	# print(matrix[0]['label'])
	# print(matrix[1]['label'])
	# print(matrix[2]['label'])
	# print('...')
	# print(matrix[-2]['label'])
	# print(matrix[-1]['label'])
	# print('\n')

	# print('there are '+str(num_rows)+' genes in the original data from Andrew')

	# initialize data matrix
	# rows - genes
	# cols - good resources 
	data_mat = scipy.zeros([ num_rows, len(rn.keys()) ])

	print('\n---------------\nadding original data to matrix\n----------------')

	# loop through the list 
	# add one to account for the full length of the matrix
	for i in range(num_rows+1):

		# get the inst row of the matrix 
		inst_row = matrix[i]

		# grab the gene name 
		inst_name = inst_row['label']

		# grab the list of entries - the actual numerical data 
		inst_entries = inst_row['entries'] 

		# gather the resource names 
		if i == 0:

			# gather all resource (columns) 
			all_res = inst_row['entries']

		# skip the first line - it has column information
		if i > 0:

			# save to nodes['row']
			nodes['row'].append(inst_name)

			# only add data from good resources
			######################################

			# save values to matrix 
			for j in range(len(inst_entries)):

				# only add data from good resources 
				if all_res[j] in rn:

					# get the inst 
					inst_data_point = inst_entries[j]

					# get the resource index in the list of good resources - nodes['col']
					# translate the long name (with underscores) to the real name 
					inst_index = nodes['col'].index( rn[all_res[j]] )

					# fill in the matrix with the entries from row i 

					# shift the index back one to compensate for first row
					matrix_index = i-1
					# shift the index to account for first row of colun labels
					data_mat[matrix_index,inst_index] = inst_data_point

	print('\n---------------\nadding grants to matrix\n----------------')

	# add grants per gene to matrix
	##################################
	for inst_gene in grants_gene:

		# get the index of the gene if it is in the original rows 
		if inst_gene in nodes['row']:

			# get the index of inst_gene
			inst_index = nodes['row'].index(inst_gene)

			# get the number of grants
			inst_grants = grants_gene[inst_gene]['CumulProbWeightSum']

			# save the number of grants to the appropriate column
			data_mat[inst_index,col_index_grant] = inst_grants

	# print('i '+str(i))
	# print('\n')			
	# print('shape of data_mat after filling in ')
	# print(data_mat.shape)
	# print('\n')			
	# print('length of nodes row')
	# print(len(nodes['row']))
	# print('nodes')
	# print(nodes['row'][0])
	# print(nodes['row'][-1])
	# print('\n')

	# save json of the numpy-ready data 
	#
	# convert numpy array to list 
	data_mat = data_mat.tolist()

	# make one dictionary 
	inst_dict = {}
	inst_dict['nodes'] = nodes
	inst_dict['data_mat'] = data_mat 

	# save to json 
	json_scripts.save_to_json(inst_dict,'andrew_data/cumul_probs.json','no_indent')
Example #14
0
def generate_d3_json():
	import json_scripts
	import d3_clustergram
	import scipy
	import numpy as np 

	print('loading json in generate_d3_json')
	# load saved json of andrew data 
	data_json = json_scripts.load_to_dict('andrew_data/cumul_probs.json')

	# get nodes and data_mat 
	nodes = data_json['nodes']
	data_mat = np.asarray(data_json['data_mat'])

	print(nodes['col'])
	print(data_mat.shape)

	print('calculating clustering orders')

	# gene and resource classes 
	################################# 
	# gene class 
	gc = json_scripts.load_to_dict('gene_classes_harmonogram.json')
	# resource class 
	rc = json_scripts.load_to_dict('resource_classes_harminogram.json')

	# loop through classes
	for inst_class in gc:

		print(inst_class + '\n')

		# initialize class matrix 
		# class_mat is the subset of data_mat that only has genes of one class, e.g. kinases
		class_mat = np.array([])

		# initialize class_nodes for export 
		class_nodes = {}
		class_nodes['col'] = nodes['col']
		class_nodes['row'] = []

		# loop through the rows and check if they are in the class
		for i in range(len(nodes['row'])):

			# get the index 
			inst_gs = nodes['row'][i]

			# check if in class list 
			if inst_gs in gc[inst_class]:

				# append gene symbol name to row 
				class_nodes['row'].append(inst_gs)

				# initialize class_mat if necesary 
				if len(class_mat) == 0:
					class_mat = data_mat[i,:]
				else:

					# fill in class_mat
					class_mat = np.vstack( (class_mat, data_mat[i,:] ))  


		# actual clustering 
		########################
		# cluster the matrix, return clust_order
		clust_order = d3_clustergram.cluster_row_and_column( class_nodes, class_mat, 'cosine' )

		# # mock clustering
		# ############################
		# print('mock clustering')
		# clust_order = {}
		# # mock cluster 
		# clust_order['clust'] = {}
		# clust_order['clust']['row'] = range(len(class_nodes['row']))
		# clust_order['clust']['col'] = range(len(class_nodes['col']))
		# # mock rank 
		# clust_order['rank'] = {}
		# clust_order['rank']['row'] = range(len(class_nodes['row']))
		# clust_order['rank']['col'] = range(len(class_nodes['col']))

		print('generating d3 json')

		# generate d3_clust json: return json 
		d3_json = d3_clustergram.d3_clust_single_value(class_nodes, clust_order, class_mat )

		# add extra information (data_group) to d3_json - add resource class to d3_json['col_nodes']
		###############################################################################################
		# loop through col_nodes
		for inst_col in d3_json['col_nodes']:

			# get the inst_res
			inst_res = inst_col['name']

			# add the resource-class - data_group
			inst_col['data_group'] = rc[ inst_res ]['data_group'].replace(' ','_')

		# add extra link information about grant: this will be used to color the grant links externally 
		# from the d3_clustergram code 
		for inst_link in d3_json['links']:

			inst_link['info'] = 0

			if d3_json['col_nodes'][inst_link['target']]['name'] == 'Grants_Per_Gene':

				inst_link['info'] = 1

		print('saving to disk')

		# save visualization json 
		json_scripts.save_to_json(d3_json,'static/networks/'+inst_class+'_cumul_probs.json','no_indent')
Example #15
0
def write_json_single_value(nodes, clust_order, LDR, full_path, perts, row_class={}, col_class={}, link_hl={} ):
	import json
	import json_scripts
	import d3_clustergram

	print(perts.keys())

	#!! special case, encode extra released information for LDR 
	mat = LDR['mat']
	# get release data 
	rl = LDR['rl']
	print('\n\nchecking rl\n\n')

	# print(rl['t'])

	# initialize dict
	d3_json = d3_clustergram.ini_d3_json()

	# generate distance cutoffs 
	all_dist = []
	for i in range(11):
		all_dist.append(float(i)/10)

	#!! generate tmp classes 
	import random
	random.seed(122341)

	# append row dicts to array 
	for i in range(len(nodes['row'])):
		inst_dict = {}
		inst_dict['name'] = nodes['row'][i]
		inst_dict['clust'] = clust_order['clust']['row'].index(i)
		# do not need to get index
		inst_dict['rank'] = clust_order['rank']['row'][i]

		# # save group 
		# inst_dict['group'] = []
		# for inst_dist in all_dist:
		# 	inst_dict['group'].append( float(clust_order['group']['row'][inst_dist][i]) )

		# # save value for bar 
		# inst_dict['value'] = random.random()

		# # add class information 
		# inst_dict['class'] = row_class[nodes['row'][i]]

		# append dictionary 
		d3_json['row_nodes'].append(inst_dict)
				

	# append col dicts to array 
	for i in range(len(nodes['col'])):
		inst_dict = {}
		inst_dict['name'] = nodes['col'][i]
		inst_dict['clust'] = clust_order['clust']['col'].index(i)
		# do not need to get index
		inst_dict['rank'] = clust_order['rank']['col'][i]
		
		# # save group data for different cutoffs
		# inst_dict['group'] = []
		# for inst_dist in all_dist:
		# 	inst_dict['group'].append( float(clust_order['group']['col'][inst_dist][i]) )

		# # save value for bar 
		# inst_dict['value'] = random.random()

		# # add class information 
		# inst_dict['class'] = col_class[nodes['col'][i]]

		# append dictionary 
		d3_json['col_nodes'].append(inst_dict)

	# links - generate edge list 
	for i in range(len(nodes['row'])):
		for j in range(len(nodes['col'])):
			if abs(mat[i,j]) > 0:
				inst_dict = {}
				inst_dict['source'] = i
				inst_dict['target'] = j
				inst_dict['value'] = mat[i,j]
				# !! custom change for LDRgram
				inst_dict['value_up'] = rl['t'][i,j]
				inst_dict['value_dn'] = -rl['f'][i,j]

				# print('\tas: '+nodes['row'][i])
				# print('\tcl: '+nodes['col'][j])
				# add perturbation information 
				inst_tuple = ( nodes['row'][i], nodes['col'][j] )
				# print( perts[inst_tuple] )
				# add to dictionary 
				inst_dict['perts'] = perts[inst_tuple]

				d3_json['links'].append( inst_dict )

	# write json 
	##############
	json_scripts.save_to_json(d3_json, full_path, 'indent')
def write_json_single_value(nodes,
                            clust_order,
                            LDR,
                            full_path,
                            perts,
                            row_class={},
                            col_class={},
                            link_hl={}):
    import json
    import json_scripts
    import d3_clustergram

    print(perts.keys())

    #!! special case, encode extra released information for LDR
    mat = LDR['mat']
    # get release data
    rl = LDR['rl']
    print('\n\nchecking rl\n\n')

    # print(rl['t'])

    # initialize dict
    d3_json = d3_clustergram.ini_d3_json()

    # generate distance cutoffs
    all_dist = []
    for i in range(11):
        all_dist.append(float(i) / 10)

    #!! generate tmp classes
    import random
    random.seed(122341)

    # append row dicts to array
    for i in range(len(nodes['row'])):
        inst_dict = {}
        inst_dict['name'] = nodes['row'][i]
        inst_dict['clust'] = clust_order['clust']['row'].index(i)
        # do not need to get index
        inst_dict['rank'] = clust_order['rank']['row'][i]

        # # save group
        # inst_dict['group'] = []
        # for inst_dist in all_dist:
        # 	inst_dict['group'].append( float(clust_order['group']['row'][inst_dist][i]) )

        # # save value for bar
        # inst_dict['value'] = random.random()

        # # add class information
        # inst_dict['class'] = row_class[nodes['row'][i]]

        # append dictionary
        d3_json['row_nodes'].append(inst_dict)

    # append col dicts to array
    for i in range(len(nodes['col'])):
        inst_dict = {}
        inst_dict['name'] = nodes['col'][i]
        inst_dict['clust'] = clust_order['clust']['col'].index(i)
        # do not need to get index
        inst_dict['rank'] = clust_order['rank']['col'][i]

        # # save group data for different cutoffs
        # inst_dict['group'] = []
        # for inst_dist in all_dist:
        # 	inst_dict['group'].append( float(clust_order['group']['col'][inst_dist][i]) )

        # # save value for bar
        # inst_dict['value'] = random.random()

        # # add class information
        # inst_dict['class'] = col_class[nodes['col'][i]]

        # append dictionary
        d3_json['col_nodes'].append(inst_dict)

    # links - generate edge list
    for i in range(len(nodes['row'])):
        for j in range(len(nodes['col'])):
            if abs(mat[i, j]) > 0:
                inst_dict = {}
                inst_dict['source'] = i
                inst_dict['target'] = j
                inst_dict['value'] = mat[i, j]
                # !! custom change for LDRgram
                inst_dict['value_up'] = rl['t'][i, j]
                inst_dict['value_dn'] = -rl['f'][i, j]

                # print('\tas: '+nodes['row'][i])
                # print('\tcl: '+nodes['col'][j])
                # add perturbation information
                inst_tuple = (nodes['row'][i], nodes['col'][j])
                # print( perts[inst_tuple] )
                # add to dictionary
                inst_dict['perts'] = perts[inst_tuple]

                d3_json['links'].append(inst_dict)

    # write json
    ##############
    json_scripts.save_to_json(d3_json, full_path, 'indent')