def zero_one_normal(tar=None, out=None, ref=None): ''' tar: the target file for zero one normalization out: the output file after zero one normalization ref: the reference file. If reference file is 'None', then zero one normalization will be done based on target file itself. ''' if ref=='None': tar_data=PCLfile(tar, skip_col = 0) tar_data.zero_one_normalization() tar_data.write_pcl(out) else: ref_data = PCLfile(ref, skip_col=0) tar_data = PCLfile(tar, skip_col=0) for i in xrange(ref_data.data_matrix.shape[0]): row_minimum = ref_data.data_matrix[i,:].min() row_maximum = ref_data.data_matrix[i,:].max() row_range = row_maximum - row_minimum tar_data.data_matrix[i,:] = (tar_data.data_matrix[i,:] - row_minimum)/row_range #bound the values to be between 0 and 1 tar_data.data_matrix[i,:] = [0 if x< 0 else x for x in tar_data.data_matrix[i,:]] tar_data.data_matrix[i,:] = [1 if x> 1 else x for x in tar_data.data_matrix[i,:]] tar_data.write_pcl(out)
def find_enriched_node(geneList_folder = None, data_file= None, gold_std = None, out_file = None): ''' geneList_folder: the folder stores high-weight gene files for each node data_file: the microarray file with gene genes gold_std: gold standard file contains a list of genes out_file: the output file that stores each node and its corresponding q value ''' datasets = PCLfile(data_file, skip_col=0) gene_id = datasets.id_list gold_fh = open(gold_std,'r') gold_set = [] for line in gold_fh: gene = line.strip().split('\t')[0] gold_set.append(gene) p_all_node = [] geneList_files = glob.glob(geneList_folder + '/Node*.txt') #Get all the high-weight gene files under the geneList_folder for i in xrange(len(geneList_files)): gene_fh = open(geneList_folder + '/Node'+str(i+1)+'.txt','r') gene_fh.next() geneset = [] #geneset stores the high-weight gene for a node for line in gene_fh: gene = line.strip().split('\t')[0] geneset.append(gene) #Build the contengency table all_overlap_genes = set(gold_set).intersection(set(gene_id)) selected_overlap_genes = set(gold_set).intersection(set(geneset)) a = len(selected_overlap_genes) b = len(all_overlap_genes) - len(selected_overlap_genes) c = len(geneset) - len(selected_overlap_genes) d = len(gene_id) -a -b -c table = [[a,b],[c,d]] #Calculate p-value using fisher exact test oddsratio, pvalue = stats.fisher_exact(table) p_all_node.append(pvalue) #multiple hypothesis correction result_adj_pvalue = multipletests(p_all_node, alpha=0.05, method='fdr_bh')[1] all_node = ['Node'+str(x+1) for x in xrange(50)] #find the node with lowest q value, write the output qvalue_small = 1 node_small = None out_fh = open(out_file, 'w') for node, qvalue in zip(all_node, result_adj_pvalue): out_fh.write(node+'\t'+str(qvalue)+'\n') if qvalue < qvalue_small: qvalue_small = qvalue node_small = node print node_small+' is most significantly associated with this gene set with a q value of '+str(qvalue_small)
def read_weight_matrix(data_file, network_file): ''' This function reads weight matrix from network structure file and the corresponding gene id from data file ''' datasets = PCLfile(data_file, skip_col=0) gene_id = datasets.id_list network_fh = open(network_file, 'r') input_size = len(gene_id) network_fh.next() # skip the layer count line network_fh.next() # skip 'weight matrix' line W = [] input_count = 0 for line in network_fh: line = line.strip().split('\t') W.append(line) input_count += 1 if input_count == input_size: break W = numpy.array(W, dtype=float) return gene_id, W
def train_SdA(training_epochs=15, train_lr=0.001, data_file=None, skip_col=2, batch_size=1, random_seed_1=89677, random_seed_2=123, net_structure=[1000, 1000, 1000], corruption_levels=[.1, .2, .3], output_file=None, net_file=None): logging.basicConfig(filename=output_file.replace('activity_SdA.txt', 'SdA.log'), level=logging.INFO) logging.info('Training the dataset:' + data_file) logging.info('The structure of the networks:' + str(net_structure)) logging.info('Training epoches:' + str(training_epochs) + '\t' + 'Batch size:' + str(batch_size) + '\t' + 'Learning rate:' + str(train_lr) + '\t' + 'Corruption levels:' + str(corruption_levels) + '\n' + 'Random seed for training:' + str(random_seed_1) + '\t' + 'Ramdom seed for permuting sample order:' + str(random_seed_2)) datasets = PCLfile(data_file, skip_col) train_set_x, sample_id = datasets.get_permuted_sample( seed=random_seed_2) #Permute the order of samples using random_seed_2 print '... finish reading the data' train_set_x = theano.shared(train_set_x, borrow=True) # compute number of minibatches for training train_size = train_set_x.get_value(borrow=True).shape[0] n_train_batches = train_size / batch_size # numpy random generator numpy_rng = numpy.random.RandomState(random_seed_1) # the number of input nodes input_node = len(datasets.id_list) print '... building the model' # construct the stacked denoising autoencoder class sda = SdA(numpy_rng=numpy_rng, n_ins=input_node, hidden_layers_sizes=net_structure) ######################### # TRAINING THE MODEL # ######################### print '... getting the training functions' training_fns = sda.training_functions(train_set_x=train_set_x, batch_size=batch_size) print '... training the model' start_time = time.clock() ## Train layer-wise corruption_levels = corruption_levels for i in xrange(sda.n_layers): # go through training epochs for epoch in xrange(training_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(training_fns[i](index=batch_index, corruption=corruption_levels[i], lr=train_lr)) print 'Training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) logging.info('Training layer %i, epoch %d, cost %f ' % (i, epoch, numpy.mean(c))) end_time = time.clock() logging.info('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print '... training finished.' ############################################################## # Return the final activity value and raw activity value # for each node of each input sample ############################################################## output_fh = open(output_file, 'w') raw_output_fh = open(output_file.replace('activity', 'rawActivity'), 'w') each_layer_output = sda.return_activity(train_set_x=train_set_x) each_layer_raw_output = sda.return_raw_activity(train_set_x=train_set_x) for i in xrange(sda.n_layers): output_fh.write('layer %i \n' % (i + 1)) raw_output_fh.write('layer %i \n' % (i + 1)) for train_sample in xrange(train_size): node_activation = each_layer_output[i](train_sample) node_raw_activation = each_layer_raw_output[i](train_sample) output_fh.write(sample_id[train_sample] + '\t') raw_output_fh.write(sample_id[train_sample] + '\t') numpy.savetxt(output_fh, node_activation, fmt='%.8f', delimiter='\t') numpy.savetxt(raw_output_fh, node_raw_activation, fmt='%.8f', delimiter='\t') ############################################################## # Return weight matrix and bias vectors of the final network # ############################################################## net_file = open(net_file, 'w') weight_output, bias_output, bias_prime_output = sda.return_network() for i in xrange(len(weight_output)): net_file.write('layer %i \n' % (i + 1)) net_file.write('weight matrix \n') numpy.savetxt(net_file, weight_output[i], fmt='%.8f', delimiter='\t') net_file.write('hidden bias vector \n') numpy.savetxt(net_file, bias_output[i], fmt='%.8f', delimiter='\t') net_file.write('visible bias vector \n') numpy.savetxt(net_file, bias_prime_output[i], fmt='%.8f', delimiter='\t')
def SdA_test(data_file, skip_col, network_file, net_structure): activity_file = data_file.replace( '.pcl', '_activity') + '_with_' + network_file.split('/')[-1] raw_activity_file = data_file.replace( '.pcl', '_rawActivity') + '_with_' + network_file.split('/')[-1] datasets = PCLfile(data_file, skip_col) input_data = datasets.get_sample() input_data = numpy.matrix(input_data) sample_id = datasets.sample_list network_fh = open(network_file, 'r') input_size = input_data.shape[1] #input_size is the number of genes layer_para = [] for each_layer in xrange(len(net_structure)): network_fh.next() # skip the layer count line network_fh.next() # skip 'weight matrix' line #Get the weight matrix W = [] input_count = 0 for line in network_fh: line = line.strip().split('\t') W.append(line) input_count += 1 if input_count == input_size: #when it reach the number of genes break W = numpy.matrix(W, dtype=float) network_fh.next() # skip 'hidden bias vector' line #Get the bias vector of hidden layer h_bias = [] output_count = 0 for line in network_fh: line = line.strip().split('\t') h_bias.append(line) output_count += 1 if output_count == int( net_structure[each_layer] ): #when it reach the number of nodes in hidden layer break h_bias = numpy.matrix(h_bias, dtype=float) network_fh.next() # skip 'visible bias vector' line #Get the bias vector of visible output layer v_bias = [] input_count = 0 for line in network_fh: v_bias.append(line) input_count += 1 if input_count == input_size: break v_bias = numpy.matrix(v_bias, dtype=float) layer_para.append( (W, h_bias) ) #Weight matrix and hidden bias vector are enough for calculating activities input_size = net_structure[each_layer] activity_fh = open(activity_file, 'w') raw_activity_fh = open(raw_activity_file, 'w') for each_layer in xrange(len(net_structure)): activity_fh.write('layer %i \n' % (each_layer + 1)) raw_activity_fh.write('layer %i \n' % (each_layer + 1)) temp = input_data * layer_para[each_layer][ 0] #calculate raw activity, before sigmoid transformation output = logit(temp + layer_para[each_layer][1].T ) #calculate activity, after sigmoid transformation for each_sample in xrange(output.shape[0]): activity_fh.write(sample_id[each_sample] + '\t') raw_activity_fh.write(sample_id[each_sample] + '\t') numpy.savetxt(activity_fh, output[each_sample, ], fmt='%.8f', delimiter='\t') numpy.savetxt(raw_activity_fh, temp[each_sample, ], fmt='%.8f', delimiter='\t') input_data = output
def zero_one_normal(tar=None, out=None, ref=None): ''' tar: the target file for zero one normalization out: the output file after zero one normalization ref: the reference file. If reference file is 'None', then zero one normalization will be done based on target file itself. ''' if ref == 'None': tar_data = PCLfile(tar, skip_col=0) tar_data.zero_one_normalization() tar_data.write_pcl(out) else: ref_data = PCLfile(ref, skip_col=0) tar_data = PCLfile(tar, skip_col=0) for i in xrange(ref_data.data_matrix.shape[0]): row_minimum = ref_data.data_matrix[i, :].min() row_maximum = ref_data.data_matrix[i, :].max() row_range = row_maximum - row_minimum tar_data.data_matrix[i, :] =\ (tar_data.data_matrix[i, :] - row_minimum)/row_range # bound the values to be between 0 and 1 tar_data.data_matrix[i, :] =\ [0 if x < 0 else x for x in tar_data.data_matrix[i, :]] tar_data.data_matrix[i, :] =\ [1 if x > 1 else x for x in tar_data.data_matrix[i, :]] tar_data.write_pcl(out)
def SdA_test(data_file, skip_col, network_file, net_structure): activity_file = data_file.replace('.pcl','_activity') + '_with_' + network_file.split('/')[-1] raw_activity_file = data_file.replace('.pcl','_rawActivity') + '_with_' + network_file.split('/')[-1] datasets = PCLfile(data_file, skip_col) input_data = datasets.get_sample() input_data = numpy.matrix(input_data) sample_id = datasets.sample_list network_fh = open(network_file,'r') input_size = input_data.shape[1] #input_size is the number of genes layer_para = [] for each_layer in xrange(len(net_structure)): network_fh.next() # skip the layer count line network_fh.next() # skip 'weight matrix' line #Get the weight matrix W = [] input_count = 0 for line in network_fh: line = line.strip().split('\t') W.append(line) input_count += 1 if input_count == input_size: #when it reach the number of genes break W = numpy.matrix(W, dtype= float) network_fh.next() # skip 'hidden bias vector' line #Get the bias vector of hidden layer h_bias = [] output_count = 0 for line in network_fh: line = line.strip().split('\t') h_bias.append(line) output_count += 1 if output_count == int(net_structure[each_layer]): #when it reach the number of nodes in hidden layer break h_bias = numpy.matrix(h_bias, dtype = float) network_fh.next() # skip 'visible bias vector' line #Get the bias vector of visible output layer v_bias = [] input_count = 0 for line in network_fh: v_bias.append(line) input_count += 1 if input_count == input_size: break v_bias = numpy.matrix(v_bias, dtype = float) layer_para.append((W,h_bias)) #Weight matrix and hidden bias vector are enough for calculating activities input_size = net_structure[each_layer] activity_fh = open(activity_file, 'w') raw_activity_fh = open(raw_activity_file,'w') for each_layer in xrange(len(net_structure)): activity_fh.write('layer %i \n' %(each_layer+1)) raw_activity_fh.write('layer %i \n' %(each_layer+1)) temp = input_data * layer_para[each_layer][0] #calculate raw activity, before sigmoid transformation output = logit(temp+ layer_para[each_layer][1].T) #calculate activity, after sigmoid transformation for each_sample in xrange(output.shape[0]): activity_fh.write(sample_id[each_sample]+'\t') raw_activity_fh.write(sample_id[each_sample]+'\t') numpy.savetxt(activity_fh, output[each_sample,], fmt= '%.8f', delimiter= '\t') numpy.savetxt(raw_activity_fh, temp[each_sample,], fmt= '%.8f', delimiter= '\t') input_data = output
def train_SdA(training_epochs=15, train_lr=0.001, data_file = None, skip_col = 2, batch_size=1, random_seed_1 = 89677, random_seed_2 = 123,net_structure = [1000,1000,1000], corruption_levels = [.1, .2, .3], output_file = None, net_file = None): logging.basicConfig(filename = output_file.replace('activity_SdA.txt', 'SdA.log'), level= logging.INFO) logging.info('Training the dataset:' + data_file) logging.info('The structure of the networks:'+ str(net_structure)) logging.info('Training epoches:'+str(training_epochs)+'\t'+'Batch size:'+str(batch_size)+'\t'+'Learning rate:'+str(train_lr)+'\t'+'Corruption levels:'+str(corruption_levels)+'\n' +'Random seed for training:'+str(random_seed_1)+'\t'+ 'Ramdom seed for permuting sample order:'+str(random_seed_2)) datasets = PCLfile(data_file, skip_col) train_set_x, sample_id = datasets.get_permuted_sample(seed = random_seed_2)#Permute the order of samples using random_seed_2 print '... finish reading the data' train_set_x = theano.shared(train_set_x,borrow=True) # compute number of minibatches for training train_size = train_set_x.get_value(borrow=True).shape[0] n_train_batches = train_size / batch_size # numpy random generator numpy_rng = numpy.random.RandomState(random_seed_1) # the number of input nodes input_node = len(datasets.id_list) print '... building the model' # construct the stacked denoising autoencoder class sda = SdA(numpy_rng=numpy_rng, n_ins= input_node, hidden_layers_sizes= net_structure) ######################### # TRAINING THE MODEL # ######################### print '... getting the training functions' training_fns = sda.training_functions(train_set_x=train_set_x, batch_size=batch_size) print '... training the model' start_time = time.clock() ## Train layer-wise corruption_levels = corruption_levels for i in xrange(sda.n_layers): # go through training epochs for epoch in xrange(training_epochs): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(training_fns[i](index=batch_index, corruption=corruption_levels[i], lr=train_lr)) print 'Training layer %i, epoch %d, cost ' % (i, epoch), print numpy.mean(c) logging.info('Training layer %i, epoch %d, cost %f ' % (i, epoch, numpy.mean(c) )) end_time = time.clock() logging.info('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print '... training finished.' ############################################################## # Return the final activity value and raw activity value # for each node of each input sample ############################################################## output_fh = open(output_file,'w') raw_output_fh = open(output_file.replace('activity','rawActivity'),'w') each_layer_output = sda.return_activity(train_set_x=train_set_x) each_layer_raw_output = sda.return_raw_activity(train_set_x=train_set_x) for i in xrange(sda.n_layers): output_fh.write('layer %i \n' %(i+1)) raw_output_fh.write('layer %i \n' %(i+1)) for train_sample in xrange(train_size): node_activation = each_layer_output[i](train_sample) node_raw_activation = each_layer_raw_output[i](train_sample) output_fh.write(sample_id[train_sample]+'\t') raw_output_fh.write(sample_id[train_sample]+'\t') numpy.savetxt(output_fh, node_activation, fmt= '%.8f', delimiter= '\t') numpy.savetxt(raw_output_fh, node_raw_activation, fmt= '%.8f', delimiter= '\t') ############################################################## # Return weight matrix and bias vectors of the final network # ############################################################## net_file = open(net_file,'w') weight_output, bias_output, bias_prime_output = sda.return_network() for i in xrange(len(weight_output)): net_file.write('layer %i \n' %(i+1)) net_file.write('weight matrix \n') numpy.savetxt(net_file, weight_output[i], fmt= '%.8f', delimiter = '\t') net_file.write('hidden bias vector \n') numpy.savetxt(net_file, bias_output[i], fmt= '%.8f', delimiter = '\t') net_file.write('visible bias vector \n') numpy.savetxt(net_file, bias_prime_output[i], fmt= '%.8f', delimiter = '\t')