def mergeBlock(self, data_ls_2d, mask_ls_2d, gene_id_set, bad_gene_id): """ 08-28-05 merge the block of same genes 02-24-06 just average """ gene_id_set.remove(bad_gene_id) gene_id = gene_id_set.pop() if self.debug: print gene_id ar = array(data_ls_2d, mask = mask_ls_2d, fill_value=100000000) #02-24-06 fill_value for graph_modeling if len(data_ls_2d)==1: #no need to do average ar = array(data_ls_2d[0], mask=mask_ls_2d[0]) if self.debug: print ar return gene_id, ar max_ls = [] """ for i in range(ar.shape[0]): signed_max_value = maximum(ar[i,:]) signed_min_value = minimum(ar[i,:]) max_value = max(abs(signed_max_value), abs(signed_min_value)) #02-17-06 max_ls.append(max_value) if self.debug: print "ar", ar print "max_value", max_value print "max_ls", max_ls ar[i,:] = ar[i,:]/max_value if self.debug: print "ar divided by max_value", ar """ new_ar = average(ar) #*max(max_ls) #02-17-06 if self.debug: print "average(ar)", average(ar) print "max(max_ls)", max(max_ls) print "new_ar(after average and multiplication of max(max_ls)", new_ar """ #02-24-06 for i in range(len(ar)): for j in range(i+1, len(ar)): edge_data = graph_modeling.ind_cor(ar[i].tolist(), ar[j].tolist(), -1) #print "correlation between %s and %s is %s"%(i, j, edge_data.value) #raw_input("Continue? : ") self.cor_list.append(edge_data.value) """ return gene_id, new_ar
def data_read_in(self, infname, no_of_nas): """ 05-09-05 """ sys.stderr.write("Reading data...") list_of_mas = [] reader = csv.reader(open(infname, 'r'),delimiter='\t') list_of_gene_ids = [] for row in reader: data_ls = [] mask_ls = [] for item in row[1:]: #ignore the first edge id if item=='NA': data_ls.append(1e20) mask_ls.append(1) else: data_ls.append(float(item)) mask_ls.append(0) if no_of_nas: if sum(mask_ls)>no_of_nas: #too many NAs continue list_of_gene_ids.append(row[0]) list_of_mas.append(array(data_ls, mask=mask_ls)) """ #the rest NA replaced with mean for i in range(len(list_of_mas)): ma = list_of_mas[i] list_of_mas[i] = filled(ma, MLab.mean(ma.compressed())) """ del reader sys.stderr.write("Done.\n") return list_of_mas, list_of_gene_ids
def data_read_in(self, infname, no_of_nas): """ 05-09-05 """ sys.stderr.write("Reading data...") list_of_mas = [] reader = csv.reader(open(infname, 'r'),delimiter='\t') reader.next() #ignore the first line for row in reader: data_ls = [] mask_ls = [] for item in row[1:]: #ignore the first edge id if item=='NA': data_ls.append(1.1) mask_ls.append(1) else: data_ls.append(float(item)) mask_ls.append(0) if no_of_nas: if sum(mask_ls)>no_of_nas: #too many NAs continue list_of_mas.append(array(data_ls, mask=mask_ls)) del reader sys.stderr.write("Done.\n") return list_of_mas
def get_ma_array_out_of_list(self, expr_list, take_log, round_one=0): """ 12-22-05 12-22-05 in the second round, take random to avoid high correlation caused by a series of 10 """ new_row = [] mask_ls = [] for i in range(len(expr_list)): if expr_list[i] == 'NA': new_row.append(1e20) mask_ls.append(1) elif expr_list[i] == '': #ignore empty entry continue else: value = float(expr_list[i]) if take_log: #12-22-05 if value<=10: if round_one: value = 10 else: value = random.uniform(math.e, 10) #12-22-05 to avoid high correlation caused by a series of 10 value = math.log(value) #12-22-05 new_row.append(value) mask_ls.append(0) ma_array = array(new_row, mask=mask_ls) return ma_array
def transform_one_file(self, src_pathname, delimiter, outputdir, b_instance, threshold, type, no_of_valids): """ 08-09-05 add type 08-29-05 add no_of_valids to cut genes with too few valid values """ reader = csv.reader(file(src_pathname), delimiter=delimiter) filename = os.path.basename(src_pathname) output_filename = os.path.join(outputdir, filename) std_list = [] for row in reader: gene_id = row[0] new_row = [] mask_ls = [] for i in range(1, len(row)): if row[i] == 'NA': new_row.append(1e20) mask_ls.append(1) elif row[i] == '': #ignore empty entry continue else: value = float(row[i]) if type==1: if value<=10: value = 10 value = math.log(value) new_row.append(value) mask_ls.append(0) ma_array = array(new_row, mask=mask_ls) if self.debug: print "The data vector is ",ma_array print "Its mask is ", ma_array.mask() if len(ma_array.compressed())>=no_of_valids: #at least two samples, otherwise, correlation can't be calculated #08-29-05 no_of_valids controls not too many NA's, which is for graph_modeling std = MLab.std(ma_array.compressed()) #disregard the NAs if self.debug: print "std is ",std raw_input("Continue?(Y/n)") std_list.append(std) del reader if len(std_list)>100: r.png('%s.png'%output_filename) r.hist(std_list, main='histogram',xlab='std',ylab='freq') r.dev_off()
def transpose_and_output(self, outfname, list_of_top_mas): """ 05-09-05 --ls_NA_fillin() """ sys.stderr.write("Outputing the data...") ls_2d = [] for ma in list_of_top_mas: ls_2d.append(ma.raw_data()) matrix = array(ls_2d) matrix = transpose(matrix) writer = csv.writer(open(outfname, 'w'), delimiter='\t') writer.writerow(matrix.shape) writer.writerow(["column", "column"]+range(len(matrix[0]))) for i in range(matrix.shape[0]): ls_with_NA_filled = self.ls_NA_fillin(matrix[i]) writer.writerow([i, i]+ls_with_NA_filled) sys.stderr.write("Done.\n")