def stat_output(self, stat_table_f, gene_no2no_of_predictions): """ 03-09-05 """ sys.stderr.write("Outputting stats ... ") no_of_predictons_array = array(gene_no2no_of_predictions.values()) before_array = no_of_predictons_array[:, 0] after_array = no_of_predictons_array[:, 1] no_of_predicted_genes = len(gene_no2no_of_predictions) genes_with_multiple_functions = sum(greater(before_array, 1)) genes_with_multiple_contexts = sum(greater(after_array, 1)) avg_functions_per_gene = sum(before_array) / float(no_of_predicted_genes) avg_contexts_per_gene = sum(after_array) / float(no_of_predicted_genes) stat_table_f.writerow(["Total genes predicted: %s" % no_of_predicted_genes]) stat_table_f.writerow( [ "\t%s(%f) of predicted genes with multiple functions" % (genes_with_multiple_functions, genes_with_multiple_functions / float(no_of_predicted_genes)) ] ) stat_table_f.writerow( [ "\t%s(%f) of predicted genes with multiple distinct contexts" % (genes_with_multiple_contexts, genes_with_multiple_contexts / float(no_of_predicted_genes)) ] ) stat_table_f.writerow(["\taverage functions per gene: %f" % (avg_functions_per_gene)]) stat_table_f.writerow(["\taverage distinct contexts per gene: %f" % (avg_contexts_per_gene)]) sys.stderr.write("Done.\n")
def stat_output(self, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src): """ 03-09-05 give an overview stats for distinct function group of each gene """ sys.stderr.write("Outputting stats ... ") #make some blank lines outf.write("\n\n") list_of_function_groups_of_known_genes = map(len, known_gene_no2p_gene_id_src.values()) list_of_function_groups_of_unknown_genes = map(len, unknown_gene_no2p_gene_id_src.values()) no_of_known_genes = len(known_gene_no2p_gene_id_src) known_genes_with_multiple_function_groups = sum(greater(list_of_function_groups_of_known_genes,1)) no_of_unknown_genes = len(unknown_gene_no2p_gene_id_src) unknown_genes_with_multiple_function_groups = sum(greater(list_of_function_groups_of_unknown_genes,1)) no_of_genes = no_of_known_genes +no_of_unknown_genes avg_functions_groups_per_known_gene = sum(list_of_function_groups_of_known_genes)/float(no_of_known_genes) avg_functions_groups_per_unknown_gene = sum(list_of_function_groups_of_unknown_genes)/float(no_of_unknown_genes) avg_functions_groups_per_gene = \ sum(list_of_function_groups_of_known_genes + list_of_function_groups_of_unknown_genes)/float(no_of_genes) outf.write("Total predicted genes: %s.\n"%no_of_genes) outf.write("\tAverage number of function groups per gene: %s.\n"%avg_functions_groups_per_gene) outf.write("Total known genes: %s. %s of them with multiple function groups\n"%\ (no_of_known_genes, known_genes_with_multiple_function_groups)) outf.write("\tAverage number of function groups per known gene: %s.\n"%avg_functions_groups_per_known_gene) outf.write("Total unknown genes: %s. %s of them with multiple function groups\n"%\ (no_of_unknown_genes, unknown_genes_with_multiple_function_groups)) outf.write("\tAverage number of function groups per unknown gene: %s.\n"%avg_functions_groups_per_unknown_gene) sys.stderr.write("Done.\n")
def get_cluster_accuracy(self, curs, p_gene_table, mcl_id_list, p_value_cut_off=0.01): """ 04-07-05 get the accuracy, no_of_corrected predictions, known predictions, total predictions for each cluster """ accuracy2cluster = [] for mcl_id in mcl_id_list: curs.execute("select is_correct_lca from %s where mcl_id=%s and p_value_cut_off<=%s"%(p_gene_table,\ mcl_id, p_value_cut_off)) rows = curs.fetchall() if rows: is_correct_lca_array = array(rows) correct_array = greater(is_correct_lca_array[:,0],0) known_array = greater_equal(is_correct_lca_array[:,0],0) accuracy = float(sum(correct_array))/float(sum(known_array)) accuracy2cluster.append([accuracy, sum(correct_array), sum(known_array), len(correct_array), mcl_id]) return accuracy2cluster
def find_closest(input_array, target_array, tol): """ Find the set of elements in input_array that are closest to elements in target_array. Record the indices of the elements in target_array that are within tolerance, tol, of their closest match. Also record the indices of the elements in target_array that are outside tolerance, tol, of their match. For example, given an array of observations with irregular observation times along with an array of times of interest, this routine can be used to find those observations that are closest to the times of interest that are within a given time tolerance. NOTE: input_array must be sorted! The array, target_array, does not have to be sorted. Inputs: input_array: a sorted Float64 numarray target_array: a Float64 numarray tol: a tolerance Returns: closest_indices: the array of indices of elements in input_array that are closest to elements in target_array accept_indices: the indices of elements in target_array that have a match in input_array within tolerance reject_indices: the indices of elements in target_array that do not have a match in input_array within tolerance """ input_array_len = len(input_array) closest_indices = numarray.searchsorted(input_array, target_array) # determine the locations of target_array in input_array acc_rej_indices = [-1] * len(target_array) curr_tol = [tol] * len(target_array) est_tol = 0.0 for i in xrange(len(target_array)): best_off = 0 # used to adjust closest_indices[i] for best approximating element in input_array if closest_indices[i] >= input_array_len: # the value target_array[i] is >= all elements in input_array so check whether it is within tolerance of the last element closest_indices[i] = input_array_len - 1 est_tol = target_array[i] - input_array[closest_indices[i]] if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i elif target_array[i] == input_array[closest_indices[i]]: # target_array[i] is in input_array est_tol = 0.0 curr_tol[i] = 0.0 acc_rej_indices[i] = i elif closest_indices[i] == 0: # target_array[i] is <= all elements in input_array est_tol = input_array[0] - target_array[i] if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i else: # target_array[i] is between input_array[closest_indices[i]-1] and input_array[closest_indices[i]] # and closest_indices[i] must be > 0 top_tol = input_array[closest_indices[i]] - target_array[i] bot_tol = target_array[i] - input_array[closest_indices[i]-1] if bot_tol <= top_tol: est_tol = bot_tol best_off = -1 # this is the only place where best_off != 0 else: est_tol = top_tol if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i if est_tol <= tol: closest_indices[i] += best_off accept_indices = numarray.compress(numarray.greater(acc_rej_indices, -1), acc_rej_indices) reject_indices = numarray.compress(numarray.equal(acc_rej_indices, -1), numarray.arange(len(acc_rej_indices))) return (closest_indices, accept_indices, reject_indices)
def matchum(file1, file2, tol=10, perr=4, aerr=1.0, nmax=40, im_masks1=[], im_masks2=[], debug=0, domags=0, xrange=None, yrange=None, sigma=4, aoffset=0): '''Take the output of two sextractor runs and match up the objects with each other (find out which objects in the first file match up with objects in the second file. The routine considers a 'match' to be any two objects that are closer than tol pixels (after applying the shift). Returns a 6-tuple: (x1,y1,x2,y2,o1,o2). o1 and o2 are the ojbects numbers such that o1[i] in file 1 corresponds to o2[i] in file 2.''' NA = num.NewAxis sexdata1 = readsex(file1) sexdata2 = readsex(file2) # Use the readsex data to get arrays of the (x,y) positions x1 = num.asarray(sexdata1[0]['X_IMAGE']) y1 = num.asarray(sexdata1[0]['Y_IMAGE']) x2 = num.asarray(sexdata2[0]['X_IMAGE']) y2 = num.asarray(sexdata2[0]['Y_IMAGE']) m1 = num.asarray(sexdata1[0]['MAG_BEST']) m2 = num.asarray(sexdata2[0]['MAG_BEST']) o1 = num.asarray(sexdata1[0]['NUMBER']) o2 = num.asarray(sexdata2[0]['NUMBER']) f1 = num.asarray(sexdata1[0]['FLAGS']) f2 = num.asarray(sexdata2[0]['FLAGS']) # First, make a cut on the flags: gids = num.where(f1 < 4) x1 = x1[gids] y1 = y1[gids] m1 = m1[gids] o1 = o1[gids] gids = num.where(f2 < 4) x2 = x2[gids] y2 = y2[gids] m2 = m2[gids] o2 = o2[gids] # next, if there is a range to use: if xrange is not None and yrange is not None: cond = num.greater(x1, xrange[0])*num.less(x1,xrange[1])*\ num.greater(y1, yrange[0])*num.less(y1,yrange[1]) gids = num.where(cond) x1 = x1[gids] y1 = y1[gids] m1 = m1[gids] o1 = o1[gids] cond = num.greater(x2, xrange[0])*num.less(x2,xrange[1])*\ num.greater(y2, yrange[0])*num.less(y2,yrange[1]) gids = num.where(cond) x2 = x2[gids] y2 = y2[gids] m2 = m2[gids] o2 = o2[gids] # Use the user masks for m in im_masks1: print "applying mask (%d,%d,%d,%d)" % tuple(m) condx = num.less(x1, m[0]) + num.greater(x1, m[1]) condy = num.less(y1, m[2]) + num.greater(y1, m[3]) gids = num.where(condx + condy) x1 = x1[gids] y1 = y1[gids] m1 = m1[gids] o1 = o1[gids] for m in im_masks2: print "applying mask (%d,%d,%d,%d)" % tuple(m) condx = num.less(x2, m[0]) + num.greater(x2, m[1]) condy = num.less(y2, m[2]) + num.greater(y2, m[3]) gids = num.where(condx + condy) x2 = x2[gids] y2 = y2[gids] m2 = m2[gids] o2 = o2[gids] if nmax: if len(x1) > nmax: ids = num.argsort(m1)[0:nmax] x1 = x1[ids] y1 = y1[ids] m1 = m1[ids] o1 = o1[ids] if len(x2) > nmax: ids = num.argsort(m2)[0:nmax] x2 = x2[ids] y2 = y2[ids] m2 = m2[ids] o2 = o2[ids] if debug: print "objects in frame 1:" print o1 print "objects in frame 2:" print o2 mp = pygplot.MPlot(2, 1, device='/XWIN') p = pygplot.Plot() p.point(x1, y1) [p.label(x1[i], y1[i], "%d" % o1[i]) for i in range(len(x1))] mp.add(p) p = pygplot.Plot() p.point(x2, y2) [p.label(x2[i], y2[i], "%d" % o2[i]) for i in range(len(x2))] mp.add(p) mp.plot() mp.close() # Now, we make 2-D arrays of all the differences in x and y between each pair # of objects. e.g., dx1[n,m] is the delta-x between object n and m in file 1 and # dy2[n,m] is the y-distance between object n and m in file 2. dx1 = x1[NA, :] - x1[:, NA] dx2 = x2[NA, :] - x2[:, NA] dy1 = y1[NA, :] - y1[:, NA] dy2 = y2[NA, :] - y2[:, NA] # Same, but with angles da1 = num.arctan2(dy1, dx1) * 180 / num.pi da2 = num.arctan2(dy2, dx2) * 180 / num.pi # Same, but with absolute distances ds1 = num.sqrt(num.power(dx1, 2) + num.power(dy1, 2)) ds2 = num.sqrt(num.power(dx2, 2) + num.power(dy2, 2)) # Here's the real magic: this is a matrix of matrices (4-D). Consider 4 objects: # objects i and j in file 1 and objects m and n in file 2. dx[i,j,m,n] is the # difference between delta-xs for objects i,j in file 1 and m,n in file 2. If object # i corresponds to object m and object j corresponds to object n, this should be a small # number, irregardless of an overall shift in coordinate systems between file 1 and 2. dx = dx1[::, ::, NA, NA] - dx2[NA, NA, ::, ::] dy = dy1[::, ::, NA, NA] - dy2[NA, NA, ::, ::] da = da1[::, ::, NA, NA] - da2[NA, NA, ::, ::] + aoffset ds = ds1[::, ::, NA, NA] - ds2[NA, NA, ::, ::] # pick out close pairs. #use = num.less(dy,perr)*num.less(dx,perr)*num.less(num.abs(da),aerr) use = num.less(ds, perr) * num.less(num.abs(da), aerr) use = use.astype(num.Int32) #use = num.less(num.abs(da),perr) suse = num.add.reduce(num.add.reduce(use, 3), 1) print suse[0] guse = num.greater(suse, suse.flat.max() / 2) i = [j for j in range(x1.shape[0]) if num.sum(guse[j])] m = [num.argmax(guse[j]) for j in range(x1.shape[0]) if num.sum(guse[j])] xx0, yy0, oo0, mm0 = num.take([x1, y1, o1, m1], i, 1) xx1, yy1, oo1, mm1 = num.take([x2, y2, o2, m2], m, 1) if debug: mp = pygplot.MPlot(2, 1, device='/XWIN') p = pygplot.Plot() p.point(xx0, yy0) [p.label(xx0[i], yy0[i], "%d" % oo0[i]) for i in range(len(xx0))] mp.add(p) p = pygplot.Plot() p.point(xx1, yy1) [p.label(xx1[i], yy1[i], "%d" % oo1[i]) for i in range(len(xx1))] mp.add(p) mp.plot() mp.close() xshift, xscat = stats.bwt(xx0 - xx1) xscat = max([1.0, xscat]) yshift, yscat = stats.bwt(yy0 - yy1) yscat = max([1.0, yscat]) mshift, mscat = stats.bwt(mm0 - mm1) print "xscat = ", xscat print "yscat = ", yscat print "xshift = ", xshift print "yshift = ", yshift print "mshift = ", mshift print "mscat = ", mscat keep = num.less(num.abs(xx0-xx1-xshift),sigma*xscat)*\ num.less(num.abs(yy0-yy1-yshift),sigma*yscat) # This is a list of x,y,object# in each file. xx0, yy0, oo0, xx1, yy1, oo1 = num.compress(keep, [xx0, yy0, oo0, xx1, yy1, oo1], 1) if debug: print file1, oo0 print file2, oo1 mp = pygplot.MPlot(2, 1, device='temp.ps/CPS') p1 = pygplot.Plot() p1.point(xx0, yy0, symbol=25, color='red') for i in range(len(xx0)): p1.label(xx0[i], yy0[i], " %d" % oo0[i], color='red') mp.add(p1) p2 = pygplot.Plot() p2.point(xx1, yy1, symbol=25, color='green') for i in range(len(xx1)): p2.label(xx1[i], yy1[i], " %d" % oo1[i], color='green') mp.add(p2) mp.plot() mp.close() if domags: return (xx0, yy0, mm0, xx1, yy1, mm1, mshift, mscat, oo0, oo1) else: return (xx0, yy0, xx1, yy1, oo0, oo1)
def find_closest(input_array, target_array, tol): """ Find the set of elements in input_array that are closest to elements in target_array. Record the indices of the elements in target_array that are within tolerance, tol, of their closest match. Also record the indices of the elements in target_array that are outside tolerance, tol, of their match. For example, given an array of observations with irregular observation times along with an array of times of interest, this routine can be used to find those observations that are closest to the times of interest that are within a given time tolerance. NOTE: input_array must be sorted! The array, target_array, does not have to be sorted. Inputs: input_array: a sorted Float64 numarray target_array: a Float64 numarray tol: a tolerance Returns: closest_indices: the array of indices of elements in input_array that are closest to elements in target_array accept_indices: the indices of elements in target_array that have a match in input_array within tolerance reject_indices: the indices of elements in target_array that do not have a match in input_array within tolerance """ input_array_len = len(input_array) closest_indices = numarray.searchsorted( input_array, target_array) # determine the locations of target_array in input_array acc_rej_indices = [-1] * len(target_array) curr_tol = [tol] * len(target_array) est_tol = 0.0 for i in xrange(len(target_array)): best_off = 0 # used to adjust closest_indices[i] for best approximating element in input_array if closest_indices[i] >= input_array_len: # the value target_array[i] is >= all elements in input_array so check whether it is within tolerance of the last element closest_indices[i] = input_array_len - 1 est_tol = target_array[i] - input_array[closest_indices[i]] if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i elif target_array[i] == input_array[closest_indices[i]]: # target_array[i] is in input_array est_tol = 0.0 curr_tol[i] = 0.0 acc_rej_indices[i] = i elif closest_indices[i] == 0: # target_array[i] is <= all elements in input_array est_tol = input_array[0] - target_array[i] if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i else: # target_array[i] is between input_array[closest_indices[i]-1] and input_array[closest_indices[i]] # and closest_indices[i] must be > 0 top_tol = input_array[closest_indices[i]] - target_array[i] bot_tol = target_array[i] - input_array[closest_indices[i] - 1] if bot_tol <= top_tol: est_tol = bot_tol best_off = -1 # this is the only place where best_off != 0 else: est_tol = top_tol if est_tol < curr_tol[i]: curr_tol[i] = est_tol acc_rej_indices[i] = i if est_tol <= tol: closest_indices[i] += best_off accept_indices = numarray.compress(numarray.greater(acc_rej_indices, -1), acc_rej_indices) reject_indices = numarray.compress(numarray.equal(acc_rej_indices, -1), numarray.arange(len(acc_rej_indices))) return (closest_indices, accept_indices, reject_indices)