Esempio n. 1
0
    def stat_output(self, stat_table_f, gene_no2no_of_predictions):
        """
		03-09-05
			
		"""
        sys.stderr.write("Outputting stats ... ")
        no_of_predictons_array = array(gene_no2no_of_predictions.values())
        before_array = no_of_predictons_array[:, 0]
        after_array = no_of_predictons_array[:, 1]
        no_of_predicted_genes = len(gene_no2no_of_predictions)
        genes_with_multiple_functions = sum(greater(before_array, 1))
        genes_with_multiple_contexts = sum(greater(after_array, 1))

        avg_functions_per_gene = sum(before_array) / float(no_of_predicted_genes)
        avg_contexts_per_gene = sum(after_array) / float(no_of_predicted_genes)

        stat_table_f.writerow(["Total genes predicted: %s" % no_of_predicted_genes])
        stat_table_f.writerow(
            [
                "\t%s(%f) of predicted genes with multiple functions"
                % (genes_with_multiple_functions, genes_with_multiple_functions / float(no_of_predicted_genes))
            ]
        )
        stat_table_f.writerow(
            [
                "\t%s(%f) of predicted genes with multiple distinct contexts"
                % (genes_with_multiple_contexts, genes_with_multiple_contexts / float(no_of_predicted_genes))
            ]
        )
        stat_table_f.writerow(["\taverage functions per gene: %f" % (avg_functions_per_gene)])
        stat_table_f.writerow(["\taverage distinct contexts per gene: %f" % (avg_contexts_per_gene)])
        sys.stderr.write("Done.\n")
Esempio n. 2
0
	def stat_output(self, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src):
		"""
		03-09-05
			give an overview stats for distinct function group of each gene
		"""
		sys.stderr.write("Outputting stats ... ")
		#make some blank lines
		outf.write("\n\n")
		list_of_function_groups_of_known_genes = map(len, known_gene_no2p_gene_id_src.values())
		list_of_function_groups_of_unknown_genes = map(len, unknown_gene_no2p_gene_id_src.values())
		no_of_known_genes = len(known_gene_no2p_gene_id_src)
		known_genes_with_multiple_function_groups = sum(greater(list_of_function_groups_of_known_genes,1))
		no_of_unknown_genes = len(unknown_gene_no2p_gene_id_src)
		unknown_genes_with_multiple_function_groups = sum(greater(list_of_function_groups_of_unknown_genes,1))
		no_of_genes = no_of_known_genes +no_of_unknown_genes
		avg_functions_groups_per_known_gene = sum(list_of_function_groups_of_known_genes)/float(no_of_known_genes)
		avg_functions_groups_per_unknown_gene = sum(list_of_function_groups_of_unknown_genes)/float(no_of_unknown_genes)
		avg_functions_groups_per_gene = \
			sum(list_of_function_groups_of_known_genes + list_of_function_groups_of_unknown_genes)/float(no_of_genes)
		outf.write("Total predicted genes: %s.\n"%no_of_genes)
		outf.write("\tAverage number of function groups per gene: %s.\n"%avg_functions_groups_per_gene)
		outf.write("Total known genes: %s. %s of them with multiple function groups\n"%\
			(no_of_known_genes, known_genes_with_multiple_function_groups))
		outf.write("\tAverage number of function groups per known gene: %s.\n"%avg_functions_groups_per_known_gene)
		outf.write("Total unknown genes: %s. %s of them with multiple function groups\n"%\
			(no_of_unknown_genes, unknown_genes_with_multiple_function_groups))
		outf.write("\tAverage number of function groups per unknown gene: %s.\n"%avg_functions_groups_per_unknown_gene)

		sys.stderr.write("Done.\n")
Esempio n. 3
0
	def get_cluster_accuracy(self, curs, p_gene_table, mcl_id_list, p_value_cut_off=0.01):
		"""
		04-07-05
			get the accuracy, no_of_corrected predictions, known predictions, total predictions for each cluster
		"""
		accuracy2cluster = []
		for mcl_id in mcl_id_list:
			curs.execute("select is_correct_lca from %s where mcl_id=%s and p_value_cut_off<=%s"%(p_gene_table,\
				mcl_id, p_value_cut_off))
			rows = curs.fetchall()
			if rows:
				is_correct_lca_array = array(rows)
				correct_array = greater(is_correct_lca_array[:,0],0)
				known_array = greater_equal(is_correct_lca_array[:,0],0)
				accuracy = float(sum(correct_array))/float(sum(known_array))
				accuracy2cluster.append([accuracy, sum(correct_array), sum(known_array), len(correct_array), mcl_id]) 
		return accuracy2cluster
Esempio n. 4
0
def find_closest(input_array, target_array, tol):
    """

    Find the set of elements in input_array that are closest to
    elements in target_array.  Record the indices of the elements in
    target_array that are within tolerance, tol, of their closest
    match. Also record the indices of the elements in target_array
    that are outside tolerance, tol, of their match.

    For example, given an array of observations with irregular
    observation times along with an array of times of interest, this
    routine can be used to find those observations that are closest to
    the times of interest that are within a given time tolerance.

    NOTE: input_array must be sorted! The array, target_array, does not have to be sorted.

    Inputs:
      input_array:  a sorted Float64 numarray
      target_array: a Float64 numarray
      tol:          a tolerance

    Returns:
      closest_indices:  the array of indices of elements in input_array that are closest to elements in target_array
      accept_indices:  the indices of elements in target_array that have a match in input_array within tolerance
      reject_indices:  the indices of elements in target_array that do not have a match in input_array within tolerance
    """

    input_array_len = len(input_array)
    closest_indices = numarray.searchsorted(input_array, target_array) # determine the locations of target_array in input_array
    acc_rej_indices = [-1] * len(target_array)
    curr_tol = [tol] * len(target_array)

    est_tol = 0.0
    for i in xrange(len(target_array)):
        best_off = 0          # used to adjust closest_indices[i] for best approximating element in input_array

        if closest_indices[i] >= input_array_len:
            # the value target_array[i] is >= all elements in input_array so check whether it is within tolerance of the last element
            closest_indices[i] = input_array_len - 1
            est_tol = target_array[i] - input_array[closest_indices[i]]
            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i
        elif target_array[i] == input_array[closest_indices[i]]:
            # target_array[i] is in input_array
            est_tol = 0.0
            curr_tol[i] = 0.0
            acc_rej_indices[i] = i
        elif closest_indices[i] == 0:
            # target_array[i] is <= all elements in input_array
            est_tol = input_array[0] - target_array[i]
            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i
        else:
            # target_array[i] is between input_array[closest_indices[i]-1] and input_array[closest_indices[i]]
            # and closest_indices[i] must be > 0
            top_tol = input_array[closest_indices[i]] - target_array[i]
            bot_tol = target_array[i] - input_array[closest_indices[i]-1]
            if bot_tol <= top_tol:
                est_tol = bot_tol
                best_off = -1           # this is the only place where best_off != 0
            else:
                est_tol = top_tol

            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i

        if est_tol <= tol:
            closest_indices[i] += best_off

    accept_indices = numarray.compress(numarray.greater(acc_rej_indices, -1), acc_rej_indices)
    reject_indices = numarray.compress(numarray.equal(acc_rej_indices, -1), numarray.arange(len(acc_rej_indices)))
    return (closest_indices, accept_indices, reject_indices)
Esempio n. 5
0
def matchum(file1,
            file2,
            tol=10,
            perr=4,
            aerr=1.0,
            nmax=40,
            im_masks1=[],
            im_masks2=[],
            debug=0,
            domags=0,
            xrange=None,
            yrange=None,
            sigma=4,
            aoffset=0):
    '''Take the output of two sextractor runs and match up the objects with
   each other (find out which objects in the first file match up with
   objects in the second file.  The routine considers a 'match' to be any 
   two objects that are closer than tol pixels (after applying the shift).  
   Returns a 6-tuple:  (x1,y1,x2,y2,o1,o2).  o1 and o2 are the ojbects
   numbers such that o1[i] in file 1 corresponds to o2[i] in file 2.'''
    NA = num.NewAxis

    sexdata1 = readsex(file1)
    sexdata2 = readsex(file2)

    # Use the readsex data to get arrays of the (x,y) positions
    x1 = num.asarray(sexdata1[0]['X_IMAGE'])
    y1 = num.asarray(sexdata1[0]['Y_IMAGE'])
    x2 = num.asarray(sexdata2[0]['X_IMAGE'])
    y2 = num.asarray(sexdata2[0]['Y_IMAGE'])
    m1 = num.asarray(sexdata1[0]['MAG_BEST'])
    m2 = num.asarray(sexdata2[0]['MAG_BEST'])
    o1 = num.asarray(sexdata1[0]['NUMBER'])
    o2 = num.asarray(sexdata2[0]['NUMBER'])
    f1 = num.asarray(sexdata1[0]['FLAGS'])
    f2 = num.asarray(sexdata2[0]['FLAGS'])

    # First, make a cut on the flags:
    gids = num.where(f1 < 4)
    x1 = x1[gids]
    y1 = y1[gids]
    m1 = m1[gids]
    o1 = o1[gids]
    gids = num.where(f2 < 4)
    x2 = x2[gids]
    y2 = y2[gids]
    m2 = m2[gids]
    o2 = o2[gids]

    # next, if there is a range to use:
    if xrange is not None and yrange is not None:
        cond = num.greater(x1, xrange[0])*num.less(x1,xrange[1])*\
              num.greater(y1, yrange[0])*num.less(y1,yrange[1])
        gids = num.where(cond)
        x1 = x1[gids]
        y1 = y1[gids]
        m1 = m1[gids]
        o1 = o1[gids]
        cond = num.greater(x2, xrange[0])*num.less(x2,xrange[1])*\
              num.greater(y2, yrange[0])*num.less(y2,yrange[1])
        gids = num.where(cond)
        x2 = x2[gids]
        y2 = y2[gids]
        m2 = m2[gids]
        o2 = o2[gids]

    # Use the user masks
    for m in im_masks1:
        print "applying mask (%d,%d,%d,%d)" % tuple(m)
        condx = num.less(x1, m[0]) + num.greater(x1, m[1])
        condy = num.less(y1, m[2]) + num.greater(y1, m[3])
        gids = num.where(condx + condy)
        x1 = x1[gids]
        y1 = y1[gids]
        m1 = m1[gids]
        o1 = o1[gids]

    for m in im_masks2:
        print "applying mask (%d,%d,%d,%d)" % tuple(m)
        condx = num.less(x2, m[0]) + num.greater(x2, m[1])
        condy = num.less(y2, m[2]) + num.greater(y2, m[3])
        gids = num.where(condx + condy)
        x2 = x2[gids]
        y2 = y2[gids]
        m2 = m2[gids]
        o2 = o2[gids]

    if nmax:
        if len(x1) > nmax:
            ids = num.argsort(m1)[0:nmax]
            x1 = x1[ids]
            y1 = y1[ids]
            m1 = m1[ids]
            o1 = o1[ids]
        if len(x2) > nmax:
            ids = num.argsort(m2)[0:nmax]
            x2 = x2[ids]
            y2 = y2[ids]
            m2 = m2[ids]
            o2 = o2[ids]
    if debug:
        print "objects in frame 1:"
        print o1
        print "objects in frame 2:"
        print o2
        mp = pygplot.MPlot(2, 1, device='/XWIN')
        p = pygplot.Plot()
        p.point(x1, y1)
        [p.label(x1[i], y1[i], "%d" % o1[i]) for i in range(len(x1))]
        mp.add(p)
        p = pygplot.Plot()
        p.point(x2, y2)
        [p.label(x2[i], y2[i], "%d" % o2[i]) for i in range(len(x2))]
        mp.add(p)
        mp.plot()
        mp.close()

    # Now, we make 2-D arrays of all the differences in x and y between each pair
    #  of objects.  e.g., dx1[n,m] is the delta-x between object n and m in file 1 and
    #  dy2[n,m] is the y-distance between object n and m in file 2.
    dx1 = x1[NA, :] - x1[:, NA]
    dx2 = x2[NA, :] - x2[:, NA]
    dy1 = y1[NA, :] - y1[:, NA]
    dy2 = y2[NA, :] - y2[:, NA]
    # Same, but with angles
    da1 = num.arctan2(dy1, dx1) * 180 / num.pi
    da2 = num.arctan2(dy2, dx2) * 180 / num.pi
    # Same, but with absolute distances
    ds1 = num.sqrt(num.power(dx1, 2) + num.power(dy1, 2))
    ds2 = num.sqrt(num.power(dx2, 2) + num.power(dy2, 2))

    # Here's the real magic:  this is a matrix of matrices (4-D).  Consider 4 objects:
    #  objects i and j in file 1 and objects m and n in file 2.  dx[i,j,m,n] is the
    #  difference between delta-xs for objects i,j in file 1 and m,n in file 2.  If object
    #  i corresponds to object m and object j corresponds to object n, this should be a small
    #  number, irregardless of an overall shift in coordinate systems between file 1 and 2.
    dx = dx1[::, ::, NA, NA] - dx2[NA, NA, ::, ::]
    dy = dy1[::, ::, NA, NA] - dy2[NA, NA, ::, ::]
    da = da1[::, ::, NA, NA] - da2[NA, NA, ::, ::] + aoffset
    ds = ds1[::, ::, NA, NA] - ds2[NA, NA, ::, ::]
    # pick out close pairs.
    #use = num.less(dy,perr)*num.less(dx,perr)*num.less(num.abs(da),aerr)
    use = num.less(ds, perr) * num.less(num.abs(da), aerr)
    use = use.astype(num.Int32)

    #use = num.less(num.abs(da),perr)
    suse = num.add.reduce(num.add.reduce(use, 3), 1)
    print suse[0]

    guse = num.greater(suse, suse.flat.max() / 2)
    i = [j for j in range(x1.shape[0]) if num.sum(guse[j])]
    m = [num.argmax(guse[j]) for j in range(x1.shape[0]) if num.sum(guse[j])]
    xx0, yy0, oo0, mm0 = num.take([x1, y1, o1, m1], i, 1)
    xx1, yy1, oo1, mm1 = num.take([x2, y2, o2, m2], m, 1)
    if debug:
        mp = pygplot.MPlot(2, 1, device='/XWIN')
        p = pygplot.Plot()
        p.point(xx0, yy0)
        [p.label(xx0[i], yy0[i], "%d" % oo0[i]) for i in range(len(xx0))]
        mp.add(p)
        p = pygplot.Plot()
        p.point(xx1, yy1)
        [p.label(xx1[i], yy1[i], "%d" % oo1[i]) for i in range(len(xx1))]
        mp.add(p)
        mp.plot()
        mp.close()
    xshift, xscat = stats.bwt(xx0 - xx1)
    xscat = max([1.0, xscat])
    yshift, yscat = stats.bwt(yy0 - yy1)
    yscat = max([1.0, yscat])
    mshift, mscat = stats.bwt(mm0 - mm1)
    print "xscat = ", xscat
    print "yscat = ", yscat
    print "xshift = ", xshift
    print "yshift = ", yshift
    print "mshift = ", mshift
    print "mscat = ", mscat
    keep = num.less(num.abs(xx0-xx1-xshift),sigma*xscat)*\
          num.less(num.abs(yy0-yy1-yshift),sigma*yscat)
    # This is a list of x,y,object# in each file.
    xx0, yy0, oo0, xx1, yy1, oo1 = num.compress(keep,
                                                [xx0, yy0, oo0, xx1, yy1, oo1],
                                                1)

    if debug:
        print file1, oo0
        print file2, oo1
        mp = pygplot.MPlot(2, 1, device='temp.ps/CPS')
        p1 = pygplot.Plot()
        p1.point(xx0, yy0, symbol=25, color='red')
        for i in range(len(xx0)):
            p1.label(xx0[i], yy0[i], " %d" % oo0[i], color='red')
        mp.add(p1)
        p2 = pygplot.Plot()
        p2.point(xx1, yy1, symbol=25, color='green')
        for i in range(len(xx1)):
            p2.label(xx1[i], yy1[i], " %d" % oo1[i], color='green')
        mp.add(p2)
        mp.plot()
        mp.close()

    if domags:
        return (xx0, yy0, mm0, xx1, yy1, mm1, mshift, mscat, oo0, oo1)
    else:
        return (xx0, yy0, xx1, yy1, oo0, oo1)
Esempio n. 6
0
def find_closest(input_array, target_array, tol):
    """

    Find the set of elements in input_array that are closest to
    elements in target_array.  Record the indices of the elements in
    target_array that are within tolerance, tol, of their closest
    match. Also record the indices of the elements in target_array
    that are outside tolerance, tol, of their match.

    For example, given an array of observations with irregular
    observation times along with an array of times of interest, this
    routine can be used to find those observations that are closest to
    the times of interest that are within a given time tolerance.

    NOTE: input_array must be sorted! The array, target_array, does not have to be sorted.

    Inputs:
      input_array:  a sorted Float64 numarray
      target_array: a Float64 numarray
      tol:          a tolerance

    Returns:
      closest_indices:  the array of indices of elements in input_array that are closest to elements in target_array
      accept_indices:  the indices of elements in target_array that have a match in input_array within tolerance
      reject_indices:  the indices of elements in target_array that do not have a match in input_array within tolerance
    """

    input_array_len = len(input_array)
    closest_indices = numarray.searchsorted(
        input_array,
        target_array)  # determine the locations of target_array in input_array
    acc_rej_indices = [-1] * len(target_array)
    curr_tol = [tol] * len(target_array)

    est_tol = 0.0
    for i in xrange(len(target_array)):
        best_off = 0  # used to adjust closest_indices[i] for best approximating element in input_array

        if closest_indices[i] >= input_array_len:
            # the value target_array[i] is >= all elements in input_array so check whether it is within tolerance of the last element
            closest_indices[i] = input_array_len - 1
            est_tol = target_array[i] - input_array[closest_indices[i]]
            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i
        elif target_array[i] == input_array[closest_indices[i]]:
            # target_array[i] is in input_array
            est_tol = 0.0
            curr_tol[i] = 0.0
            acc_rej_indices[i] = i
        elif closest_indices[i] == 0:
            # target_array[i] is <= all elements in input_array
            est_tol = input_array[0] - target_array[i]
            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i
        else:
            # target_array[i] is between input_array[closest_indices[i]-1] and input_array[closest_indices[i]]
            # and closest_indices[i] must be > 0
            top_tol = input_array[closest_indices[i]] - target_array[i]
            bot_tol = target_array[i] - input_array[closest_indices[i] - 1]
            if bot_tol <= top_tol:
                est_tol = bot_tol
                best_off = -1  # this is the only place where best_off != 0
            else:
                est_tol = top_tol

            if est_tol < curr_tol[i]:
                curr_tol[i] = est_tol
                acc_rej_indices[i] = i

        if est_tol <= tol:
            closest_indices[i] += best_off

    accept_indices = numarray.compress(numarray.greater(acc_rej_indices, -1),
                                       acc_rej_indices)
    reject_indices = numarray.compress(numarray.equal(acc_rej_indices, -1),
                                       numarray.arange(len(acc_rej_indices)))
    return (closest_indices, accept_indices, reject_indices)