Example #1
0
def distSpearman(x, y):
    """distance corresponding to 1 - spearman's correlation coefficient for arrays x,y
    returns distance: 1 - spearman_r
    """
    x = MA.asarray(x)
    y = MA.asarray(y)
    assert MA.rank(x) == MA.rank(y) == 1
    cond = MA.logical_not(MA.logical_or(MA.getmaskarray(x),
                                        MA.getmaskarray(y)))
    return 1 - statc.spearmanr(
        MA.compress(cond, x).tolist(),
        MA.compress(cond, y).tolist())[0]
Example #2
0
def _distSpearmanW_MA(x, y, w):
    """if any of x,y,w is a MA array containing masked values
    """
    x = MA.asarray(x)
    y = MA.asarray(y)
    w = MA.asarray(w)
    assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1
    cond = MA.logical_not(
        MA.logical_or(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y)),
                      MA.getmaskarray(w)))
    # with MA use compress before tolist() !
    rankx = Numeric.array(statc.rankdata(MA.compress(cond, x).tolist()))
    ranky = Numeric.array(statc.rankdata(MA.compress(cond, y).tolist()))
    return distPearsonW(rankx, ranky, MA.compress(cond, w))
Example #3
0
def loessMA(m,
            windowSize,
            axis=0,
            approxMasked=True,
            verbose=False,
            callback=None):
    """Returns a new array with values at the given axis smoothed by loess;
    if approxMasked==True: the masked values are approximated by loess;
    assumes equidistant spacing of points on the given axis.
    """
    assert 0 < windowSize <= m.shape[
        axis] + 0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % (
            "%", windowSize)
    m = MA.asarray(m)
    if m.dtype.char <> Numeric.Float:
        m = m.astype(Numeric.Float)
    shp_other = list(m.shape)
    shp_other.pop(axis)
    # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros
    mask = Numeric.reshape(
        Numeric.transpose(MA.getmaskarray(m), [axis] + range(0, axis) +
                          range(axis + 1, len(m.shape))),
        (m.shape[axis], Numeric.multiply.reduce(shp_other)))
    data = MA.reshape(
        MA.transpose(m,
                     [axis] + range(0, axis) + range(axis + 1, len(m.shape))),
        (m.shape[axis], Numeric.multiply.reduce(shp_other)))
    maskInv = -1 * (mask - 1)
    xall = Numeric.arange(data.shape[0])
    xallList = xall.tolist()
    for ii in Numeric.compress(
            Numeric.add.reduce(maskInv, 0) > 1, range(data.shape[1])
    ):  # run loess if the profile contains more than 2 values
        try:
            data[:, ii] = MA.array(
                statc.loess(
                    zip(
                        MA.compress(maskInv[:, ii], xall).tolist(),
                        MA.compress(maskInv[:, ii], data[:, ii]).tolist()),
                    xallList, windowSize))[:, 1]
        except:
            if verbose:
                print "Warning: loessMA: could not loess axis %i index %i" % (
                    axis, ii)
        if callback:
            callback()
    if not approxMasked:
        data = MA.array(data, mask=mask)
    return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] +
                        range(0, axis) + range(axis + 1, len(m.shape)))
Example #4
0
def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None):
    """Returns a new array with values at the given axis smoothed by loess;
    if approxMasked==True: the masked values are approximated by loess;
    assumes equidistant spacing of points on the given axis.
    """
    assert 0 < windowSize <= m.shape[axis]+0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ("%", windowSize)
    m = MA.asarray(m)
    if m.dtype.char <> Numeric.Float:
        m = m.astype(Numeric.Float)
    shp_other = list(m.shape)
    shp_other.pop(axis)
    # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros
    mask = Numeric.reshape(Numeric.transpose(MA.getmaskarray(m), [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other)))
    data = MA.reshape(MA.transpose(m, [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other)))
    maskInv = -1*(mask-1)
    xall = Numeric.arange(data.shape[0])
    xallList = xall.tolist()
    for ii in Numeric.compress(Numeric.add.reduce(maskInv,0) > 1, range(data.shape[1])):    # run loess if the profile contains more than 2 values
        try:
            data[:,ii] = MA.array(statc.loess(zip(MA.compress(maskInv[:,ii], xall).tolist(), MA.compress(maskInv[:,ii], data[:,ii]).tolist()), xallList, windowSize))[:,1]
        except:
            if verbose:
                print "Warning: loessMA: could not loess axis %i index %i" % (axis, ii)
        if callback:
            callback()
    if not approxMasked:
        data = MA.array(data, mask=mask)
    return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0,axis) + range(axis+1,len(m.shape)))
Example #5
0
 def ttest_rsmplA(self, ma3d, callback):
     """conducts related samples t-test on individual examples wrt factor A (variables, ma3d axis 1);
     returns Numeric array of p-values in shape (1, numExamples).
     """
     ps = -1*Numeric.ones((ma3d.shape[0],), Numeric.Float)
     for eIdx in range(ma3d.shape[0]):
         a = ma3d[eIdx][0]
         b = ma3d[eIdx][1]
         cond = Numeric.logical_not(Numeric.logical_or(MA.getmaskarray(a), MA.getmaskarray(b)))
         a = Numeric.asarray(MA.compress(cond, a))
         b = Numeric.asarray(MA.compress(cond, b))
         if len(a) >= 2:
             try:
                 ps[eIdx] = scipy.stats.ttest_rel(a,b)[1]
             except Exception, inst:
                 print "Warning: %s" % str(inst)
                 print "Example %i:\n%s\n%s\n" % (eIdx, str(a), str(b))
                 ps[eIdx] = 1.0
         else:
             print "Warning: removing example %i:\n%s\n%s\n" % (eIdx, str(a), str(b))
             ps[eIdx] = 1.0
         callback()
Example #6
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:,colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2
Example #7
0
 def anova2(self, ma3d, groupLens, addInteraction, repMeasuresOnA, callback):
     """Conducts two-way ANOVA on individual examples;
     returns a Numeric array of p-values in shape (2, numExamples) or (3, numExamples), depending whether we test for interaction;
     Note: levels of factors A and B that cause empty cells are removed prior to conducting ANOVA.
     """
     groupLens = Numeric.asarray(groupLens)
     # arrays to store p-vals
     if addInteraction:
         ps = Numeric.ones((3, ma3d.shape[0]), Numeric.Float)
     else:
         ps = Numeric.ones((2, ma3d.shape[0]), Numeric.Float)
     # decide between non-repeated / repeated measures ANOVA for factor time
     if repMeasuresOnA:
         fAnova = Anova.AnovaRM12LR
     else:
         fAnova = Anova.Anova2wayLR
     # check for empty cells for all genes at once and remove them
     tInd2rem = []
     ax2Ind = Numeric.concatenate(([0], Numeric.add.accumulate(groupLens)))
     for aIdx in range(ma3d.shape[1]):
         for rIdx in range(groupLens.shape[0]):
             if Numeric.add.reduce(MA.count(ma3d[:,aIdx,ax2Ind[rIdx]:ax2Ind[rIdx+1]],1)) == 0:
                 tInd2rem.append(aIdx)
                 break
     if len(tInd2rem) > 0:
         print "Warning: removing time indices %s for all genes" % (str(tInd2rem))
         tInd2keep = range(ma3d.shape[1])
         for aIdx in tInd2rem:
             tInd2keep.remove(aIdx)
         ma3d = ma3d.take(tInd2keep, 1)
     # for each gene...
     for eIdx in range(ma3d.shape[0]):
         # faster check for empty cells for that gene -> remove time indices with empty cells
         ma2d = ma3d[eIdx]
         cellCount = MA.zeros((ma2d.shape[0], groupLens.shape[0]), Numeric.Int)
         for g,(i0,i1) in enumerate(zip(ax2Ind[:-1], ax2Ind[1:])):
             cellCount[:,g] = MA.count(ma2d[:,i0:i1], 1)
         ma2dTakeInd = Numeric.logical_not(Numeric.add.reduce(Numeric.equal(cellCount,0),1)) # 1 where to take, 0 where not to take
         if Numeric.add.reduce(ma2dTakeInd) != ma2dTakeInd.shape[0]:
             print "Warning: removing time indices %s for gene %i" % (str(Numeric.compress(ma2dTakeInd == 0, Numeric.arange(ma2dTakeInd.shape[0]))), eIdx)
             ma2d = MA.compress(ma2dTakeInd, ma2d, 0)
         an = fAnova(ma2d, groupLens, addInteraction, allowReductA=True, allowReductB=True)
         ps[:,eIdx] = an.ps
         callback()
     return ps
Example #8
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(
            Numeric.logical_and(Numeric.greater(countByRows, 0),
                                Numeric.less(countByRows, columnInd.shape[0])),
            Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(
            MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(
            Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(
                1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3),
                3)  # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(
            Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond),
            columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:, colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(
                1 - MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,
                  colIdx] = MA.average(columnVals.compressed()[:K],
                                       weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0),
                                   Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2