Exemple #1
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:,colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2
Exemple #2
0
def rankDataMA(m, inverse=False):
    """Returns ranks of 1D masked array; masked values ignored, range 1...#non-masked_values.
    """
    m = MA.asarray(m)
    assert MA.rank(m) == 1
    fill_val = m.fill_value()
    m.set_fill_value(MA.maximum(m) + 1)
    r = MA.zeros(m.shape[0], Numeric.Float)
    MA.put(r, MA.argsort(m), Numeric.arange(m.shape[0]))
    m.set_fill_value(fill_val)
    r = MA.array(r, mask=MA.getmaskarray(m))
    if inverse:
        return -1*r+MA.count(m)
    else:
        return r+1
Exemple #3
0
def kNNimputeMA(arr2d, K=20, callback=None):
    """Returns a new 2D MA.array with missing values imputed from K nearest neighbours.
    Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance.
    Imputed value = weighted average of the corresponding values of K nearest neighbours,
    where weights equal to tricubic distribution of distances to all rows.
    Impute missing rows by average over all rows.
    Version: 30.8.2005
    """
    arr2d = MA.asarray(arr2d)
    assert len(arr2d.shape) == 2, "2D array expected"
    # make a copy for imputation
    aImp2 = MA.array(arr2d)
    # leave out columns with 0 known values (columnInd: non-zero columns)
    columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0)
    columnIndAll = Numeric.arange(arr2d.shape[1])
    columnInd = Numeric.compress(columnCond, columnIndAll)
    # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values
    countByRows = MA.count(arr2d, axis=1)
    for rowIdx in Numeric.compress(
            Numeric.logical_and(Numeric.greater(countByRows, 0),
                                Numeric.less(countByRows, columnInd.shape[0])),
            Numeric.arange(arr2d.shape[0])):
        rowResized = MA.resize(arr2d[rowIdx], arr2d.shape)
        diff = arr2d - rowResized
        distances = MA.sqrt(
            MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1))
        # nearest neighbours row indices (without the current row index)
        indSorted = MA.argsort(distances)[1:]
        distSorted = distances.take(indSorted)
        # number of distances different from MA.masked
        numNonMasked = distSorted.shape[0] - Numeric.add.reduce(
            Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int))
        # number of distances to account for (K or less)
        if numNonMasked > 1:
            weightsSorted = MA.power(
                1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3),
                3)  # tricubic distribution of all weights
        else:
            weightsSorted = Numeric.ones(distSorted.shape[0])
        # compute average for each column separately in order to account for K non-masked values
        colInd4CurrRow = Numeric.compress(
            Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond),
            columnIndAll)
        for colIdx in colInd4CurrRow:
            # column values sorted by distances
            columnVals = arr2d[:, colIdx].take(indSorted)
            # take only those weights where columnVals does not equal MA.masked
            weightsSortedCompressed = MA.compress(
                1 - MA.getmaskarray(columnVals), weightsSorted)
            # impute from K (or possibly less) values
            aImp2[rowIdx,
                  colIdx] = MA.average(columnVals.compressed()[:K],
                                       weights=weightsSortedCompressed[:K])
        if callback:
            callback()
    # impute the unknown rows with average profile
    avrgRow = MA.average(arr2d, 0)
    for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0),
                                   Numeric.arange(arr2d.shape[0])):
        aImp2[rowIdx] = avrgRow
        if callback:
            callback()
    return aImp2