Example #1
0
def clusters(dataSet, features, exemplars):
    ftrs = list(features)

    wI = wellIndex(dataSet)
    objectCount = len(wI)
    predicted = np.empty(objectCount, dtype=np.int)

    # Default to use all features if none habe been selected.
    if not ftrs:
        ftrs = data.imageFeatures(dataSet)

    if ftrs and exemplars:   #(exemplars or wellTypes(dataSet)):
        # Training feature data.
        valueMatrix = np.matrix([scaledArray(dataSet, ftr) for ftr in ftrs], copy=False).transpose()

        # Construct from well type annotation.
        trainingLabels = np.copy(wI['type'].values)

        # Knock out large part of training values (to speed up training).
        trainingSample = np.random.rand(trainingLabels.size) < configuration(dataSet).wellTypeSample
        trainingLabels = np.where(trainingSample, trainingLabels, np.nan)

        # Override well type annotations where exemplars have been chosen by user.
        exemplarDict = dict(exemplars)

        for popId, exemplars in exemplarDict.iteritems():
            for exemplar in exemplars:
                trainingLabels[exemplar] = popId

        # Prune training features and labels, based on presence of labels.
        trainingValues = valueMatrix[~np.isnan(trainingLabels)]
        trainingLabels = trainingLabels[~np.isnan(trainingLabels)]

        print "Begin training"
        #trainingValues = np.take(valueMatrix, exemplarObjects, axis=0)
        forest = RandomForestClassifier(
            n_estimators=10,
            n_jobs=-1,
            class_weight="balanced"#,
            #min_samples_split=0.01*trainingValues.size
        )
        forest = forest.fit(trainingValues, trainingLabels)    #forest.fit(trainingValues, exemplarLabels)
        print "End training"

        print "Begin classification"
        #predicted = forest.predict(valueMatrix)
        confidenceThreshold = data.config(dataSet).classifierConfidenceThreshold
        probabilities = forest.predict_proba(valueMatrix)
        maxProb = np.max(probabilities, axis=1)
        maxArgProb = np.argmax(probabilities, axis=1)
        predicted = np.where(maxProb > confidenceThreshold, np.choose(maxArgProb, forest.classes_), 2).astype(np.int)
        print "End classification"
    else:
        predicted.fill(2)   # 2 unsure about all input when no training input is provided

    # Partition predicted column to object indices.
    return predicted
Example #2
0
def featureOrdering(dataSet):
    from ordering.rearrange import rearrange

    print "Order features by correlation"
    objectSet = selectImageFeatures(dataSet, smallSample(dataSet))
    corr = objectSet.corr()
    distances = 1 - corr.abs()
    rearrangedSubset = rearrange(distances.values)

    ftrs = data.imageFeatures(dataSet)
    return [ftrs[i] for i in rearrangedSubset]
Example #3
0
def featureHistograms(dataSet, featureSet, exemplars, bins):
    partition = clustersAsMap(dataSet, featureSet, exemplars)

    # All computation combinations.
    #print "Compute feature histograms."
    tasks = [(dataSet, featureSet, exemplars, feature, cluster, bins)
             for feature in data.imageFeatures(dataSet)
             for cluster, clusterMap in partition.iteritems()]

    pool = Pool()
    results = pool.imap(featureHistogram, tasks)
    pool.close()
    pool.join()

    histograms = {c: {} for c, table in partition.iteritems()}
    for feature, cluster, histogram in results:
        histograms[cluster][feature] = histogram
    #print "Finish compute feature histograms."

    return histograms
Example #4
0
def featureColumns(dataSet):
    def column(colName):
        col = data.numpyDump(dataSet, colName).astype(np.float64)
        return (col - np.mean(col)) / np.std(col)

    return np.array([column(col) for col in data.imageFeatures(dataSet)])
Example #5
0
def selectImageFeatures(dataSet, subset):
    return subset[data.imageFeatures(dataSet)]