コード例 #1
0
hyperParameters = {"hidden": hiddenOpt, "l2": l2Opt}
bestModel = autoencoder.tuneAndTrain(
    hyperParameters,
    H2OAutoEncoderEstimator(activation="Tanh",
                            ignore_const_cols=False,
                            epochs=100), dataFramePreprocessed)

#Assign invalidity scores
invalidityScores = autoencoder.assignInvalidityScore(bestModel,
                                                     dataFramePreprocessed)

#Detect faulty records
testing = Testing()
faultyRecordFrame = testing.detectFaultyRecords(dataFrame, invalidityScores,
                                                np.median(invalidityScores))

print faultyRecordFrame.sort_values(by=['invalidityScore'], ascending=False)

faultyRecordFramePreprocessed = dataCollection.preprocess(
    faultyRecordFrame.drop(
        [faultyRecordFrame.columns.values[0], 'invalidityScore'], axis=1),
    ['gender_concept_id', 'measurement_type_concept_id'],
    ['year_of_birth', 'value_as_number', 'range_low', 'range_high'])

#Cluster the faulty records
#Train a 5*5 SOM with 100 iterations
#Exclude id columnand invalidity score for clustering
som = SOM(5, 5, len(faultyRecordFrame.columns.values) - 2, 400)
print som.clusterFaultyRecords(faultyRecordFramePreprocessed,
                               faultyRecordFrame)