Exemple #1
0
    def test_predict_explain(self, binary_problem=False):
        __labeled_inp_df = self.labeled_inp_df.copy(deep=True)
        __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True)
        __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True)
        __schema_with_label = self.schema_with_label.copy()

        ac = Classifier(model_configuration=testModelConfiguration)

        if binary_problem:
            ac.train(input_df=__labeled_binary_inp_df,
                     schema=__schema_with_label)
        else:
            ac.train(input_df=__labeled_inp_df, schema=__schema_with_label)
        lr, fm, lm = ac.get_models()

        ac = Classifier(model_configuration=testModelConfiguration)
        ac.load_models(lr, fm, lm)

        res_df = ac.predict_explain(input_df=__nonlabeled_inp_df)

        self.assertTrue(isinstance(res_df, pd.DataFrame))
        self.assertEqual(len(res_df.columns),
                         len(self.fields_without_label) + 4)
        self.assertEqual(res_df.dtypes[-1], "object")
        self.assertEqual(res_df.dtypes[-2], "float64")
        self.assertEqual(res_df.dtypes[-3], "object")
        self.assertEqual(res_df.dtypes[-4], "object")

        #Test if all top-contributed features are present
        def chk_contributor_existance(row):
            contributors = row["TopContributors"].split(';')
            features = [contrib.split('=')[0] for contrib in contributors]
            for feat in features:
                if '::' in feat:
                    field_name, field_value = feat.split('::')
                    self.assertIn(field_name, self.fields_without_label)
                    fld_no = list(res_df.columns).index(field_name)
                    if self.schema_without_label[fld_no] in ["text", "set"]:
                        #self.assertIn(field_value, row[field_name].lower())
                        self.assertTrue(True)
                    elif self.schema_without_label[fld_no] in [
                            "string", "numeric", "boolean"
                    ]:
                        self.assertEqual(field_value, row[field_name])
                else:
                    field_name = feat
                    if len(field_name) > 0:
                        self.assertIn(field_name, self.fields_without_label)

        res_df.apply(chk_contributor_existance, axis=1)
Exemple #2
0
def classify(cachedModelID, data):
    startedTime = datetime.datetime.now()
    assert(cachedModelID in cachedMSR), "Model not found."
    model = cachedMSR[cachedModelID]['selectedModel']

    emptyResults = {
        'id': -1,
        'classSummaries': []
    }

    #debug
    print('Received a dataset with ', len(data['features']), ' features to classify.')
    if (len(data['features']) ==0):
        print('There is no feature, empty result set is returned.')
        return emptyResults
    print('Received a dataset with ', len(data['features'][0]['data']), ' rows to classify.')
    if (len(data['features'][0]['data']) ==0):
        print('There is no data, empty result set is returned.')
        return emptyResults

    candidate = model["candidate"]
    features = candidate["features"]
    config = candidate["config"]

    unlabeled_df = datasetToDataframe(data)
    filtered_input_df = unlabeled_df.filter([f['name'] for f in features])

    lr, fm, lm = loadTrainedModel(model)

    ac = Classifier(model_configuration=config)
    ac.load_models(lr, fm, lm)

    res_df = ac.predict_explain(input_df=filtered_input_df, topN_features=10)
    reccom_df = ac.input_qlty(input_df=filtered_input_df, topN=10)
    res_df = pd.concat([res_df, reccom_df.filter(["SuggestedFeatures"])], axis=1)

    plCountSeries = res_df.groupby('PredictedLabel').PredictedLabel.count()
    labels = list(plCountSeries.keys())

    classSummaries = []

    for label in labels:
        filtered_res_df = res_df[res_df.PredictedLabel == label]
        entropies = []
        probabilities = []
        results = []
        for data_index, row in filtered_res_df.iterrows():
            entropies.append(float(row.Entropy))
            probsDict, allLabels = unpackProbs(row.Probabilities)
            probabilities.append(float(probsDict[label]))
            contributors = unpackContribs(row.TopContributors)
            recommends = unpackSuggestedFeatures(row.SuggestedFeatures)

            input_data = []
            for feat in data['features']:
                input_data.append({'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_index]]})
            data_instance = {
                'id': id(),
                'dataset': { 'id': id(),
                             'features': input_data},
                'index': data_index
            }

            classificationResult = {
                'id': id(),
                'allLabels': allLabels,
                'entropy': float(row.Entropy),
                'contributors': contributors,
                'dataInstance': data_instance,
                'predictedLabel': {
                    'id': id(),
                    'label': label,
                    'probability': float(probsDict[label])
                },
                'recommends': recommends
            }

            results.append(classificationResult)
        
        classSumary = {
            'id': id(),
            'label': label,
            'numInstances': int(plCountSeries[label]),
            'probabilities': probabilities,
            'entropies': entropies,
            'results': results
        }

        classSummaries.append(classSumary)

    batchClassificationResult = {
        'id': id(),
        "classSummaries": classSummaries
    }

    print('Classification time:' + str((datetime.datetime.now() - startedTime).total_seconds()) + ' seconds ')

    return batchClassificationResult