Beispiel #1
0
def random_feature():
    feature_types = ['NUMERICAL', 'CATEGORICAL', 'TEXT', 'SET', 'BOOLEAN']
    fIndex = rd.randint(0, 100)
    fName = 'feature' + str(fIndex) + id()[:8]
    fTypeIdx = rd.randint(0, len(feature_types) - 1)
    fType = feature_types[fTypeIdx]

    return {'id': id(), 'index': fIndex, 'name': fName, 'type': fType}
Beispiel #2
0
def random_labeled_dataset(num_classes=rd.randint(6, 8)):
    dataset = random_dataset()
    num_features = len(dataset['features'])
    label = {
        'id': id(),
        'index': num_features,
        'name': "Label",
        'type': "LABEL"
    }
    data = []
    num_entries = len(dataset['features'][0]['data'])

    for _ in range(num_entries):
        data.append(random_labeldata(num_classes=num_classes))
    feature_data = {'id': id(), 'feature': label, 'data': data}

    #Must be given unique ID.
    return {'id': id(), 'data': dataset, 'label': feature_data}
Beispiel #3
0
def random_min_max_scaler():
    return {
        'id': id(),
        'minValue': rd.random(),
        'maxValue': rd.randint(1, 10) + rd.random(),
        'scale': rd.random(),
        'dataMin': rd.random(),
        'dataMax': rd.randint(1, 10) + rd.random()
    }
Beispiel #4
0
def random_dataset():
    num_features = rd.randint(10, 20)
    num_entries = rd.randint(100, 200)
    feature_data = []

    feature = random_train_group_feature()
    fvalues = []
    for _ in range(num_entries):
        fvalues.append({'id': id(), 'numerical': rd.randint(1, 5)})
    feature_data.append({'id': id(), 'feature': feature, 'data': fvalues})

    for _ in range(num_features):
        feature = random_feature()
        fvalues = []
        for _ in range(num_entries):
            fvalues.append(random_dataentry(feature['type']))
        feature_data.append({'id': id(), 'feature': feature, 'data': fvalues})

    return {'id': id(), 'features': feature_data}
Beispiel #5
0
def random_tfidf_vectorizer():
    num_terms = rd.randint(100, 600)
    term_feature_mapping = []
    idfs = []
    for tidx in range(num_terms):
        term = "term" + str(tidx)
        fidx = 100 + tidx
        term_feature_mapping.append({
            'id': id(),
            'term': term,
            'featureIdx': fidx
        })
        idfs.append(rd.random())

    return {
        'id': id(),
        'vocab': term_feature_mapping,
        'idf': idfs,
        'stopwords': ['the', 'this', 'a', 'an', 'those', 'these', 'at', 'on']
    }
Beispiel #6
0
def flattenLabeledRiskToDataset(labeled_risks):
    allRisks = [labeled_risk['risk'] for labeled_risk in labeled_risks]
    labels = [{
        'id':
        id(),
        'text':
        ' '.join([labeled_risk['severity'], labeled_risk['likelihood']])
    } for labeled_risk in labeled_risks]

    label_feature = {
        'id': id(),
        'feature': {
            'id': id(),
            'index': 12,
            'name': 'severity likelihood',
            'type': "LABEL"
        },
        'data': labels
    }
    dataset = flattenRiskToDataset(allRisks)

    return {'id': iris_unique_id(), 'data': dataset, 'label': label_feature}
Beispiel #7
0
def random_class_weights(num_features, labels):
    class_weights = []
    for label in labels:
        intercept = rd.randint(0, 100) * 1.0 / 100
        feature_weights = []
        for _ in range(num_features):
            num_weights = rd.randint(1, 10)
            feature_weights.append({
                'id':
                id(),
                'feature':
                random_feature(),
                'weights': [rd.randint(0, 100) * 1.0 / 100] * num_weights
            })
        class_weights.append({
            'id': id(),
            'weights': feature_weights,
            'class': label,
            'intercept': intercept
        })

    return class_weights
Beispiel #8
0
def random_dataentry(ftype):
    if ftype in ["CATEGORICAL"]:
        num_categories = rd.randint(6, 22)
        dval = "string " + str(rd.randint(1, num_categories))
        return {'id': id(), 'text': dval}
    elif ftype in ["TEXT"]:
        random_words = []
        for _ in range(rd.randint(50, 300)):
            random_words.append(words[rd.randint(0, len(words) - 1)])
        dval = " ".join(random_words)
        return {'id': id(), 'text': dval}
    elif ftype in ["NUMERICAL"]:
        dval = rd.randint(0, 100) * 1.0 / 100
        return {'id': id(), 'numerical': dval}
    elif ftype in ["BOOLEAN"]:
        dval = rd.randint(0, 1)
        return {'id': id(), 'numerical': dval}
    else:  #SET
        num_categories = rd.randint(3, 10)
        dval = []
        for _ in range(rd.randint(1, 3)):
            dval.append("string " + str(rd.randint(1, num_categories)))
        return {'id': id(), 'set': dval}
Beispiel #9
0
def random_model_performance(num_classes):
    class_performances = []
    for lblidx in range(num_classes):
        label = "Label " + str(lblidx + 1)
        buckets = []
        total_num_instances = 0
        for bucket_idx in range(num_classes):
            num_instances = rd.randint(20, 80)
            total_num_instances += num_instances
            buckets.append({
                'id': id(),
                'trueLabel': label,
                'predictedLabel': "Label " + str(bucket_idx + 1),
                'numInstances': num_instances,
                'weight': rd.random(),
            })
        perf = {
            'id': id(),
            'label': label,
            'weight': rd.random(),
            'numInstances': total_num_instances,
            'classifiedAs': buckets,
            'recall': rd.random(),
            'precision': rd.random(),
            'f1': rd.random()
        }
        class_performances.append(perf)

    return {
        'id': id(),
        'classPerformances': class_performances,
        'numInstances': rd.randint(50, 100),
        'avgRecall': rd.random(),
        'avgPrecision': rd.random(),
        'avgF1': rd.random()
    }
Beispiel #10
0
def unpackProbs(prob):
    res_dict = {}
    all_labels_list = []

    probList = prob.split(',')
    
    for kv in probList:
        k,v = kv.split(':')
        v = float(v)
        res_dict[k] = v
        predictedLabel = {
            'id': id(),
            'label': k,
            'probability': v
        }
        all_labels_list.append(predictedLabel)
    
    return res_dict, all_labels_list
Beispiel #11
0
def unpackSuggestedFeatures(suggestions):
    res = []
    if len(suggestions) > 0:
        suggested_features = suggestions.split(',')

        for feat in suggested_features:
            if '::' in feat:
                field_name, field_value = feat.split('::')
            else:
                field_name, field_value = feat, ''

            res.append({
                'id': id(),
                'featureName': field_name,
                'featureValue': field_value,
                'weight': 1.
            })

    return res
Beispiel #12
0
def batchClassificationResultToRiskProfile(batch_classification_result,
                                           profile_id):
    max_entr = -1
    for cls_sum in batch_classification_result['classSummaries']:
        max_entr = max(max_entr, max(cls_sum['entropies']))

    risk_scores = []
    risk_buckets = []
    for class_summary in batch_classification_result['classSummaries']:
        severity, likelihood = class_summary['label'].split()
        risks = []
        for res in class_summary['results']:
            res['entropy'] /= max_entr
            classifiedRisk = classificationResultToClassifiedRisk(res)
            risks.append(classifiedRisk)
            risk_scores.append(classifiedRisk['score'])
        bucket = {
            'id':
            id(),
            'severity':
            severity,
            'likelihood':
            likelihood,
            'numberOfRisks':
            class_summary['numInstances'],
            'averageConfidenceLevel':
            np.average(class_summary['entropies']) / max_entr,
            'numberOfLowConfidenceRisks':
            len([
                entropy for entropy in class_summary['entropies']
                if is_low_confidence(entropy / max_entr)
            ]),
            'risks':
            risks
        }
        risk_buckets.append(bucket)

    return {
        'id': profile_id,
        'compoundRisk': np.average(risk_scores),
        'riskBuckets': risk_buckets
    }
Beispiel #13
0
def unpackContribs(contrib):
    res = []
    if len(contrib) > 0:
        contributors = contrib.split(';')

        for contributor in contributors:
            assert '=' in contributor, "bad contributor:" + '-->' + contributor + '<--' + ' in ' + '"' + contrib + '"'
            feat, weight = contributor.split('=')
            if '::' in feat:
                field_name, field_value = feat.split('::')
            else:
                field_name, field_value = feat, ''

            res.append({
                'id': id(),
                'featureName': field_name,
                'featureValue': field_value,
                'weight': float(weight)
            })

    return res
Beispiel #14
0
def classificationResultToClassifiedRisk(classification_result):
    input_data = classification_result['dataInstance']['dataset']['features']
    risk = {}
    topology = {}
    discipline = {}
    for feat_data in input_data:
        if feat_data['feature']['name'] in [
                'id', 'title', 'description', 'cause', 'consequence'
        ]:
            risk.update(
                {feat_data['feature']['name']: feat_data['data'][0]['text']})
        elif feat_data['feature']['name'].startswith('topology.'):
            topology.update({
                feat_data['feature']['name'].replace('topology.', ''):
                feat_data['data'][0]['text']
            })
        elif feat_data['feature']['name'].startswith('discipline.'):
            discipline.update({
                feat_data['feature']['name'].replace('discipline.', ''):
                feat_data['data'][0]['text']
            })
    risk.update({'topology': topology})
    risk.update({'discipline': discipline})

    severity, likelihood = classification_result['predictedLabel'][
        'label'].split()
    entropy = classification_result['entropy']
    classified_risk = {
        'id': id(),
        'risk': risk,
        'severity': severity,
        'likelihood': likelihood,
        'confidenceLevel': entropy,
        'lowConfidence': is_low_confidence(entropy),
        'score': calculate_score(severity, likelihood),
        'contributors': classification_result['contributors'],
        'recommends': classification_result['recommends']
    }

    return classified_risk
Beispiel #15
0
def random_train_group_feature():
    fIndex = rd.randint(0, 100)
    fName = 'TRAIN_GROUP'
    fType = 'NUMERICAL'

    return {'id': id(), 'index': fIndex, 'name': fName, 'type': fType}
Beispiel #16
0
def random_batch_classification_results():
    dataset = random_dataset()
    num_classes = rd.randint(2, 5)
    probabilities = [1.0 / num_classes] * num_classes
    classes = ["Class " + str(idx + 1) for idx in range(num_classes)]

    allPredictedLabels = [{
        'id': id(),
        'label': lbl,
        'probability': prob
    } for (lbl, prob) in zip(classes, probabilities)]

    class_summaries = []
    for clsidx in range(num_classes):
        num_instances = rd.randint(3, 10)
        results = []
        for instidx in range(num_instances):
            data_idx = rd.randint(0, len(dataset['features'][0]['data']) - 1)
            input_data = []
            for feat in dataset['features']:
                input_data.append({
                    'id': id(),
                    'feature': feat['feature'],
                    'data': [feat['data'][data_idx]]
                })
            data_instance = {'id': id(), 'features': input_data}
            results.append({
                'id':
                id(),
                'dataInstance': {
                    'id': id(),
                    'dataset': data_instance,
                    'index': instidx
                },
                'allLabels':
                allPredictedLabels,
                'predictedLabel':
                allPredictedLabels[clsidx],
                'entropy':
                rd.random(),
                'contributors': [{
                    'id': id(),
                    'featureName': 'topology',
                    'featureValue': 'topsides',
                    'weight': .68
                }],
                'recommends': [{
                    'id': id(),
                    'featureName': 'topology',
                    'featureValue': 'subsea',
                    'weight': .86
                }]
            })
        class_summaries.append({
            'id': id(),
            'label': classes[clsidx],
            'numInstances': num_instances,
            'probabilities': [1.0 / num_classes] * num_instances,
            'entropies': [rd.random()] * num_instances,
            'results': results
        })

    return {'id': id(), 'classSummaries': class_summaries}
Beispiel #17
0
def random_doc_to_vector():
    return {
        'id': id(),
        'modelFile': 'fullpathText2VecBinaryFileName',
        'maxNumWords': rd.randint(1000, 10000)
    }
Beispiel #18
0
def classify(cachedModelID, data):
    startedTime = datetime.datetime.now()
    assert(cachedModelID in cachedMSR), "Model not found."
    model = cachedMSR[cachedModelID]['selectedModel']

    emptyResults = {
        'id': -1,
        'classSummaries': []
    }

    #debug
    print('Received a dataset with ', len(data['features']), ' features to classify.')
    if (len(data['features']) ==0):
        print('There is no feature, empty result set is returned.')
        return emptyResults
    print('Received a dataset with ', len(data['features'][0]['data']), ' rows to classify.')
    if (len(data['features'][0]['data']) ==0):
        print('There is no data, empty result set is returned.')
        return emptyResults

    candidate = model["candidate"]
    features = candidate["features"]
    config = candidate["config"]

    unlabeled_df = datasetToDataframe(data)
    filtered_input_df = unlabeled_df.filter([f['name'] for f in features])

    lr, fm, lm = loadTrainedModel(model)

    ac = Classifier(model_configuration=config)
    ac.load_models(lr, fm, lm)

    res_df = ac.predict_explain(input_df=filtered_input_df, topN_features=10)
    reccom_df = ac.input_qlty(input_df=filtered_input_df, topN=10)
    res_df = pd.concat([res_df, reccom_df.filter(["SuggestedFeatures"])], axis=1)

    plCountSeries = res_df.groupby('PredictedLabel').PredictedLabel.count()
    labels = list(plCountSeries.keys())

    classSummaries = []

    for label in labels:
        filtered_res_df = res_df[res_df.PredictedLabel == label]
        entropies = []
        probabilities = []
        results = []
        for data_index, row in filtered_res_df.iterrows():
            entropies.append(float(row.Entropy))
            probsDict, allLabels = unpackProbs(row.Probabilities)
            probabilities.append(float(probsDict[label]))
            contributors = unpackContribs(row.TopContributors)
            recommends = unpackSuggestedFeatures(row.SuggestedFeatures)

            input_data = []
            for feat in data['features']:
                input_data.append({'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_index]]})
            data_instance = {
                'id': id(),
                'dataset': { 'id': id(),
                             'features': input_data},
                'index': data_index
            }

            classificationResult = {
                'id': id(),
                'allLabels': allLabels,
                'entropy': float(row.Entropy),
                'contributors': contributors,
                'dataInstance': data_instance,
                'predictedLabel': {
                    'id': id(),
                    'label': label,
                    'probability': float(probsDict[label])
                },
                'recommends': recommends
            }

            results.append(classificationResult)
        
        classSumary = {
            'id': id(),
            'label': label,
            'numInstances': int(plCountSeries[label]),
            'probabilities': probabilities,
            'entropies': entropies,
            'results': results
        }

        classSummaries.append(classSumary)

    batchClassificationResult = {
        'id': id(),
        "classSummaries": classSummaries
    }

    print('Classification time:' + str((datetime.datetime.now() - startedTime).total_seconds()) + ' seconds ')

    return batchClassificationResult
Beispiel #19
0
def random_label_encoder(num_classes):
    return {
        'id': id(),
        'labels': ["Label " + str(n + 1) for n in range(num_classes)]
    }
Beispiel #20
0
def random_multilabel_binarizer(num_classes):
    return {
        'id': id(),
        'labels': ["Label " + str(n + 1) for n in range(num_classes)]
    }
Beispiel #21
0
def random_noop():
    return {'id': id()}
Beispiel #22
0
def flattenRiskToDataset(risks):

    features = []
    field_datas = []
    for _ in range(12):
        field_datas.append([])

    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 0,
            'name': 'id',
            'type': "TEXT"
        },
        'data': field_datas[0]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 1,
            'name': 'title',
            'type': "TEXT"
        },
        'data': field_datas[1]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 2,
            'name': 'description',
            'type': "TEXT"
        },
        'data': field_datas[2]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 3,
            'name': 'cause',
            'type': "TEXT"
        },
        'data': field_datas[3]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 4,
            'name': 'consequence',
            'type': "TEXT"
        },
        'data': field_datas[4]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 5,
            'name': 'topology.id',
            'type': "CATEGORICAL"
        },
        'data': field_datas[5]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 6,
            'name': 'topology.onshoreOffshore',
            'type': "CATEGORICAL"
        },
        'data': field_datas[6]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 7,
            'name': 'topology.upstreamDownstream',
            'type': "CATEGORICAL"
        },
        'data': field_datas[7]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 8,
            'name': 'topology.oilGas',
            'type': "CATEGORICAL"
        },
        'data': field_datas[8]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 9,
            'name': 'topology.facilityType',
            'type': "CATEGORICAL"
        },
        'data': field_datas[9]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 10,
            'name': 'discipline.id',
            'type': "CATEGORICAL"
        },
        'data': field_datas[10]
    })
    features.append({
        'id': id(),
        'feature': {
            'id': id(),
            'index': 11,
            'name': 'discipline.name',
            'type': "CATEGORICAL"
        },
        'data': field_datas[11]
    })

    for risk in risks:
        field_datas[0].append({'id': id(), 'text': risk['id']})
        field_datas[1].append({'id': id(), 'text': risk['title']})
        field_datas[2].append({'id': id(), 'text': risk['description']})
        field_datas[3].append({'id': id(), 'text': risk['cause']})
        field_datas[4].append({'id': id(), 'text': risk['consequence']})

        field_datas[5].append({'id': id(), 'text': risk['topology']['id']})
        field_datas[6].append({
            'id': id(),
            'text': risk['topology']['onshoreOffshore']
        })
        field_datas[7].append({
            'id': id(),
            'text': risk['topology']['upstreamDownstream']
        })
        field_datas[8].append({'id': id(), 'text': risk['topology']['oilGas']})
        field_datas[9].append({
            'id': id(),
            'text': risk['topology']['facilityType']
        })

        field_datas[10].append({'id': id(), 'text': risk['discipline']['id']})
        field_datas[11].append({
            'id': id(),
            'text': risk['discipline']['name']
        })

    return {'id': iris_unique_id(), 'features': features}
Beispiel #23
0
def random_labeldata(num_classes):
    dval = "class " + str(rd.randint(1, num_classes))
    return {'id': id(), 'text': dval}
Beispiel #24
0
    def setUpClass(cls):
        cls.features = [{
            "feature": {
                "index": 0,
                "name": "feature 0",
                "type": "TEXT"
            },
            "data": [{
                "text": "Hello"
            }, {
                "text": "Hello"
            }]
        }, {
            "feature": {
                "index": 1,
                "name": "feature 1",
                "type": "NUMERICAL"
            },
            "data": [{
                "numerical": 1.2
            }, {
                "numerical": 2.5
            }]
        }, {
            "feature": {
                "index": 2,
                "name": "feature 2",
                "type": "SET"
            },
            "data": [{
                "set": ["a", "b"]
            }, {
                "set": ["d", "e"]
            }]
        }]

        cls.ds = {"features": cls.features}

        cls.labeled_ds = {
            'id': id(),
            "data": {
                'id': id(),
                "features": cls.features
            },
            "label": {
                'id':
                id(),
                "feature": {
                    'id': id(),
                    "index": 0,
                    "name": "label feature",
                    "type": "LABEL"
                },
                "data": [{
                    'id': id(),
                    "text": "value 1"
                }, {
                    'id': id(),
                    "text": "value 2"
                }]
            }
        }

        cls.df_dict = {
            "feature 0": ["Hello", "Hello"],
            "feature 1": [1.2, 2.5],
            "feature 2": [["a", "b"], ["d", "e"]]
        }

        cls.labeled_df_dict = {
            **cls.df_dict,
            "label feature": ["value 1", "value 2"],
        }