def random_feature(): feature_types = ['NUMERICAL', 'CATEGORICAL', 'TEXT', 'SET', 'BOOLEAN'] fIndex = rd.randint(0, 100) fName = 'feature' + str(fIndex) + id()[:8] fTypeIdx = rd.randint(0, len(feature_types) - 1) fType = feature_types[fTypeIdx] return {'id': id(), 'index': fIndex, 'name': fName, 'type': fType}
def random_labeled_dataset(num_classes=rd.randint(6, 8)): dataset = random_dataset() num_features = len(dataset['features']) label = { 'id': id(), 'index': num_features, 'name': "Label", 'type': "LABEL" } data = [] num_entries = len(dataset['features'][0]['data']) for _ in range(num_entries): data.append(random_labeldata(num_classes=num_classes)) feature_data = {'id': id(), 'feature': label, 'data': data} #Must be given unique ID. return {'id': id(), 'data': dataset, 'label': feature_data}
def random_min_max_scaler(): return { 'id': id(), 'minValue': rd.random(), 'maxValue': rd.randint(1, 10) + rd.random(), 'scale': rd.random(), 'dataMin': rd.random(), 'dataMax': rd.randint(1, 10) + rd.random() }
def random_dataset(): num_features = rd.randint(10, 20) num_entries = rd.randint(100, 200) feature_data = [] feature = random_train_group_feature() fvalues = [] for _ in range(num_entries): fvalues.append({'id': id(), 'numerical': rd.randint(1, 5)}) feature_data.append({'id': id(), 'feature': feature, 'data': fvalues}) for _ in range(num_features): feature = random_feature() fvalues = [] for _ in range(num_entries): fvalues.append(random_dataentry(feature['type'])) feature_data.append({'id': id(), 'feature': feature, 'data': fvalues}) return {'id': id(), 'features': feature_data}
def random_tfidf_vectorizer(): num_terms = rd.randint(100, 600) term_feature_mapping = [] idfs = [] for tidx in range(num_terms): term = "term" + str(tidx) fidx = 100 + tidx term_feature_mapping.append({ 'id': id(), 'term': term, 'featureIdx': fidx }) idfs.append(rd.random()) return { 'id': id(), 'vocab': term_feature_mapping, 'idf': idfs, 'stopwords': ['the', 'this', 'a', 'an', 'those', 'these', 'at', 'on'] }
def flattenLabeledRiskToDataset(labeled_risks): allRisks = [labeled_risk['risk'] for labeled_risk in labeled_risks] labels = [{ 'id': id(), 'text': ' '.join([labeled_risk['severity'], labeled_risk['likelihood']]) } for labeled_risk in labeled_risks] label_feature = { 'id': id(), 'feature': { 'id': id(), 'index': 12, 'name': 'severity likelihood', 'type': "LABEL" }, 'data': labels } dataset = flattenRiskToDataset(allRisks) return {'id': iris_unique_id(), 'data': dataset, 'label': label_feature}
def random_class_weights(num_features, labels): class_weights = [] for label in labels: intercept = rd.randint(0, 100) * 1.0 / 100 feature_weights = [] for _ in range(num_features): num_weights = rd.randint(1, 10) feature_weights.append({ 'id': id(), 'feature': random_feature(), 'weights': [rd.randint(0, 100) * 1.0 / 100] * num_weights }) class_weights.append({ 'id': id(), 'weights': feature_weights, 'class': label, 'intercept': intercept }) return class_weights
def random_dataentry(ftype): if ftype in ["CATEGORICAL"]: num_categories = rd.randint(6, 22) dval = "string " + str(rd.randint(1, num_categories)) return {'id': id(), 'text': dval} elif ftype in ["TEXT"]: random_words = [] for _ in range(rd.randint(50, 300)): random_words.append(words[rd.randint(0, len(words) - 1)]) dval = " ".join(random_words) return {'id': id(), 'text': dval} elif ftype in ["NUMERICAL"]: dval = rd.randint(0, 100) * 1.0 / 100 return {'id': id(), 'numerical': dval} elif ftype in ["BOOLEAN"]: dval = rd.randint(0, 1) return {'id': id(), 'numerical': dval} else: #SET num_categories = rd.randint(3, 10) dval = [] for _ in range(rd.randint(1, 3)): dval.append("string " + str(rd.randint(1, num_categories))) return {'id': id(), 'set': dval}
def random_model_performance(num_classes): class_performances = [] for lblidx in range(num_classes): label = "Label " + str(lblidx + 1) buckets = [] total_num_instances = 0 for bucket_idx in range(num_classes): num_instances = rd.randint(20, 80) total_num_instances += num_instances buckets.append({ 'id': id(), 'trueLabel': label, 'predictedLabel': "Label " + str(bucket_idx + 1), 'numInstances': num_instances, 'weight': rd.random(), }) perf = { 'id': id(), 'label': label, 'weight': rd.random(), 'numInstances': total_num_instances, 'classifiedAs': buckets, 'recall': rd.random(), 'precision': rd.random(), 'f1': rd.random() } class_performances.append(perf) return { 'id': id(), 'classPerformances': class_performances, 'numInstances': rd.randint(50, 100), 'avgRecall': rd.random(), 'avgPrecision': rd.random(), 'avgF1': rd.random() }
def unpackProbs(prob): res_dict = {} all_labels_list = [] probList = prob.split(',') for kv in probList: k,v = kv.split(':') v = float(v) res_dict[k] = v predictedLabel = { 'id': id(), 'label': k, 'probability': v } all_labels_list.append(predictedLabel) return res_dict, all_labels_list
def unpackSuggestedFeatures(suggestions): res = [] if len(suggestions) > 0: suggested_features = suggestions.split(',') for feat in suggested_features: if '::' in feat: field_name, field_value = feat.split('::') else: field_name, field_value = feat, '' res.append({ 'id': id(), 'featureName': field_name, 'featureValue': field_value, 'weight': 1. }) return res
def batchClassificationResultToRiskProfile(batch_classification_result, profile_id): max_entr = -1 for cls_sum in batch_classification_result['classSummaries']: max_entr = max(max_entr, max(cls_sum['entropies'])) risk_scores = [] risk_buckets = [] for class_summary in batch_classification_result['classSummaries']: severity, likelihood = class_summary['label'].split() risks = [] for res in class_summary['results']: res['entropy'] /= max_entr classifiedRisk = classificationResultToClassifiedRisk(res) risks.append(classifiedRisk) risk_scores.append(classifiedRisk['score']) bucket = { 'id': id(), 'severity': severity, 'likelihood': likelihood, 'numberOfRisks': class_summary['numInstances'], 'averageConfidenceLevel': np.average(class_summary['entropies']) / max_entr, 'numberOfLowConfidenceRisks': len([ entropy for entropy in class_summary['entropies'] if is_low_confidence(entropy / max_entr) ]), 'risks': risks } risk_buckets.append(bucket) return { 'id': profile_id, 'compoundRisk': np.average(risk_scores), 'riskBuckets': risk_buckets }
def unpackContribs(contrib): res = [] if len(contrib) > 0: contributors = contrib.split(';') for contributor in contributors: assert '=' in contributor, "bad contributor:" + '-->' + contributor + '<--' + ' in ' + '"' + contrib + '"' feat, weight = contributor.split('=') if '::' in feat: field_name, field_value = feat.split('::') else: field_name, field_value = feat, '' res.append({ 'id': id(), 'featureName': field_name, 'featureValue': field_value, 'weight': float(weight) }) return res
def classificationResultToClassifiedRisk(classification_result): input_data = classification_result['dataInstance']['dataset']['features'] risk = {} topology = {} discipline = {} for feat_data in input_data: if feat_data['feature']['name'] in [ 'id', 'title', 'description', 'cause', 'consequence' ]: risk.update( {feat_data['feature']['name']: feat_data['data'][0]['text']}) elif feat_data['feature']['name'].startswith('topology.'): topology.update({ feat_data['feature']['name'].replace('topology.', ''): feat_data['data'][0]['text'] }) elif feat_data['feature']['name'].startswith('discipline.'): discipline.update({ feat_data['feature']['name'].replace('discipline.', ''): feat_data['data'][0]['text'] }) risk.update({'topology': topology}) risk.update({'discipline': discipline}) severity, likelihood = classification_result['predictedLabel'][ 'label'].split() entropy = classification_result['entropy'] classified_risk = { 'id': id(), 'risk': risk, 'severity': severity, 'likelihood': likelihood, 'confidenceLevel': entropy, 'lowConfidence': is_low_confidence(entropy), 'score': calculate_score(severity, likelihood), 'contributors': classification_result['contributors'], 'recommends': classification_result['recommends'] } return classified_risk
def random_train_group_feature(): fIndex = rd.randint(0, 100) fName = 'TRAIN_GROUP' fType = 'NUMERICAL' return {'id': id(), 'index': fIndex, 'name': fName, 'type': fType}
def random_batch_classification_results(): dataset = random_dataset() num_classes = rd.randint(2, 5) probabilities = [1.0 / num_classes] * num_classes classes = ["Class " + str(idx + 1) for idx in range(num_classes)] allPredictedLabels = [{ 'id': id(), 'label': lbl, 'probability': prob } for (lbl, prob) in zip(classes, probabilities)] class_summaries = [] for clsidx in range(num_classes): num_instances = rd.randint(3, 10) results = [] for instidx in range(num_instances): data_idx = rd.randint(0, len(dataset['features'][0]['data']) - 1) input_data = [] for feat in dataset['features']: input_data.append({ 'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_idx]] }) data_instance = {'id': id(), 'features': input_data} results.append({ 'id': id(), 'dataInstance': { 'id': id(), 'dataset': data_instance, 'index': instidx }, 'allLabels': allPredictedLabels, 'predictedLabel': allPredictedLabels[clsidx], 'entropy': rd.random(), 'contributors': [{ 'id': id(), 'featureName': 'topology', 'featureValue': 'topsides', 'weight': .68 }], 'recommends': [{ 'id': id(), 'featureName': 'topology', 'featureValue': 'subsea', 'weight': .86 }] }) class_summaries.append({ 'id': id(), 'label': classes[clsidx], 'numInstances': num_instances, 'probabilities': [1.0 / num_classes] * num_instances, 'entropies': [rd.random()] * num_instances, 'results': results }) return {'id': id(), 'classSummaries': class_summaries}
def random_doc_to_vector(): return { 'id': id(), 'modelFile': 'fullpathText2VecBinaryFileName', 'maxNumWords': rd.randint(1000, 10000) }
def classify(cachedModelID, data): startedTime = datetime.datetime.now() assert(cachedModelID in cachedMSR), "Model not found." model = cachedMSR[cachedModelID]['selectedModel'] emptyResults = { 'id': -1, 'classSummaries': [] } #debug print('Received a dataset with ', len(data['features']), ' features to classify.') if (len(data['features']) ==0): print('There is no feature, empty result set is returned.') return emptyResults print('Received a dataset with ', len(data['features'][0]['data']), ' rows to classify.') if (len(data['features'][0]['data']) ==0): print('There is no data, empty result set is returned.') return emptyResults candidate = model["candidate"] features = candidate["features"] config = candidate["config"] unlabeled_df = datasetToDataframe(data) filtered_input_df = unlabeled_df.filter([f['name'] for f in features]) lr, fm, lm = loadTrainedModel(model) ac = Classifier(model_configuration=config) ac.load_models(lr, fm, lm) res_df = ac.predict_explain(input_df=filtered_input_df, topN_features=10) reccom_df = ac.input_qlty(input_df=filtered_input_df, topN=10) res_df = pd.concat([res_df, reccom_df.filter(["SuggestedFeatures"])], axis=1) plCountSeries = res_df.groupby('PredictedLabel').PredictedLabel.count() labels = list(plCountSeries.keys()) classSummaries = [] for label in labels: filtered_res_df = res_df[res_df.PredictedLabel == label] entropies = [] probabilities = [] results = [] for data_index, row in filtered_res_df.iterrows(): entropies.append(float(row.Entropy)) probsDict, allLabels = unpackProbs(row.Probabilities) probabilities.append(float(probsDict[label])) contributors = unpackContribs(row.TopContributors) recommends = unpackSuggestedFeatures(row.SuggestedFeatures) input_data = [] for feat in data['features']: input_data.append({'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_index]]}) data_instance = { 'id': id(), 'dataset': { 'id': id(), 'features': input_data}, 'index': data_index } classificationResult = { 'id': id(), 'allLabels': allLabels, 'entropy': float(row.Entropy), 'contributors': contributors, 'dataInstance': data_instance, 'predictedLabel': { 'id': id(), 'label': label, 'probability': float(probsDict[label]) }, 'recommends': recommends } results.append(classificationResult) classSumary = { 'id': id(), 'label': label, 'numInstances': int(plCountSeries[label]), 'probabilities': probabilities, 'entropies': entropies, 'results': results } classSummaries.append(classSumary) batchClassificationResult = { 'id': id(), "classSummaries": classSummaries } print('Classification time:' + str((datetime.datetime.now() - startedTime).total_seconds()) + ' seconds ') return batchClassificationResult
def random_label_encoder(num_classes): return { 'id': id(), 'labels': ["Label " + str(n + 1) for n in range(num_classes)] }
def random_multilabel_binarizer(num_classes): return { 'id': id(), 'labels': ["Label " + str(n + 1) for n in range(num_classes)] }
def random_noop(): return {'id': id()}
def flattenRiskToDataset(risks): features = [] field_datas = [] for _ in range(12): field_datas.append([]) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 0, 'name': 'id', 'type': "TEXT" }, 'data': field_datas[0] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 1, 'name': 'title', 'type': "TEXT" }, 'data': field_datas[1] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 2, 'name': 'description', 'type': "TEXT" }, 'data': field_datas[2] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 3, 'name': 'cause', 'type': "TEXT" }, 'data': field_datas[3] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 4, 'name': 'consequence', 'type': "TEXT" }, 'data': field_datas[4] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 5, 'name': 'topology.id', 'type': "CATEGORICAL" }, 'data': field_datas[5] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 6, 'name': 'topology.onshoreOffshore', 'type': "CATEGORICAL" }, 'data': field_datas[6] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 7, 'name': 'topology.upstreamDownstream', 'type': "CATEGORICAL" }, 'data': field_datas[7] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 8, 'name': 'topology.oilGas', 'type': "CATEGORICAL" }, 'data': field_datas[8] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 9, 'name': 'topology.facilityType', 'type': "CATEGORICAL" }, 'data': field_datas[9] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 10, 'name': 'discipline.id', 'type': "CATEGORICAL" }, 'data': field_datas[10] }) features.append({ 'id': id(), 'feature': { 'id': id(), 'index': 11, 'name': 'discipline.name', 'type': "CATEGORICAL" }, 'data': field_datas[11] }) for risk in risks: field_datas[0].append({'id': id(), 'text': risk['id']}) field_datas[1].append({'id': id(), 'text': risk['title']}) field_datas[2].append({'id': id(), 'text': risk['description']}) field_datas[3].append({'id': id(), 'text': risk['cause']}) field_datas[4].append({'id': id(), 'text': risk['consequence']}) field_datas[5].append({'id': id(), 'text': risk['topology']['id']}) field_datas[6].append({ 'id': id(), 'text': risk['topology']['onshoreOffshore'] }) field_datas[7].append({ 'id': id(), 'text': risk['topology']['upstreamDownstream'] }) field_datas[8].append({'id': id(), 'text': risk['topology']['oilGas']}) field_datas[9].append({ 'id': id(), 'text': risk['topology']['facilityType'] }) field_datas[10].append({'id': id(), 'text': risk['discipline']['id']}) field_datas[11].append({ 'id': id(), 'text': risk['discipline']['name'] }) return {'id': iris_unique_id(), 'features': features}
def random_labeldata(num_classes): dval = "class " + str(rd.randint(1, num_classes)) return {'id': id(), 'text': dval}
def setUpClass(cls): cls.features = [{ "feature": { "index": 0, "name": "feature 0", "type": "TEXT" }, "data": [{ "text": "Hello" }, { "text": "Hello" }] }, { "feature": { "index": 1, "name": "feature 1", "type": "NUMERICAL" }, "data": [{ "numerical": 1.2 }, { "numerical": 2.5 }] }, { "feature": { "index": 2, "name": "feature 2", "type": "SET" }, "data": [{ "set": ["a", "b"] }, { "set": ["d", "e"] }] }] cls.ds = {"features": cls.features} cls.labeled_ds = { 'id': id(), "data": { 'id': id(), "features": cls.features }, "label": { 'id': id(), "feature": { 'id': id(), "index": 0, "name": "label feature", "type": "LABEL" }, "data": [{ 'id': id(), "text": "value 1" }, { 'id': id(), "text": "value 2" }] } } cls.df_dict = { "feature 0": ["Hello", "Hello"], "feature 1": [1.2, 2.5], "feature 2": [["a", "b"], ["d", "e"]] } cls.labeled_df_dict = { **cls.df_dict, "label feature": ["value 1", "value 2"], }