def test_kfolds_eval(self, binary_problem=False): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) if binary_problem: res_df = ac.eval(input_df=__labeled_binary_inp_df, schema=__schema_with_label, mode="K_FOLDS", nfolds=3) else: res_df = ac.eval(input_df=__labeled_inp_df, schema=__schema_with_label, mode="K_FOLDS", nfolds=3) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(res_df.dtypes[0], "object") if binary_problem: self.assertEqual(len(res_df.columns), max(1 + len(self.binary_labels), 5)) else: self.assertEqual(len(res_df.columns), max(1 + len(self.labels), 5))
def test_numclasses(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) nclasses = ac.num_classes() self.assertTrue(nclasses == len(self.labels))
def test_predict_proba(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() ac = Classifier(model_configuration=testModelConfiguration) ac.load_models(lr, fm, lm) for multilbl_pred in [True, False]: res_df = ac.predict_proba(input_df=__nonlabeled_inp_df, multilabel_pred=multilbl_pred) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), len(self.fields_without_label) + 3) self.assertEqual(res_df.dtypes[-1], "float64") self.assertEqual(res_df.dtypes[-2], "object") self.assertEqual(res_df.dtypes[-3], "object") self.assertEqual(len(res_df), self.num_recs) if not multilbl_pred: self.assertFalse( any( list( map(lambda x: x[0] not in self.labels, res_df.filter([res_df.columns[-3]]).values)))) else: list_lbls = list( map(lambda lbls: lbls[0].split(","), res_df.filter([res_df.columns[-3]]).values)) list_valid_lbls = list( map( lambda lbls: map(lambda lbl: lbl not in self.labels, lbls), list_lbls)) self.assertFalse(any(list(map(any, list_valid_lbls)))) #Test if probabilities sum-up to 1 prob_str = list( map(lambda p_str: p_str.split(','), res_df["Probabilities"].values)) prob_float = list( map( lambda prob_with_label: [float(p.split(':')[1]) for p in prob_with_label], prob_str)) self.assertFalse(any(list(map(lambda probs: sum(probs) >= 1.0 + 0.005 * len(self.fields_without_label) \ or sum(probs) <= 1.0 - 0.005 * len(self.fields_without_label), prob_float))))
def test_training(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() self.assertTrue( isinstance(lr, LR) or isinstance(lr, LSVC) or isinstance(lr, Ensemble)) self.assertTrue(isinstance(fm, Featurizer)) self.assertTrue(isinstance(lm, Featurizer))
def top_rfe_features(labeled_dataset, config, topN = None): labeled_inp_df = datasetToDataframe(labeled_dataset) features = defaultFeatures(dataset=labeled_dataset) featurizers = defaultFeaturizers(features) stop_words = ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else [] tokenizer = BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \ else PorterTokenizer() if config["tokenizer"] == "STEMMER" \ else LemmaTokenizer() if config["tokenizer"] == "LEMMATIZER" \ else None ngram_range = (1, 1) if config["ngrams"] == "UNIGRAM" \ else (2, 2) if config["ngrams"] == "BIGRAM" \ else (1, 2) if config["ngrams"] == "BOTH" \ else None ac = Classifier(model_configuration={ "type": config['type'], "class_weight": config['weighting'].lower(), "tokenizer": tokenizer, "ngram_range": ngram_range, "sublinear_tf": config['tf']=="SUBLINEAR", "smooth_idf": config['df']=="SMOOTH", "penalty": config['penalty'].lower(), "multi_class": config['multiclass'].lower(), "solver": config['solver'].lower(), "dual": config['primal_dual']=="DUAL", "fit_intercept": config['fitIntercept'], 'max_df': config['max_df'], 'min_df': config['min_df'], 'stopwords': stop_words, 'C': config['C'], 'max_iter': config['max_iter'] }) res_df = ac.feature_ranking(input_df=labeled_inp_df, schema=featurizers, mode=Classifier.CC_fs_backward) feature_names = pd.Series(map(lambda fname: fname.split('::')[0], res_df['Feature'])) feature_scores = pd.concat([feature_names, res_df['Score']], axis=1) feature_scores.columns = ['Feature', 'Score'] feature_sum_scores = feature_scores.groupby('Feature').sum() sorted_features = feature_sum_scores.sort_values(by = ["Score"], ascending = False) selected_feature_names = list(sorted_features.index)[:topN] selected_features = [] for fname in selected_feature_names: selected_features += [feat for feat in features if feat['name'] == fname] return selected_features+ [features[-1]]
def test_LOO_eval_table_format(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) res_df = ac.eval(input_df=__labeled_inp_df, schema=__schema_with_label, mode="LEAVE_ONE_OUT", nfolds=3) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), max(1 + len(self.labels), 5)) self.assertEqual(res_df.dtypes[0], "object")
def test_correlation_feature_selection(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label_nonnegative = self.schema_with_label_nonnegative.copy( ) ac = Classifier(model_configuration=testModelConfiguration) res_df = ac.feature_ranking(input_df=__labeled_inp_df, schema=__schema_with_label_nonnegative, mode=Classifier.CC_fs_correlation) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), 2) self.assertEqual(res_df.dtypes[0], "object") self.assertIn(res_df.dtypes[1], ["int64", "float64"])
def test_labels(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) labels = ac.labels() diff1 = [elem for elem in labels if elem not in self.labels] diff2 = [elem for elem in self.labels if elem not in labels] self.assertTrue(len(diff1) == 0) self.assertTrue(len(diff2) == 0) self.assertTrue(len(labels) == len(self.labels))
def create_classifier(config): return Classifier(model_configuration={ 'id': id(), "type": config["type"], "class_weight": None if config['weighting'].lower() == 'none' else config['weighting'].lower(), "tokenizer": BaseTokenizer() if config["tokenizer"] == "WORD_TOKENIZER" \ else PorterTokenizer() if config["tokenizer"] == "STEMMER" \ else LemmaTokenizer() if config["tokenizer"] == "LEMMATIZER" \ else None, "ngram_range": (1, 1) if config["ngrams"] == "UNIGRAM" \ else (2, 2) if config["ngrams"] == "BIGRAM" \ else (1, 2) if config["ngrams"] == "BOTH" \ else None, "sublinear_tf": config["tf"] == "SUBLINEAR", "smooth_idf": config["df"] == "SMOOTH", "penalty": config['penalty'].lower(), "multi_class": config['multiclass'].lower(), "solver": config['solver'].lower(), "dual": config['primal_dual']=='DUAL', "fit_intercept": config['fitIntercept'], 'max_df': config['max_df'], 'min_df': config['min_df'], 'stopwords': ENGLISH_STOP_WORDS if config["stopwords"] == "ENGLISH" else [], 'C': config['C'], 'max_iter': config['max_iter'] })
def test_learn(self): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() ac = Classifier(model_configuration=testModelConfiguration) ac.load_models(lr, fm, lm) res_df = ac.learn(input_df=__nonlabeled_inp_df) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), len(self.fields_without_label) + 3) self.assertEqual(res_df.dtypes[-1], "float64")
def test_backward_feature_selection(self): if testModelConfiguration['type'] in [ Classifier.ENSEMBLE_SVC_MODEL_TYPE, Classifier.ENSEMBLE_LR_MODEL_TYPE ]: return __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) res_df = ac.feature_ranking(input_df=__labeled_inp_df, schema=__schema_with_label, mode=Classifier.CC_fs_backward) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), 2) self.assertEqual(res_df.dtypes[0], "object") self.assertIn(res_df.dtypes[1], ["int64", "float64"])
def test_eval_data(self, binary_problem=False): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) if binary_problem: labels, true_lbls, pred_lbls, conf_mat, cls_report = ac.eval_data( input_df=__labeled_binary_inp_df, schema=__schema_with_label, mode="LEAVE_ONE_OUT", nfolds=3) else: labels, true_lbls, pred_lbls, conf_mat, cls_report = ac.eval_data( input_df=__labeled_inp_df, schema=__schema_with_label, mode="LEAVE_ONE_OUT", nfolds=3) if binary_problem: self.assertTrue(len(labels) == 2) else: self.assertTrue(len(labels) == len(self.labels)) self.assertTrue(len(true_lbls) == self.num_recs) self.assertTrue(len(true_lbls) == len(pred_lbls)) self.assertTrue(len(conf_mat) == len(labels)) self.assertTrue(len(conf_mat[0]) == len(labels)) ext_labels = list(labels) + ['macro avg', 'weighted avg'] for lbl in ext_labels: self.assertTrue(lbl in cls_report.keys()) self.assertTrue('precision' in cls_report[lbl]) self.assertTrue('recall' in cls_report[lbl]) self.assertTrue('f1-score' in cls_report[lbl]) self.assertTrue('support' in cls_report[lbl]) self.assertTrue('accuracy' in cls_report.keys())
def test_model_visualization(self, binary_problem=False): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) if binary_problem: ac.train(input_df=__labeled_binary_inp_df, schema=__schema_with_label) else: ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() ac = Classifier(model_configuration=testModelConfiguration) ac.load_models(lr, fm, lm) res_df = ac.model_visualization() self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), 3) self.assertEqual(res_df.dtypes[-1], "float64") self.assertEqual(res_df.dtypes[-2], "object") self.assertEqual(res_df.dtypes[-3], "object")
def test_predict_explain(self, binary_problem=False): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True) __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) if binary_problem: ac.train(input_df=__labeled_binary_inp_df, schema=__schema_with_label) else: ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() ac = Classifier(model_configuration=testModelConfiguration) ac.load_models(lr, fm, lm) res_df = ac.predict_explain(input_df=__nonlabeled_inp_df) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), len(self.fields_without_label) + 4) self.assertEqual(res_df.dtypes[-1], "object") self.assertEqual(res_df.dtypes[-2], "float64") self.assertEqual(res_df.dtypes[-3], "object") self.assertEqual(res_df.dtypes[-4], "object") #Test if all top-contributed features are present def chk_contributor_existance(row): contributors = row["TopContributors"].split(';') features = [contrib.split('=')[0] for contrib in contributors] for feat in features: if '::' in feat: field_name, field_value = feat.split('::') self.assertIn(field_name, self.fields_without_label) fld_no = list(res_df.columns).index(field_name) if self.schema_without_label[fld_no] in ["text", "set"]: #self.assertIn(field_value, row[field_name].lower()) self.assertTrue(True) elif self.schema_without_label[fld_no] in [ "string", "numeric", "boolean" ]: self.assertEqual(field_value, row[field_name]) else: field_name = feat if len(field_name) > 0: self.assertIn(field_name, self.fields_without_label) res_df.apply(chk_contributor_existance, axis=1)
def test_input_qlty(self, binary_problem=False): __labeled_inp_df = self.labeled_inp_df.copy(deep=True) __labeled_binary_inp_df = self.labeled_binary_inp_df.copy(deep=True) __nonlabeled_inp_df = self.nonlabeled_inp_df.copy(deep=True) __schema_with_label = self.schema_with_label.copy() ac = Classifier(model_configuration=testModelConfiguration) if binary_problem: ac.train(input_df=__labeled_binary_inp_df, schema=__schema_with_label) else: ac.train(input_df=__labeled_inp_df, schema=__schema_with_label) lr, fm, lm = ac.get_models() ac = Classifier(model_configuration=testModelConfiguration) ac.load_models(lr, fm, lm) res_df = ac.input_qlty(input_df=__nonlabeled_inp_df) self.assertTrue(isinstance(res_df, pd.DataFrame)) self.assertEqual(len(res_df.columns), len(self.fields_without_label) + 2) self.assertEqual(res_df.dtypes[-1], "object") self.assertEqual(res_df.dtypes[-2], "object") self.assertFalse( any( list( map(lambda x: x not in ["Good", "Bad", "OK"], res_df.filter([res_df.columns[-2]]).values)))) #Test if all suggested features are not present def chk_feature_nonexistance(row): suggested_features = row["SuggestedFeatures"].split(',') for feat in suggested_features: if '::' in feat: field_name, field_value = feat.split('::') self.assertIn(field_name, self.fields_without_label) fld_no = list(res_df.columns).index(field_name) if self.schema_without_label[fld_no] in [ "text", "text2vec" ]: self.assertNotIn(' ' + field_value + ' ', row[field_name].lower()) #self.assertTrue(True) elif self.schema_without_label[fld_no] == "set": if len(field_value) > 0: self.assertNotIn(field_value, row[field_name]) elif self.schema_without_label[fld_no] in [ "string", "numeric", "boolean" ]: self.assertNotEqual(field_value, row[field_name]) else: field_name = feat if len(field_name) > 0: self.assertIn(field_name, self.fields_without_label) res_df.apply(chk_feature_nonexistance, axis=1)
def classify(cachedModelID, data): startedTime = datetime.datetime.now() assert(cachedModelID in cachedMSR), "Model not found." model = cachedMSR[cachedModelID]['selectedModel'] emptyResults = { 'id': -1, 'classSummaries': [] } #debug print('Received a dataset with ', len(data['features']), ' features to classify.') if (len(data['features']) ==0): print('There is no feature, empty result set is returned.') return emptyResults print('Received a dataset with ', len(data['features'][0]['data']), ' rows to classify.') if (len(data['features'][0]['data']) ==0): print('There is no data, empty result set is returned.') return emptyResults candidate = model["candidate"] features = candidate["features"] config = candidate["config"] unlabeled_df = datasetToDataframe(data) filtered_input_df = unlabeled_df.filter([f['name'] for f in features]) lr, fm, lm = loadTrainedModel(model) ac = Classifier(model_configuration=config) ac.load_models(lr, fm, lm) res_df = ac.predict_explain(input_df=filtered_input_df, topN_features=10) reccom_df = ac.input_qlty(input_df=filtered_input_df, topN=10) res_df = pd.concat([res_df, reccom_df.filter(["SuggestedFeatures"])], axis=1) plCountSeries = res_df.groupby('PredictedLabel').PredictedLabel.count() labels = list(plCountSeries.keys()) classSummaries = [] for label in labels: filtered_res_df = res_df[res_df.PredictedLabel == label] entropies = [] probabilities = [] results = [] for data_index, row in filtered_res_df.iterrows(): entropies.append(float(row.Entropy)) probsDict, allLabels = unpackProbs(row.Probabilities) probabilities.append(float(probsDict[label])) contributors = unpackContribs(row.TopContributors) recommends = unpackSuggestedFeatures(row.SuggestedFeatures) input_data = [] for feat in data['features']: input_data.append({'id': id(), 'feature': feat['feature'], 'data': [feat['data'][data_index]]}) data_instance = { 'id': id(), 'dataset': { 'id': id(), 'features': input_data}, 'index': data_index } classificationResult = { 'id': id(), 'allLabels': allLabels, 'entropy': float(row.Entropy), 'contributors': contributors, 'dataInstance': data_instance, 'predictedLabel': { 'id': id(), 'label': label, 'probability': float(probsDict[label]) }, 'recommends': recommends } results.append(classificationResult) classSumary = { 'id': id(), 'label': label, 'numInstances': int(plCountSeries[label]), 'probabilities': probabilities, 'entropies': entropies, 'results': results } classSummaries.append(classSumary) batchClassificationResult = { 'id': id(), "classSummaries": classSummaries } print('Classification time:' + str((datetime.datetime.now() - startedTime).total_seconds()) + ' seconds ') return batchClassificationResult