def nb_classifier(features, links_true, train_size=0.2, cv=None): """ Naive Bayes Classifier """ nb = rl.NaiveBayesClassifier(binarize=0.3) print(features) print(len(features)) ##FIXME train_size check should be greater than 0 less than 1 if cv is None: golden_match_index = features.index & links_true.index train_index = int(len(features) * train_size) print(train_index) #train model nb.fit(features[0:train_index], golden_match_index) # Predict the match status for all record pairs matches = nb.predict(features) df_nb = pd.DataFrame(nb.prob(features)) df_nb.columns = ['score'] else: df_results = cross_val_predict(nb, features, links_true, cv, method='predict') matches = df_results.index df_nb = cross_val_predict(nb, features, links_true, cv, method='predict_proba') return matches, df_nb
def test_bernoulli_naive_bayes(self): """Basic Naive Bayes""" bernb = recordlinkage.NaiveBayesClassifier() bernb.learn(self.y_train.round(), self.matches_index) bernb.predict(self.y.round()) bernb.prob(self.y.round())
def init_model(classifier, num_features, **kwargs): if classifier is keys.NAIVE_BAYES: # add `binarize` threshold if not already specified if "binarize" not in kwargs.keys(): kwargs["binarize"] = constants.NAIVE_BAYES_BINARIZE model = rl.NaiveBayesClassifier(**kwargs) elif classifier is keys.LINEAR_SVM: model = rl.SVMClassifier(**kwargs) elif classifier is keys.SVM: model = classifiers.SVCClassifier(**kwargs) elif classifier is keys.SINGLE_LAYER_PERCEPTRON: model = classifiers.SingleLayerPerceptron(num_features, **kwargs) elif classifier is keys.MULTI_LAYER_PERCEPTRON: model = classifiers.MultiLayerPerceptron(num_features, **kwargs) else: err_msg = ( f'Classifier not supported: {classifier}. ' f'It should be one of {set(constants.CLASSIFIERS)}' ) LOGGER.critical(err_msg) raise ValueError(err_msg) LOGGER.info('Model initialized: %s', model) return model
def init_model(classifier: str, num_features: int, **kwargs): if classifier is keys.NAIVE_BAYES: # add `binarize` threshold if not already specified kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs} model = rl.NaiveBayesClassifier(**kwargs) elif classifier is keys.LOGISTIC_REGRESSION: kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs} model = rl.LogisticRegressionClassifier(**kwargs) elif classifier is keys.LINEAR_SVM: kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs} model = rl.SVMClassifier(**kwargs) elif classifier is keys.SVM: model = classifiers.SVCClassifier(**kwargs) elif classifier is keys.RANDOM_FOREST: model = classifiers.RandomForest(**kwargs) elif classifier is keys.SINGLE_LAYER_PERCEPTRON: model = classifiers.SingleLayerPerceptron(num_features, **kwargs) elif classifier is keys.MULTI_LAYER_PERCEPTRON: model = classifiers.MultiLayerPerceptron(num_features, **kwargs) elif classifier is keys.VOTING_CLASSIFIER: model = classifiers.VotingClassifier(num_features, **kwargs) elif classifier is keys.GATED_CLASSIFIER: model = classifiers.GatedEnsembleClassifier(num_features, **kwargs) elif classifier is keys.STACKED_CLASSIFIER: model = classifiers.StackedEnsembleClassifier(num_features, **kwargs) else: err_msg = ( f'Classifier not supported: {classifier}. ' f'It should be one of {set(constants.CLASSIFIERS)}' ) LOGGER.critical(err_msg) raise ValueError(err_msg) LOGGER.info('Model initialized: %s', model) return model
n_pairs = 50000 n_matches = 7000 m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92]) u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09]) # Create the dataset and return the true links. X_data, links_true = binary_vectors( n_pairs, # the number of candidate links n_matches, # the number of true links m=m_simulate, # the m probabilities u=u_simulate, # the u probabilities random_state=535, # set seed return_links=True) # return true links # Initialise the NaiveBayesClassifier. cl = rl.NaiveBayesClassifier() cl.fit(X_data, links_true) # Print the parameters that are trained (m, u and p). Note that the estimates # are very good. print("p probability P(Match):", cl.p) print("m probabilities P(x_i=1|Match):", cl.m_probs) print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) print("log weights of features:", cl.log_weights) print("weights of features:", cl.weights) # evaluate the model links_pred = cl.predict(X_data) print("Predicted number of links:", len(links_pred))
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
def create_and_train_naive_bayes(): """ Creates and trains a NaiveBayes Classifier """ return train_supervised_classifier(rl.NaiveBayesClassifier())