Exemple #1
0
def nb_classifier(features, links_true, train_size=0.2, cv=None):
    """ Naive Bayes Classifier """

    nb = rl.NaiveBayesClassifier(binarize=0.3)
    print(features)
    print(len(features))
    ##FIXME train_size check should be greater than 0  less than 1
    if cv is None:
        golden_match_index = features.index & links_true.index
        train_index = int(len(features) * train_size)
        print(train_index)
        #train model
        nb.fit(features[0:train_index], golden_match_index)

        # Predict the match status for all record pairs
        matches = nb.predict(features)

        df_nb = pd.DataFrame(nb.prob(features))
        df_nb.columns = ['score']
    else:
        df_results = cross_val_predict(nb,
                                       features,
                                       links_true,
                                       cv,
                                       method='predict')
        matches = df_results.index
        df_nb = cross_val_predict(nb,
                                  features,
                                  links_true,
                                  cv,
                                  method='predict_proba')

    return matches, df_nb
    def test_bernoulli_naive_bayes(self):
        """Basic Naive Bayes"""

        bernb = recordlinkage.NaiveBayesClassifier()
        bernb.learn(self.y_train.round(), self.matches_index)
        bernb.predict(self.y.round())
        bernb.prob(self.y.round())
Exemple #3
0
def init_model(classifier, num_features, **kwargs):
    if classifier is keys.NAIVE_BAYES:
        # add `binarize` threshold if not already specified
        if "binarize" not in kwargs.keys():
            kwargs["binarize"] = constants.NAIVE_BAYES_BINARIZE

        model = rl.NaiveBayesClassifier(**kwargs)

    elif classifier is keys.LINEAR_SVM:
        model = rl.SVMClassifier(**kwargs)

    elif classifier is keys.SVM:
        model = classifiers.SVCClassifier(**kwargs)

    elif classifier is keys.SINGLE_LAYER_PERCEPTRON:
        model = classifiers.SingleLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.MULTI_LAYER_PERCEPTRON:
        model = classifiers.MultiLayerPerceptron(num_features, **kwargs)

    else:
        err_msg = (
            f'Classifier not supported: {classifier}. '
            f'It should be one of {set(constants.CLASSIFIERS)}'
        )
        LOGGER.critical(err_msg)
        raise ValueError(err_msg)

    LOGGER.info('Model initialized: %s', model)

    return model
Exemple #4
0
def init_model(classifier: str, num_features: int, **kwargs):
    if classifier is keys.NAIVE_BAYES:
        # add `binarize` threshold if not already specified

        kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs}
        model = rl.NaiveBayesClassifier(**kwargs)

    elif classifier is keys.LOGISTIC_REGRESSION:
        kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs}
        model = rl.LogisticRegressionClassifier(**kwargs)

    elif classifier is keys.LINEAR_SVM:
        kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs}
        model = rl.SVMClassifier(**kwargs)

    elif classifier is keys.SVM:
        model = classifiers.SVCClassifier(**kwargs)

    elif classifier is keys.RANDOM_FOREST:
        model = classifiers.RandomForest(**kwargs)

    elif classifier is keys.SINGLE_LAYER_PERCEPTRON:
        model = classifiers.SingleLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.MULTI_LAYER_PERCEPTRON:
        model = classifiers.MultiLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.VOTING_CLASSIFIER:
        model = classifiers.VotingClassifier(num_features, **kwargs)

    elif classifier is keys.GATED_CLASSIFIER:
        model = classifiers.GatedEnsembleClassifier(num_features, **kwargs)

    elif classifier is keys.STACKED_CLASSIFIER:
        model = classifiers.StackedEnsembleClassifier(num_features, **kwargs)

    else:
        err_msg = (
            f'Classifier not supported: {classifier}. '
            f'It should be one of {set(constants.CLASSIFIERS)}'
        )
        LOGGER.critical(err_msg)
        raise ValueError(err_msg)

    LOGGER.info('Model initialized: %s', model)

    return model
Exemple #5
0
n_pairs = 50000
n_matches = 7000
m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92])
u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09])

# Create the dataset and return the true links.
X_data, links_true = binary_vectors(
    n_pairs,  # the number of candidate links
    n_matches,  # the number of true links
    m=m_simulate,  # the m probabilities
    u=u_simulate,  # the u probabilities
    random_state=535,  # set seed
    return_links=True)  # return true links

# Initialise the NaiveBayesClassifier.
cl = rl.NaiveBayesClassifier()
cl.fit(X_data, links_true)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)
print("log weights of features:", cl.log_weights)
print("weights of features:", cl.weights)

# evaluate the model
links_pred = cl.predict(X_data)
print("Predicted number of links:", len(links_pred))
Exemple #6
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
Exemple #7
0
def create_and_train_naive_bayes():
    """
    Creates and trains a NaiveBayes Classifier
    """
    return train_supervised_classifier(rl.NaiveBayesClassifier())