Ejemplo n.º 1
0
def svm_classifier(features, links_true, train_size=0.2, cv=None):
    """  Support vector machine classifier """
    svm = rl.SVMClassifier()

    ##FIXME train_size check should be greater than 0  less than 1
    if cv is None:
        golden_match_index = features.index & links_true.index
        train_index = int(len(features) * train_size)
        #train model
        svm.fit(features[0:train_index], golden_match_index)

        # Predict the match status for all record pairs
        matches = svm.predict(features)

        df_svm = pd.DataFrame(svm.kernel.decision_function(features))
        df_svm.columns = ['score']
    else:
        df_results = cross_val_predict(svm,
                                       features,
                                       links_true,
                                       cv,
                                       method='predict')
        matches = df_results.index
        df_svm = cross_val_predict(svm,
                                   features,
                                   links_true,
                                   cv,
                                   method='decision_function')

    return matches, df_svm
Ejemplo n.º 2
0
def init_model(classifier, num_features, **kwargs):
    if classifier is keys.NAIVE_BAYES:
        # add `binarize` threshold if not already specified
        if "binarize" not in kwargs.keys():
            kwargs["binarize"] = constants.NAIVE_BAYES_BINARIZE

        model = rl.NaiveBayesClassifier(**kwargs)

    elif classifier is keys.LINEAR_SVM:
        model = rl.SVMClassifier(**kwargs)

    elif classifier is keys.SVM:
        model = classifiers.SVCClassifier(**kwargs)

    elif classifier is keys.SINGLE_LAYER_PERCEPTRON:
        model = classifiers.SingleLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.MULTI_LAYER_PERCEPTRON:
        model = classifiers.MultiLayerPerceptron(num_features, **kwargs)

    else:
        err_msg = (
            f'Classifier not supported: {classifier}. '
            f'It should be one of {set(constants.CLASSIFIERS)}'
        )
        LOGGER.critical(err_msg)
        raise ValueError(err_msg)

    LOGGER.info('Model initialized: %s', model)

    return model
Ejemplo n.º 3
0
    def test_svm(self):

        svm = rl.SVMClassifier()
        svm.fit(self.X_train, self.y_train)
        svm.predict(self.X_test)

        # There are no probabilities
        with pytest.raises(AttributeError):
            svm.prob(self.X_train)
Ejemplo n.º 4
0
    def test_svm(self):

        svm = recordlinkage.SVMClassifier()
        svm.learn(self.y_train, self.matches_index)
        svm.predict(self.y)

        # There are no probabilities
        with pytest.raises(AttributeError):
            svm.prob(self.y)
Ejemplo n.º 5
0
def init_model(classifier: str, num_features: int, **kwargs):
    if classifier is keys.NAIVE_BAYES:
        # add `binarize` threshold if not already specified

        kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs}
        model = rl.NaiveBayesClassifier(**kwargs)

    elif classifier is keys.LOGISTIC_REGRESSION:
        kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs}
        model = rl.LogisticRegressionClassifier(**kwargs)

    elif classifier is keys.LINEAR_SVM:
        kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs}
        model = rl.SVMClassifier(**kwargs)

    elif classifier is keys.SVM:
        model = classifiers.SVCClassifier(**kwargs)

    elif classifier is keys.RANDOM_FOREST:
        model = classifiers.RandomForest(**kwargs)

    elif classifier is keys.SINGLE_LAYER_PERCEPTRON:
        model = classifiers.SingleLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.MULTI_LAYER_PERCEPTRON:
        model = classifiers.MultiLayerPerceptron(num_features, **kwargs)

    elif classifier is keys.VOTING_CLASSIFIER:
        model = classifiers.VotingClassifier(num_features, **kwargs)

    elif classifier is keys.GATED_CLASSIFIER:
        model = classifiers.GatedEnsembleClassifier(num_features, **kwargs)

    elif classifier is keys.STACKED_CLASSIFIER:
        model = classifiers.StackedEnsembleClassifier(num_features, **kwargs)

    else:
        err_msg = (
            f'Classifier not supported: {classifier}. '
            f'It should be one of {set(constants.CLASSIFIERS)}'
        )
        LOGGER.critical(err_msg)
        raise ValueError(err_msg)

    LOGGER.info('Model initialized: %s', model)

    return model
Ejemplo n.º 6
0
#getting a sample of the dataframe
data_sample = data.take(np.random.permutation(len(data))[:2000000])

#tentetive matches
matches = dfA_sample[dfA_sample.sum(axis=1) > 6]
#tentetive matches
#nonmatches = data_sample[data_sample.sum(axis=1) < 4]
#creating match index
match_index = matches.index

#creating a training dataset
golden_pairs = data_sample[0:2000000]
golden_matches_index = golden_pairs.index & match_index

# Train the classifier
svm = rl.SVMClassifier()
svm.learn(golden_pairs, golden_matches_index)
# Predict the match status for all record pairs
result_svm = svm.predict(data)
len(result_svm)

#creating a confusion matrix
conf_svm = rl.confusion_matrix(match_index, result_svm, len(data))
conf_svm

# The F-score for this classification is
rl.fscore(conf_svm)

m_last = pd.DataFrame(result_svm)

#loading data for review
Ejemplo n.º 7
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
def get_matches(locu_train_path, foursquare_train_path, matches_train_path,
                locu_test_path, foursquare_test_path):
    four_train = pd.read_json(foursquare_train_path)
    locu_train = pd.read_json(locu_train_path)

    four_test = pd.read_json(foursquare_test_path)
    locu_test = pd.read_json(locu_test_path)

    matches_train = pd.read_csv(matches_train_path)

    # visualize missing data
    #     msno.matrix(four_train)
    #     msno.matrix(locu_train)
    #     msno.matrix(four_test)
    #     msno.matrix(locu_test)

    locu_train, four_train = preprocess(locu_train, four_train)
    locu_test, four_test = preprocess(locu_test, four_test)
    matches_train = preprocess_matches(matches_train)

    candidate_pairs = index_pairs(locu_train, four_train)
    test_candidate_pairs = index_pairs(locu_test, four_test)
    #     print (len(locu_train), len(four_train), len(candidate_pairs))
    #     print (len(locu_test), len(four_test), len(test_candidate_pairs))

    features = compare_strings(locu_train, four_train, candidate_pairs)
    test_features = compare_strings(locu_test, four_test, test_candidate_pairs)

    #     features = features.loc[features['street_address'] > .1]
    #     features = features.loc[features['name'] > .1]

    train_pairs, train_matches_index, all_matches_index = traintestsplit(
        features, matches_train)

    # Train Logistic Regression classifier
    logreg = recordlinkage.LogisticRegressionClassifier()
    logreg.learn(train_pairs, train_matches_index)
    #     print ("LogReg Intercept: ", logreg.intercept)
    #     print ("LogReg Coefficients: ", logreg.coefficients)

    # Train SVM classifier
    svm = recordlinkage.SVMClassifier()
    svm.learn(train_pairs, train_matches_index)

    # Predict on training data with both classifiers
    svm_results_index = predict(features, svm)
    logreg_results_index = predict(features, logreg)

    # To view pairs
    #     features.index = features.index.rename(['locu_id', 'foursquare_id'])
    #     train_matches = features.loc[svm_results_index]
    #     train_matches

    # Training results
    svm_confn_matrix = recordlinkage.confusion_matrix(all_matches_index,
                                                      svm_results_index,
                                                      len(features))
    #     print("SVM Confusion Matrix: ", svm_confn_matrix)
    #     print("SVM Precision: ", recordlinkage.precision(svm_confn_matrix))
    #     print("SVM Recall:    ", recordlinkage.recall(svm_confn_matrix))
    #     print("SVM Accuracy:  ", recordlinkage.accuracy(svm_confn_matrix))
    #     print("SVM F1 Score:  ", recordlinkage.fscore(svm_confn_matrix))

    logreg_confn_matrix = recordlinkage.confusion_matrix(
        all_matches_index, logreg_results_index, len(features))
    #     print("Logistic Regression Confusion Matrix: ", logreg_confn_matrix)
    #     print("Logistic Regression Precision: ", recordlinkage.precision(logreg_confn_matrix))
    #     print("Logistic Regression Recall:    ", recordlinkage.recall(logreg_confn_matrix))
    #     print("Logistic Regression Accuracy:  ", recordlinkage.accuracy(logreg_confn_matrix))
    #     print("Logistic Regression F1 Score:  ", recordlinkage.fscore(logreg_confn_matrix))

    # Predict on test data with SVM
    test_results_index = predict(test_features, svm)

    # Format and write to CSV
    test_features.index = test_features.index.rename(
        ['locu_id', 'foursquare_id'])
    test_match_pairs = test_features.loc[test_results_index]
    matches_test = test_match_pairs.drop(test_match_pairs.columns[::], axis=1)
    #     matches_test
    matches_test.to_csv('matches_test.csv')

    # create a dataframe for both fourquare and locu of pairs that get matched
    test_tuples = list(matches_test.index)
    test_locu_index = [i[0] for i in test_tuples]
    test_four_index = [i[1] for i in test_tuples]
    test_locu_matches = locu_test.loc[test_locu_index]
    test_four_matches = four_test.loc[test_four_index]

    # for viewing full match dataset
    temp = matches_test.reset_index().join(test_four_matches,
                                           on=['foursquare_id'])
    test_match_pairs = temp.join(test_locu_matches,
                                 on=['locu_id'],
                                 lsuffix='_foursquare',
                                 rsuffix='_locu').set_index(
                                     matches_test.index.names)

    cols = np.array(test_match_pairs.columns.tolist())
    order = [0, 7, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13]
    cols = list(cols[order])
    test_matches_reordered = test_match_pairs[cols]
    #     display(test_matches_reordered)
    #     print("Successfully wrote results to matches_test.csv")
    return
Ejemplo n.º 9
0
def create_and_train_svm():
    """
    Creates and trains a SVM Classifier
    """
    return train_supervised_classifier(rl.SVMClassifier())