def test_census(self):
        logger = get_logger('RL.Test.KmeansClustering.CENSUS')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier(algorithm='full',
                                               max_iter=1000,
                                               random_state=42)
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))
    def test_febrl(self):
        logger = get_logger('RL.Test.KmeansClustering.FEBRL')

        febrl = FEBRL()

        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.candidate_links, febrl.trainDataA,
                                      febrl.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_links,
                            len(febrl.candidate_links))

        #Test the classifier
        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.test_links, febrl.testDataA,
                                      febrl.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_test_links,
                            len(febrl.test_links))
    def test_kmeans_manual(self):
        """KMeansClassifier with manual cluster centers"""

        # Make random test data.
        np.random.seed(535)
        manual_mcc = list(np.random.randn(self.X_train.shape[1]))
        manual_nmcc = list(np.random.randn(self.X_train.shape[1]))

        # Initialize the KMeansClassifier
        kmeans = rl.KMeansClassifier()

        # Check if the cluster centers are None
        assert not hasattr(kmeans, 'match_cluster_center')
        assert not hasattr(kmeans, 'nonmatch_cluster_center')

        # Set the cluster centers
        kmeans.match_cluster_center = manual_mcc
        kmeans.nonmatch_cluster_center = manual_nmcc

        # Perform the prediction
        kmeans.predict(self.X_test)

        # Check the match clusters
        mcc = kmeans.match_cluster_center
        nmcc = kmeans.nonmatch_cluster_center
        assert_almost_equal(mcc, manual_mcc)
        assert_almost_equal(nmcc, manual_nmcc)
Exemple #4
0
def create_and_train_kmeans():
    """
    Creates and trains a KMeans Classifier
    """
    classifier = rl.KMeansClassifier()
    classifier.learn(features)
    return classifier
    def test_cora(self):
        logger = get_logger('RL.Test.KmeansClustering.CORA')

        #Read Train data in dataset A & B
        cora = Cora()

        ## Extarct Features
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.candidate_links, cora.trainDataA,
                                      cora.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_links,
                            len(cora.candidate_links))

        #Test the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.test_links, cora.testDataA,
                                      cora.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_test_links,
                            len(cora.test_links))
    def test_kmeans_no_training_data(self):
        """ Kmeans, no training data"""

        kmeans = recordlinkage.KMeansClassifier()

        with pytest.raises(ValueError):
            kmeans.learn(pandas.DataFrame(columns=self.y_train.columns))
    def test_kmeans_manual(self):
        """KMeansClassifier with manual cluster centers"""

        # Make random test data.
        numpy.random.seed(535)
        manual_mcc = list(numpy.random.randn(self.y_train.shape[1]))
        manual_nmcc = list(numpy.random.randn(self.y_train.shape[1]))

        # Initialize the KMeansClassifier
        kmeans = recordlinkage.KMeansClassifier()

        # Check if the cluster centers are None
        assert kmeans.match_cluster_center is None
        assert kmeans.nonmatch_cluster_center is None

        # Set the cluster centers
        kmeans.match_cluster_center = manual_mcc
        kmeans.nonmatch_cluster_center = manual_nmcc

        # Perform the prediction
        kmeans.predict(self.y)

        # Check the match clusters
        mcc = kmeans.match_cluster_center
        nmcc = kmeans.nonmatch_cluster_center
        assert mcc == manual_mcc
        assert nmcc == manual_nmcc
    def test_kmeans_error(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)

        # There are no probabilities
        with pytest.raises(AttributeError):
            kmeans.prob(self.X_train)
    def test_kmeans(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)
        result = kmeans.predict(self.X_test)

        assert isinstance(result, pd.MultiIndex)
        assert result.shape[0] == 11670
    def test_kmeans(self):

        kmeans = recordlinkage.KMeansClassifier()
        kmeans.learn(self.y_train)
        kmeans.predict(self.y)

        # There are no probabilities
        with pytest.raises(AttributeError):
            kmeans.prob(self.y)
    def test_kmeans_not_trained(self):
        """
        Raise an error if the classifier is not trained, but a prediction is
        asked.
        """

        kmeans = recordlinkage.KMeansClassifier()

        with pytest.raises(Exception):
            kmeans.predict(self.y)
    def test_kmean_parameters(self):

        kmeans = rl.KMeansClassifier()
        kmeans.fit(self.X_train)

        _, n_features = self.X_train.shape

        assert isinstance(kmeans.match_cluster_center, np.ndarray)
        assert kmeans.match_cluster_center.shape == (n_features, )

        assert isinstance(kmeans.nonmatch_cluster_center, np.ndarray)
        assert kmeans.nonmatch_cluster_center.shape == (n_features, )
Exemple #13
0
def link_reduce(from_rest: str, dfs: dict, window: int, th: float, classifier: str, thFusion: float) -> dict:
    dfs_copy = {from_rest: dfs[from_rest]}
    dfs_reduce = dfs.copy()

    # Make copy of dfs with from_rest moved on top
    for rest, df in dfs.items():
        if rest == from_rest:
            continue
        else:
            dfs_copy[rest] = df.copy()

    for rest, df in dfs_copy.items():
        for rr, ddf in dfs_reduce.items():
            if rr == rest:
                continue
            else:
                columns_to_check = ['restaurant']
                print(f"{rest} -> {rr}")
                if df['addressGoogle'].isnull().sum() != len(df['addressGoogle']) and ddf['addressGoogle'].isnull().sum() != len(ddf['addressGoogle']):
                    columns_to_check.append('addressGoogle')
                if df['neighborhood'].isnull().sum() != len(df['neighborhood']) and ddf['neighborhood'].isnull().sum() != len(ddf['neighborhood']):
                    columns_to_check.append('neighborhood')

                #print(f"\tcheck: {columns_to_check}")
                indexer = recordlinkage.Index()

                # 1 - INDEXING
                for col in columns_to_check:
                    indexer.sortedneighbourhood(
                        left_on=col, right_on=col, window=window)
                candidate_links = indexer.index(df, ddf)

                # 2 - COMPARISON
                compare_cl = recordlinkage.Compare(n_jobs=-1)
                for col in columns_to_check:
                    if col == 'addressGoogle':
                        compare_cl.exact(col, col)
                    else:
                        compare_cl.string(col, col, label=col,
                                          threshold=th, method='jarowinkler')
                features = compare_cl.compute(candidate_links, df, ddf)

                # 3 - CLASSIFICATION
                matches = None
                if classifier == "ecm":
                    ecm = recordlinkage.ECMClassifier(
                        init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True)
                    ecm.fit_predict(features)
                    matches = ecm.predict(features)
                elif classifier == "kmeans":
                    kmeans = recordlinkage.KMeansClassifier()
                    kmeans.fit_predict(features)
                    matches = kmeans.predict(features)

                # 4 - COMBINE INFORMATION
                for left, right in matches:
                    if not combine(df.loc[left], ddf.loc[right], thFusion, th):
                        matches = matches.drop((left, right))

                print(f"\tmatches: {len(matches)}")
                dfs_copy[rest] = df.copy()

                # 4 - DROP RIGHT ON MATCHES INDEX
                index_to_drop = set(matches.get_level_values(1))
                print(f"\t{rr} before drop: {len(ddf.index)}")
                ddf.drop(index_to_drop, inplace=True)
                dfs_copy[rr] = ddf.copy()
                dfs_reduce[rr] = ddf.copy()
                print(f"\t{rr} after drop: {len(dfs_reduce[rr].index)}\n")

        del dfs_reduce[rest]

    final_df = pd.concat(list(dfs_copy.values()))
    final_df.dropna(subset=['addressGoogle'], inplace=True)
    final_df.drop_duplicates(inplace=True)
    return final_df
    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key,
                                   fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(
                schema_id,
                table_name,
            )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(
                *_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(
                df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(
                list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(
                df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key,
                                                             window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name,
                                       column_name,
                                       method='linear',
                                       offset=10,
                                       scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name,
                                      column_name,
                                      method=alg,
                                      threshold=0.75,
                                      label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)

            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'"
                .format(dedup_table_name))
            app.logger.exception(e)
            raise e
Exemple #15
0
    del data["Total_Score"]
    del data["rec_id"]
    del data["rec_id.1"]
    ###calculate known matches, then delete for classification
    del data['rec_num']
    data.to_csv('feature_vectors_clean.csv', sep=",", encoding='utf-8')
    return data


prepData()
####Evaluate vector scoring methodology###########################################################################################
known_matches = knownMatches()
missed_matches = missedMatches()
false_positives = findFalsePos()
pairs = ScoreRecords()
print "number of comparison pairs in index:", len(pairs)
print "number of matching pairs:", known_matches
print "number of missed matches:", missed_matches
print "number of false positives:", false_positives
#supervised#######################################################################################################
#unsupervised methods#############################################################################################
###k-means####################################################
data = prepData()
kmeans = rl.KMeansClassifier()
result_kmeans = kmeans.learn(data)
print 'number of predicted pairs using K-means clustering:', len(result_kmeans)
###ECM Maximization###########################################
ecm = rl.ECMClassifier()
result_ecm = ecm.learn((data > 0.8).astype(int))
print 'the number of predicted pairs using ECM Maximization:', len(result_ecm)
Exemple #16
0
def linkDB(df1, df2, type, classifier):

    # 1 - INDEXING

    indexer = recordlinkage.Index()

    if type == "sortedneighbourhood":
        indexer.sortedneighbourhood(left_on="0_restaurant",
                                    right_on="1_restaurant")
    elif type == "full":
        indexer.full()
    elif type == "block":
        indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle")

    candidate_links = indexer.index(df1, df2)

    test_pairs = candidate_links[0:100]

    #https://recordlinkage.readthedocs.io/en/latest/annotation.html
    """
	df1.columns = df1.columns.str.replace(r'0_', '')
	df2.columns = df2.columns.str.replace(r'1_', '')
	
	recordlinkage.write_annotation_file(
		"check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF")
	
	df1 = df1.add_prefix('0_')
	df2 = df2.add_prefix('1_')
	"""

    annotations = recordlinkage.read_annotation_file('result.json')

    # 2 - COMPARISON
    comp = recordlinkage.Compare()
    comp.string('0_restaurant',
                '1_restaurant',
                threshold=0.95,
                method='jarowinkler',
                label='ristorante')
    comp.string('0_neighborhood',
                '1_neighborhood',
                method='jarowinkler',
                threshold=0.85,
                label='quartiere')
    comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle')

    features = comp.compute(candidate_links, df1, df2)
    test_features = comp.compute(test_pairs, df1, df2)

    # 3 - CLASSIFICATION
    # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised

    matches = []
    drop1 = []
    drop2 = []

    if classifier == "ecm":
        ecm = recordlinkage.ECMClassifier(init='jaro',
                                          binarize=None,
                                          max_iter=100,
                                          atol=0.0001,
                                          use_col_names=True)
        ecm.fit_predict(features, match_index=None)  # Train the classifier
        e_matches = ecm.predict(features)
        for i, j in e_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)
    elif classifier == "kmeans":
        kmeans = recordlinkage.KMeansClassifier()
        kmeans.fit_predict(features)
        k_matches = kmeans.predict(features)
        for i, j in k_matches:
            if i not in drop1:
                drop1.append(i)
            if j not in drop2:
                drop2.append(j)
            record_1 = df1.loc[i]
            record_2 = df2.loc[j]
            record = tuple(record_1) + tuple(record_2)
            matches.append(record)

    head = tuple(df1.head()) + tuple(df2.head())
    matches_result = pd.DataFrame(matches)
    matches_result.columns = head

    df1t = df1.drop(drop1, axis=0)
    df2t = df2.drop(drop2, axis=0)
    result = df1t.append([df2t, matches_result])

    new_index = []

    for n in range(result.shape[0]):
        new_index.append(n)

    result.index = new_index

    # 4 - EVALUATION

    if classifier == "ecm":
        test_matches = ecm.predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)
    elif classifier == "kmeans":
        test_matches = kmeans.fit_predict(test_features)
        cm = recordlinkage.confusion_matrix(annotations.links,
                                            test_matches,
                                            total=100)
        acc = recordlinkage.accuracy(annotations.links,
                                     test_matches,
                                     total=100)

    print(cm, acc)

    return result
Exemple #17
0
def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches
Exemple #18
0
def kmeans_classifier(features):
    """ Kmeans classifier """
    kmeans = rl.KMeansClassifier()
    matches = kmeans.fit_predict(features)

    return matches