Python SortedNeighbourhoodIndex Exemples, recordlinkage.SortedNeighbourhoodIndex Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

    def test_single_sorting_key(self):
        """SNI: Test class arguments."""

        # all the following cases return in the same index.

        # situation 1
        index_cl1 = recordlinkage.SortedNeighbourhoodIndex('var_arange')
        pairs1 = index_cl1.index((self.a, self.b))

        # situation 2
        index_cl2 = recordlinkage.SortedNeighbourhoodIndex(on='var_arange')
        pairs2 = index_cl2.index((self.a, self.b))

        # situation 3
        index_cl3 = recordlinkage.SortedNeighbourhoodIndex(
            left_on='var_arange', right_on='var_arange')
        pairs3 = index_cl3.index((self.a, self.b))

        # situation 4
        index_cl4 = recordlinkage.SortedNeighbourhoodIndex(on=['var_arange'])
        pairs4 = index_cl4.index((self.a, self.b))

        # situation 5
        index_cl5 = recordlinkage.SortedNeighbourhoodIndex(
            left_on=['var_arange'], right_on=['var_arange'])
        pairs5 = index_cl5.index((self.a, self.b))

        # test
        ptm.assert_index_equal(pairs1, pairs2)
        ptm.assert_index_equal(pairs1, pairs3)
        ptm.assert_index_equal(pairs1, pairs4)
        ptm.assert_index_equal(pairs1, pairs5)

Exemple #2

0

Afficher le fichier

Fichier : bench_indexing.py Projet : wfranus/recordlinkage

    def time_sni_index(self):

        # setup class
        c_pairs = rl.SortedNeighbourhoodIndex(on='given_name', w=5)

        # Make pairs
        c_pairs.index(self.A)

Exemple #3

0

Afficher le fichier

def ScoreRecords():
    global features
    global df
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    ## create pairs
    indexer = rl.SortedNeighbourhoodIndex(on='given_name', window=5)
    pairs = indexer.index(train, test)
    compare_cl = rl.Compare()
    ## methodology for scoring
    compare_cl.exact('postcode', 'postcode', label='postcode')
    compare_cl.string('surname',
                      'surname',
                      method='jaro',
                      threshold=.95,
                      label='surname')
    compare_cl.string('given_name',
                      'given_name',
                      method='jaro',
                      threshold=.95,
                      label='name')
    compare_cl.string('date_of_birth',
                      'date_of_birth',
                      method='jaro',
                      threshold=0.85,
                      label='dob')
    compare_cl.string('suburb',
                      'suburb',
                      method='jaro',
                      label='suburb',
                      threshold=.85)
    compare_cl.string('state',
                      'state',
                      label='state',
                      method='jaro',
                      threshold=.85)
    compare_cl.string('address_1',
                      'address_1',
                      method='jaro',
                      threshold=0.9,
                      label='address_1')
    compare_cl.exact('rec_num', 'rec_num', label='rec_num')
    ##compute feature vector
    features = compare_cl.compute(pairs, train, test)
    total_score = []
    features["Total_Score"] = features.sum(axis=1)
    features.fillna(0)
    y = []
    for row in features["Total_Score"]:
        if row >= 7:
            y.append(1)
        else:
            y.append(0)
    features["target"] = y
    features.to_csv('feature_vectors.csv', sep=",", encoding='utf-8')
    return (features)
    ScoreRecords()

Exemple #4

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

    def test_sni_with_blocking_link(self):
        """SNI: Test sni with blocking keys."""

        # sni
        index_class = recordlinkage.SortedNeighbourhoodIndex(
            on='var_arange', window=3, block_on='var_arange')
        pairs = index_class.index((self.a, self.b[0:len(self.a)]))

        # the length of pairs is length(self.a)
        self.assertEqual(len(pairs), len(self.a))

Exemple #5

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

    def test_sni_with_blocking_dedup(self):
        """SNI: Test sni with blocking keys."""

        # sni
        index_class = recordlinkage.SortedNeighbourhoodIndex(
            on='var_arange', window=3, block_on='var_arange')
        pairs = index_class.index(self.a)

        print(pairs.values)

        # the length of pairs is 0
        self.assertEqual(len(pairs), 0)

Exemple #6

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

    def test_sni_algorithm_dedup(self, window):
        """SNI: Test the window size (dedup)."""

        # window = 7 # using paramereized tests instead

        index_class = recordlinkage.SortedNeighbourhoodIndex(on='var_arange',
                                                             window=window)
        pairs = index_class.index((self.a))

        # the expected number of pairs
        window_d = (window - 1) / 2
        len_a = len(self.a)
        n_pairs_expected = \
            np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1))

        # test
        self.assertEqual(len(pairs), n_pairs_expected)

Exemple #7

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

    def test_sni_algorithm_link(self, window):
        """SNI: Test the window size (link)."""

        # window = 7 # using paramereized tests instead

        index_class = recordlinkage.SortedNeighbourhoodIndex(on='var_arange',
                                                             window=window)
        pairs = index_class.index((self.a, self.b[0:len(self.a)]))

        # the expected number of pairs
        window_d = (window - 1) / 2
        len_a = len(self.a)
        n_pairs_expected = \
            len(self.a) + \
            2 * np.sum(np.arange(len_a - 1, len_a - (window_d + 1), -1))

        # test
        print('expected number of pairs: %s' % n_pairs_expected)
        print('number of pairs found: %s' % len(pairs))
        self.assertEqual(len(pairs), n_pairs_expected)

Exemple #8

0

Afficher le fichier

Fichier : matching1.py Projet : ychen216/Record-Linkage

import recordlinkage as rl
from IOoperation import load_people_record, write_result, load_people_record_test, write_pair, get_result_list
import time
import os

print("Load File Begins...")
load_start_time = time.time()
# dfA = load_people_record_test()
dfA = load_people_record()
load_end_time = time.time()
print("Load File Costs:%f s" % (load_end_time - load_start_time))

# Blocking
print("Blocking Begins...")
pcl = rl.SortedNeighbourhoodIndex(on=['EmplyeeID'], window=3)
pairs = pcl.index(dfA)
blocking_end_time = time.time()
print("Blocking Costs:%f s" % (blocking_end_time - load_end_time))

# Comparison

print("Comparison Begins...")
compare_helper = rl.Compare()

compare_helper.exact('SSN', 'SSN', label='SSN')
#compare_helper.string('EmplyeeID', 'EmplyeeID', threshold=0.85, label='EmplyeeID')
compare_helper.string('FNAME',
                      'FNAME',
                      method='jarowinkler',
                      threshold=0.85,
                      label='FNAME')

Exemple #9

0

Afficher le fichier

def ScoreRecords():
    global features
    global df
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    ##create pairs
    x_train, y_train = train_test_split(train, test_size=0.5)
    indexer = rl.SortedNeighbourhoodIndex(on='given_name', window=3)
    pairs = indexer.index(x_train, y_train)
    compare_cl = rl.Compare()
    compare_cl.string('postcode',
                      'postcode',
                      method='jaro',
                      threshold=0.85,
                      label='postcode')
    compare_cl.string('surname',
                      'surname',
                      method='jaro',
                      threshold=.95,
                      label='surname')
    compare_cl.string('given_name',
                      'given_name',
                      method='jaro',
                      threshold=.95,
                      label='name')
    compare_cl.string('date_of_birth',
                      'date_of_birth',
                      method='jaro',
                      threshold=0.85,
                      label='dob')
    compare_cl.string('postcode',
                      'postcode',
                      method='jaro',
                      threshold=0.85,
                      label='postcode')
    compare_cl.string('suburb',
                      'suburb',
                      method='jaro',
                      label='suburb',
                      threshold=.85)
    compare_cl.string('state',
                      'state',
                      label='state',
                      method='jaro',
                      threshold=.85)
    compare_cl.string('address_1',
                      'address_1',
                      method='jaro',
                      threshold=0.9,
                      label='address_1')
    compare_cl.exact('rec_num', 'rec_num', label='rec_num')
    ##compute feature vector
    features = compare_cl.compute(pairs, x_train, y_train)
    total_score = []
    features["Total_Score"] = features.sum(axis=1)
    ##look at distribution of scores across record-pairs
    plt.hist(features['Total_Score'], bins=8)
    plt.title(r'Distribution of feature score totals for record pairs')
    plt.show()
    y = []
    for row in features["Total_Score"]:
        if row >= 6:
            y.append(1)
        else:
            y.append(0)

    features["y"] = y
    ###replace string in index columns with int
    features.to_csv('feature_vectors.csv', sep=",", encoding='utf-8')
    return (features)

Exemple #10

0

Afficher le fichier

class TestIndexApi(TestData):
    """General unittest for the indexing API."""

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = recordlinkage.FullIndex()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = recordlinkage.FullIndex()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        if not isinstance(index_class, recordlinkage.RandomIndex):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(
            index={self.a.index[1]: self.a.index[0]}, inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [
            ('index1', 'index2'),
            ('index1', None),
            (None, 'index2'),
            (None, None),
            (10, 'index2'),
            (10, 11)
        ]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

Exemple #11

0

Afficher le fichier

		name = name.strip()
		auth.append(name)




for Investigator in investigators:
	if Investigator in auth and Investigator != "":
		print("Investigator " , Investigator , " present in Author List")
		
		
		



indexer = recordlinkage.SortedNeighbourhoodIndex(on="Title")
pairs = indexer.index(pd,ct)











compare_cl = recordlinkage.Compare()

compare_cl.string('Title', 'Title', method='levenshtein',label="Levenshtein - Method")

Exemple #12

0

Afficher le fichier

Fichier : test_indexing.py Projet : mcmoots/recordlinkage

import unittest
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage

TEST_INDEXATION_OBJECTS = [
    param(recordlinkage.FullIndex()),
    param(recordlinkage.BlockIndex(on='var_arange')),
    param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=False)),
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150

        cls.index_a = ['rec_a_%s' % i for i in range(0, n_a)]
        cls.index_b = ['rec_b_%s' % i for i in range(0, n_b)]

Exemple #13

0

Afficher le fichier

Fichier : make_indices.py Projet : joelbecker/netlab-blog-contributions

import pandas as pd
import recordlinkage as rl
import labutils as lu

# Import data
names_1 = ['alfred', 'bob', 'calvin', 'hobbes', 'rusty']
names_2 = ['alfred', 'danny', 'callum', 'hobie', 'rusty']
df_a = pd.DataFrame(pd.Series(names_1, name='names'))
df_b = pd.DataFrame(pd.Series(names_2, name='names'))

df_c = pd.DataFrame(
    pd.Series(['alexander', 'bob', 'bruce', 'bruce', 'alexander'],
              name='names'))
df_d = pd.DataFrame(
    pd.Series(['alice', 'beth', 'amy', 'beth', 'brittany'], name='names'))

indexer = rl.SortedNeighbourhoodIndex(on='names', window=3)
candidate_links = indexer.index(df_c, df_d)

lu.clip_df(candidate_links.to_frame())

Exemple #14

0

Afficher le fichier

    fieldnames.append(cfg.name)

dfFile1 = tools.load_file_as_df(config.common.filename_1, fieldnames)
dfFile2 = tools.load_file_as_df(config.common.filename_2, fieldnames)
perfect_match_index = tools.load_perfect_match_as_index(
    config.common.filename_perfect_match)

# for each config item
for index, config_item in enumerate(config.items):
    # init Random with a fixes seed (for reproducibility)
    tools.init_random_with_seed()

    print("Indexing")
    if config_item.index_type == "sorted_neighbourhood":
        indexer = rl.SortedNeighbourhoodIndex(
            config_item.index_field_name,
            window=config_item.sorted_neighborhood_window)
    elif config_item.index_type == "block":
        indexer = rl.BlockIndex(config_item.index_field_name)
    elif config_item.index_type == "canopy":
        indexer = tools.CanopyClusterIndex(
            config_item.index_field_name,
            threshold_add=config_item.canopy_threshold_add,
            threshold_remove=config_item.canopy_threshold_remove)
    elif config_item.index_type == "full":
        indexer = tools.FullIndex(config_item.index_field_name)
    else:
        raise ValueError(
            "index_type {0} is invalid: must be sorted_neighbourhood, block, canopy or full"
            .format(config_item.index_type))

Exemple #15

0

Afficher le fichier

import sys
import csv
import numpy as np
from matplotlib import pyplot

#reload(sys)
#sys.setdefaultencoding('utf8')

#pd , ct = pandas.DataFrame.from_csv("ArticleData.csv") , pandas.DataFrame.from_csv("ClinicalTrialsData.csv")

pd = pandas.read_csv("ArticleData.csv")
ct = pandas.read_csv("ClinicalTrialsData.csv")

#print pd

indexer = recordlinkage.SortedNeighbourhoodIndex(on="DateRevised")
#indexer = recordlinkage.FullIndex()
pairs = indexer.index(pd, ct)

#print (len(ct),len(pd),len(pairs))

compare_cl = recordlinkage.Compare()

compare_cl.string('Title',
                  'Title',
                  method='levenshtein',
                  label="Levenshtein - Method")
compare_cl.string('Title',
                  'Title',
                  method='jarowinkler',
                  label="Jarowinkler - Method")

Exemple #16

0

Afficher le fichier

Fichier : models.py Projet : sergiofenoll/project-databases

    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key,
                                   fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(
                schema_id,
                table_name,
            )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(
                *_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(
                df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(
                list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(
                df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key,
                                                             window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name,
                                       column_name,
                                       method='linear',
                                       offset=10,
                                       scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name,
                                      column_name,
                                      method=alg,
                                      threshold=0.75,
                                      label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)

            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'"
                .format(dedup_table_name))
            app.logger.exception(e)
            raise e

Exemple #17

0

Afficher le fichier

Fichier : dblp_scholar.py Projet : helmuthb/SPEML-2020

def run_experiment(win_len, preproc, comparison_variant, run_only=None):
    # window length
    if win_len == 0:
        index_description = "block"
        indexer = recordlinkage.BlockIndex('year')
    elif win_len > 0:
        index_description = f"nb{win_len}"
        indexer = recordlinkage.SortedNeighbourhoodIndex('year',
                                                         window=win_len)
    else:
        raise ValueError(f"Invalid window length {win_len}")
    pairs_train = indexer.index(dataDBLP_train, dataScholar_train)
    pairs_test = indexer.index(dataDBLP_test, dataScholar_test)
    if debug:
        print(f"Number of candidates (index={index_description}):")
        print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)")

    # preprocessing
    if preproc == 0:
        print("No preprocesing")
        field_suffix = ""
        preproc_description = "none"
    elif preproc == 1:
        print("Cleaned fields")
        field_suffix = "_clean"
        preproc_description = "clean"
    elif preproc == 2:
        print("Soundex encoding")
        field_suffix = "_soundex"
        preproc_description = "soundex"
    elif preproc == 3:
        print("Nysiis encoding")
        field_suffix = "_nysiis"
        preproc_description = "nysiis"
    elif preproc == 4:
        print("Metaphone encoding")
        field_suffix = "_metaphone"
        preproc_description = "metaphone"
    elif preproc == 5:
        print("Match-rating encoding")
        field_suffix = "_match_rating"
        preproc_description = "match_rating"
    else:
        raise ValueError(f"Unknown preprocessing variant {preproc}")
    print(f"Preprocessing used: {preproc_description}")

    # comparator
    comp = recordlinkage.Compare()
    if comparison_variant == 0:
        comp_description = "exact"
        comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix))
        comp.add(
            compare.Exact('authors' + field_suffix, 'authors' + field_suffix))
        comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix))
    elif comparison_variant == 1:
        comp_description = "levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='levenshtein'))
    elif comparison_variant == 2:
        comp_description = "damerau_levenshtein"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='damerau_levenshtein'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='damerau_levenshtein'))
    elif comparison_variant == 3:
        comp_description = "jaro"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jaro'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jaro'))
    elif comparison_variant == 4:
        comp_description = "jarowinkler"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='jarowinkler'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='jarowinkler'))
    elif comparison_variant == 5:
        comp_description = "qgram"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='qgram'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='qgram'))
    elif comparison_variant == 6:
        comp_description = "cosine"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='cosine'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='cosine'))
    elif comparison_variant == 7:
        comp_description = "smith_waterman"
        comp.add(
            compare.String('title' + field_suffix,
                           'title' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('authors' + field_suffix,
                           'authors' + field_suffix,
                           method='smith_waterman'))
        comp.add(
            compare.String('venue' + field_suffix,
                           'venue' + field_suffix,
                           method='smith_waterman'))
    else:
        raise ValueError(f"Unknown comparison variant {comparison_variant}")
    print(f"String comparison: {comp_description}")

    print("Start compare for training data set")
    start = time.time()
    result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train)
    print("Compare on training data took %.2fs" % (time.time() - start))
    print("Start compare for test data set")
    start = time.time()
    result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test)
    # save time compare for evaluation
    time_compare = time.time() - start
    print("Compare on test data took %.2fs" % (time_compare))

    matches = []
    for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']:
        # skip others if only one classifier is requested
        if run_only is not None and run_only != classifier_description:
            continue
        if classifier_description == 'logreg':
            print("Logistic Regression classifier")
            classifier = recordlinkage.LogisticRegressionClassifier()
            supervised = True
        elif classifier_description == 'bayes':
            print("Naive Bayes classifier")
            classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75)
            supervised = True
        elif classifier_description == 'svm':
            print("Support Vector Machine classifier")
            classifier = recordlinkage.SVMClassifier()
            supervised = True
        elif classifier_description == 'kmeans':
            print("KMeans classifier")
            classifier = recordlinkage.KMeansClassifier()
            supervised = False
        elif classifier_description == 'ecm':
            print("ECM classifier")
            classifier = recordlinkage.ECMClassifier(binarize=0.75)
            supervised = False
        else:
            raise ValueError(
                f"Unknown classifier variant {classifier_description}")

        if supervised:
            start = time.time()
            classifier.fit(result_train, links_train)
            time_train = time.time() - start
            start = time.time()
            match = classifier.predict(result_test)
            time_classify = time.time() - start
        else:
            start = time.time()
            match = classifier.fit_predict(result_test)
            time_classify = time.time() - start
            time_train = 0
        matches.append(
            (index_description, preproc_description, comp_description,
             classifier_description, match, 1000 * time_compare,
             1000 * time_train, 1000 * time_classify))

        if debug:
            print("%d matches" % len(match))
            print_experiment_evaluation(
                match, "-".join((index_description, preproc_description,
                                 comp_description)))

    return matches