コード例 #1
0
ファイル: Linkage.py プロジェクト: Ewen2015/GossipCat
    def __init__(self, data, new_data, name, n_sample=50000, bin_size=500, block=None, method='jarowinkler', threshold=0.93):
        super(Linkage, self).__init__()
        self.data = data
        self.new_data = pd.DataFrame(new_data)
        self.n_sample = n_sample
        self.bin_size = bin_size
        self.name = name
        self.block = block
        self.method = method
        self.threshold = threshold
        
        self.sample = data.sample(n=self.n_sample, replace=False, random_state=0)

        if self.block != None:
            self.indexer = recordlinkage.BlockIndex(on=self.block)
        else:
            self.indexer = recordlinkage.FullIndex()
        self.compare_cl = recordlinkage.Compare(n_jobs=4)
        self.compare_cl.string(self.name, self.name, method=self.method, threshold=self.threshold, label=self.name)

        if self.new_data.empty:
            self.List = list(itertools.combinations(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)), 2))
        else:
            self.Linst = list(np.split(self.sample, indices_or_sections=round(self.n_sample/self.bin_size, 0)))
        self.results_df = pd.DataFrame(columns=['pairs', 'company_1', 'company_2'])
        self.results_tmp = None
コード例 #2
0
    def time_full_index(self):

        # setup class
        c_pairs = rl.FullIndex()

        # Make pairs
        c_pairs.index(self.A)
コード例 #3
0
    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = recordlinkage.FullIndex()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = recordlinkage.FullIndex()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
コード例 #4
0
    def test_basic_link(self):
        """FULL: Test basic characteristics of full indexing (link)."""

        # finding duplicates
        index_cl = recordlinkage.FullIndex()
        pairs = index_cl.index((self.a, self.b))

        self.assertIsInstance(pairs, pd.MultiIndex)
        self.assertEqual(len(pairs), len(self.a) * len(self.b))
        self.assertTrue(pairs.is_unique)
コード例 #5
0
    def setup(self):

        # download data
        self.A, self.B = load_febrl4()

        # Add numbers (age)
        self.A['postcode'] = self.A['postcode'].astype(float)
        self.B['postcode'] = self.B['postcode'].astype(float)

        # make pairs
        c_pairs = rl.FullIndex()
        self.pairs = c_pairs.index(self.A, self.B)[0:5e4]
コード例 #6
0
    def setup(self):

        # download data
        self.A = load_febrl1()

        # make pairs
        c_pairs = rl.FullIndex()
        pairs = c_pairs.index(self.A)

        # different sizes of pairs
        self.pairs_xsmall = pairs[0:5e3]
        self.pairs_small = pairs[0:5e4]
        self.pairs_medium = pairs[0:5e5]
        self.pairs_large = pairs[0:5e6]
コード例 #7
0
ファイル: import_data.py プロジェクト: Sun-Kev/MACS30200proj
def record_link_schools():
    """
	This function performs record linkage on two dataframes: the critical
	mass dataframe and the retention rates dataframe. The record linkage
	is condicted on the name of the school. 

	Input: None
	Output:
		- link: a tuple containing the indices of retention dataframe
                and the critical mass dataframe; AND the best matches qgram scores
	"""
    critical_mass_df = calculate_critical_mass_var()
    retention_df = import_cleaned_retention(RETENTION_CLEAN)

    # set thresholds for comparing strings using qgram method
    school_name_thresh = 0.85

    # initialize a Record Linkage comparison object
    compare = rl.Compare()
    indexer = rl.FullIndex()  # No blocking available
    compare.string('school',
                   'school',
                   method='qgram',
                   threshold=school_name_thresh,
                   label='school_name_score')

    # make pairs
    pairs = indexer.index(retention_df, critical_mass_df)

    # compute record linkage scores
    features = compare.compute(pairs, retention_df, critical_mass_df)

    # set classification threshold
    school_name_classif_thresh = 1.0

    # Classification & Final Filtering
    best_matches = features[(features['school_name_score'] >=
                             school_name_classif_thresh)]

    # obtain the index values from best_matches
    index_array = best_matches.index.values

    # create tuple of indices and best matches df
    link = (index_array, best_matches)

    return link
コード例 #8
0
    def match(self, df:pd.DataFrame, key='all') -> pd.DataFrame:
        
        metadata = {
            'size': len(df)
        }
        logger.debug('Indexing the data for matching!')
        indexer = rl.FullIndex()
        pairs = indexer.index(df)
        metadata['n_pairs'] = len(pairs)
        logger.debug(f"Number of pairs: {metadata['n_pairs']}")

        logger.debug(f"Initializing contrasting")
        contraster_obj = contraster.Contraster(self.contrast_rules)
        contrasts = contraster_obj.run(pairs, df)
        metadata['contraster_metadata'] = contraster_obj.metadata
        logger.debug(f"Contrasts created")

        contrasts.index.rename(['matcher_index_left', 'matcher_index_right'], inplace=True)
        contrasts = rules.compactify(contrasts, operation='mean')
        logger.debug('Summary distances generated. Making you some stats about them.')
        metadata['scores'] = utils.summarize_column(contrasts.matches)
        logger.debug('Caching those contrasts and distances for you.')
        ioutils.write_dataframe(contrasts.reset_index(), filepath=f'{self.base_data_directory}/match_cache/contrasts/{self.match_job_id}/{key}')

        logger.debug(f"Contrasts dataframe size: {contrasts.shape}")
        logger.debug(f"Contrasts data without duplicated indexes: {contrasts[~contrasts.index.duplicated(keep='first')].shape}")
        logger.debug("Duplicated keys:")
        logger.debug(f"{contrasts[contrasts.index.duplicated(keep=False)]}")

        matches = cluster.generate_matched_ids(
            distances=contrasts,
            DF=df,
            clustering_params=self.clustering_rules,
            base_data_directory=self.base_data_directory, # at some point, we may want to consider making the matcher into a class
            match_job_id=self.match_job_id,       # rather than passing around keys, match_job_ids, base_data_directorys, etc.
            block_name=str(key)
        )

        return matches, metadata
コード例 #9
0
import os
import unittest
import tempfile
import shutil
import pickle

import numpy as np
import pandas as pd
import pandas.util.testing as ptm
from parameterized import parameterized, param

import recordlinkage

TEST_INDEXATION_OBJECTS = [
    param(recordlinkage.FullIndex()),
    param(recordlinkage.BlockIndex(on='var_arange')),
    param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
    param(recordlinkage.RandomIndex(10, random_state=100, replace=False)),
]


class TestData(unittest.TestCase):
    """Unittest object to setup test data."""
    @classmethod
    def setUpClass(cls):

        n_a = 100
        n_b = 150
コード例 #10
0
assert (len(config.common.fields) == 1), "Only one Field is allowed for fields"
fieldname = config.common.fields[0].name

tools.ensure_directories(config.common.result_base_dir + "dummy")

# init Random with a fixes seed (for reproducibility)
tools.init_random_with_seed()

# load files
print("Loading files")
df_1 = tools.load_file_as_df(config.common.filename_1, [fieldname])
df_2 = tools.load_file_as_df(config.common.filename_2, [fieldname])
idx_match = tools.load_perfect_match_as_index(config.common.filename_perfect_match)

# build a full index without the matches
idx_full = rl.FullIndex().index(df_1, df_2)
idx_distinct = sample_index(idx_full.difference(idx_match), len(idx_match) * 10)

# run compare
print("Compare matches")
df_match = run_compare(fieldname, df_1, df_2, idx_match)

print("Compare distincts")
df_distinct = run_compare(fieldname, df_1, df_2, idx_distinct)

# save result
save_result(df_match, 'cm_matches.csv')
save_result(df_distinct, 'cm_distinct.csv')
save_binned_result(df_match, df_distinct, 25, 'cm_bin_{0}.csv')

print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))
コード例 #11
0
ファイル: matching.py プロジェクト: tamos/nice_things
def link_datasets(yelp_results, dj_df, df_type="wages"):
    """
    (Assisted by Record Linkage Toolkit library and documentation)
    This functions compares the Yelp query results to database results and 
    produces the best matches based on computing the qgram score. Depending
    on the specific database table characteristics the qgram calculation
    will be between the zip_code, business name, address strings, latitude, 
    longitude, or a combination of those charateristics. 

    Inputs:
        - yelp_results: a pandas dataframe of yelp business results based 
                        on a user's input
        - dj_df: a pandas dataframe of django results.
                 Ex. labour statistics, healthcode violations, Divvy, etc.
        - df_type: a string of which specific dataframe is being passed to
                   be compared to the Yelp results

    Outputs:
        - link: a tuple containing the indices of Yelp query results dataframe
                and the database dataframe AND the best matches qgram scores
    """
    # set thresholds for comparing strings using qgram method
    name_thresh = 0.55
    addr_thresh = 0.55
    strong_addr_thresh = 0.90

    # initialize a Record Linkage comparison object
    compare = rl.Compare()
    
    # Labour & Food data comparisons to Yelp are made on zip, business name,
    # and address
    if df_type == "wages" or df_type == "food": 
        indexer = rl.BlockIndex(on='zip_code')  # block on zip code
        compare.numeric('zip_code', 'zip_code', method='linear',
                        scale=30.0, label='zip_score')
        compare.string('name', 'name', method='qgram',
                       threshold=name_thresh, label='name_score')
        compare.string('addr', 'addr', method='qgram',
                       threshold=addr_thresh, label='addr_score')
    
    # Environmental data comparisons to Yelp are made on address
    elif df_type == "enviro":
        indexer = rl.FullIndex() # no blocking available
        compare.string('addr', 'addr', method='qgram',
                       threshold=strong_addr_thresh, label='addr_score')
    
    # all other data comparisons to Yelp 
    else:
        indexer = rl.FullIndex()
    pairs = indexer.index(yelp_results, dj_df)

    # In addition to above comparisons, ALL data sets are also compared to
    # Yelp based on latitude and longitude
    compare.geo('latitude', 'longitude', 'latitude', 'longitude',
                method='linear', scale=30.0, label='coord_score')

    # compute record linkage scores
    features = compare.compute(pairs, yelp_results, dj_df)

    # set classification thresholds
    zip_classif_thresh = 1.0
    addr_classif_thresh = 1.0
    coord_classif_thresh = 0.99
    name_classif_thresh = 1.0

    # Classification and final filtering
    if df_type == "wages" or df_type == "food": 
        best_matches = features[(features['zip_score'] == zip_classif_thresh) &
                                (features['name_score'] == name_classif_thresh) &
                                (features['addr_score'] == addr_classif_thresh) &
                                (features['coord_score'] >= coord_classif_thresh)]
    elif df_type == "enviro":
        best_matches = features[(features['addr_score'] == addr_classif_thresh) &
                                (features['coord_score'] >= coord_classif_thresh)]
    else:
        best_matches = features[(features['coord_score'] >= coord_classif_thresh)]
    
    # obtain the index values from best_matches
    index_array = best_matches.index.values

    # create tuple of indices and best matches df
    link = (index_array, best_matches)
    
    return link
コード例 #12
0
import recordlinkage
import pandas as pd

path = "C:/Users/DELL/Desktop/Data-de-duplication/Deduplication Problem - Sample Dataset.csv"
df = pd.read_csv(path)

print(df)

indexer = recordlinkage.FullIndex()
pairs = indexer.index(df)

print(len(df), len(pairs))
print(pairs)

indexer = recordlinkage.BlockIndex(on=['dob', 'gn'])
pairs = indexer.index(df)

print(len(pairs))
# (1000*1000-1000)/2 = 499500

# This cell can take some time to compute.
compare_cl = recordlinkage.Compare()

compare_cl.string('ln', 'ln', label='last_name')
compare_cl.string('fn',
                  'fn',
                  method='jarowinkler',
                  threshold=0.85,
                  label='first_name')
compare_cl.exact('dob', 'dob', label='date_of_birth')
compare_cl.exact('gn', 'gn', label='gender')
コード例 #13
0
class TestIndexApi(TestData):
    """General unittest for the indexing API."""

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_repr(self, index_class):

        index_str = str(index_class)
        index_repr = repr(index_class)
        self.assertEqual(index_str, index_repr)

        start_str = '<{}'.format(index_class.__class__.__name__)
        self.assertTrue(index_str.startswith(start_str))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_arguments(self, index_class):
        """Test the index method arguments"""

        # The following should work
        index_class.index(self.a)
        index_class.index(self.a, self.b)
        index_class.index((self.a))
        index_class.index([self.a])
        index_class.index((self.a, self.b))
        index_class.index([self.a, self.b])
        index_class.index(x=(self.a, self.b))

    def test_iterative(self):
        """Test the iterative behaviour."""

        # SINGLE STEP
        index_class = recordlinkage.FullIndex()
        pairs = index_class.index((self.a, self.b))
        pairs = pd.DataFrame(index=pairs).sort_index()

        # MULTI STEP
        index_class = recordlinkage.FullIndex()

        pairs1 = index_class.index((self.a[0:50], self.b))
        pairs2 = index_class.index((self.a[50:100], self.b))

        pairs_split = pairs1.append(pairs2)
        pairs_split = pd.DataFrame(index=pairs_split).sort_index()

        ptm.assert_frame_equal(pairs, pairs_split)
        # note possible to sort MultiIndex, so made a frame out of it.

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_empty_imput_dataframes(self, index_class):
        """Empty DataFrames"""

        # make an empty dataframe with the columns of self.a and self.b
        df_a = pd.DataFrame(columns=self.a.columns.tolist())
        df_b = pd.DataFrame(columns=self.b.columns.tolist())

        if not isinstance(index_class, recordlinkage.RandomIndex):
            # make an index
            pairs = index_class.index((df_a, df_b))

            # check if the MultiIndex has length 0
            self.assertIsInstance(pairs, pd.MultiIndex)
            self.assertEqual(len(pairs), 0)
        else:
            with self.assertRaises(ValueError):
                index_class.index((df_a, df_b))

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_error_handling(self, index_class):
        """Test error handling on non-unique index."""

        # make a non_unique index
        df_a = self.a.rename(
            index={self.a.index[1]: self.a.index[0]}, inplace=False)

        with self.assertRaises(ValueError):
            index_class.index(df_a)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_dedup(self, index_class):

        index_names = ['dedup', None, 'index', int(1)]
        expected = [
            ['dedup_1', 'dedup_2'],
            [None, None],
            ['index_1', 'index_2'],
            ['1_1', '1_2'],
        ]

        for i, name in enumerate(index_names):

            index_A = pd.Index(self.a.index).rename(name)
            df_A = pd.DataFrame(self.a, index=index_A)

            pairs = index_class.index((df_A))

            self.assertEqual(pairs.names, expected[i])
            self.assertEqual(df_A.index.name, name)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_dedup(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        # make the index
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index(df_a)
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_index_names_link(self, index_class):

        # tuples with the name of the first and second index
        index_names = [
            ('index1', 'index2'),
            ('index1', None),
            (None, 'index2'),
            (None, None),
            (10, 'index2'),
            (10, 11)
        ]

        for name_a, name_b in index_names:

            # make an index for each dataframe with a new index name
            index_a = pd.Index(self.a.index, name=name_a)
            df_a = pd.DataFrame(self.a, index=index_a)

            index_b = pd.Index(self.b.index, name=name_b)
            df_b = pd.DataFrame(self.b, index=index_b)

            pairs = index_class.index((df_a, df_b))
            self.assertEqual(pairs.names, [name_a, name_b])

            # check for inplace editing (not the intention)
            self.assertEqual(df_a.index.name, name_a)
            self.assertEqual(df_b.index.name, name_b)

    @parameterized.expand([
        param(recordlinkage.FullIndex()),
        param(recordlinkage.BlockIndex(on='var_arange')),
        param(recordlinkage.SortedNeighbourhoodIndex(on='var_arange')),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=True)),
        param(recordlinkage.RandomIndex(10, random_state=100, replace=False))
    ])
    def test_duplicated_index_names_link(self, index_class):

        # make an index for each dataframe with a new index name
        index_a = pd.Index(self.a.index, name='index')
        df_a = pd.DataFrame(self.a, index=index_a)

        index_b = pd.Index(self.b.index, name='index')
        df_b = pd.DataFrame(self.b, index=index_b)

        # make the index
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_1', 'index_2'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

        # make the index
        index_class.suffixes = ['_a', '_b']
        pairs = index_class.index((df_a, df_b))
        self.assertEqual(pairs.names, ['index_a', 'index_b'])

        # check for inplace editing (not the intention)
        self.assertEqual(df_a.index.name, 'index')
        self.assertEqual(df_b.index.name, 'index')

    @parameterized.expand(TEST_INDEXATION_OBJECTS)
    def test_pickle(self, index_class):
        """Test if it is possible to pickle the class."""

        pickle_path = os.path.join(self.test_dir, 'pickle_compare_obj.pickle')

        # pickle before indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))

        # compute the record pairs
        index_class.index(self.a, self.b)

        # pickle after indexing
        pickle.dump(index_class, open(pickle_path, 'wb'))
コード例 #14
0
import pandas as pd
import recordlinkage as rl
import helpers as h

h.pandas_setup()

df_a = pd.DataFrame.from_csv('./data/grid-df.csv').reset_index()
df_b = pd.DataFrame.from_csv(
    './data/scraped/scraped-universities-subset.csv').reset_index()

indexer = rl.FullIndex()
candidate_links = indexer.index(df_a, df_b)

comp = rl.Compare(candidate_links, df_a, df_b)
comp.string('grid-name', 'scraped-name')

preds = [(1 if i >= .9 else 0) for i in comp.vectors[0]]

fuse = rl.FuseLinks()
コード例 #15
0
def index_pairs(df1, df2):
    indexer = recordlinkage.FullIndex()  # BlockIndex(on='postal_code')
    return indexer.index(df1, df2)