def test_initialize(self) : dist = dedupe.affinegap.normalizedAffineGapDistance deduper = dedupe.Dedupe({'name' : {'type' : 'String'}}) assert deduper.data_model['fields']['name'] == {'Has Missing': False, 'type': 'String', 'comparator': dist} deduper = dedupe.Dedupe({'name' : {'type' : 'String', 'Has Missing' : True}}) assert deduper.data_model['fields']['name'] == {'Has Missing': True, 'type': 'String', 'comparator': dist } deduper = dedupe.Dedupe({'name' : {'type' : 'Source', 'Source Names' : ['a', 'b'], 'Has Missing' : True}}) source_comparator = deduper.data_model['fields']['name']['comparator'] assert source_comparator('a', 'a') == 0 assert source_comparator('b', 'b') == 1 assert source_comparator('a', 'b') == 2 assert source_comparator('b', 'a') == 2 self.assertRaises(ValueError, source_comparator, 'b', 'c') self.assertRaises(ValueError, source_comparator, '', 'c') assert numpy.isnan(source_comparator('', 'b'))
def test_comparator(self): fieldDistances = dedupe.core.fieldDistances deduper = dedupe.Dedupe( {'type': { 'type': 'Categorical', 'Categories': ['a', 'b', 'c'] }}, []) record_pairs = (({ 'type': 'a' }, { 'type': 'b' }), ({ 'type': 'a' }, { 'type': 'c' })) numpy.testing.assert_array_almost_equal( fieldDistances(record_pairs, deduper.data_model), numpy.array([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0]]), 3) deduper = dedupe.Dedupe( { 'type': { 'type': 'Categorical', 'Categories': ['a', 'b', 'c'] }, 'source': { 'type': 'Source', 'Source Names': ['foo', 'bar'] } }, []) record_pairs = (({ 'type': 'a', 'source': 'bar' }, { 'type': 'b', 'source': 'bar' }), ({ 'type': 'a', 'source': 'foo' }, { 'type': 'c', 'source': 'bar' })) numpy.testing.assert_array_almost_equal( fieldDistances(record_pairs, deduper.data_model), numpy.array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0.], [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.]]), 3)
def setUp(self): field_definition = [{'field': 'name', 'type': 'String'}] self.data_model = dedupe.Dedupe(field_definition).data_model self.training_pairs = { 'match': [({"name": "Bob", "age": "50"}, {"name": "Bob", "age": "75"}), ({"name": "Meredith", "age": "40"}, {"name": "Sue", "age": "10"})], 'distinct': [({"name": "Jimmy", "age": "20"}, {"name": "Jimbo", "age": "21"}), ({"name": "Willy", "age": "35"}, {"name": "William", "age": "35"}), ({"name": "William", "age": "36"}, {"name": "William", "age": "35"})] } self.training = self.training_pairs['match'] + \ self.training_pairs['distinct'] self.training_records = [] for pair in self.training: for record in pair: if record not in self.training_records: self.training_records.append(record) self.simple = lambda x: set([str(k) for k in x if "CompoundPredicate" not in str(k)])
def setUp(self): self.frozendict = dedupe.core.frozendict field_definition = [{'field' : 'name', 'type': 'String'}, {'field' :'age', 'type': 'String'}] self.data_model = dedupe.Dedupe(field_definition).data_model self.training_pairs = { 0: [((1, self.frozendict({"name": "Bob", "age": "50"})), (2, self.frozendict({"name": "Bob", "age": "75"}))), ((3, self.frozendict({"name": "Meredith", "age": "40"})), (4, self.frozendict({"name": "Sue", "age": "10"})))], 1: [((5, self.frozendict({"name": "Jimmy", "age": "20"})), (6, self.frozendict({"name": "Jimbo", "age": "21"}))), ((7, self.frozendict({"name": "Willy", "age": "35"})), (8, self.frozendict({"name": "William", "age": "35"}))), ((9, self.frozendict({"name": "William", "age": "36"})), (8, self.frozendict({"name": "William", "age": "35"})))] } self.training = self.training_pairs[0] + self.training_pairs[1] self.distinct_ids = [tuple([pair[0][0], pair[1][0]]) for pair in self.training_pairs[0]] self.dupe_ids = [tuple([pair[0][0], pair[1][0]]) for pair in self.training_pairs[1]] self.simple = lambda x : set([str(k) for k in x if "CompoundPredicate" not in str(k)])
def write_config(self, distinct_config=None): ''' Generate the training data file and then write the settings ''' print('creating deduper') print(datetime.now()) deduper = dedupe.Dedupe(self.training_data.match_fields) print('getting sample matches') print(datetime.now()) match_list, sample_data = self.training_data.get_sample_matches(.3) print('done- sample matches') print(datetime.now()) if distinct_config: distincts = self.get_distinct_from_file(distinct_config) else: distincts = [] training_dict = {"distinct": distincts, "match": match_list} deduper.sample(sample_data) print('done- sample') print(datetime.now()) deduper.markPairs(training_dict) print('done- markPairs') print(datetime.now()) deduper.train(recall=.9) print('done- train') print(datetime.now()) with BytesIO() as sf: deduper.writeSettings(sf) self.settings_file.write_file(sf.getvalue())
def rundedupe(input_file_path, unique_col, dedupe_cols): global input_file, data_d, deduper, fields input_file = input_file_path print('importing data ...') data_d = readData(input_file, unique_col) if os.path.exists(os.getcwd() + "/media/settings_files/" + settings_file): print('reading from', settings_file) with open(os.getcwd() + "/media/settings_files/" + settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) ret = False else: fields = [] for i in dedupe_cols: fields.append({'field': i, 'type': 'String'}) deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 15000) if os.path.exists(os.getcwd() + "/media/training_files/" + training_file): print('reading labeled examples from ', training_file) with open(os.getcwd() + "/media/training_files/" + training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') ret = True fields = unique(field.field for field in deduper.data_model.primary_fields) return ret
def collect_labelled_data(data_d, fields, training_file, settings_file): """collects labelled data, returns the deduper""" deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 75000) # check if a training file exists. If it does load it. if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: deduper.readTraining(tf) # active labelling phase print('starting active labeling...') dedupe.consoleLabel(deduper) # train the deduper print("training deduper...") deduper.train() # save out the training data print("saving out training file...") with open(training_file, 'w') as tf: deduper.writeTraining(tf) # save out the settings print("saving out settings file...") with open(settings_file, 'w') as sf: deduper.writeSettings(sf) return
def test_field_distance_simple(self) : fieldDistances = dedupe.core.fieldDistances deduper = dedupe.Dedupe({'name' : {'type' :'String'}, 'source' : {'type' : 'Source', 'Source Names' : ['a', 'b']}}) record_pairs = (({'name' : 'steve', 'source' : 'a'}, {'name' : 'steven', 'source' : 'a'}),) numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, deduper.data_model), numpy.array([[0, 0.647, 0, 0, 0]]), 3) record_pairs = (({'name' : 'steve', 'source' : 'b'}, {'name' : 'steven', 'source' : 'b'}),) numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, deduper.data_model), numpy.array([[1, 0.647, 0, 0.647, 0]]), 3) record_pairs = (({'name' : 'steve', 'source' : 'a'}, {'name' : 'steven', 'source' : 'b'}),) numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, deduper.data_model), numpy.array([[0, 0.647, 1, 0, 0.647]]), 3)
def test_initialize(self): fields = { 'name': { 'type': 'String' }, 'age': { 'type': 'String' }, } deduper = dedupe.Dedupe(fields) string_predicates = (dedupe.predicates.wholeFieldPredicate, dedupe.predicates.tokenFieldPredicate, dedupe.predicates.commonIntegerPredicate, dedupe.predicates.sameThreeCharStartPredicate, dedupe.predicates.sameFiveCharStartPredicate, dedupe.predicates.sameSevenCharStartPredicate, dedupe.predicates.nearIntegersPredicate, dedupe.predicates.commonFourGram, dedupe.predicates.commonSixGram) tfidf_string_predicates = tuple([ dedupe.tfidf.TfidfPredicate(threshold) for threshold in [0.2, 0.4, 0.6, 0.8] ]) assert deduper.blocker_types == { 'String': string_predicates + tfidf_string_predicates }
def test_comparator_interaction(self): deduper = dedupe.Dedupe([{'field' : 'type', 'variable name' : 'type', 'type' : 'Categorical', 'categories' : ['a', 'b']},\ {'type' : 'Interaction', 'interaction variables' : ['type', 'name']}, {'field' : 'name', 'variable name' : 'name', 'type' : 'Exact'}] , []) record_pairs = (({ 'name': 'steven', 'type': 'a' }, { 'name': 'steven', 'type': 'b' }), ({ 'name': 'steven', 'type': 'b' }, { 'name': 'steven', 'type': 'b' })) numpy.testing.assert_array_almost_equal( deduper.data_model.distances(record_pairs), numpy.array([[0, 1, 1, 0, 1], [1, 0, 1, 1, 0]]), 3)
def active_training(): print("MODE: Active training") # carga fichero de datos a deduplicar try: data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE) except IOError: print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE) raise IOError # Entrenamiento Activo deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS) deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE) # Carga de registros clasificados de entrenamientos anteriores if CONFIG.GENERAL.LOAD_TRAINING: try: with open(CONFIG.PATHS.TRAINING_FILE) as f: deduper.readTraining(f) except IOError: print("No se pudo abrir el fichero de entrenamiento activo -" + CONFIG.PATHS.TRAINING_FILE) dedupe.consoleLabel(deduper) with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf: deduper.writeTraining(tf)
def setUp(self) : random.seed(123) empty_set = set([]) long_string ='asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag' self.records = iter([((long_string, {'name': 'Margret', 'age': '32'}, empty_set), ('2', {'name': 'Marga', 'age': '33'}, empty_set)), (('2', {'name': 'Marga', 'age': '33'}, empty_set), ('3', {'name': 'Maria', 'age': '19'}, empty_set)), (('4', {'name': 'Maria', 'age': '19'}, empty_set), ('5', {'name': 'Monica', 'age': '39'}, empty_set)), (('6', {'name': 'Monica', 'age': '39'}, empty_set), ('7', {'name': 'Mira', 'age': '47'}, empty_set)), (('8', {'name': 'Mira', 'age': '47'}, empty_set), ('9', {'name': 'Mona', 'age': '9'}, empty_set)), ]) self.data_model = dedupe.Dedupe([{'field' : "name", 'type' : 'String'}], ()).data_model self.data_model['fields'][0].weight = -1.0302742719650269 self.data_model['bias'] = 4.76 score_dtype = [('pairs', '<U192', 2), ('score', 'f4', 1)] self.desired_scored_pairs = numpy.array([((long_string, '2'), 0.96), (['2', '3'], 0.96), (['4', '5'], 0.78), (['6', '7'], 0.72), (['8', '9'], 0.84)], dtype=score_dtype)
def test_comparator_interaction(self) : fieldDistances = dedupe.core.fieldDistances deduper = dedupe.Dedupe([{'field' : 'type', 'variable name' : 'type', 'type' : 'Categorical', 'categories' : ['a', 'b']},\ {'type' : 'Interaction', 'interaction variables' : ['type', 'name']}, {'field' : 'name', 'variable name' : 'name', 'type' : 'String'}] , []) record_pairs = (({'name' : 'steven', 'type' : 'a'}, {'name' : 'steve', 'type' : 'b'}), ({'name' : 'steven', 'type' : 'b'}, {'name' : 'steve', 'type' : 'b'})) print deduper.data_model numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, deduper.data_model), numpy.array([[0, 1, 0.64772, 0, 0.64772], [1, 0, 0.64772, 0.64772, 0]]), 3)
def test_writeTraining(self): string = StringIO.StringIO() training_pairs = OrderedDict({ "distinct": [(dedupe.core.frozendict({ u'bar': frozenset([u'bar']), u'foo': u'baz' }), dedupe.core.frozendict({u'foo': u'baz'}))], "match": [] }) json.dump(training_pairs, string, default=dedupe.serializer._to_json, ensure_ascii=False) string.seek(0) loaded_training_pairs = json.load(string, cls=dedupe.serializer.dedupe_decoder) assert loaded_training_pairs["distinct"][0] ==\ training_pairs["distinct"][0] assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset) deduper = dedupe.Dedupe([{'field': 'foo', 'type': 'String'}]) string.seek(0) deduper.readTraining(string) assert repr(deduper.training_pairs) == repr(training_pairs) string.close()
def dedupe_snippets(): deduper = dedupe.Dedupe(fields) deduper.sample(snippets, 15000) dedupe.consoleLabel(deduper) deduper.train() threshold = deduper.threshold(snippets, recall_weight=1) clustered_dupes = deduper.match(snippets, threshold) for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster max_like_count = -1 max_like_count_comment_id = '' for comment_id in id_set: like_count = snippets[comment_id]['likeCount'] if like_count > max_like_count: snippets.pop(max_like_count_comment_id, None) max_like_count = like_count max_like_count_comment_id = comment_id else: snippets.pop(comment_id)
def setUp(self): field_definition = [{'field': 'name', 'type': 'String'}] self.data_model = dedupe.Dedupe(field_definition).data_model self.training_pairs = { 'match': [({ "name": "Bob", "age": "50" }, { "name": "Bob", "age": "75" }), ({ "name": "Meredith", "age": "40" }, { "name": "Sue", "age": "10" })], 'distinct': [({ "name": "Jimmy", "age": "20" }, { "name": "Jimbo", "age": "21" }), ({ "name": "Willy", "age": "35" }, { "name": "William", "age": "35" }), ({ "name": "William", "age": "36" }, { "name": "William", "age": "35" })] } self.training = self.training_pairs['match'] + \ self.training_pairs['distinct'] self.training_records = [] for pair in self.training: for record in pair: if record not in self.training_records: self.training_records.append(record) self.simple = lambda x: set( [str(k) for k in x if "CompoundPredicate" not in str(k)]) self.block_learner = training.BlockLearner self.block_learner.blocker = dedupe.blocking.Fingerprinter( self.data_model.predicates()) self.block_learner.blocker.index_all( {i: x for i, x in enumerate(self.training_records)})
def deduplicate(): print("MODE: Deduplicate") # Comprueba que va a haber registros clasificados con los que entrenar un modelo: if not (CONFIG.GENERAL.LOAD_TRAINING or CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING): print("ERROR: El entrenamiento activo y la carga desde fichero están desactivados") return # Carga fichero de datos a deduplicar try: data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE) except IOError: print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE) raise IOError if CONFIG.GENERAL.LOAD_SETTINGS: try: with open(CONFIG.PATHS.SETTINGS_FILE, 'rb') as f: deduper = dedupe.StaticDedupe(f) except IOError: print("No se pudo abrir el fichero de settings de dedupe - " + CONFIG.PATHS.SETTINGS_FILE) raise IOError else: # Inicializa objeto dedupe deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS) # Carga de registros clasificados de entrenamientos anteriores if CONFIG.GENERAL.LOAD_TRAINING: try: with open(CONFIG.PATHS.TRAINING_FILE) as f: deduper.readTraining(f) except IOError: print("No se pudo abrir el fichero de entrenamiento activo - " + CONFIG.PATHS.TRAINING_FILE) raise IOError if CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING: # Muestreo y entrenamiento activo deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE) dedupe.consoleLabel(deduper) # Entrenamiento de modelo predictivo (por defecto regresión logística deduper.train(CONFIG.DEDUPE.USE_INDEX_PREDICATES) # Guarda entrenamiento activo, y modelo predictivo + predicados with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf: deduper.writeTraining(tf) with open(CONFIG.PATHS.SETTINGS_FILE, 'wb') as sf: deduper.writeSettings(sf) try: # Calcula umbral para la regresión logistica threshold = deduper.threshold(data_d, recall_weight=CONFIG.DEDUPE.RECALL_WEIGHT) # Agrupación de matches en clusters clustered_dupes = deduper.match(data_d, threshold) # Escrirura en fichero write_clusters(clustered_dupes) except NameError: print("Error - No se pudo inicializar el objeto dedupe") raise NameError
def setUp(self): random.seed(123) numpy.random.seed(456) field_definition = [{'field': 'name', 'type': 'String'}, {'field': 'age', 'type': 'String'}] self.deduper = dedupe.Dedupe(field_definition)
def setUp(self): random.seed(123) self.records = iter([ (('1', { 'name': 'Margret', 'age': '32' }), ('2', { 'name': 'Marga', 'age': '33' })), (('2', { 'name': 'Marga', 'age': '33' }), ('3', { 'name': 'Maria', 'age': '19' })), (('4', { 'name': 'Maria', 'age': '19' }), ('5', { 'name': 'Monica', 'age': '39' })), (('6', { 'name': 'Monica', 'age': '39' }), ('7', { 'name': 'Mira', 'age': '47' })), (('8', { 'name': 'Mira', 'age': '47' }), ('9', { 'name': 'Mona', 'age': '9' })), ]) self.data_model = dedupe.Dedupe({ "name": { 'type': 'String' } }, ()).data_model self.data_model['fields']['name']['weight'] = -1.0302742719650269 self.data_model['bias'] = 4.76 score_dtype = [('pairs', 'S4', 2), ('score', 'f4', 1)] self.desired_scored_pairs = numpy.array([(('1', '2'), 0.96), (['2', '3'], 0.96), (['4', '5'], 0.78), (['6', '7'], 0.72), (['8', '9'], 0.84)], dtype=score_dtype)
def setUp(self): field_definition = [{ 'field': 'name', 'type': 'String' }, { 'field': 'age', 'type': 'String' }] self.deduper = dedupe.Dedupe(field_definition)
def setUp(self): random.seed(123) long_string = 'asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag' # noqa: E501 self.records = iter([ ((long_string, { 'name': 'Margret', 'age': '32' }), ('2', { 'name': 'Marga', 'age': '33' })), (('2', { 'name': 'Marga', 'age': '33' }), ('3', { 'name': 'Maria', 'age': '19' })), (('4', { 'name': 'Maria', 'age': '19' }), ('5', { 'name': 'Monica', 'age': '39' })), (('6', { 'name': 'Monica', 'age': '39' }), ('7', { 'name': 'Mira', 'age': '47' })), (('8', { 'name': 'Mira', 'age': '47' }), ('9', { 'name': 'Mona', 'age': '9' })), ]) deduper = dedupe.Dedupe([{'field': "name", 'type': 'String'}]) self.distances = deduper.distances self.classifier = deduper.classifier self.classifier.weights = [-1.0302742719650269] self.classifier.bias = 4.76 score_dtype = [('pairs', '<U192', 2), ('score', 'f4')] self.desired_scored_pairs = numpy.array([((long_string, '2'), 0.96), (['2', '3'], 0.96), (['4', '5'], 0.78), (['6', '7'], 0.72), (['8', '9'], 0.84)], dtype=score_dtype)
def create_deduper(project): deduper = dedupe.Dedupe(VARIABLES, num_cores=4) data = {e['uid']: to_record(e) for e in project.entities} if len(data): deduper.sample(data) deduper.markPairs({ 'match': get_trainset(project, True, data), 'distinct': get_trainset(project, False, data) }) return deduper, data
def setUp(self): random.seed(123) fields = { 'name': { 'type': 'String' }, 'age': { 'type': 'String' }, } self.deduper = dedupe.Dedupe(fields)
def __init__(self, dataframe, entity): self.settings = entity["name"].lower() + "_settings" self.training = entity["name"].lower() + "_training.json" self.dictionary = dataframe.to_dict('index') self.dataframe = dataframe self.entity = entity if os.path.exists(self.settings): with open(self.settings, 'rb') as sf: self.deduper = dedupe.StaticDedupe(sf, num_cores=4) else: self.deduper = dedupe.Dedupe(entity["matching_fields"], num_cores=4)
def test_exact_comparator(self): deduper = dedupe.Dedupe([{'field': 'name', 'type': 'Exact'} ]) record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}), ({'name': 'Shmoo'}, {'name': 'Shmoo'})) numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs), numpy.array([[0.0], [1.0]]), 3)
def test_writeTraining(self): if sys.version < '3': from StringIO import StringIO output = StringIO() encoded_file = codecs.EncodedFile(output, data_encoding='utf8', file_encoding='ascii') else: from io import StringIO encoded_file = StringIO() training_pairs = { u"distinct": [(dedupe.core.frozendict({ u'bar': frozenset([u'barë']), 'baz': (1, 2), 'bang': [1, 2], u'foo': u'baz' }), dedupe.core.frozendict({u'foo': u'baz'}))], u"match": [] } json.dump(training_pairs, encoded_file, default=dedupe.serializer._to_json, tuple_as_array=False, ensure_ascii=True) encoded_file.seek(0) loaded_training_pairs = json.load(encoded_file, cls=dedupe.serializer.dedupe_decoder) assert loaded_training_pairs["distinct"][0][0] ==\ dict(training_pairs["distinct"][0][0]) assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset) assert isinstance(loaded_training_pairs['distinct'][0][0]['baz'], tuple) deduper = dedupe.Dedupe([{'field': 'foo', 'type': 'String'}]) deduper.classifier.cv = False encoded_file.seek(0) deduper.readTraining(encoded_file) print(deduper.training_pairs) print(training_pairs) assert deduper.training_pairs == training_pairs encoded_file.close()
def test_get_pair(self): fds = open(join(fixtures_path, 'field_defs.json'), 'rb').read() sample = open(join(fixtures_path, 'sample.dump'), 'rb').read() deduper = dedupe.Dedupe(json.loads(fds), cPickle.loads(sample)) with self.app.test_request_context(): self.login() with self.client as c: with c.session_transaction() as sess: sess['deduper'] = deduper rv = c.get('/get-pair/') assert set(['left', 'right', 'field']) == set(json.loads(rv.data)[0].keys()) assert session.get('current_pair') is not None
def train(self, data, training_file): fields = self.fields deduper = dedupe.Dedupe(fields) # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data, f) else: raise Exception('no training data') deduper.train() self.deduper = deduper return self
def drawSample(session_id): sess = worker_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) fields = list(set([f['field'] for f in field_defs])) d = dedupe.Dedupe(field_defs) data_d = makeSampleDict(sess.id, fields=fields) if len(data_d) < 50001: sample_size = 5000 else: # pragma: no cover sample_size = round(int(len(data_d) * 0.01), -3) d.sample(data_d, sample_size=sample_size, blocked_proportion=1) sess.sample = cPickle.dumps(d.data_sample) worker_session.add(sess) worker_session.commit()
def deduper_setup(self,settings_file, training_file, field_list, selection, sample): """ Trains (if training and settings files do not exist) otherwise set up deduper object :param settings_file: settings file name :param training_file: training file name :param field_list: list of lists (field(string), comparator(string), missing?(bool)) :param selection: sql statement selecting all relevant columns to use in deduplication :param sample: sample size of data to be used for training :return: deduper object """ if os.path.exists(settings_file): print('Reading from ', settings_file) with open(settings_file, 'rb') as sf: self.deduper = dedupe.StaticDedupe(sf, num_cores=4) else: # Define the fields dedupe will pay attention to fields = [] for field in field_list: fields.append({'field': field[0], 'type': field[1], 'has missing': field[2]}) # Create a new deduper object and pass our data model to it. self.deduper = dedupe.Dedupe(fields, num_cores=4) data = db.pandas_read(selection).to_dict('index') print('Collecting sample data for active learning... this may take a while.') self.deduper.sample(data, sample) if os.path.exists(training_file): print('Reading labeled examples from ', training_file) with open(training_file) as tf: self.deduper.readTraining(tf) print('Starting active labeling...') dedupe.convenience.consoleLabel(self.deduper) # When finished, save our labeled, training pairs to disk with open(training_file, 'w') as tf: self.deduper.writeTraining(tf) # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. self.deduper.train(recall=0.90) with open(settings_file, 'wb') as sf: self.deduper.writeSettings(sf) self.deduper.cleanupTraining()