Exemple #1
0
  def test_initialize(self) :
    dist = dedupe.affinegap.normalizedAffineGapDistance
    deduper = dedupe.Dedupe({'name' : {'type' : 'String'}})
    assert deduper.data_model['fields']['name'] == {'Has Missing': False, 
                                                    'type': 'String', 
                                                    'comparator': dist}

    deduper = dedupe.Dedupe({'name' : {'type' : 'String',
                                       'Has Missing' : True}})
    assert deduper.data_model['fields']['name'] == {'Has Missing': True, 
                                                    'type': 'String', 
                                                    'comparator': dist }

    deduper = dedupe.Dedupe({'name' : {'type' : 'Source',
                                       'Source Names' : ['a', 'b'],
                                       'Has Missing' : True}})

    source_comparator = deduper.data_model['fields']['name']['comparator']
    assert source_comparator('a', 'a') == 0
    assert source_comparator('b', 'b') == 1
    assert source_comparator('a', 'b') == 2
    assert source_comparator('b', 'a') == 2
    self.assertRaises(ValueError, source_comparator, 'b', 'c')
    self.assertRaises(ValueError, source_comparator, '', 'c')
    assert numpy.isnan(source_comparator('', 'b'))
Exemple #2
0
    def test_comparator(self):
        fieldDistances = dedupe.core.fieldDistances
        deduper = dedupe.Dedupe(
            {'type': {
                'type': 'Categorical',
                'Categories': ['a', 'b', 'c']
            }}, [])

        record_pairs = (({
            'type': 'a'
        }, {
            'type': 'b'
        }), ({
            'type': 'a'
        }, {
            'type': 'c'
        }))

        numpy.testing.assert_array_almost_equal(
            fieldDistances(record_pairs, deduper.data_model),
            numpy.array([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0]]), 3)

        deduper = dedupe.Dedupe(
            {
                'type': {
                    'type': 'Categorical',
                    'Categories': ['a', 'b', 'c']
                },
                'source': {
                    'type': 'Source',
                    'Source Names': ['foo', 'bar']
                }
            }, [])

        record_pairs = (({
            'type': 'a',
            'source': 'bar'
        }, {
            'type': 'b',
            'source': 'bar'
        }), ({
            'type': 'a',
            'source': 'foo'
        }, {
            'type': 'c',
            'source': 'bar'
        }))

        numpy.testing.assert_array_almost_equal(
            fieldDistances(record_pairs, deduper.data_model),
            numpy.array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0.],
                         [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                          0.]]), 3)
Exemple #3
0
    def setUp(self):

        field_definition = [{'field': 'name', 'type': 'String'}]
        self.data_model = dedupe.Dedupe(field_definition).data_model
        self.training_pairs = {
            'match': [({"name": "Bob", "age": "50"},
                       {"name": "Bob", "age": "75"}),
                      ({"name": "Meredith", "age": "40"},
                       {"name": "Sue", "age": "10"})],
            'distinct': [({"name": "Jimmy", "age": "20"},
                          {"name": "Jimbo", "age": "21"}),
                         ({"name": "Willy", "age": "35"},
                          {"name": "William", "age": "35"}),
                         ({"name": "William", "age": "36"},
                          {"name": "William", "age": "35"})]
        }

        self.training = self.training_pairs['match'] + \
            self.training_pairs['distinct']
        self.training_records = []
        for pair in self.training:
            for record in pair:
                if record not in self.training_records:
                    self.training_records.append(record)

        self.simple = lambda x: set([str(k) for k in x
                                     if "CompoundPredicate" not in str(k)])
Exemple #4
0
  def setUp(self):
    self.frozendict = dedupe.core.frozendict

    field_definition = [{'field' : 'name', 'type': 'String'}, 
                        {'field' :'age', 'type': 'String'}]
    self.data_model = dedupe.Dedupe(field_definition).data_model
    self.training_pairs = {
        0: [((1, self.frozendict({"name": "Bob", "age": "50"})),
             (2, self.frozendict({"name": "Bob", "age": "75"}))),
            ((3, self.frozendict({"name": "Meredith", "age": "40"})),
             (4, self.frozendict({"name": "Sue", "age": "10"})))], 
        1: [((5, self.frozendict({"name": "Jimmy", "age": "20"})),
             (6, self.frozendict({"name": "Jimbo", "age": "21"}))),
            ((7, self.frozendict({"name": "Willy", "age": "35"})),
             (8, self.frozendict({"name": "William", "age": "35"}))),
            ((9, self.frozendict({"name": "William", "age": "36"})),
             (8, self.frozendict({"name": "William", "age": "35"})))]
      }

    self.training = self.training_pairs[0] + self.training_pairs[1]
    self.distinct_ids = [tuple([pair[0][0], pair[1][0]])
                         for pair in
                         self.training_pairs[0]]
    self.dupe_ids = [tuple([pair[0][0], pair[1][0]])
                     for pair in
                     self.training_pairs[1]]

    self.simple = lambda x : set([str(k) for k in x 
                                  if "CompoundPredicate" not in str(k)])
 def write_config(self, distinct_config=None):
     '''
     Generate the training data file and then write the settings
     '''
     print('creating deduper')
     print(datetime.now())
     deduper = dedupe.Dedupe(self.training_data.match_fields)
     print('getting sample matches')
     print(datetime.now())
     match_list, sample_data = self.training_data.get_sample_matches(.3)
     print('done- sample matches')
     print(datetime.now())
     if distinct_config:
         distincts = self.get_distinct_from_file(distinct_config)
     else:
         distincts = []
     training_dict = {"distinct": distincts, "match": match_list}
     deduper.sample(sample_data)
     print('done- sample')
     print(datetime.now())
     deduper.markPairs(training_dict)
     print('done- markPairs')
     print(datetime.now())
     deduper.train(recall=.9)
     print('done- train')
     print(datetime.now())
     with BytesIO() as sf:
         deduper.writeSettings(sf)
         self.settings_file.write_file(sf.getvalue())
Exemple #6
0
def rundedupe(input_file_path, unique_col, dedupe_cols):
    global input_file, data_d, deduper, fields
    input_file = input_file_path
    print('importing data ...')
    data_d = readData(input_file, unique_col)
    if os.path.exists(os.getcwd() + "/media/settings_files/" + settings_file):
        print('reading from', settings_file)
        with open(os.getcwd() + "/media/settings_files/" + settings_file,
                  'rb') as f:
            deduper = dedupe.StaticDedupe(f)
        ret = False
    else:
        fields = []
        for i in dedupe_cols:
            fields.append({'field': i, 'type': 'String'})

        deduper = dedupe.Dedupe(fields)
        deduper.sample(data_d, 15000)
        if os.path.exists(os.getcwd() + "/media/training_files/" +
                          training_file):
            print('reading labeled examples from ', training_file)
            with open(os.getcwd() + "/media/training_files/" + training_file,
                      'rb') as f:
                deduper.readTraining(f)
        print('starting active labeling...')
        ret = True
    fields = unique(field.field for field in deduper.data_model.primary_fields)
    return ret
def collect_labelled_data(data_d, fields, training_file, settings_file):
    """collects labelled data, returns the deduper"""

    deduper = dedupe.Dedupe(fields)

    deduper.sample(data_d, 75000)

    # check if a training file exists. If it does load it.
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            deduper.readTraining(tf)

    # active labelling phase
    print('starting active labeling...')
    dedupe.consoleLabel(deduper)

    # train the deduper
    print("training deduper...")
    deduper.train()

    # save out the training data
    print("saving out training file...")
    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    # save out the settings
    print("saving out settings file...")
    with open(settings_file, 'w') as sf:
        deduper.writeSettings(sf)

    return
Exemple #8
0
  def test_field_distance_simple(self) :
    fieldDistances = dedupe.core.fieldDistances
    deduper = dedupe.Dedupe({'name' : {'type' :'String'},
                             'source' : {'type' : 'Source',
                                         'Source Names' : ['a', 'b']}})

    record_pairs = (({'name' : 'steve', 'source' : 'a'}, 
                     {'name' : 'steven', 'source' : 'a'}),)


    numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, 
                                                           deduper.data_model),
                                            numpy.array([[0, 0.647, 0, 0, 0]]), 3)

    record_pairs = (({'name' : 'steve', 'source' : 'b'}, 
                     {'name' : 'steven', 'source' : 'b'}),)
    numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, 
                                                           deduper.data_model),
                                            numpy.array([[1, 0.647, 0, 0.647, 0]]), 3)

    record_pairs = (({'name' : 'steve', 'source' : 'a'}, 
                     {'name' : 'steven', 'source' : 'b'}),)
    numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, 
                                                           deduper.data_model),
                                            numpy.array([[0, 0.647, 1, 0, 0.647]]), 3)
    def test_initialize(self):
        fields = {
            'name': {
                'type': 'String'
            },
            'age': {
                'type': 'String'
            },
        }
        deduper = dedupe.Dedupe(fields)

        string_predicates = (dedupe.predicates.wholeFieldPredicate,
                             dedupe.predicates.tokenFieldPredicate,
                             dedupe.predicates.commonIntegerPredicate,
                             dedupe.predicates.sameThreeCharStartPredicate,
                             dedupe.predicates.sameFiveCharStartPredicate,
                             dedupe.predicates.sameSevenCharStartPredicate,
                             dedupe.predicates.nearIntegersPredicate,
                             dedupe.predicates.commonFourGram,
                             dedupe.predicates.commonSixGram)

        tfidf_string_predicates = tuple([
            dedupe.tfidf.TfidfPredicate(threshold)
            for threshold in [0.2, 0.4, 0.6, 0.8]
        ])

        assert deduper.blocker_types == {
            'String': string_predicates + tfidf_string_predicates
        }
Exemple #10
0
    def test_comparator_interaction(self):
        deduper = dedupe.Dedupe([{'field' : 'type',
                                  'variable name' : 'type',
                                  'type' : 'Categorical',
                                  'categories' : ['a', 'b']},\
                                 {'type' : 'Interaction',
                                  'interaction variables' : ['type', 'name']},
                                 {'field' : 'name',
                                  'variable name' : 'name',
                                  'type' : 'Exact'}]
                                 , [])

        record_pairs = (({
            'name': 'steven',
            'type': 'a'
        }, {
            'name': 'steven',
            'type': 'b'
        }), ({
            'name': 'steven',
            'type': 'b'
        }, {
            'name': 'steven',
            'type': 'b'
        }))

        numpy.testing.assert_array_almost_equal(
            deduper.data_model.distances(record_pairs),
            numpy.array([[0, 1, 1, 0, 1], [1, 0, 1, 1, 0]]), 3)
Exemple #11
0
def active_training():
    print("MODE: Active training")

    # carga fichero de datos a deduplicar
    try:
        data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE)
    except IOError:
        print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE)
        raise IOError

    # Entrenamiento Activo
    deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS)
    deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE)

    # Carga de registros clasificados de entrenamientos anteriores
    if CONFIG.GENERAL.LOAD_TRAINING:
        try:
            with open(CONFIG.PATHS.TRAINING_FILE) as f:
                deduper.readTraining(f)
        except IOError:
            print("No se pudo abrir el fichero de entrenamiento activo -" + CONFIG.PATHS.TRAINING_FILE)

    dedupe.consoleLabel(deduper)

    with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf:
        deduper.writeTraining(tf)
Exemple #12
0
  def setUp(self) :
    random.seed(123)
    empty_set = set([])

    long_string ='asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag'

    self.records = iter([((long_string, {'name': 'Margret', 'age': '32'}, empty_set), 
                          ('2', {'name': 'Marga', 'age': '33'}, empty_set)), 
                         (('2', {'name': 'Marga', 'age': '33'}, empty_set), 
                          ('3', {'name': 'Maria', 'age': '19'}, empty_set)), 
                         (('4', {'name': 'Maria', 'age': '19'}, empty_set), 
                          ('5', {'name': 'Monica', 'age': '39'}, empty_set)), 
                         (('6', {'name': 'Monica', 'age': '39'}, empty_set), 
                          ('7', {'name': 'Mira', 'age': '47'}, empty_set)),
                         (('8', {'name': 'Mira', 'age': '47'}, empty_set), 
                          ('9', {'name': 'Mona', 'age': '9'}, empty_set)),
                        ])

    self.data_model = dedupe.Dedupe([{'field' : "name", 'type' : 'String'}], ()).data_model
    self.data_model['fields'][0].weight = -1.0302742719650269
    self.data_model['bias'] = 4.76

    score_dtype = [('pairs', '<U192', 2), ('score', 'f4', 1)]

    self.desired_scored_pairs = numpy.array([((long_string, '2'), 0.96), 
                                             (['2', '3'], 0.96), 
                                             (['4', '5'], 0.78), 
                                             (['6', '7'], 0.72), 
                                             (['8', '9'], 0.84)], 
                                            dtype=score_dtype)
Exemple #13
0
  def test_comparator_interaction(self) :
    fieldDistances = dedupe.core.fieldDistances      

    deduper = dedupe.Dedupe([{'field' : 'type', 
                              'variable name' : 'type',
                              'type' : 'Categorical',
                              'categories' : ['a', 'b']},\
                             {'type' : 'Interaction',
                              'interaction variables' : ['type', 'name']},
                             {'field' : 'name',
                              'variable name' : 'name',
                              'type' : 'String'}]
                             , [])

    record_pairs = (({'name' : 'steven', 'type' : 'a'},
                     {'name' : 'steve', 'type' : 'b'}),
                    ({'name' : 'steven', 'type' : 'b'},
                     {'name' : 'steve', 'type' : 'b'}))

    print deduper.data_model

    numpy.testing.assert_array_almost_equal(fieldDistances(record_pairs, 
                                                           deduper.data_model),
                                            numpy.array([[0, 1, 0.64772, 0, 0.64772],
                                                         [1, 0, 0.64772, 0.64772, 0]]), 3)
Exemple #14
0
    def test_writeTraining(self):
        string = StringIO.StringIO()
        training_pairs = OrderedDict({
            "distinct": [(dedupe.core.frozendict({
                u'bar': frozenset([u'bar']),
                u'foo': u'baz'
            }), dedupe.core.frozendict({u'foo': u'baz'}))],
            "match": []
        })

        json.dump(training_pairs,
                  string,
                  default=dedupe.serializer._to_json,
                  ensure_ascii=False)

        string.seek(0)

        loaded_training_pairs = json.load(string,
                                          cls=dedupe.serializer.dedupe_decoder)

        assert loaded_training_pairs["distinct"][0] ==\
            training_pairs["distinct"][0]

        assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"],
                          frozenset)

        deduper = dedupe.Dedupe([{'field': 'foo', 'type': 'String'}])

        string.seek(0)

        deduper.readTraining(string)
        assert repr(deduper.training_pairs) == repr(training_pairs)

        string.close()
def dedupe_snippets():

    deduper = dedupe.Dedupe(fields)
    deduper.sample(snippets, 15000)
    dedupe.consoleLabel(deduper)
    deduper.train()

    threshold = deduper.threshold(snippets, recall_weight=1)
    clustered_dupes = deduper.match(snippets, threshold)

    for (cluster_id, cluster) in enumerate(clustered_dupes):

        id_set, scores = cluster
        max_like_count = -1
        max_like_count_comment_id = ''

        for comment_id in id_set:

            like_count = snippets[comment_id]['likeCount']

            if like_count > max_like_count:

                snippets.pop(max_like_count_comment_id, None)
                max_like_count = like_count
                max_like_count_comment_id = comment_id

            else:
                snippets.pop(comment_id)
Exemple #16
0
    def setUp(self):

        field_definition = [{'field': 'name', 'type': 'String'}]
        self.data_model = dedupe.Dedupe(field_definition).data_model
        self.training_pairs = {
            'match': [({
                "name": "Bob",
                "age": "50"
            }, {
                "name": "Bob",
                "age": "75"
            }),
                      ({
                          "name": "Meredith",
                          "age": "40"
                      }, {
                          "name": "Sue",
                          "age": "10"
                      })],
            'distinct': [({
                "name": "Jimmy",
                "age": "20"
            }, {
                "name": "Jimbo",
                "age": "21"
            }),
                         ({
                             "name": "Willy",
                             "age": "35"
                         }, {
                             "name": "William",
                             "age": "35"
                         }),
                         ({
                             "name": "William",
                             "age": "36"
                         }, {
                             "name": "William",
                             "age": "35"
                         })]
        }

        self.training = self.training_pairs['match'] + \
            self.training_pairs['distinct']
        self.training_records = []
        for pair in self.training:
            for record in pair:
                if record not in self.training_records:
                    self.training_records.append(record)

        self.simple = lambda x: set(
            [str(k) for k in x if "CompoundPredicate" not in str(k)])

        self.block_learner = training.BlockLearner
        self.block_learner.blocker = dedupe.blocking.Fingerprinter(
            self.data_model.predicates())
        self.block_learner.blocker.index_all(
            {i: x
             for i, x in enumerate(self.training_records)})
Exemple #17
0
def deduplicate():
    print("MODE: Deduplicate")

    # Comprueba que va a haber registros clasificados con los que entrenar un modelo:
    if not (CONFIG.GENERAL.LOAD_TRAINING or CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING):
        print("ERROR: El entrenamiento activo y la carga desde fichero están desactivados")
        return

    # Carga fichero de datos a deduplicar
    try:
        data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE)
    except IOError:
        print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE)
        raise IOError

    if CONFIG.GENERAL.LOAD_SETTINGS:
        try:
            with open(CONFIG.PATHS.SETTINGS_FILE, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)
        except IOError:
            print("No se pudo abrir el fichero de settings de dedupe - " + CONFIG.PATHS.SETTINGS_FILE)
            raise IOError
    else:
        # Inicializa objeto dedupe
        deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS)

        # Carga de registros clasificados de entrenamientos anteriores
        if CONFIG.GENERAL.LOAD_TRAINING:
            try:
                with open(CONFIG.PATHS.TRAINING_FILE) as f:
                    deduper.readTraining(f)
            except IOError:
                print("No se pudo abrir el fichero de entrenamiento activo - " + CONFIG.PATHS.TRAINING_FILE)
                raise IOError

        if CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING:
            # Muestreo y entrenamiento activo
            deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE)
            dedupe.consoleLabel(deduper)

        # Entrenamiento de modelo predictivo (por defecto regresión logística
        deduper.train(CONFIG.DEDUPE.USE_INDEX_PREDICATES)

        # Guarda entrenamiento activo, y modelo predictivo + predicados
        with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf:
            deduper.writeTraining(tf)
        with open(CONFIG.PATHS.SETTINGS_FILE, 'wb') as sf:
            deduper.writeSettings(sf)

    try:
        # Calcula umbral para la regresión logistica
        threshold = deduper.threshold(data_d, recall_weight=CONFIG.DEDUPE.RECALL_WEIGHT)
        # Agrupación de matches en clusters
        clustered_dupes = deduper.match(data_d, threshold)
        # Escrirura en fichero
        write_clusters(clustered_dupes)
    except NameError:
        print("Error - No se pudo inicializar el objeto dedupe")
        raise NameError
Exemple #18
0
    def setUp(self):
        random.seed(123)
        numpy.random.seed(456)

        field_definition = [{'field': 'name', 'type': 'String'},
                            {'field': 'age', 'type': 'String'}]

        self.deduper = dedupe.Dedupe(field_definition)
Exemple #19
0
    def setUp(self):
        random.seed(123)

        self.records = iter([
            (('1', {
                'name': 'Margret',
                'age': '32'
            }), ('2', {
                'name': 'Marga',
                'age': '33'
            })),
            (('2', {
                'name': 'Marga',
                'age': '33'
            }), ('3', {
                'name': 'Maria',
                'age': '19'
            })),
            (('4', {
                'name': 'Maria',
                'age': '19'
            }), ('5', {
                'name': 'Monica',
                'age': '39'
            })),
            (('6', {
                'name': 'Monica',
                'age': '39'
            }), ('7', {
                'name': 'Mira',
                'age': '47'
            })),
            (('8', {
                'name': 'Mira',
                'age': '47'
            }), ('9', {
                'name': 'Mona',
                'age': '9'
            })),
        ])

        self.data_model = dedupe.Dedupe({
            "name": {
                'type': 'String'
            }
        }, ()).data_model
        self.data_model['fields']['name']['weight'] = -1.0302742719650269
        self.data_model['bias'] = 4.76

        score_dtype = [('pairs', 'S4', 2), ('score', 'f4', 1)]

        self.desired_scored_pairs = numpy.array([(('1', '2'), 0.96),
                                                 (['2', '3'], 0.96),
                                                 (['4', '5'], 0.78),
                                                 (['6', '7'], 0.72),
                                                 (['8', '9'], 0.84)],
                                                dtype=score_dtype)
 def setUp(self):
     field_definition = [{
         'field': 'name',
         'type': 'String'
     }, {
         'field': 'age',
         'type': 'String'
     }]
     self.deduper = dedupe.Dedupe(field_definition)
Exemple #21
0
    def setUp(self):
        random.seed(123)

        long_string = 'asa;sasdfjasdio;fio;asdnfasdvnvao;asduifvnavjasdfasdfasfasasdfasdfasdfasdfasdfsdfasgnuavpidcvaspdivnaspdivninasduinguipghauipsdfnvaspfighapsdifnasdifnasdpighuignpaguinpgiasidfjasdfjsdofgiongag'  # noqa: E501

        self.records = iter([
            ((long_string, {
                'name': 'Margret',
                'age': '32'
            }), ('2', {
                'name': 'Marga',
                'age': '33'
            })),
            (('2', {
                'name': 'Marga',
                'age': '33'
            }), ('3', {
                'name': 'Maria',
                'age': '19'
            })),
            (('4', {
                'name': 'Maria',
                'age': '19'
            }), ('5', {
                'name': 'Monica',
                'age': '39'
            })),
            (('6', {
                'name': 'Monica',
                'age': '39'
            }), ('7', {
                'name': 'Mira',
                'age': '47'
            })),
            (('8', {
                'name': 'Mira',
                'age': '47'
            }), ('9', {
                'name': 'Mona',
                'age': '9'
            })),
        ])

        deduper = dedupe.Dedupe([{'field': "name", 'type': 'String'}])
        self.distances = deduper.distances
        self.classifier = deduper.classifier
        self.classifier.weights = [-1.0302742719650269]
        self.classifier.bias = 4.76

        score_dtype = [('pairs', '<U192', 2), ('score', 'f4')]

        self.desired_scored_pairs = numpy.array([((long_string, '2'), 0.96),
                                                 (['2', '3'], 0.96),
                                                 (['4', '5'], 0.78),
                                                 (['6', '7'], 0.72),
                                                 (['8', '9'], 0.84)],
                                                dtype=score_dtype)
Exemple #22
0
def create_deduper(project):
    deduper = dedupe.Dedupe(VARIABLES, num_cores=4)
    data = {e['uid']: to_record(e) for e in project.entities}
    if len(data):
        deduper.sample(data)
        deduper.markPairs({
            'match': get_trainset(project, True, data),
            'distinct': get_trainset(project, False, data)
        })
    return deduper, data
Exemple #23
0
 def setUp(self):
     random.seed(123)
     fields = {
         'name': {
             'type': 'String'
         },
         'age': {
             'type': 'String'
         },
     }
     self.deduper = dedupe.Dedupe(fields)
Exemple #24
0
 def __init__(self, dataframe, entity):
     self.settings = entity["name"].lower() + "_settings"
     self.training = entity["name"].lower() + "_training.json"
     self.dictionary = dataframe.to_dict('index')
     self.dataframe = dataframe
     self.entity = entity
     if os.path.exists(self.settings):
         with open(self.settings, 'rb') as sf:
             self.deduper = dedupe.StaticDedupe(sf, num_cores=4)
     else:
         self.deduper = dedupe.Dedupe(entity["matching_fields"],
                                      num_cores=4)
Exemple #25
0
    def test_exact_comparator(self):
        deduper = dedupe.Dedupe([{'field': 'name',
                                  'type': 'Exact'}
                                 ])

        record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}),
                        ({'name': 'Shmoo'}, {'name': 'Shmoo'}))

        numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs),
                                                numpy.array([[0.0],
                                                             [1.0]]),
                                                3)
Exemple #26
0
    def test_writeTraining(self):
        if sys.version < '3':
            from StringIO import StringIO
            output = StringIO()
            encoded_file = codecs.EncodedFile(output,
                                              data_encoding='utf8',
                                              file_encoding='ascii')
        else:
            from io import StringIO
            encoded_file = StringIO()

        training_pairs = {
            u"distinct": [(dedupe.core.frozendict({
                u'bar': frozenset([u'barë']),
                'baz': (1, 2),
                'bang': [1, 2],
                u'foo': u'baz'
            }), dedupe.core.frozendict({u'foo': u'baz'}))],
            u"match": []
        }

        json.dump(training_pairs,
                  encoded_file,
                  default=dedupe.serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)

        encoded_file.seek(0)

        loaded_training_pairs = json.load(encoded_file,
                                          cls=dedupe.serializer.dedupe_decoder)

        assert loaded_training_pairs["distinct"][0][0] ==\
            dict(training_pairs["distinct"][0][0])

        assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"],
                          frozenset)

        assert isinstance(loaded_training_pairs['distinct'][0][0]['baz'],
                          tuple)

        deduper = dedupe.Dedupe([{'field': 'foo', 'type': 'String'}])
        deduper.classifier.cv = False

        encoded_file.seek(0)

        deduper.readTraining(encoded_file)
        print(deduper.training_pairs)
        print(training_pairs)
        assert deduper.training_pairs == training_pairs

        encoded_file.close()
Exemple #27
0
 def test_get_pair(self):
     fds = open(join(fixtures_path, 'field_defs.json'), 'rb').read()
     sample = open(join(fixtures_path, 'sample.dump'), 'rb').read()
     deduper = dedupe.Dedupe(json.loads(fds), cPickle.loads(sample))
     with self.app.test_request_context():
         self.login()
         with self.client as c:
             with c.session_transaction() as sess:
                 sess['deduper'] = deduper
             rv = c.get('/get-pair/')
             assert set(['left', 'right',
                         'field']) == set(json.loads(rv.data)[0].keys())
             assert session.get('current_pair') is not None
Exemple #28
0
    def train(self, data, training_file):
        fields = self.fields
        deduper = dedupe.Dedupe(fields)
        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data, f)
        else:
            raise Exception('no training data')

        deduper.train()
        self.deduper = deduper
        return self
Exemple #29
0
def drawSample(session_id):
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fields = list(set([f['field'] for f in field_defs]))
    d = dedupe.Dedupe(field_defs)
    data_d = makeSampleDict(sess.id, fields=fields)
    if len(data_d) < 50001:
        sample_size = 5000
    else: # pragma: no cover
        sample_size = round(int(len(data_d) * 0.01), -3)
    d.sample(data_d, sample_size=sample_size, blocked_proportion=1)
    sess.sample = cPickle.dumps(d.data_sample)
    worker_session.add(sess)
    worker_session.commit()
Exemple #30
0
    def deduper_setup(self,settings_file, training_file, field_list, selection, sample):
        """
        Trains (if training and settings files do not exist) otherwise set up deduper object
        :param settings_file: settings file name
        :param training_file: training file name
        :param field_list: list of lists (field(string), comparator(string), missing?(bool))
        :param selection: sql statement selecting all relevant columns to use in deduplication
        :param sample: sample size of data to be used for training
        :return: deduper object
        """
        if os.path.exists(settings_file):
            print('Reading from ', settings_file)
            with open(settings_file, 'rb') as sf:
                self.deduper = dedupe.StaticDedupe(sf, num_cores=4)
        else:
            # Define the fields dedupe will pay attention to
            fields = []
            for field in field_list:
                fields.append({'field': field[0], 'type': field[1], 'has missing': field[2]})

            # Create a new deduper object and pass our data model to it.
            self.deduper = dedupe.Dedupe(fields, num_cores=4)

            data = db.pandas_read(selection).to_dict('index')

            print('Collecting sample data for active learning... this may take a while.')
            self.deduper.sample(data, sample)

            if os.path.exists(training_file):
                print('Reading labeled examples from ', training_file)
                with open(training_file) as tf:
                    self.deduper.readTraining(tf)

            print('Starting active labeling...')
            dedupe.convenience.consoleLabel(self.deduper)

            # When finished, save our labeled, training pairs to disk
            with open(training_file, 'w') as tf:
                self.deduper.writeTraining(tf)

            # `recall` is the proportion of true dupes pairs that the learned
            # rules must cover. You may want to reduce this if your are making
            # too many blocks and too many comparisons.
            self.deduper.train(recall=0.90)

            with open(settings_file, 'wb') as sf:
                self.deduper.writeSettings(sf)

            self.deduper.cleanupTraining()