Ejemplo n.º 1
0
    def _blockerTypes(self) : # pragma : no cover
        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold)
                                         for threshold
                                         in [0.2, 0.4, 0.6, 0.8]])

        return {'String' : (string_predicates
                            + tfidf_string_predicates)}
Ejemplo n.º 2
0
def _initializeTraining(training_pairs, fields, predicate_functions,
                        tfidf_thresholds, df_index):

    training_dupes = (training_pairs[1])[:]
    training_distinct = (training_pairs[0])[:]

    predicate_functions = list(product(predicate_functions, fields))

    tfidf_predicates = [
        tfidf.TfidfPredicate(threshold) for threshold in tfidf_thresholds
    ]
    tfidf_predicates = list(product(tfidf_predicates, fields))

    predicate_set = disjunctivePredicates(predicate_functions +
                                          tfidf_predicates)

    if tfidf_predicates:
        _overlap = canopyOverlap(predicate_set,
                                 training_dupes + training_distinct, df_index)
    else:
        _overlap = defaultdict(lambda: None)

    return (training_dupes, training_distinct, predicate_set, _overlap)
Ejemplo n.º 3
0
    def __init__(self, init=None):
        """
        Load or initialize a data model.

        #### Example usage

            # initialize from a settings file
            deduper = dedupe.Dedupe('my_learned_settings')

        or

            # initialize from a defined set of fields
            fields = {'Site name': {'type': 'String'},
                      'Address':   {'type': 'String'},
                      'Zip':       {'type': 'String', 'Has Missing':True},
                      'Phone':     {'type': 'String', 'Has Missing':True},
                      }
            
            deduper = dedupe.Dedupe(fields)

        
        #### Keyword arguments
        
        `init`
        A field definition or a file location for a settings file.

        #### Additional detail
        A field definition is a dictionary where the keys are the fields
        that will be used for training a model and the values are the
        field specification

        Field types include

        - String

        A 'String' type field must have as its key a name of a field
        as it appears in the data dictionary and a type declaration
        ex. `{'Phone': {type: 'String'}}`

        Longer example of a field definition:


            fields = {'name':       {'type': 'String'},
                      'address':    {'type': 'String'},
                      'city':       {'type': 'String'},
                      'cuisine':    {'type': 'String'}
                      }


        Settings files are typically generated by saving the settings
        learned in a previous session. If you need details for this
        file see the method [`writeSettings`][[api.py#writesettings]].
        """

        if init.__class__ is dict and init:
            self.data_model = _initializeDataModel(init)
            self.predicates = None
        elif init.__class__ is str and init:
            (self.data_model, self.predicates) = self._readSettings(init)
        elif init:
            raise ValueError('Incorrect Input Type: must supply either a '
                             'field definition or a settings file.')
        else:

            raise ValueError('No Input: must supply either a field '
                             'definition or a settings file.')

        self.training_data = None
        self.training_pairs = None
        self.data_sample = None
        self.dupes = None
        self.training_encoder = training_serializer._to_json
        self.training_decoder = training_serializer.dedupe_decoder

        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([
            tfidf.TfidfPredicate(threshold)
            for threshold in [0.2, 0.4, 0.6, 0.8]
        ])

        self.blocker_types = {
            'String': (string_predicates + tfidf_string_predicates)
        }