def _blockerTypes(self) : # pragma : no cover string_predicates = (predicates.wholeFieldPredicate, predicates.tokenFieldPredicate, predicates.commonIntegerPredicate, predicates.sameThreeCharStartPredicate, predicates.sameFiveCharStartPredicate, predicates.sameSevenCharStartPredicate, predicates.nearIntegersPredicate, predicates.commonFourGram, predicates.commonSixGram) tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold) for threshold in [0.2, 0.4, 0.6, 0.8]]) return {'String' : (string_predicates + tfidf_string_predicates)}
def _initializeTraining(training_pairs, fields, predicate_functions, tfidf_thresholds, df_index): training_dupes = (training_pairs[1])[:] training_distinct = (training_pairs[0])[:] predicate_functions = list(product(predicate_functions, fields)) tfidf_predicates = [ tfidf.TfidfPredicate(threshold) for threshold in tfidf_thresholds ] tfidf_predicates = list(product(tfidf_predicates, fields)) predicate_set = disjunctivePredicates(predicate_functions + tfidf_predicates) if tfidf_predicates: _overlap = canopyOverlap(predicate_set, training_dupes + training_distinct, df_index) else: _overlap = defaultdict(lambda: None) return (training_dupes, training_distinct, predicate_set, _overlap)
def __init__(self, init=None): """ Load or initialize a data model. #### Example usage # initialize from a settings file deduper = dedupe.Dedupe('my_learned_settings') or # initialize from a defined set of fields fields = {'Site name': {'type': 'String'}, 'Address': {'type': 'String'}, 'Zip': {'type': 'String', 'Has Missing':True}, 'Phone': {'type': 'String', 'Has Missing':True}, } deduper = dedupe.Dedupe(fields) #### Keyword arguments `init` A field definition or a file location for a settings file. #### Additional detail A field definition is a dictionary where the keys are the fields that will be used for training a model and the values are the field specification Field types include - String A 'String' type field must have as its key a name of a field as it appears in the data dictionary and a type declaration ex. `{'Phone': {type: 'String'}}` Longer example of a field definition: fields = {'name': {'type': 'String'}, 'address': {'type': 'String'}, 'city': {'type': 'String'}, 'cuisine': {'type': 'String'} } Settings files are typically generated by saving the settings learned in a previous session. If you need details for this file see the method [`writeSettings`][[api.py#writesettings]]. """ if init.__class__ is dict and init: self.data_model = _initializeDataModel(init) self.predicates = None elif init.__class__ is str and init: (self.data_model, self.predicates) = self._readSettings(init) elif init: raise ValueError('Incorrect Input Type: must supply either a ' 'field definition or a settings file.') else: raise ValueError('No Input: must supply either a field ' 'definition or a settings file.') self.training_data = None self.training_pairs = None self.data_sample = None self.dupes = None self.training_encoder = training_serializer._to_json self.training_decoder = training_serializer.dedupe_decoder string_predicates = (predicates.wholeFieldPredicate, predicates.tokenFieldPredicate, predicates.commonIntegerPredicate, predicates.sameThreeCharStartPredicate, predicates.sameFiveCharStartPredicate, predicates.sameSevenCharStartPredicate, predicates.nearIntegersPredicate, predicates.commonFourGram, predicates.commonSixGram) tfidf_string_predicates = tuple([ tfidf.TfidfPredicate(threshold) for threshold in [0.2, 0.4, 0.6, 0.8] ]) self.blocker_types = { 'String': (string_predicates + tfidf_string_predicates) }