def __init__(self, variable_definition, data_sample=None, num_cores=None): """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = [{'field' : 'Site name', 'type': 'String'}, {'field' : 'Address', 'type': 'String'}, {'field' : 'Zip', 'type': 'String', 'Has Missing':True}, {'field' : 'Phone', 'type': 'String', 'Has Missing':True}, ] data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a list of dictionaries where each dictionary describes a variable to use for comparing records. For details about variable types, check the documentation. <http://dedupe.readthedocs.org>`_ In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ self.data_model = DataModel(variable_definition) if num_cores is None: self.num_cores = multiprocessing.cpu_count() else: self.num_cores = num_cores self.data_sample = data_sample if self.data_sample: self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning( self.data_sample, self.data_model, self.num_cores) else: self.data_sample = [] self.activeLearner = None training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = OrderedDict({u'distinct': [], u'match': []}) self.blocker = None
def __init__(self, field_definition, data_sample = None, num_processes = 1) : """ Initialize from a data model and data sample. #### Example usage # initialize from a defined set of fields fields = {'Site name': {'type': 'String'}, 'Address': {'type': 'String'}, 'Zip': {'type': 'String', 'Has Missing':True}, 'Phone': {'type': 'String', 'Has Missing':True}, } data_sample = [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] deduper = dedupe.Dedupe(fields, data_sample) #### Additional detail A field definition is a dictionary where the keys are the fields that will be used for training a model and the values are the field specification Field types include - String A 'String' type field must have as its key a name of a field as it appears in the data dictionary and a type declaration ex. `{'Phone': {type: 'String'}}` Longer example of a field definition: fields = {'name': {'type': 'String'}, 'address': {'type': 'String'}, 'city': {'type': 'String'}, 'cuisine': {'type': 'String'} } In the data_sample, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. """ super(ActiveMatching, self).__init__() if field_definition.__class__ is not dict : raise ValueError('Incorrect Input Type: must supply ' 'a field definition.') self.data_model = DataModel(field_definition) self.data_sample = data_sample if self.data_sample : self._checkDataSample(self.data_sample) self.activeLearner = training.ActiveLearning(self.data_sample, self.data_model) else : self.activeLearner = None self.num_processes = num_processes training_dtype = [('label', 'S8'), ('distances', 'f4', (len(self.data_model['fields']), ))] self.training_data = numpy.zeros(0, dtype=training_dtype) self.training_pairs = dedupe.backport.OrderedDict({'distinct': [], 'match': []})