コード例 #1
0
def activeLearning(data_d, data_model, labelPairFunction, num_questions) :
  training_data = []
  duplicates = []
  nonduplicates = []
  num_iterations = 100
  pairs = blocking.allCandidates(data_d)
  record_distances = core.recordDistances(pairs, data_d, data_model)
  for _ in range(num_questions) :
    print "finding the next uncertain pair ..."
    uncertain_indices = findUncertainPairs(record_distances, data_model)
    record_distances = record_distances[: , uncertain_indices]

    uncertain_pairs = record_distances['pairs'][0:1]
    record_distances = record_distances[1:]

    labeled_pairs = labelPairFunction(uncertain_pairs, data_d, data_model)

    nonduplicates.extend(labeled_pairs[0])
    duplicates.extend(labeled_pairs[1])
    
    training_data = addTrainingData(labeled_pairs, training_data, data_model)

    data_model = core.trainModel(training_data, num_iterations, data_model)

  training_pairs = {0 : nonduplicates, 1 : duplicates}  
  
  return(training_data, training_pairs, data_model)
コード例 #2
0
ファイル: training.py プロジェクト: nilesh-c/dedupe
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] != 'Missing Data']


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model)

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [(candidates[uncertain_index][0][1],
                            candidates[uncertain_index][1][1])]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)
        if len(training_data) > 0:
            data_model = core.trainModel(training_data, data_model, 1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
コード例 #3
0
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] != 'Missing Data']


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model)

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [(candidates[uncertain_index][0][1],
                            candidates[uncertain_index][1][1])]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)
        if len(training_data) > 0:
            data_model = core.trainModel(training_data, data_model, 1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
コード例 #4
0
ファイル: training_sample.py プロジェクト: derwiki/dedupe
def activeLearning(data_d,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs = None,
                   key_groups = []
                   ):

    duplicates = []
    nonduplicates = []

    if training_pairs :
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    finished = False
    candidates = blocking.allCandidates(data_d, key_groups)

    import time
    t_train = time.time()
    record_distances = core.recordDistances(candidates, data_model)
    print 'calculated recordDistances in ', time.time() - t_train, 'seconds'
    
    while finished == False :
        print 'finding the next uncertain pair ...'
        uncertain_indices = findUncertainPairs(record_distances,
                                               data_model)

    # pop the next most uncertain pair off of record distances

        record_distances = record_distances[:, uncertain_indices]
        uncertain_pair_ids = (record_distances['pairs'])[0:1]
        record_distances = record_distances[1:]

        uncertain_pairs = []
        for pair in uncertain_pair_ids :
            record_pair = [data_d[instance] for instance in pair]
            record_pair = tuple(record_pair)
            uncertain_pairs.append(record_pair)

        labeled_pairs, finished = labelPairFunction(uncertain_pairs,
                                          data_model)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs,
                                        data_model,
                                        training_data)
        if len(training_data) > 0 :
            data_model = core.trainModel(training_data, data_model, 1)
        else :
            raise ValueError("No training pairs given")

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
コード例 #5
0
ファイル: dedupe.py プロジェクト: derwiki/dedupe
    def train(self, data_d, training_source=None, key_groups=[]) :
        """
        Learn field weights and blocking predicate from file of
        labeled examples or round of interactive labeling

        Keyword arguments:
        data_d -- a dictionary of records
        training_source -- either a path to a file of labeled examples or
                           a labeling function


        In the dictionary of records, the keys are unique identifiers
        for each record, the values are a dictionary where the keys
        are the names of the record field and values are the record
        values.

        For Example,
        {
         854: {'city': 'san francisco',
               'address': '300 de haro st.',
               'name': "sally's cafe & bakery",
               'cuisine': 'american'},
         855: {'city': 'san francisco',
               'address': '1328 18th st.',
               'name': 'san francisco bbq',
               'cuisine': 'thai'}
         }

        The labeling function will be used to do active learning. The
        function will be supplied a list of examples that the learner
        is the most 'curious' about, that is examples where we are most
        uncertain about how they should be labeled. The labeling function
        will label these, and based upon what we learn from these
        examples, the labeling function will be supplied with new
        examples that the learner is now most curious about.  This will
        continue until the labeling function sends a message that we
        it is done labeling.
            
        The labeling function must be a function that takes two
        arguments.  The first argument is a sequence of pairs of
        records. The second argument is the data model.

        The labeling function must return two outputs. The function
        must return a dictionary of labeled pairs and a finished flag.

        The dictionary of labeled pairs must have two keys, 1 and 0,
        corresponding to record pairs that are duplicates or
        nonduplicates respectively. The values of the dictionary must
        be a sequence of records pairs, like the sequence that was
        passed in.

        The 'finished' flag should take the value False for active
        learning to continue, and the value True to stop active learning.

        i.e.

        labelFunction(record_pairs, data_model) :
            ...
            return (labeled_pairs, finished)

        For a working example, see consoleLabel in training_sample

        Labeled example files are typically generated by saving the
        examples labeled in a previous session. If you need details
        for this file see the method writeTraining.
        """

        if (training_source.__class__ is not str
            and not isinstance(training_source, types.FunctionType)):
            raise ValueError

        # data_d = core.sampleDict(data_d, 700) #we should consider changing this
        print "data_d length: ", len(data_d)

        self.data_d = dict([(key, core.frozendict(value)) for key, value in data_d.iteritems()])

        if training_source.__class__ is str:
            print 'reading training from file'
            if not hasattr(self, 'training_data'):
                self.initializeTraining(training_source)
            
            self.training_pairs, self.training_data = self._readTraining(training_source,
                                                                        self.training_data)

        elif isinstance(training_source, types.FunctionType) :
            if not hasattr(self, 'training_data'):
                self.initializeTraining()
            
            (self.training_data,
            self.training_pairs,
            self.data_model) = training_sample.activeLearning(self.data_d,
                                                              self.data_model,
                                                              training_source,
                                                              self.training_data,
                                                              self.training_pairs,
                                                              key_groups)

        self.alpha = crossvalidation.gridSearch(self.training_data,
                                                core.trainModel,
                                                self.data_model,
                                                k=20)

        self.data_model = core.trainModel(self.training_data,
                                          self.data_model,
                                          self.alpha)

        self._printLearnedWeights()
コード例 #6
0
ファイル: training.py プロジェクト: nikitsaraf/dedupe
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [
        field for field in data_model['fields']
        if data_model['fields'][field]['type'] not in ('Missing Data',
                                                       'Interaction',
                                                       'Higher Categories')
    ]

    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])

    if training_data.shape[0] == 0:
        rand_int = random.randint(0, len(candidates))
        exact_match = candidates[rand_int]
        training_data = addTrainingData({
            1: [exact_match] * 2,
            0: []
        }, data_model, training_data)

    data_model = core.trainModel(training_data, data_model, .1)

    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances, data_model,
                                               (len(duplicates) /
                                                (len(nonduplicates) + 1.0)))

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [candidates[uncertain_index]]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model,
                                        training_data)

        if len(training_data) > 0:

            data_model = core.trainModel(training_data, data_model, .1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)
コード例 #7
0
ファイル: dedupe.py プロジェクト: JeffDonovan/dedupe
    def train(self, data_sample, training_source=None, key_groups=[]):
        """
        Learn field weights and blocking predicate from file of
        labeled examples or round of interactive labeling

        Keyword arguments:
        data_sample -- a sample of record pairs
        training_source -- either a path to a file of labeled examples or
                           a labeling function


        In the sample of record_pairs, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.

        For example, a data_sample with only one pair of records,

        [
          (
           (854, {'city': 'san francisco',
                  'address': '300 de haro st.',
                  'name': "sally's cafe & bakery",
                  'cuisine': 'american'}),
           (855, {'city': 'san francisco',
                 'address': '1328 18th st.',
                 'name': 'san francisco bbq',
                 'cuisine': 'thai'})
           )
         ]

        The labeling function will be used to do active learning. The
        function will be supplied a list of examples that the learner
        is the most 'curious' about, that is examples where we are most
        uncertain about how they should be labeled. The labeling function
        will label these, and based upon what we learn from these
        examples, the labeling function will be supplied with new
        examples that the learner is now most curious about.  This will
        continue until the labeling function sends a message that we
        it is done labeling.
            
        The labeling function must be a function that takes two
        arguments.  The first argument is a sequence of pairs of
        records. The second argument is the data model.

        The labeling function must return two outputs. The function
        must return a dictionary of labeled pairs and a finished flag.

        The dictionary of labeled pairs must have two keys, 1 and 0,
        corresponding to record pairs that are duplicates or
        nonduplicates respectively. The values of the dictionary must
        be a sequence of records pairs, like the sequence that was
        passed in.

        The 'finished' flag should take the value False for active
        learning to continue, and the value True to stop active learning.

        i.e.

        labelFunction(record_pairs, data_model) :
            ...
            return (labeled_pairs, finished)

        For a working example, see consoleLabel in training

        Labeled example files are typically generated by saving the
        examples labeled in a previous session. If you need details
        for this file see the method writeTraining.
        """

        self.data_sample = data_sample

        if training_source.__class__ is not str and not isinstance(training_source, types.FunctionType):
            raise ValueError

        if training_source.__class__ is str:
            logging.info("reading training from file")
            if not hasattr(self, "training_data"):
                self.initializeTraining(training_source)

            self.training_pairs, self.training_data = self._readTraining(training_source, self.training_data)

        elif isinstance(training_source, types.FunctionType):
            if not hasattr(self, "training_data"):
                self.initializeTraining()

            (self.training_data, self.training_pairs, self.data_model) = training.activeLearning(
                self.data_sample, self.data_model, training_source, self.training_data, self.training_pairs, key_groups
            )

        self.alpha = crossvalidation.gridSearch(self.training_data, core.trainModel, self.data_model, k=20)

        self.data_model = core.trainModel(self.training_data, self.data_model, self.alpha)

        self._logLearnedWeights()
コード例 #8
0
ファイル: training.py プロジェクト: BrianSipple/dedupe
def activeLearning(candidates,
                   data_model,
                   labelPairFunction,
                   training_data,
                   training_pairs=None):
    """
    Ask the user to label the record pair we are most uncertain of. Train the
    data model, and update our uncertainty. Repeat until user tells us she is
    finished.
    """

    fields = [field for field in data_model['fields']
              if data_model['fields'][field]['type'] not in ('Missing Data',
                                                             'Interaction',
                                                             'Higher Categories')]


    duplicates = []
    nonduplicates = []

    if training_pairs:
        nonduplicates.extend(training_pairs[0])
        duplicates.extend(training_pairs[1])


    if training_data.shape[0] == 0 :
        rand_int = random.randint(0, len(candidates))
        exact_match = candidates[rand_int]
        training_data = addTrainingData({1:[exact_match]*2,
                                         0:[]},
                                        data_model,
                                        training_data)

    data_model = core.trainModel(training_data, data_model, .1)


    finished = False

    import time
    t_train = time.time()
    field_distances = core.fieldDistances(candidates, data_model)
    logging.info('calculated fieldDistances in %s seconds',
                 str(time.time() - t_train))

    seen_indices = set()

    while finished == False:
        logging.info('finding the next uncertain pair ...')
        uncertain_indices = findUncertainPairs(field_distances,
                                               data_model,
                                               (len(duplicates)/
                                                (len(nonduplicates)+1.0)))

        for uncertain_index in uncertain_indices:
            if uncertain_index not in seen_indices:
                seen_indices.add(uncertain_index)
                break

        uncertain_pairs = [candidates[uncertain_index]]

        (labeled_pairs, finished) = labelPairFunction(uncertain_pairs, fields)

        nonduplicates.extend(labeled_pairs[0])
        duplicates.extend(labeled_pairs[1])

        training_data = addTrainingData(labeled_pairs, data_model, training_data)

        if len(training_data) > 0:

            data_model = core.trainModel(training_data, data_model, .1)
        else:
            raise ValueError('No training pairs given')

    training_pairs = {0: nonduplicates, 1: duplicates}

    return (training_data, training_pairs, data_model)