Python RegularizedLogisticRegression Examples

Programming Language: Python

Namespace/Package Name: rlr

Method/Function: RegularizedLogisticRegression

Examples at hotexamples.com: 4

Python RegularizedLogisticRegression - 4 examples found. These are the top rated real world Python examples of rlr.RegularizedLogisticRegression extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def train_linker(df1, df2):
    if READ_FROM_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    else:
        linker = dedupe.RecordLink(FIELDS)
        # It's terrible that you have to do this next line!!!!
        linker.classifier = rlr.RegularizedLogisticRegression()

        linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE)

        if READ_FROM_TRAINING_FILE:
            print('reading labeled examples from ', TRAINING_FILE)
            with open(TRAINING_FILE, 'rb') as tf:
                linker.readTraining(tf)
        else:
            dedupe.consoleLabel(linker)

        linker.train()

    if WRITE_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'wb') as sf:
            linker.writeSettings(sf)
    if WRITE_TRAINING_FILE:
        with open(TRAINING_FILE, 'w') as tf:
            linker.writeTraining(tf)

    return linker

Example #2

Show file

class ActiveMatching(Matching):
    classifier = rlr.RegularizedLogisticRegression()
    ActiveLearner = labeler.RLRLearner
    """
    Class for training dedupe extends Matching.

    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    - cleanupTraining
    """
    def __init__(self, variable_definition, data_sample=None, num_cores=None):
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String',
                       'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String',
                       'Has Missing':True},
                     ]

            deduper = dedupe.Dedupe(fields)


        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records.

        For details about variable types, check the documentation.
        <https://dedupe.io/developers/library>`_
        """
        self.data_model = datamodel.DataModel(variable_definition)

        if data_sample is not None:
            raise UserWarning(
                'data_sample is deprecated, use the .sample method')

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        self.active_learner = None

        self.training_pairs = OrderedDict({u'distinct': [], u'match': []})

        self.blocker = None
        self.loaded_indices = False

    def cleanupTraining(self):  # pragma: no cover
        '''
        Clean up data we used for training. Free up memory.
        '''
        del self.training_pairs
        del self.active_learner

    def readTraining(self, training_file):
        '''
        Read training from previously built training data file object

        Arguments:

        training_file -- file object containing the training data
        '''

        logger.info('reading training from file')
        training_pairs = json.load(training_file,
                                   cls=serializer.dedupe_decoder)
        self.markPairs(training_pairs)

    def train(self, recall=0.95, index_predicates=True):  # pragma: no cover
        """
        Keyword arguments:

        maximum_comparisons -- The maximum number of comparisons a
                               blocking rule is allowed to make.

                               Defaults to 1000000

        recall -- The proportion of true dupe pairs in our training
                  data that that we the learned blocks must cover. If
                  we lower the recall, there will be pairs of true
                  dupes that we will never directly compare.

                  recall should be a float between 0.0 and 1.0, the default
                  is 0.95

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can
                            be slower and take susbstantial memory.

                            Defaults to True.
        """
        examples, y = flatten_training(self.training_pairs)
        self.classifier.fit(self.data_model.distances(examples), y)

        self._trainBlocker(recall, index_predicates)

    def _trainBlocker(self, recall, index_predicates):  # pragma: no cover
        matches = self.training_pairs['match'][:]

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        block_learner = self._blockLearner(predicate_set)

        self.predicates = block_learner.learn(matches, recall)

        self.blocker = blocking.Blocker(self.predicates)

    def writeTraining(self, file_obj):  # pragma: no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_obj -- file object to write training data to
        """

        json.dump(self.training_pairs,
                  file_obj,
                  default=serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)

    def uncertainPairs(self):
        '''
        Provides a list of the pairs of records that dedupe is most
        curious to learn if they are matches or distinct.

        Useful for user labeling.

        '''

        return self.active_learner.get()

    def markPairs(self, labeled_pairs):
        '''
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
        '''
        self._checkTrainingPairs(labeled_pairs)

        for label, examples in labeled_pairs.items():
            self.training_pairs[label].extend(examples)

        if self.active_learner:
            examples, y = flatten_training(labeled_pairs)
            self.active_learner.mark(examples, y)

    def _checkTrainingPairs(self, labeled_pairs):
        try:
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except:
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match']:
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPair(pair)

        if labeled_pairs[u'distinct']:
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPair(pair)

        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']:
            warnings.warn("Didn't return any labeled record pairs")

    def _checkRecordPair(self, record_pair):
        try:
            record_pair[0]
        except:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try:
            record_pair[0].keys() and record_pair[1].keys()
        except:
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])

Example #3

Show file

File: api.py Project: ejokeeffe/dedupe

class ActiveMatching(Matching):
    classifier = rlr.RegularizedLogisticRegression()

    """
    Class for training dedupe extends Matching.

    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    - cleanupTraining
    """

    def __init__(self,
                 variable_definition,
                 data_sample=None,
                 num_cores=None):
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String',
                       'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String',
                       'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]



            deduper = dedupe.Dedupe(fields, data_sample)


        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records.

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = datamodel.DataModel(variable_definition)

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        if data_sample:
            self._checkDataSample(data_sample)
            self.data_sample = data_sample
            self.activeLearner = training.ActiveLearning(self.data_sample,
                                                         self.data_model,
                                                         self.num_cores)
        else:
            self.data_sample = []
            self.activeLearner = None

        # Override _loadSampledRecords() to load blocking data from
        # data_sample.
        self._loadSampledRecords(data_sample)

        training_dtype = [('label', 'S8'),
                          ('distances', 'f4',
                           (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [],
                                           u'match': []})

        self.blocker = None
        self.loaded_indices = False

    def cleanupTraining(self):  # pragma: no cover
        '''
        Clean up data we used for training. Free up memory.
        '''
        del self.training_data
        del self.training_pairs
        del self.activeLearner
        del self.data_sample

    def readTraining(self, training_file):
        '''
        Read training from previously built training data file object

        Arguments:

        training_file -- file object containing the training data
        '''

        logger.info('reading training from file')

        training_pairs = json.load(training_file,
                                   cls=serializer.dedupe_decoder)

        if not any(training_pairs.values()):
            raise EmptyTrainingException(
                "The training file seems to contain no training examples")

        for (label, examples) in training_pairs.items():
            if examples:
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self, ppc=None, uncovered_dupes=None, maximum_comparisons=1000000, recall=0.95, index_predicates=True):  # pragma: no cover
        """Keyword arguments:

        maximum_comparisons -- The maximum number of comparisons a
                               blocking rule is allowed to make.

                               Defaults to 1000000

        recall -- The proportion of true dupe pairs in our training
                  data that that we the learned blocks must cover. If
                  we lower the recall, there will be pairs of true
                  dupes that we will never directly compare.

                  recall should be a float between 0.0 and 1.0, the default
                  is 0.975

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can
                            be slower and take susbstantial memory.

                            Defaults to True.
        """
        if ppc is not None:
            warnings.warn('`ppc` is a deprecated argument to train. '
                          'Use `maximum_comparisons` to set the maximum '
                          'number records a block is allowed to cover')

        if uncovered_dupes is not None:
            warnings.warn('`uncovered_dupes` is a deprecated argument '
                          'to train. Use recall to set the proportion '
                          'of true pairs that the blocking rules must cover')

        self._trainClassifier()
        self._trainBlocker(maximum_comparisons,
                           recall,
                           index_predicates)

    def _trainClassifier(self, **kwargs):  # pragma: no cover
        labels = numpy.array(self.training_data['label'] == b'match',
                             dtype='int8')
        examples = self.training_data['distances']

        classifier_args = backport.signature(self.classifier.fit).parameters

        classifier_args = {k : kwargs[k]
                           for k
                           in viewkeys(kwargs) & classifier_args}

        self.classifier.fit(examples, labels, **classifier_args)

    def _trainBlocker(self, maximum_comparisons, recall, index_predicates):  # pragma: no cover
        matches = self.training_pairs['match'][:]

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        block_learner = self._blockLearner(predicate_set)

        self.predicates = block_learner.learn(matches,
                                              maximum_comparisons,
                                              recall)

        self.blocker = blocking.Blocker(self.predicates)

    def writeTraining(self, file_obj):  # pragma: no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_obj -- file object to write training data to
        """

        json.dump(self.training_pairs,
                  file_obj,
                  default=serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)

    def uncertainPairs(self):
        '''
        Provides a list of the pairs of records that dedupe is most
        curious to learn if they are matches or distinct.

        Useful for user labeling.

        '''

        if self.training_data.shape[0] == 0:
            rand_int = random.randint(0, len(self.data_sample) - 1)
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0])
            self._addTrainingData({u'match': [exact_match, exact_match],
                                   u'distinct': [random_pair]})

        self._trainClassifier(cv=0)

        bias = len(self.training_pairs[u'match'])
        if bias:
            bias /= (bias +
                     len(self.training_pairs[u'distinct']))

        min_examples = min(len(self.training_pairs[u'match']),
                           len(self.training_pairs[u'distinct']))

        regularizer = 10

        bias = ((0.5 * min_examples + bias * regularizer) /
                (min_examples + regularizer))

        return self.activeLearner.uncertainPairs(self.classifier, bias)

    def markPairs(self, labeled_pairs):
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model

        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records

        '''
        try:
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except:
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match']:
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPairType(pair)

        if labeled_pairs[u'distinct']:
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPairType(pair)

        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']:
            warnings.warn("Didn't return any labeled record pairs")

        for label, pairs in labeled_pairs.items():
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs)

    def _checkRecordPairType(self, record_pair):
        try:
            record_pair[0]
        except:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try:
            record_pair[0].keys() and record_pair[1].keys()
        except:
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])

    def _checkDataSample(self, data_sample):
        try:
            len(data_sample)
        except TypeError:
            raise ValueError("data_sample must be a sequence")

        if len(data_sample):
            self._checkRecordPairType(data_sample[0])

        else:
            warnings.warn("You submitted an empty data_sample")

    def _addTrainingData(self, labeled_pairs):
        """
        Appends training data to the training data collection.
        """

        for label, examples in labeled_pairs.items():
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples,
                                   dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = self.data_model.distances(examples)

            self.training_data = numpy.append(self.training_data,
                                              new_data)

    def _loadSample(self, data_sample):

        self._checkDataSample(data_sample)

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample,
                                                     self.data_model,
                                                     self.num_cores)

    def _loadSampledRecords(self, data_sample):
        """Override to load blocking data from data_sample."""

Example #4

Show file

class ActiveMatching(Matching):
    classifier = rlr.RegularizedLogisticRegression()
    """
    Class for training dedupe extends Matching.
    
    Public methods:
    - __init__
    - readTraining
    - train
    - writeSettings
    - writeTraining
    - uncertainPairs
    - markPairs
    - cleanupTraining
    """
    def __init__(self, variable_definition, data_sample=None, num_cores=None):
        """
        Initialize from a data model and data sample.

        #### Example usage

            # initialize from a defined set of fields
            fields = [{'field' : 'Site name', 'type': 'String'},
                      {'field' : 'Address', 'type': 'String'},
                      {'field' : 'Zip', 'type': 'String', 'Has Missing':True},
                      {'field' : 'Phone', 'type': 'String', 'Has Missing':True},
                     ]

            data_sample = [
                           (
                            (854, {'city': 'san francisco',
                             'address': '300 de haro st.',
                             'name': "sally's cafe & bakery",
                             'cuisine': 'american'}),
                            (855, {'city': 'san francisco',
                             'address': '1328 18th st.',
                             'name': 'san francisco bbq',
                             'cuisine': 'thai'})
                             )
                            ]


            
            deduper = dedupe.Dedupe(fields, data_sample)

        
        #### Additional detail

        A field definition is a list of dictionaries where each dictionary
        describes a variable to use for comparing records. 

        For details about variable types, check the documentation.
        <http://dedupe.readthedocs.org>`_ 

        In the data_sample, each element is a tuple of two
        records. Each record is, in turn, a tuple of the record's key and
        a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = DataModel(variable_definition)

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        self.data_sample = data_sample

        if self.data_sample:
            self._checkDataSample(self.data_sample)
            self.activeLearner = training.ActiveLearning(
                self.data_sample, self.data_model, self.num_cores)
        else:
            self.data_sample = []
            self.activeLearner = None

        training_dtype = [('label', 'S8'),
                          ('distances', 'f4', (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [], u'match': []})

        self.blocker = None

    def cleanupTraining(self):  # pragma : no cover
        '''
        Clean up data we used for training. Free up memory.
        '''
        del self.training_data
        del self.training_pairs
        del self.activeLearner
        del self.data_sample

    def readTraining(self, training_file):
        '''
        Read training from previously built training data file object
        
        Arguments:
        
        training_file -- file object containing the training data
        '''

        logger.info('reading training from file')

        training_pairs = json.load(training_file,
                                   cls=serializer.dedupe_decoder)

        if not any(training_pairs.values()):
            raise EmptyTrainingException(
                "The training file seems to contain no training examples")

        for (label, examples) in training_pairs.items():
            if examples:
                self._checkRecordPairType(examples[0])

            examples = core.freezeData(examples)

            training_pairs[label] = examples
            self.training_pairs[label].extend(examples)

        self._addTrainingData(training_pairs)

        self._trainClassifier()

    def train(self,
              ppc=.1,
              uncovered_dupes=None,
              index_predicates=True,
              pud=0.025):  # pragma : no cover
        """Keyword arguments:
        ppc -- Limits the Proportion of Pairs Covered that we allow a
               predicate to cover. If a predicate puts together a fraction
               of possible pairs greater than the ppc, that predicate will
               be removed from consideration.

               As the size of the data increases, the user will generally
               want to reduce ppc.

               ppc should be a value between 0.0 and 1.0

        uncovered_dupes -- The number of true dupes pairs in our training
                           data that we can accept will not be put into any
                           block. If true true duplicates are never in the
                           same block, we will never compare them, and may
                           never declare them to be duplicates.

                           However, requiring that we cover every single
                           true dupe pair may mean that we have to use
                           blocks that put together many, many distinct pairs
                           that we'll have to expensively, compare as well.

                           uncoverd_dupes is deprecated in favor of pud

        pud -- The proportion of true dupe pairs in our training data
               that that we can accept will not be put into any
               block. If true true duplicates are never in the same
               block, we will never compare them, and may never
               declare them to be duplicates.

               If both pud and uncovered_dupes are set, uncovered_dupes will
               take priority

               pud should be a float between 0.0 and 1.0

        index_predicates -- Should dedupe consider predicates that
                            rely upon indexing the data. Index predicates can 
                            be slower and take susbstantial memory.

                            Defaults to True.

        """
        if uncovered_dupes is None:
            uncovered_dupes = int(pud * len(self.training_pairs['match']))
        else:
            warnings.warn(
                "The uncovered_dupes argument of the train method will be deprecated in dedupe 1.4, please use the pud argument instead",
                DeprecationWarning)

        self._trainClassifier()
        self._trainBlocker(ppc, uncovered_dupes, index_predicates)

    def _trainClassifier(self):  # pragma : no cover
        labels = numpy.array(self.training_data['label'] == b'match',
                             dtype='i4')
        examples = self.training_data['distances']

        self.classifier.fit(examples, labels)

    def _trainBlocker(self, ppc, uncovered_dupes,
                      index_predicates):  # pragma : no cover
        training_pairs = copy.deepcopy(self.training_pairs)

        confident_nonduplicates = training.semiSupervisedNonDuplicates(
            self.data_sample,
            self.data_model,
            self.classifier,
            sample_size=32000)

        training_pairs[u'distinct'].extend(confident_nonduplicates)

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        self.predicates = dedupe.training.blockTraining(
            training_pairs, predicate_set, ppc, uncovered_dupes,
            self._linkage_type)

        self.blocker = blocking.Blocker(self.predicates)

    def writeSettings(self, file_obj):  # pragma : no cover
        """
        Write a settings file containing the 
        data model and predicates to a file object

        Keyword arguments:
        file_obj -- file object to write settings data into
        """

        pickle.dump(self.data_model, file_obj)
        pickle.dump(self.classifier, file_obj)
        pickle.dump(self.predicates, file_obj)

    def writeTraining(self, file_obj):  # pragma : no cover
        """
        Write to a json file that contains labeled examples

        Keyword arguments:
        file_obj -- file object to write training data to
        """

        json.dump(self.training_pairs,
                  file_obj,
                  default=serializer._to_json,
                  tuple_as_array=False,
                  ensure_ascii=True)

    def uncertainPairs(self):
        '''
        Provides a list of the pairs of records that dedupe is most
        curious to learn if they are matches or distinct.
        
        Useful for user labeling.

        '''

        if self.training_data.shape[0] == 0:
            rand_int = random.randint(0, len(self.data_sample) - 1)
            random_pair = self.data_sample[rand_int]
            exact_match = (random_pair[0], random_pair[0])
            self._addTrainingData({
                u'match': [exact_match, exact_match],
                u'distinct': [random_pair]
            })

        self._trainClassifier()

        bias = len(self.training_pairs[u'match'])
        if bias:
            bias /= (bias + len(self.training_pairs[u'distinct']))

        min_examples = min(len(self.training_pairs[u'match']),
                           len(self.training_pairs[u'distinct']))

        regularizer = 10

        bias = ((0.5 * min_examples + bias * regularizer) /
                (min_examples + regularizer))

        return self.activeLearner.uncertainPairs(self.classifier, bias)

    def markPairs(self, labeled_pairs):
        '''
        Add a labeled pairs of record to dedupes training set and update the
        matching model
        
        Argument :

        labeled_pairs -- A dictionary with two keys, `match` and `distinct`
                         the values are lists that can contain pairs of records
                         
        '''
        try:
            labeled_pairs.items()
            labeled_pairs[u'match']
            labeled_pairs[u'distinct']
        except:
            raise ValueError('labeled_pairs must be a dictionary with keys '
                             '"distinct" and "match"')

        if labeled_pairs[u'match']:
            pair = labeled_pairs[u'match'][0]
            self._checkRecordPairType(pair)

        if labeled_pairs[u'distinct']:
            pair = labeled_pairs[u'distinct'][0]
            self._checkRecordPairType(pair)

        if not labeled_pairs[u'distinct'] and not labeled_pairs[u'match']:
            warnings.warn("Didn't return any labeled record pairs")

        for label, pairs in labeled_pairs.items():
            self.training_pairs[label].extend(core.freezeData(pairs))

        self._addTrainingData(labeled_pairs)

    def _checkRecordPairType(self, record_pair):
        try:
            record_pair[0]
        except:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs (ordered sequences of length 2)")

        if len(record_pair) != 2:
            raise ValueError("The elements of data_sample must be pairs "
                             "of record_pairs")
        try:
            record_pair[0].keys() and record_pair[1].keys()
        except:
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])

    def _checkDataSample(self, data_sample):
        try:
            len(data_sample)
        except TypeError:
            raise ValueError("data_sample must be a sequence")

        if len(data_sample):
            self._checkRecordPairType(data_sample[0])

        else:
            warnings.warn("You submitted an empty data_sample")

    def _addTrainingData(self, labeled_pairs):
        """
        Appends training data to the training data collection.
        """

        for label, examples in labeled_pairs.items():
            n_examples = len(examples)
            labels = [label] * n_examples

            new_data = numpy.empty(n_examples, dtype=self.training_data.dtype)

            new_data['label'] = labels
            new_data['distances'] = self.data_model.distances(examples)

            self.training_data = numpy.append(self.training_data, new_data)

    def _loadSample(self, data_sample):

        self._checkDataSample(data_sample)

        self.data_sample = data_sample

        self.activeLearner = training.ActiveLearning(self.data_sample,
                                                     self.data_model,
                                                     self.num_cores)