Python TfidfPredicate Examples

Programming Language: Python

Namespace/Package Name: dedupe.tfidf

Method/Function: TfidfPredicate

Examples at hotexamples.com: 3

Python TfidfPredicate - 3 examples found. These are the top rated real world Python examples of dedupe.tfidf.TfidfPredicate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: api.py Project: jtbates/dedupe

    def _blockerTypes(self) : # pragma : no cover
        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([tfidf.TfidfPredicate(threshold)
                                         for threshold
                                         in [0.2, 0.4, 0.6, 0.8]])

        return {'String' : (string_predicates
                            + tfidf_string_predicates)}

Example #2

Show file

def _initializeTraining(training_pairs, fields, predicate_functions,
                        tfidf_thresholds, df_index):

    training_dupes = (training_pairs[1])[:]
    training_distinct = (training_pairs[0])[:]

    predicate_functions = list(product(predicate_functions, fields))

    tfidf_predicates = [
        tfidf.TfidfPredicate(threshold) for threshold in tfidf_thresholds
    ]
    tfidf_predicates = list(product(tfidf_predicates, fields))

    predicate_set = disjunctivePredicates(predicate_functions +
                                          tfidf_predicates)

    if tfidf_predicates:
        _overlap = canopyOverlap(predicate_set,
                                 training_dupes + training_distinct, df_index)
    else:
        _overlap = defaultdict(lambda: None)

    return (training_dupes, training_distinct, predicate_set, _overlap)

Example #3

Show file

File: api.py Project: TinaCloud/dedupe-gaptor

    def __init__(self, init=None):
        """
        Load or initialize a data model.

        #### Example usage

            # initialize from a settings file
            deduper = dedupe.Dedupe('my_learned_settings')

        or

            # initialize from a defined set of fields
            fields = {'Site name': {'type': 'String'},
                      'Address':   {'type': 'String'},
                      'Zip':       {'type': 'String', 'Has Missing':True},
                      'Phone':     {'type': 'String', 'Has Missing':True},
                      }
            
            deduper = dedupe.Dedupe(fields)

        
        #### Keyword arguments
        
        `init`
        A field definition or a file location for a settings file.

        #### Additional detail
        A field definition is a dictionary where the keys are the fields
        that will be used for training a model and the values are the
        field specification

        Field types include

        - String

        A 'String' type field must have as its key a name of a field
        as it appears in the data dictionary and a type declaration
        ex. `{'Phone': {type: 'String'}}`

        Longer example of a field definition:


            fields = {'name':       {'type': 'String'},
                      'address':    {'type': 'String'},
                      'city':       {'type': 'String'},
                      'cuisine':    {'type': 'String'}
                      }


        Settings files are typically generated by saving the settings
        learned in a previous session. If you need details for this
        file see the method [`writeSettings`][[api.py#writesettings]].
        """

        if init.__class__ is dict and init:
            self.data_model = _initializeDataModel(init)
            self.predicates = None
        elif init.__class__ is str and init:
            (self.data_model, self.predicates) = self._readSettings(init)
        elif init:
            raise ValueError('Incorrect Input Type: must supply either a '
                             'field definition or a settings file.')
        else:

            raise ValueError('No Input: must supply either a field '
                             'definition or a settings file.')

        self.training_data = None
        self.training_pairs = None
        self.data_sample = None
        self.dupes = None
        self.training_encoder = training_serializer._to_json
        self.training_decoder = training_serializer.dedupe_decoder

        string_predicates = (predicates.wholeFieldPredicate,
                             predicates.tokenFieldPredicate,
                             predicates.commonIntegerPredicate,
                             predicates.sameThreeCharStartPredicate,
                             predicates.sameFiveCharStartPredicate,
                             predicates.sameSevenCharStartPredicate,
                             predicates.nearIntegersPredicate,
                             predicates.commonFourGram,
                             predicates.commonSixGram)

        tfidf_string_predicates = tuple([
            tfidf.TfidfPredicate(threshold)
            for threshold in [0.2, 0.4, 0.6, 0.8]
        ])

        self.blocker_types = {
            'String': (string_predicates + tfidf_string_predicates)
        }