Python ElementMultiplication.ElementMultiplication Exemples, beard.similarity.ElementMultiplication.ElementMultiplication Python Exemples

Exemple #1

0

Afficher le fichier

def test_ElementMultiplication():
    """Test for ElementMultiplication."""
    X = np.array([[1.0, 1.0, 1.0, 2.0], [0.5, 1.0, 1.0, 0.5],
                  [2.5, 0.2, 10.0, 2.0]])

    y = np.array([[1.0, 2.0], [0.5, 0.5], [25.0, 0.4]])

    Xt = ElementMultiplication().fit_transform(X)
    assert_array_almost_equal(Xt, y)

Exemple #2

0

Afficher le fichier

Fichier : models.py Projet : lucianovilasboas/inspire-disambiguation

    def fit(self):
        """Fit data using the estimator"""
        transformer = FeatureUnion([
            (
                "author_full_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "full_name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_second_initial_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_initial),
                            groupby=group_by_signature,
                        ),
                    ),
                    (
                        "combiner",
                        StringDistance(
                            similarity_function="character_equality"),
                    ),
                ]),
            ),
            (
                "author_first_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_first_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_second_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_other_names_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "other_names",
                                    FuncTransformer(
                                        func=get_author_other_names),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "affiliation_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "affiliation",
                                    FuncTransformer(
                                        func=get_normalized_affiliation),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "coauthors_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "coauthors",
                                    FuncTransformer(
                                        func=get_coauthors_neighborhood),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "abstract_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "abstract",
                                    FuncTransformer(func=get_abstract),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "keywords_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_keywords),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "collaborations_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "collaborations",
                                    FuncTransformer(func=get_collaborations),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "subject_similairty",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_topics),
                                ),
                                ("shaper", Shaper(newshape=(-1))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "title_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                ("title", FuncTransformer(func=get_title)),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_ethnicity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "classifier",
                                    EstimatorTransformer(
                                        self.ethnicity_estimator.estimator),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("sigmoid", FuncTransformer(func=expit)),
                    ("combiner", ElementMultiplication()),
                ]),
            ),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([("transformer", transformer),
                                            ("classifier", classifier)])
        self.distance_estimator.fit(self.X, self.y)

Exemple #3

0

Afficher le fichier

def _build_distance_estimator(X,
                              y,
                              verbose=0,
                              ethnicity_estimator=None,
                              fast=False):
    """Build a vector reprensation of a pair of signatures."""
    if not fast:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_second_initial_similarity",
             Pipeline([
                 ("pairs",
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_second_initial),
                                  groupby=group_by_signature)),
                 ("combiner",
                  StringDistance(similarity_function="character_equality"))
             ])),
            ("author_first_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_first_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_second_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_second_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("journal_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("journal", FuncTransformer(func=get_journal)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("abstract_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("abstract", FuncTransformer(func=get_abstract)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("keywords_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_keywords)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("collaborations_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("collaborations",
                             FuncTransformer(func=get_collaborations)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("subject_similairty",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_topics)),
                            ("shaper", Shaper(newshape=(-1))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    else:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(
            ("author_ethnicity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("classifier",
                             EstimatorTransformer(ethnicity_estimator)),
                        ]),
                                        groupby=group_by_signature)),
                       ("sigmoid", FuncTransformer(func=expit)),
                       ("combiner", ElementMultiplication())])))

    # Train a classifier on these vectors

    classifier = GradientBoostingClassifier(n_estimators=500,
                                            max_depth=9,
                                            max_features=10,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # classifier = RandomForestClassifier(n_estimators=500,
    #                                     verbose=verbose,
    #                                     n_jobs=8)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator

Exemple #4

0

Afficher le fichier

    def fit(self):
        transformer = FeatureUnion([
            ('author_full_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('full_name',
                           FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_second_initial_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_initial),
                      groupby=group_by_signature,
                  )),
                 ('combiner',
                  StringDistance(similarity_function='character_equality')),
             ])),
            ('author_first_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_first_given_name),
                                  groupby=group_by_signature)),
                 ('combiner', StringDistance()),
             ])),
            ('author_second_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_given_name),
                      groupby=group_by_signature,
                  )),
                 ('combiner', StringDistance()),
             ])),
            ('author_other_names_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('other_names',
                           FuncTransformer(func=get_author_other_names)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('affiliation_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('affiliation',
                           FuncTransformer(func=get_author_affiliation)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('coauthors_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('coauthors',
                           FuncTransformer(func=get_coauthors_neighborhood)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('abstract_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('abstract', FuncTransformer(func=get_abstract)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('keywords_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_keywords)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('collaborations_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('collaborations',
                           FuncTransformer(func=get_collaborations)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('subject_similairty',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_topics)),
                          ('shaper', Shaper(newshape=(-1))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('title_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('title', FuncTransformer(func=get_title)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_ethnicity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('name', FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('classifier',
                           EstimatorTransformer(
                               self.ethnicity_estimator.estimator)),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('sigmoid', FuncTransformer(func=expit)),
                 ('combiner', ElementMultiplication()),
             ])),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([('transformer', transformer),
                                            ('classifier', classifier)])
        self.distance_estimator.fit(self.X, self.y)

Exemple #5

0

Afficher le fichier

def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None):
    """Build a vector reprensation of a pair of signatures."""
    transformer = FeatureUnion([
        # ("author_full_name_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("full_name", FuncTransformer(func=get_author_full_name)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(analyzer="char_wb",
        #                                    ngram_range=(2, 4),
        #                                    dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("author_second_initial_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=FuncTransformer(
        #         func=get_second_initial
        #     ), groupby=group_by_signature)),
        #     ("combiner", StringDistance(
        #         similarity_function="character_equality"))
        # ])),
        # ("mesh_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("mesh_terms", FuncTransformer(func=get_mesh_terms)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        ("mesh_word2vec", Pipeline([
            ("pairs", FuncTransformer(func=get_mesh_word2vec)),
            ("combiner", MyCosineSimilarity())
        ])),
        ("affiliation_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("affiliation", FuncTransformer(func=get_author_affiliation)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("title_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("title", FuncTransformer(func=get_title)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("journal_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("journal", FuncTransformer(func=get_journal)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("abstract_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("abstract", FuncTransformer(func=get_abstract)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ]))
        # ("mesh_word2vec", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
        # ("keywords_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("keywords", FuncTransformer(func=get_keywords)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("year_diff", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
    ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(("author_ethnicity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("name", FuncTransformer(func=get_author_full_name)),
                ("shaper", Shaper(newshape=(-1,))),
                ("classifier", EstimatorTransformer(ethnicity_estimator)),
            ]), groupby=group_by_signature)),
            ("sigmoid", FuncTransformer(func=expit)),
            ("combiner", ElementMultiplication())
        ])))

    # Train a classifier on these vectors
    classifier = GradientBoostingClassifier(n_estimators=2000,
                                            max_depth=9,
                                            max_features=5,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator