def fit(self):
        """Fit data using the estimator"""
        transformer = FeatureUnion([
            (
                "author_full_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "full_name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_second_initial_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_initial),
                            groupby=group_by_signature,
                        ),
                    ),
                    (
                        "combiner",
                        StringDistance(
                            similarity_function="character_equality"),
                    ),
                ]),
            ),
            (
                "author_first_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_first_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_second_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_other_names_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "other_names",
                                    FuncTransformer(
                                        func=get_author_other_names),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "affiliation_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "affiliation",
                                    FuncTransformer(
                                        func=get_normalized_affiliation),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "coauthors_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "coauthors",
                                    FuncTransformer(
                                        func=get_coauthors_neighborhood),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "abstract_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "abstract",
                                    FuncTransformer(func=get_abstract),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "keywords_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_keywords),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "collaborations_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "collaborations",
                                    FuncTransformer(func=get_collaborations),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "subject_similairty",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_topics),
                                ),
                                ("shaper", Shaper(newshape=(-1))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "title_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                ("title", FuncTransformer(func=get_title)),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_ethnicity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "classifier",
                                    EstimatorTransformer(
                                        self.ethnicity_estimator.estimator),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("sigmoid", FuncTransformer(func=expit)),
                    ("combiner", ElementMultiplication()),
                ]),
            ),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([("transformer", transformer),
                                            ("classifier", classifier)])
        self.distance_estimator.fit(self.X, self.y)
Ejemplo n.º 2
0
def _build_distance_estimator(X,
                              y,
                              verbose=0,
                              ethnicity_estimator=None,
                              fast=False):
    """Build a vector reprensation of a pair of signatures."""
    if not fast:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_second_initial_similarity",
             Pipeline([
                 ("pairs",
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_second_initial),
                                  groupby=group_by_signature)),
                 ("combiner",
                  StringDistance(similarity_function="character_equality"))
             ])),
            ("author_first_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_first_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_second_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_second_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("journal_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("journal", FuncTransformer(func=get_journal)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("abstract_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("abstract", FuncTransformer(func=get_abstract)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("keywords_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_keywords)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("collaborations_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("collaborations",
                             FuncTransformer(func=get_collaborations)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("subject_similairty",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_topics)),
                            ("shaper", Shaper(newshape=(-1))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    else:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(
            ("author_ethnicity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("classifier",
                             EstimatorTransformer(ethnicity_estimator)),
                        ]),
                                        groupby=group_by_signature)),
                       ("sigmoid", FuncTransformer(func=expit)),
                       ("combiner", ElementMultiplication())])))

    # Train a classifier on these vectors

    classifier = GradientBoostingClassifier(n_estimators=500,
                                            max_depth=9,
                                            max_features=10,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # classifier = RandomForestClassifier(n_estimators=500,
    #                                     verbose=verbose,
    #                                     n_jobs=8)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
Ejemplo n.º 3
0
    def fit(self):
        transformer = FeatureUnion([
            ('author_full_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('full_name',
                           FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_second_initial_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_initial),
                      groupby=group_by_signature,
                  )),
                 ('combiner',
                  StringDistance(similarity_function='character_equality')),
             ])),
            ('author_first_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_first_given_name),
                                  groupby=group_by_signature)),
                 ('combiner', StringDistance()),
             ])),
            ('author_second_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_given_name),
                      groupby=group_by_signature,
                  )),
                 ('combiner', StringDistance()),
             ])),
            ('author_other_names_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('other_names',
                           FuncTransformer(func=get_author_other_names)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('affiliation_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('affiliation',
                           FuncTransformer(func=get_author_affiliation)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('coauthors_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('coauthors',
                           FuncTransformer(func=get_coauthors_neighborhood)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('abstract_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('abstract', FuncTransformer(func=get_abstract)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('keywords_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_keywords)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('collaborations_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('collaborations',
                           FuncTransformer(func=get_collaborations)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('subject_similairty',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_topics)),
                          ('shaper', Shaper(newshape=(-1))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('title_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('title', FuncTransformer(func=get_title)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_ethnicity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('name', FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('classifier',
                           EstimatorTransformer(
                               self.ethnicity_estimator.estimator)),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('sigmoid', FuncTransformer(func=expit)),
                 ('combiner', ElementMultiplication()),
             ])),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([('transformer', transformer),
                                            ('classifier', classifier)])
        self.distance_estimator.fit(self.X, self.y)
Ejemplo n.º 4
0
def train(records, use_categories=True):
    """Train a classifier on the given arXiv records.

    :param records:
        Records are expected as a list of dictionaries with
        the following fields required: "title", "abstract", "categories"
        and "decision". The decision field should be either "CORE", "Non-CORE"
        or "Rejected".

        Example:
            records = [{u'decision': "CORE",
                        u'title': u'Effects of top compositeness',
                        u'abstract': u'We investigate the effects of (...)'
                        u'categories': [u'cond-mat.mes-hall',
                                        u'cond-mat.mtrl-sci']},
                        {...}, ...]

    :param use_categories:
        Whether the "categories" is used to build the classifier.

    :return: the trained pipeline
    """
    records = np.array(records, dtype=np.object).reshape((-1, 1))

    if use_categories:
        transformer = Pipeline([
            ("features",
             FeatureUnion([
                 ("title_abstract",
                  Pipeline([("getter",
                             FuncTransformer(func=_get_title_abstract)),
                            ("shape", Shaper(newshape=(-1, ))),
                            ("tfidf",
                             TfidfVectorizer(min_df=3,
                                             max_df=0.1,
                                             norm="l2",
                                             ngram_range=(1, 1),
                                             stop_words="english",
                                             strip_accents="unicode",
                                             dtype=np.float32,
                                             decode_error="replace"))])),
                 ("categories",
                  Pipeline([("getter", FuncTransformer(func=_get_categories)),
                            ("shape", Shaper(newshape=(-1, ))),
                            ("tfidf",
                             TfidfVectorizer(norm="l2",
                                             dtype=np.float32,
                                             decode_error="replace"))])),
             ])), ("scaling", Normalizer())
        ])

    else:
        transformer = Pipeline([("getter",
                                 FuncTransformer(func=_get_title_abstract)),
                                ("shape", Shaper(newshape=(-1, ))),
                                ("tfidf",
                                 TfidfVectorizer(min_df=3,
                                                 max_df=0.1,
                                                 norm="l2",
                                                 ngram_range=(1, 1),
                                                 stop_words="english",
                                                 strip_accents="unicode",
                                                 dtype=np.float32,
                                                 decode_error="replace")),
                                ("scaling", Normalizer())])

    X = transformer.fit_transform(records)
    y = np.array([r[0]["decision"] for r in records])

    grid = GridSearchCV(
        LinearSVC(),
        param_grid={"C": np.linspace(start=0.2, stop=0.5, num=20)},
        scoring="accuracy",
        cv=3,
        verbose=3)
    grid.fit(X, y)

    return Pipeline([("transformer", transformer),
                     ("classifier", grid.best_estimator_)])
Ejemplo n.º 5
0
def build_distance_estimator(X, y):
    # Build a vector reprensation of a pair of signatures
    transformer = FeatureUnion([
        ("author_full_name_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("full_name",
                         FuncTransformer(func=get_author_full_name)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("author_other_names_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("other_names",
                         FuncTransformer(func=get_author_other_names)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("author_initials_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("initials",
                         FuncTransformer(func=get_author_initials)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("count",
                         CountVectorizer(analyzer="char_wb",
                                         ngram_range=(1, 1),
                                         binary=True,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("affiliation_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("affiliation",
                         FuncTransformer(func=get_author_affiliation)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("coauthors_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("coauthors", FuncTransformer(func=get_coauthors)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("title_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("title", FuncTransformer(func=get_title)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("journal_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("journal", FuncTransformer(func=get_journal)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("abstract_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("abstract", FuncTransformer(func=get_abstract)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("keywords_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("keywords", FuncTransformer(func=get_keywords)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("collaborations_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("collaborations",
                         FuncTransformer(func=get_collaborations)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("references_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("references", FuncTransformer(func=get_references)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        (
            "year_diff",
            Pipeline([
                ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                ("combiner", AbsoluteDifference()
                 )  # FIXME: when one is missing
            ]))
    ])

    # Train a classifier on these vectors
    classifier = GradientBoostingClassifier(n_estimators=500,
                                            max_depth=9,
                                            max_features=10,
                                            learning_rate=0.125,
                                            verbose=3)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
Ejemplo n.º 6
0
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None):
    """Build a vector reprensation of a pair of signatures."""
    transformer = FeatureUnion([
        # ("author_full_name_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("full_name", FuncTransformer(func=get_author_full_name)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(analyzer="char_wb",
        #                                    ngram_range=(2, 4),
        #                                    dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("author_second_initial_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=FuncTransformer(
        #         func=get_second_initial
        #     ), groupby=group_by_signature)),
        #     ("combiner", StringDistance(
        #         similarity_function="character_equality"))
        # ])),
        # ("mesh_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("mesh_terms", FuncTransformer(func=get_mesh_terms)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        ("mesh_word2vec", Pipeline([
            ("pairs", FuncTransformer(func=get_mesh_word2vec)),
            ("combiner", MyCosineSimilarity())
        ])),
        ("affiliation_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("affiliation", FuncTransformer(func=get_author_affiliation)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("title_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("title", FuncTransformer(func=get_title)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("journal_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("journal", FuncTransformer(func=get_journal)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("abstract_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("abstract", FuncTransformer(func=get_abstract)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ]))
        # ("mesh_word2vec", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
        # ("keywords_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("keywords", FuncTransformer(func=get_keywords)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("year_diff", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
    ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(("author_ethnicity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("name", FuncTransformer(func=get_author_full_name)),
                ("shaper", Shaper(newshape=(-1,))),
                ("classifier", EstimatorTransformer(ethnicity_estimator)),
            ]), groupby=group_by_signature)),
            ("sigmoid", FuncTransformer(func=expit)),
            ("combiner", ElementMultiplication())
        ])))

    # Train a classifier on these vectors
    classifier = GradientBoostingClassifier(n_estimators=2000,
                                            max_depth=9,
                                            max_features=5,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator