def test_ElementMultiplication(): """Test for ElementMultiplication.""" X = np.array([[1.0, 1.0, 1.0, 2.0], [0.5, 1.0, 1.0, 0.5], [2.5, 0.2, 10.0, 2.0]]) y = np.array([[1.0, 2.0], [0.5, 0.5], [25.0, 0.4]]) Xt = ElementMultiplication().fit_transform(X) assert_array_almost_equal(Xt, y)
def fit(self): """Fit data using the estimator""" transformer = FeatureUnion([ ( "author_full_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "full_name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_second_initial_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, ), ), ( "combiner", StringDistance( similarity_function="character_equality"), ), ]), ), ( "author_first_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_second_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_other_names_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "other_names", FuncTransformer( func=get_author_other_names), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "affiliation_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "affiliation", FuncTransformer( func=get_normalized_affiliation), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "coauthors_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "coauthors", FuncTransformer( func=get_coauthors_neighborhood), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "abstract_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "abstract", FuncTransformer(func=get_abstract), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "keywords_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_keywords), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "collaborations_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "collaborations", FuncTransformer(func=get_collaborations), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "subject_similairty", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_topics), ), ("shaper", Shaper(newshape=(-1))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "title_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_ethnicity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "classifier", EstimatorTransformer( self.ethnicity_estimator.estimator), ), ]), groupby=group_by_signature, ), ), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()), ]), ), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]) self.distance_estimator.fit(self.X, self.y)
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None, fast=False): """Build a vector reprensation of a pair of signatures.""" if not fast: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_second_initial_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature)), ("combiner", StringDistance(similarity_function="character_equality")) ])), ("author_first_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_second_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("journal_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("abstract_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("keywords_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_keywords)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("collaborations_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("collaborations", FuncTransformer(func=get_collaborations)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("subject_similairty", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_topics)), ("shaper", Shaper(newshape=(-1))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) else: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append( ("author_ethnicity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication())]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=500, max_depth=9, max_features=10, learning_rate=0.125, verbose=verbose) # classifier = RandomForestClassifier(n_estimators=500, # verbose=verbose, # n_jobs=8) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def fit(self): transformer = FeatureUnion([ ('author_full_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('full_name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_second_initial_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, )), ('combiner', StringDistance(similarity_function='character_equality')), ])), ('author_first_given_name_similarity', Pipeline([ ('pairs', PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ('combiner', StringDistance()), ])), ('author_second_given_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, )), ('combiner', StringDistance()), ])), ('author_other_names_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('other_names', FuncTransformer(func=get_author_other_names)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('affiliation_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('affiliation', FuncTransformer(func=get_author_affiliation)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('coauthors_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('coauthors', FuncTransformer(func=get_coauthors_neighborhood)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('abstract_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('abstract', FuncTransformer(func=get_abstract)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('keywords_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_keywords)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('collaborations_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('collaborations', FuncTransformer(func=get_collaborations)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('subject_similairty', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_topics)), ('shaper', Shaper(newshape=(-1))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('title_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('title', FuncTransformer(func=get_title)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_ethnicity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('classifier', EstimatorTransformer( self.ethnicity_estimator.estimator)), ]), groupby=group_by_signature, )), ('sigmoid', FuncTransformer(func=expit)), ('combiner', ElementMultiplication()), ])), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([('transformer', transformer), ('classifier', classifier)]) self.distance_estimator.fit(self.X, self.y)
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None): """Build a vector reprensation of a pair of signatures.""" transformer = FeatureUnion([ # ("author_full_name_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("full_name", FuncTransformer(func=get_author_full_name)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(analyzer="char_wb", # ngram_range=(2, 4), # dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("author_second_initial_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=FuncTransformer( # func=get_second_initial # ), groupby=group_by_signature)), # ("combiner", StringDistance( # similarity_function="character_equality")) # ])), # ("mesh_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("mesh_terms", FuncTransformer(func=get_mesh_terms)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), ("mesh_word2vec", Pipeline([ ("pairs", FuncTransformer(func=get_mesh_word2vec)), ("combiner", MyCosineSimilarity()) ])), ("affiliation_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("title_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("journal_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("abstract_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])) # ("mesh_word2vec", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) # ("keywords_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("keywords", FuncTransformer(func=get_keywords)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("year_diff", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append(("author_ethnicity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1,))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()) ]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=2000, max_depth=9, max_features=5, learning_rate=0.125, verbose=verbose) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator