Ejemplo n.º 1
0
def test_feature_union_parallel():
    # test that n_jobs work for FeatureUnion
    X = JUNK_FOOD_DOCS

    fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))])

    fs_parallel = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs_parallel2 = FeatureUnion(
        [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2
    )

    fs.fit(X)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape[0], len(X))

    fs_parallel.fit(X)
    X_transformed_parallel = fs_parallel.transform(X)
    assert_equal(X_transformed.shape, X_transformed_parallel.shape)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray())

    # fit_transform should behave the same
    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())

    # transformers should stay fit after fit_transform
    X_transformed_parallel2 = fs_parallel2.transform(X)
    assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
Ejemplo n.º 2
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ["x3"]
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ["x5"]

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [("m5", mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["m5__x5"], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[("mock", mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x3"], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x5"], ft.get_feature_names())
Ejemplo n.º 3
0
 def test_feature_union(self):
     """Tests that combining multiple featurizers works as expected"""
     modules = ["bag-of-words", "entities"]
     modules_list, _ = modules_to_dictionary(modules)
     feature_union = FeatureUnion(modules_list)
     feature_union.fit(texts_entities, outcomes)
     feature_union.transform(["unknown"])
Ejemplo n.º 4
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ['x5']

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [('m5', mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['m5__x5'], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[('mock', mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x3'], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x5'], ft.get_feature_names())
Ejemplo n.º 5
0
def pca(x, y, test_x, n_features=-1):
    if n_features == -1:
        n_features = int(np.ceil(np.sqrt(x.shape[1])))

    pca = PCA(n_components=n_features)
    selection = SelectKBest(k=n_features/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    combined_features.fit(x, y)

    return combined_features.transform(x), combined_features.transform(test_x)
def prediction(train_df, test_df, MODEL):

    print "... start prediction"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    test_X = fu_obj.transform(test_df)

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
Ejemplo n.º 7
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Ejemplo n.º 8
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Ejemplo n.º 9
0
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Ejemplo n.º 11
0
class q5_feature_UNION(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self):
        self.q5_feature_UNION = FeatureUnion([('q2_mlm_KNN', q2_mlm_KNN()), ('q3_mlm_RIDGE', q3_mlm_RIDGE()), ('q4_mlm_RIDGE', q4_mlm_RIDGE())])
        
    def transform(self, X):
        model_union = self.q5_feature_UNION.transform(X)
        prediction = np.asscalar(np.average(model_union))
        return prediction
Ejemplo n.º 12
0
 def fit_logreg(self):
     tokenize_sense = CachedFitTransform(Pipeline([
         ('tokenize', Map(compose(tokenize, normalize_special, unescape))),
         ('normalize', MapTokens(normalize_elongations)),
     ]), self.memory)
     features = FeatureUnion([
         # ('w2v_doc', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec(
         #         dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20,
         #         workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))),
         # ('w2v_word_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_avg_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_norm_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         ('w2v_word_norm_avg_google', Pipeline([
             ('tokenize', tokenize_sense),
             ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         ])),
         # ('w2v_word_max', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_max_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_inv', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))),
     ])
     classifier = LogisticRegression()
     with temp_log_level({'gensim.models.word2vec': logging.INFO}):
         classifier.fit(features.transform(self.train_docs), self.train_labels())
     estimator = Pipeline([('features', features), ('classifier', classifier)])
     return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator
def prediction(train_df, test_df, MODEL):

    print "... start prediction"
    fu_obj = FeatureUnion(transformer_list=features.feature_list)
    train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)]
    train_X = fu_obj.fit_transform(train_df)
    train_y = np.log1p(train_df["Sales"]).as_matrix()
    train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj))
    train_dump_df["target"] = train_y
    train_dump_df = train_dump_df.dropna(axis=0)
    print train_dump_df.shape
    train_X = train_dump_df[get_split_feature_list(fu_obj)].values
    train_y = train_dump_df["target"].values
    train_dump_df["ID"] = -1
    train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False)
    test_X = fu_obj.transform(test_df)
    test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj))
    print (test_dump_df == 0).sum(axis=0)
    test_dump_df["ID"] = test_df["Id"]
    test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False)
    if MODEL == "XGB":
        train_X, valid_X, train_y, valid_y =\
            train_test_split(train_X, train_y, test_size=0.05)
        fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)],
                     "eval_metric": rmspe_xg,
                     "early_stopping_rounds": 100}
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1,
                           fit_params=fit_param)
    else:
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    y_pred = np.expm1(clf.predict(test_X))
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
Ejemplo n.º 14
0
class MuscleClassifier():

	def __init__(self, auto_load=True):
		""" Initializes our MuscleClassifier
			Option to preload it or start from fresh model 
		"""

		#=====[ If auto_load, then we rehydrate our existing models ]=====
		if auto_load:

			self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r'))
			self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r'))
			self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r'))

		else:

			self.model = BernoulliNB()

	def train(self, muscle_groups, labels):
		""" 
			Vectorizes raw input and trains our classifier 
		"""

		#=====[ Instantiate label encoder to turn text labels into ints ]=====
		self.le = preprocessing.LabelEncoder()

		#=====[ Declare vectorizers and merge them via a FeatureUnion ]=====
		char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8')
		word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8')

		self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)])

		#=====[ Transform our input and labels ]=====
		X = self.vectorizer.fit_transform(muscle_groups).toarray()
		Y = self.le.fit_transform(labels)

		#=====[ Fit our model and then run inference on training data ]=====
		self.model.fit(X,Y)
		y = self.model.predict(X)

		#=====[ Report Traning Accuracy ]=====
		print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y)))

	def predict(self, exercises):
		""" Takes in raw input, vectorizes it, and reports back predicted muscle group """

		X = self.vectorizer.transform(exercises).toarray()
		y = self.model.predict(X)

		return self.le.classes_[y]
Ejemplo n.º 15
0
def test_feature_stacker_weights():
    # test feature stacker with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())
Ejemplo n.º 16
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Ejemplo n.º 17
0
 def test_reference_plusplus_legacy(self):
     """compare with reference result of original implementation"""
     image_list = ['./v1like_ref/sample_{}.png'.format(i) for i in range(10)]
     reference_result = loadmat('./v1like_ref/reference_v1like_result_plusplus.mat')['feature_matrix']
     X = [imread(imagename) for imagename in image_list]
     v1like_instance_1 = v1like.V1Like(pars_baseline='simple_plus', legacy=True, debug=debug)
     v1like_instance_2 = v1like.V1Like(pars_baseline='simple_plusplus_2nd_scale', legacy=True, debug=debug)
     v1like_instance = FeatureUnion([('scale_1', v1like_instance_1),
                                     ('scale_2', v1like_instance_2)])
     # seems that FeatureUnion's X can't be a iterator. must be a true array.
     with Timer('simple_plusplus legacy version'):
         result_legacy = v1like_instance.transform(X)
     self.assertEqual(reference_result.dtype, result_legacy.dtype)
     self.assertEqual(reference_result.shape, result_legacy.shape)
     if debug:
         print(abs(reference_result[:, :] - result_legacy[:, :]).max())
     self.assertTrue(np.allclose(reference_result, result_legacy, atol=tol))
Ejemplo n.º 18
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1],
                    10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())
Ejemplo n.º 19
0
def build_pipeline():
    x_train, x_test, y_train, y_test = get_training_data()
    tfidf = TfidfVectorizer()

    feature_union = FeatureUnion(
                transformer_list=[
                    ('x', Pipeline([
                        ('selector', ItemSelector(key='x')),
                        ('tfidf', tfidf),
                        ('best', SelectKBest(k=1000))
                    ]))
                ])

    X_features = feature_union.fit(x_train, y_train).transform(x_train)
    param_grid = dict(univ_select__k=[1,100,1000,10000], mnb__alpha=[0.01, 0.1, 1.0])
    grid = GridSearchCV(MultinomialNB(), param_grid=param_grid)
    grid.fit(X_features, y_train)
    c = grid.best_estimator_

    X_test = feature_union.transform(x_test)
    pred = np.array(c.predict(X_test))
    pred_proba = np.array([a[1] for a in c.predict_proba(X_test)])
    precision, recall, fscore, support = precision_recall_fscore_support(actual, pred)
    fpr, tpr, thresholds = roc_curve(actual, pred)
    auc_score = auc(fpr, tpr)

    now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')

    metadata = {
        'pipeline':      str(grid.best_estimator_),
        'created_at':    now,
        'git_hash':      0,
        'precision':     [float(p) for p in precision],
        'recall':        [float(r) for r in recall],
        'fscore':        [float(f) for f in fscore],
        'support':       [int(s) for s in support],
        'auc': auc_score
    }

    p = PackagedPipeline(pipeline=grid.best_estimator_, feature_union=feature_union, metadata=metadata,
        x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)
    p.save()
def validation_model(df, MODEL):

    print "... start validation"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_df = df[(df["valflag"] != 1)]

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, cv=None)
    clf.fit(train_X, train_y)
    print clf.grid_scores_
    print clf.best_estimator_
    print clf.best_score_
    print clf.best_params_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s_validation.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s_validation.csv" % MODEL
        coef_df.to_csv(coeffile)

    val_df = df[(df["valflag"] == 1)]
    test_X = fu_obj.transform(val_df)
    test_y = val_df["Sales"].as_matrix()

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales_Pred")
    y_sr = pd.Series(test_y, name="Sales")
    res = pd.concat([pred_sr, y_sr], axis=1).rename(index=lambda x: x + 1)
    submissionfile = SUBMISSION + "submission_validation_%s.csv" % MODEL
    res.to_csv(submissionfile)
Ejemplo n.º 21
0
class PerClassFeatureSelector:
    """


    """
    def __init__(self,*transformers):
        self.transformers=transformers
        self.transformer=None


    def fit(self,X,y):
        feature_logger.info("Fitting transformers for each class")
        #Get all the classes first
        genre_set=set((normalize_genre_string(g,1) for g in y))

        #stage 1
        transformer_list=[] #list of all the transformers for each class/genre
        for g in genre_set:
            feature_logger.info("Fitting transformer for {}".format(g))
            transformer_obj=copy.deepcopy(self.transformers[0])

            genre_matches=[g == normalize_genre_string(g_1,1) for g_1 in y]

            #X_match=X[np.array(genre_matches)]
            #y_match=y[np.array(genre_matches)]

            transformer_obj.fit(X,genre_matches)
            transformer_list.append((g,transformer_obj))

        #now train the actual transformer
        self.transformer=FeatureUnion(transformer_list,1)

    def transform(self,X):
        return self.transformer.transform(X)

    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X)
    def test_same_result_weight(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], transformer_weights={"words": 10})
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], transformer_weights={"words": 10})

        loc_union.fit(X)
        dist_union.fit(Z)

        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
Ejemplo n.º 23
0
		return self.scaler.transform(float_df)

class SelectCategoryVars(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
		return self
	def transform(self, X, y=None):
		res=[]
		for item in X.columns:
			if item not in FLOAT_VARS: res.append(item)
		return np.array(X.loc[:,res])

combine_feature = FeatureUnion([("scalingfloat", ScalingFloat()), ("selectcategory", SelectCategoryVars())])
# combine_feature = FeatureUnion([("scalingfloat", ScalingFloat())])
# combine_feature = FeatureUnion([("selectcategory", SelectCategoryVars())])
combine_feature.fit(train)
train = combine_feature.transform(train)
test = combine_feature.transform(test)
# print train.shape

lrc = GridSearchCV(LogisticRegression(), param_grid = dict(C = C, \
	penalty=penalty, intercept_scaling=intercept_scaling), cv = 10)
lrc.fit(train, y_train)

y_train_predict = lrc.predict(train)
y_test_predict = lrc.predict(test)

#scaled results: it seems that though RF has higher accuracy, but logistic regression has no overfitting, which can be a huge advantage
# print 'the best parameter setting is:', lrc.best_estimator_
# # the best parameter setting is: LogisticRegression(C=9, class_weight=None, dual=False, fit_intercept=True,
#           # intercept_scaling=3, penalty='l1', random_state=None, tol=0.0001)
# print 'the best CV score in the GridSearchCV is:', lrc.best_score_
class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1a: TODO: Use (s, abs(s)) as features
        # feature = np.append(state,np.abs(state))
        # Task 1b: RBF features
        # return np.hstack((state,np.abs(state)))

        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state
        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        next_qs = np.array(
            max([
                q_a.predict(featurized_next_state) for q_a in self.q_functions
            ]))
        if done:
            next_qs = np.zeros(1)
        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        target = reward + self.gamma * next_qs

        # Update Q-value estimation
        self.q_functions[action].partial_fit(featurized_state, target)

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(self.batch_size)

        # Task 2: TODO: Reformat data in the minibatch
        states = np.array([sample.state for sample in samples])
        action = np.array([sample.action for sample in samples])
        next_states = np.array([sample.next_state for sample in samples])
        rewards = np.array([sample.reward for sample in samples])
        dones = np.array([sample.done for sample in samples])

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        next_qs = np.array(
            [q_a.predict(featurized_next_states) for q_a in self.q_functions])
        next_qs = np.max(next_qs, axis=0)
        idx = dones == True
        if np.any(idx):
            next_qs[idx] = 0

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs

        # Calculate featurized states
        featurized_states = self.featurize(states)

        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]

                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)
Ejemplo n.º 25
0
    dataset = pd.DataFrame(data=catconversion.fit_transform(orig_dataset),
                           columns=fcols,
                           index=orig_dataset.index)
    target = FeatureColumnsExtractor(
        settings.TARGET).fit_transform(orig_dataset).apply(nonlinearity)

    import time
    before = time.time()
    pipeline = overall_pipeline()

    cv = KFold(len(target), n_folds=4, random_state=2, shuffle=False)
    submission = SqrtHazardSubmission(pipeline, 'XGB_Direct_OneHot', cv=cv)

    submission.fit(dataset,
                   target,
                   perform_cv=True,
                   scoring=scorer_normalized_gini,
                   n_jobs=2,
                   verbose=3)

    # print('fitted. time:', time.time() - before)

    original_test_set = pd.read_csv(settings.TEST_FILE)
    test_set = pd.DataFrame(data=catconversion.transform(original_test_set),
                            columns=fcols,
                            index=original_test_set.index)

    predictions = submission.predict(test_set)
    submission.create_submission(predictions, original_test_set,
                                 settings.SUBMIT_MY_XGB_DIRECT_ONE_HOT)
Ejemplo n.º 26
0
new_data = get_data_frame(
    new_data_directory,
    lambda line: json.loads(line),
    extension =".timetest")

y_train = data_train.target
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3,
                                 stop_words='english')

content_pipeline = FeaturePipeline([
    ('cont1', TextExtractor('content')),
    ('vec', vectorizer),
])
colloc_pipeline = FeaturePipeline([
    ('cont1', TextExtractor('content')),
    ('coll', ChiSqBigramFinder(score_thr=70)),
    ('vectc', FeatureHasher(input_type="string", non_negative=True))
])


preprocess = FeatureUnion([
    ('cp', content_pipeline),
    ('op', colloc_pipeline)
])
X_train = preprocess.fit_transform(data_train.data)
X_new = preprocess.transform(new_data.data)
model = LinearSVC(loss='l2',penalty='l2',tol=1e-3)
trained_model = model.fit(X_train,y_train)
stuff = trained_model.decision_function(X_new)
Ejemplo n.º 27
0
scaler = preprocessing.StandardScaler().fit(X_train_raw)
X_train_scaled = scaler.transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

## PCA and Feature Selection

'''pca = PCA(n_components=100)  
pca.fit(X_train_scaled)
#print(pca.explained_variance_ratio_) 
X_train_reduced = pca.transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)
'''

pca = PCA(n_components=800)
selection = SelectKBest(k=850)
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
combined_features.fit(X_train_scaled, train_labels.ravel())
#print(pca.explained_variance_ratio_) 
X_train_reduced = combined_features.transform(X_train_scaled)
X_test_reduced = combined_features.transform(X_test_scaled)



## Train final Classifiers
#clf = Ridge(alpha=.5)
clf = Lasso(alpha=.03)
clf.fit(X_train_reduced, Y_train_raw)
Y_predicted = clf.predict(X_test_reduced)

## Save results to csv
np.savetxt('prediction.csv', Y_predicted, fmt='%.5f',delimiter=',')
Ejemplo n.º 28
0
        ("cat_pipeline", cat_pipeline),
    ])


# In[89]:

housing_prepared = preparation_pipeline.fit_transform(housing)

#print(housing_prepared)

#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = preparation_pipeline.transform(some_data)

print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(housing_labels, housing_predictions)
print(lin_mae)
Ejemplo n.º 29
0
  for i in lde_params:
    lde1=lde(p_dropout=i)
    # lde1.fit(X_train)
    # X_ae = lde1.transform(X_train)
    for j in lde_params:
      lde2=lde(p_dropout=j)
      # lde2.fit(X_ae)
      # X_ae = lde2.transform(X_ae)
      # # lde3=lde(p_dropout=0.15)
      print("For p_dropout = ",i , j)
      # eval_model(clf, X_ae, y, cv=3, n_jobs=-1)
      univ_selection=SelectKBest(k=52)
      combined_features = FeatureUnion([("lde1", lde1), ("lde2", lde2),("univ_selection",univ_selection)])
      # Use combined features to transform dataset:
      combined_features.fit(X_train, y)
      X_features = combined_features.transform(X_train)
      eval_model(clf, X_features, y, cv=3, n_jobs=-1)


  print("linear AE done")

  # ae_params = {'p_dropout':[0.1, 0.3]}
  # estimator = GridSearchCV(p, cv = 2,
  #                        # param_grid = dict(lde1__p_dropout = lde_params,lde2__p_dropout = lde_params)
  #                        )

  # estimator.fit(X_train, y)
  # print("best_estimator_ ",estimator.best_estimator_ )
  # print("best_score_  ",estimator.best_score_ )

vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')
    dump_picle(transformed_train, "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
Ejemplo n.º 31
0
    pipe_feature = FeatureUnion([('window_transformer', WindowTransformerList(searched_words=search_words,
                                                                              n_jobs=n_jobs,
                                                                              min_df=min_df)),
                                 ('bag_of_words', BagOfWordInLine(searched_words=search_words,
                                                                  n_jobs=n_jobs,
                                                                  min_df=min_df)),
                                 ('is_date', IsDate(n_jobs=n_jobs)),
                                 ("position", BoxPositionGetter()),
                                 ('is_digit', ContainsDigit(n_jobs=n_jobs)),
                                 ('is_nom', IsNom(n_jobs=n_jobs)),
                                 ('is_prenom', IsPrenom(n_jobs=n_jobs)),
                                 ])

    X_train = pipe_feature.fit_transform(X_train)
    X_test = pipe_feature.transform(X_test)

    data = {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test
    }

    from pickle import dump
    with open(data_output, 'wb') as f1:
        dump(data, f1)

    with open(pipe_feature_output, 'wb') as f2:
        dump(pipe_feature, f2)
Ejemplo n.º 32
0
class AuthorshipAttribution:
    """ Implements authorship attribution models."""
    def __init__(self, data_set):
        self.corpus = []
        self.book_labels = []
        self.author_labels = []
        self.tags = []
        for item in data_set:
            self.corpus.append(item["text"])
            self.book_labels.append(item["book"])
            self.author_labels.append(item["author"])
            self.tags.append(item["pos"])

        self.sample_clf = None
        self.author_clf = None
        self.author_lms = {}

        # word ngram feature generators
        self.word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2),
                                           max_features=2000, binary=False,
                                           decode_error='ignore')
        # char ngram feature generators
        self.char_vector = TfidfVectorizer(analyzer="char", ngram_range=(2, 3),
                                           max_features=2000, binary=False,
                                           decode_error='ignore', min_df=0)
        # POS ngram feature generators
        self.tag_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2),
                                          max_features=2000, binary=False,
                                          decode_error='ignore')
        # punctuation frequency feature generators
        self.punct_vector = TfidfVectorizer(analyzer='char',
                                            preprocessor=util.retain_punct,
                                            max_features=2000, binary=False,
                                            use_idf=False,
                                            decode_error='ignore')
        # concatenate features generators
        self.vectorizer = FeatureUnion([("chars", self.char_vector),
                                        ("words", self.word_vector),
                                        ("puncts", self.punct_vector)])

        # generate features
        print "- Generating features"
        X1 = self.vectorizer.fit_transform(self.corpus)
        X2 = self.tag_vector.fit_transform(self.tags)
        # concatenate two feature matrices
        matrix = sp.hstack((X1, X2))
        self.X = matrix.toarray()

    def generate_test_features(self, corpus, classes, tags):
        """Generate feature matrix of the test corpus passes as argument."""
        X1 = self.vectorizer.transform(corpus)
        X2 = self.tag_vector.transform(tags)
        # concatenate two matrices
        matrix = sp.hstack((X1, X2))
        X = matrix.toarray()
        y = np.asarray(classes)
        return (X, y)

    def train_sample_model(self):
        """Train classifier needed to predict the book using sample text."""
        print "- Training book/work model"
        save_location = os.path.join("models", "clf", "sample_model.p")
        # check and load if a saved model is present, if not train a new one
        if os.path.isfile(save_location):
            self.sample_clf = pickle.load(open(save_location, "rb"))
        else:
            X_train, y_train = self.X, np.asarray(self.book_labels)
            model = SVC(kernel='rbf')
            self.sample_clf = model.fit(X_train, y_train)
            pickle.dump(self.sample_clf, open(save_location, "wb"))

    def train_author_model(self):
        """Train classifier needed to predict the author using sample text."""
        print "- Training author model"
        save_location = os.path.join("models", "clf", "author_model.p")
        # check and load if a saved model is present, if not train a new one
        if os.path.isfile(save_location):
            self.author_clf = pickle.load(open(save_location, "rb"))
        else:
            X_train, y_train = self.X, np.asarray(self.author_labels)
            model = LinearSVC(loss='hinge', dual=True)
            self.author_clf = model.fit(X_train, y_train)
            pickle.dump(self.author_clf, open(save_location, "wb"))

    def train_lang_model(self):
        """Train language model needed to predict the next word."""
        print "- Training language model"
        author_data = {}
        for author, book in zip(self.author_labels, self.corpus):
            save_location = os.path.join("models", "lm", author+".p")
            if os.path.isfile(save_location):
                continue
            else:
                if author in author_data:
                    author_data[author] = author_data[author] + book
                else:
                    author_data[author] = book
        print "  - LM for: [",
        for author in set(self.author_labels):
            print author,
            save_location = os.path.join("models", "lm", author+".p")
            if os.path.isfile(save_location):
                lm = LangModel()
                lm.load(save_location)
                self.author_lms[author] = lm
            else:
                lm = LangModel()
                works = author_data[author]
                lm.train(works)
                self.author_lms[author] = lm
                lm.save(save_location)
        print " ]"

    def predict_word(self, context, author=None):
        """Predict next word. This first predicts the author if author is not
        passed as an argument, then predicts the next word using author's
        language model.
        """
        if not author:
            author = self.recognize_author([context])[0]
        # print "- predicting word using {}'s language model.".format(author)
        author_lm = self.author_lms[author]
        return author_lm.predict(context)

    def recognize_sample(self, test_text, test_class=None, test_tags=None):
        """Interface to call the classifier and predict the work using sample
        text.
        """
        if not test_tags:
            text_tags = [util.get_pos_tags(txt) for txt in test_text]
        else:
            text_tags = test_tags
        text_class = test_class

        if not (self.sample_clf):
            self.train_sample_model()

        X_test, y_test = self.generate_test_features(test_text,
                                                     text_class,
                                                     text_tags)
        y_pred = self.sample_clf.predict(X_test)
        return y_pred

    def recognize_author(self, test_text, test_class=None, test_tags=None):
        """Interface to call the classifier and predict the author using sample
        text.
        """
        if not test_tags:
            text_tags = [util.get_pos_tags(txt) for txt in test_text]
        else:
            text_tags = test_tags
        text_class = test_class

        if not (self.author_clf):
            self.train_author_model()

        X_test, y_test = self.generate_test_features(test_text,
                                                     text_class,
                                                     text_tags)
        y_pred = self.author_clf.predict(X_test)
        return y_pred
def prediction(train_df, test_df, MODELS):
    print("...create feature")
    fu_obj = FeatureUnion(transformer_list=features.get_feature_list())
    train_X = fu_obj.fit_transform(train_df, train_df["Response"])
    train_y = train_df["Response"].as_matrix()
    train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj))
    train_dump_df["target"] = train_y
    train_dump_df["ID"] = -1
    train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False)
    test_X = fu_obj.transform(test_df)
    test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj))
    test_dump_df["ID"] = test_df["Id"]
    test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False)
    oc_obj = oc.OptimCutPoint()
    oc_obj2 = oc.OptimCutPoint()
    oo_obj = oo.OptimOffset()
    oo_obj2 = oo.OptimOffset()
    oo_all_obj = oo.OptimOffset(True)
    oo_all_obj2 = oo.OptimOffset(True)
    model_list = MODELS.split(",")
    clf_list = []
    valid_list = []
    kf = KFold(len(train_X), random_state=7777, n_folds=5)
    print("...start fitting")
    for model in model_list:
        print("... start fit %s model" % model)
        valid_pred_list = []
        valid_label_list = []
        for train_index, test_index in kf:
            use_train_X = train_X[train_index]
            use_train_y = train_y[train_index]
            valid_X = train_X[test_index]
            valid_y = train_y[test_index]
            clf_dict[model]["paramteters"]
            if model == "XGB_REG" or model == "XGB_RANK" or model == "XGB_REG2":
                use_train_X, xgb_valid_X, use_train_y, xgb_valid_y =\
                    train_test_split(use_train_X, use_train_y, test_size=0.2)
                fit_param = {"eval_set": [(use_train_X, use_train_y),
                                          (xgb_valid_X, xgb_valid_y)],
                             "early_stopping_rounds": 50
                            }
                f_clf = GridSearchCV(estimator=clf_dict[model]["clf"](),
                                     param_grid=clf_dict[model]["paramteters"],
                                     n_jobs=3, verbose=2, fit_params=fit_param)
            else:
                fit_param = {}
                f_clf = GridSearchCV(estimator=clf_dict[model]["clf"](),
                                     param_grid=clf_dict[model]["paramteters"],
                                     n_jobs=3, verbose=2)
            f_clf.fit(use_train_X, use_train_y)
            valid_pred_list.append(f_clf.predict(valid_X))
            valid_label_list.append(valid_y)
        valid_list.append(np.concatenate(valid_pred_list))
        concat_valid_y = (np.concatenate(valid_label_list))
        use_train_X = np.copy(train_X)
        use_train_y = np.copy(train_y)
        if model == "XGB_REG" or model == "XGB_RANK" or model == "XGB_REG2":
            use_train_X, xgb_valid_X, use_train_y, xgb_valid_y =\
                train_test_split(train_X, train_y, test_size=0.2)
            fit_param = {"eval_set": [(use_train_X, use_train_y),
                                      (xgb_valid_X, xgb_valid_y)],
                         "early_stopping_rounds": 50
                        }
            clf = GridSearchCV(estimator=clf_dict[model]["clf"](),
                               param_grid=clf_dict[model]["paramteters"],
                               n_jobs=3, verbose=1, fit_params=fit_param)
        else:
            fit_param = {}
            clf = GridSearchCV(estimator=clf_dict[model]["clf"](),
                               param_grid=clf_dict[model]["paramteters"],
                               n_jobs=3, verbose=2)
        clf.fit(use_train_X, use_train_y)
        clf_list.append(clf)
    print("... start optim cutting")
    if len(clf_list) > 1:
        test_predict_list = [c.predict(test_X) for c in clf_list]
        valid_predict_X = np.c_[valid_list].T
        test_predict_X = np.c_[test_predict_list].T
        linear_reg = sklearn.linear_model.LinearRegression()
        linear_reg.fit(valid_predict_X, concat_valid_y)
        print(linear_reg.intercept_)
        print(linear_reg.coef_)
        valid_ave_predict = valid_predict_X.mean(axis=1)[None].T
        valid_predict = linear_reg.predict(valid_predict_X)[None].T
        test_ave_predict = test_predict_X.mean(axis=1)[None].T
        test_predict = linear_reg.predict(test_predict_X)[None].T
    else:
        use_clf = clf_list[0]
        concat_valid_y = train_y
        valid_predict = use_clf.predict(train_X)[None].T
        test_predict = use_clf.predict(test_X)[None].T
    print("...start y_pred")
    oo_obj.fit(valid_predict, concat_valid_y)
    oo_all_obj.fit(valid_predict, concat_valid_y)
    oc_obj.fit(valid_predict, concat_valid_y)
    y_pred = oo_obj.transform(test_predict)
    pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_offset_%s.csv" % MODELS
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
    y_pred = oo_all_obj.transform(test_predict)
    pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_offset_all_%s.csv" % MODELS
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
    y_pred = oc_obj.transform(test_predict)
    pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_cutpoint_%s.csv" % MODELS
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
    if len(clf_list) > 1:
        oo_obj2.fit(valid_ave_predict, concat_valid_y)
        oo_all_obj2.fit(valid_predict, concat_valid_y)
        y_pred2 = oo_obj2.transform(test_ave_predict)
        pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"])
        submissionfile = SUBMISSION + "submission_offset_ave_%s.csv" % MODELS
        pred_sr2.to_csv(submissionfile, header=True, index_label="ID")
        y_pred2 = oo_all_obj2.transform(test_ave_predict)
        pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"])
        submissionfile = SUBMISSION + "submission_offset_all_ave_%s.csv" % MODELS
        pred_sr2.to_csv(submissionfile, header=True, index_label="ID")
        oc_obj2.fit(valid_ave_predict, concat_valid_y)
        y_pred2 = oc_obj2.transform(test_ave_predict)
        pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"])
        submissionfile = SUBMISSION + "submission_cutpoint_ave_%s.csv" % MODELS
        pred_sr2.to_csv(submissionfile, header=True, index_label="ID")
    print("... finish y_pred")
Ejemplo n.º 34
0
    Xtrain = vectorizer.fit_transform(Xtrain)
    print('Shape of Xtrain:', Xtrain.shape)

    print('Numerifying labels...')
    le = LabelEncoder()
    # Fit label encoder on Y for now, not Ytrain, to ensure we really 'know' all labels in the val set (This should usually be guaranteed by task)
    le.fit(Y)
    Ytrain = le.transform(Ytrain)
    print('Shape of Ytrain:', Ytrain.shape)

    print('Fitting SVM ...')
    clf = LinearSVC(random_state=0)
    clf.fit(Xtrain, Ytrain)

    print('Predicting...')
    Yguess_svm = clf.predict(vectorizer.transform(Xtest))

    # Transform Yguess back to nominal labels
    Yguess_svm = le.inverse_transform(Yguess_svm)
    # Evaluate on val set
    print()
    print('*' * 50)
    print('Results for SVM baseline:')
    evaluate(Ytest, Yguess_svm)
    print('*' * 50)
    '''
    # classifier_svm = Pipeline([('vec', vectorizer),
    #                             ('classify', SVC(kernel=Kernel, C=C_val))])
    X_mat = vectorizer.fit_transform(X)
    print('shape of X_mat:', X_mat.shape)