def train_model(trainset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []

	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])

	print "Training instances : ", 0.8*len(classes)
	print "Testing instances : ", 0.2*len(classes) 
	
	matrix = vectorizer.fit_transform(corpus)
	print "feature count : ", len(vectorizer.get_feature_names())
	print "training model"
	X = matrix.toarray()
	y = numpy.asarray(classes)
	model =LinearSVC()
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0)
	y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
	#y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test)
	#print y_prob
	#con_matrix = []
	#for row in range(len(y_prob)):
	#	temp = [y_pred[row]]	
	#	for prob in y_prob[row]:
	#		temp.append(prob)
	#	con_matrix.append(temp)
	#for row in con_matrix:
	#	output.write(str(row)+"\n")
	#print y_pred		
	#print y_test
	
	res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited']
	res2=[i for i, j in enumerate(y_test) if j == 'anonEdited']
	reset=[]
	for r in res1:
		if y_test[r] != "anonEdited":
			reset.append(y_test[r])
	for r in res2:
		if y_pred[r] != "anonEdited":
			reset.append(y_pred[r])
	
	
	output=open(sys.argv[2],"w")
	for suspect in reset:
		output.write(str(suspect)+"\n")	
	cm = confusion_matrix(y_test, y_pred)
	print(cm)
	pl.matshow(cm)
	pl.title('Confusion matrix')
	pl.colorbar()
	pl.ylabel('True label')
	pl.xlabel('Predicted label')
	pl.show()
	print accuracy_score(y_pred,y_test)
Beispiel #2
0
def concat_feature_extractors(train_data, labels):
    # This dataset is way to high-dimensional. Better do PCA:
    pca = PCA(n_components = 2)

    # Maybe some original features where good, too?
    selection = SelectKBest(k = 1)

    # Build estimator from PCA and Univariate selection:

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(train_data, labels).transform(train_data)

    # Classify:
    svm = SVC(kernel = "linear")
    svm.fit(X_features, labels)

    # Do grid search over k, n_components and C:

    pipeline = Pipeline([("features", combined_features), ("svm", svm)])

    param_grid = dict(features__pca__n_components = [1, 2, 3],
                      features__univ_select__k = [1, 2],
                      svm__C = [0.1, 1, 10])

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = 10)
    grid_search.fit(train_data, labels)
    print(grid_search.best_estimator_)
 def test_feature_union(self):
     """Tests that combining multiple featurizers works as expected"""
     modules = ["bag-of-words", "entities"]
     modules_list, _ = modules_to_dictionary(modules)
     feature_union = FeatureUnion(modules_list)
     feature_union.fit(texts_entities, outcomes)
     feature_union.transform(["unknown"])
def testSVC(lbda=1.0, n_components=20, kbest=4):
	otto = load_otto()
	X = otto.data
	y = otto.target
	# X = otto.data[:10000, :10]
	# y = otto.target[:10000]

	scaler = StandardScaler().fit(X)
	X = scaler.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ("univ_select", selection)]
	)
	X_features = combined_features.fit(X, y).transform(X)

	svc = SVC(C=1.0/lbda, kernel='rbf', cache_size=400, probability=True)
	pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scaler.transform(testData)
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)
Beispiel #5
0
	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1, 2, 3],
				                  features__univ_select__k=[1, 2])
			else:
				param_grid = dict(features__pca__n_components=[1, 2,3],
				                  features__univ_select__k=[1,2])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None
def prediction(train_df, test_df, MODEL):

    print "... start prediction"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    test_X = fu_obj.transform(test_df)

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
Beispiel #7
0
	def trainItalianSexClassifier(self):
		#get correct labels from dictionary in trainY and testY
		trainX = self.italianTrainData[0]
		trainY = self.getYlabels(self.italianTrainData[1], 'sex')

		

		combined_features = FeatureUnion([("tfidf", TfidfVectorizer()),
										("ngrams", TfidfVectorizer(ngram_range=(3, 3), analyzer="char")), 
										("counts", CountVectorizer()),
										("latin", Latin()),	
										],transformer_weights={
											'latin': 1,
											'tfidf': 2,
											'ngrams': 2,
											'counts': 1,

        								})
		
		X_features = combined_features.fit(trainX, trainY).transform(trainX)
		classifier = svm.LinearSVC()
		pipeline = Pipeline([("features", combined_features), ("classifier", classifier)])
		pipeline.fit(trainX, trainY)
		
		return pipeline
Beispiel #8
0
	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1],
				                  features__univ_select__k=[1])
			else:
				param_grid = dict(features__pca__n_components=[1,2,3,4],
				                  features__univ_select__k=[1,2,3,4])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			self.R2=r2_score(self.target_matrix,regr.predict(self.feature_matrix)) #Ian: should do R2 on predicted points vs. points on a given day
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None
def rbf_kernels(env, n_samples=100000, gamma=[0.01, 0.1], n_components=100):
    """Represent observation samples using RBF-kernels.

    EXAMPLE
    -------
    >>> env = gym.make('MountainCar-v0')
    >>> n_params, rbf = rbf_kernels(env, n_components=100)
    >>> sample = env.observation_space.sample().reshape((1, env.observation_space.shape[0]))
    >>> rbf(sample).shape
    (1, 100)
    """
    observation_examples = np.array([env.observation_space.sample() for _ in range(n_samples)])

    # Fit feature scaler
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Fir feature extractor
    features = []
    for g in gamma:
        features.append(('gamma={}'.format(g), RBFSampler(n_components=n_components // len(gamma), gamma=g)))

    features = FeatureUnion(features)
    features.fit(scaler.transform(observation_examples))

    def _rbf_kernels(observation):
        return features.transform(scaler.transform(observation))

    return _rbf_kernels
Beispiel #10
0
def pca_kpca(train_data, labels):
    estimators = make_union(PCA(), TruncatedSVD(), KernelPCA())
#    estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
    combined = FeatureUnion(estimators)
    combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels)

    return combined
 def fit(self, X, y=None):
     Trans2 = Q2Transformer()
     Trans3 = Q3Transformer()
     Trans4 = Q4Transformer()
     combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)])
     self.fit = combined_features.fit(X)
     return self
def testLogistic(lbda=1.0, n_components=20, kbest=4):
	# X = otto.data[:1000, :20]
	# y = otto.target[:1000]
	otto = load_otto()
	X = otto.data[:, :]
	y = otto.target[:]
	# n_components = 20
	# kbest = 4
#	print 'y.shape =', y.shape

	scalar = StandardScaler().fit(X)
	X = scalar.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ('univ_select', selection)]
	)
	X_features = combined_features.fit(X,y).transform(X)

	logistic = LogisticRegression(C=1.0/lbda)
	pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	# print trainTarget
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scalar.transform(testData)
	# logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score()))
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)
Beispiel #13
0
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
def convert_testdata(test_gray_data, feature_rule=f.feature_transformer_rule):

    data_df = f.make_test_df(test_gray_data)
    fu = FeatureUnion(transformer_list=feature_rule)
    Std = preprocessing.StandardScaler()

    X_test = fu.fit_transform(data_df)
    #X_test = Std.fit_transform(X_test)

    return X_test
Beispiel #15
0
def get_pca_transformer(train_x, train_y, n_components=-1):
    if n_components == -1:
        n_components = int(np.ceil(np.sqrt(train_x.shape[1])))

    pca = PCA(n_components=n_components)
    selection = SelectKBest(k=n_components/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    return combined_features.fit(train_x, train_y)
Beispiel #16
0
 def fit_logreg(self):
     tokenize_sense = CachedFitTransform(Pipeline([
         ('tokenize', Map(compose(tokenize, normalize_special, unescape))),
         ('normalize', MapTokens(normalize_elongations)),
     ]), self.memory)
     features = FeatureUnion([
         # ('w2v_doc', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec(
         #         dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20,
         #         workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))),
         # ('w2v_word_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_avg_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_norm_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         ('w2v_word_norm_avg_google', Pipeline([
             ('tokenize', tokenize_sense),
             ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         ])),
         # ('w2v_word_max', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_max_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_inv', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))),
     ])
     classifier = LogisticRegression()
     with temp_log_level({'gensim.models.word2vec': logging.INFO}):
         classifier.fit(features.transform(self.train_docs), self.train_labels())
     estimator = Pipeline([('features', features), ('classifier', classifier)])
     return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator
Beispiel #17
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different pca object to control the random_state stream
    fs = FeatureUnion([("pca", pca), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def set_traindata(df, key):

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X = fu.fit_transform(df)
    y = np.concatenate(df["label"].apply(lambda x: x.flatten()))

    X = Std.fit_transform(X)

    return (X, y)
def cv_score(train_df, MODEL):
    print "... start cross validation"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()
    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=-1, scoring=rmspe, cv=None)
    print cross_val_score(clf, train_X, train_y, scoring=rmspe, cv=5, n_jobs=3)
def pca(x, y, test_x, n_features=-1):
    if n_features == -1:
        n_features = int(np.ceil(np.sqrt(x.shape[1])))

    pca = PCA(n_components=n_features)
    selection = SelectKBest(k=n_features/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    combined_features.fit(x, y)

    return combined_features.transform(x), combined_features.transform(test_x)
def prediction(train_df, test_df, MODEL):

    print "... start prediction"
    fu_obj = FeatureUnion(transformer_list=features.feature_list)
    train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)]
    train_X = fu_obj.fit_transform(train_df)
    train_y = np.log1p(train_df["Sales"]).as_matrix()
    train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj))
    train_dump_df["target"] = train_y
    train_dump_df = train_dump_df.dropna(axis=0)
    print train_dump_df.shape
    train_X = train_dump_df[get_split_feature_list(fu_obj)].values
    train_y = train_dump_df["target"].values
    train_dump_df["ID"] = -1
    train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False)
    test_X = fu_obj.transform(test_df)
    test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj))
    print (test_dump_df == 0).sum(axis=0)
    test_dump_df["ID"] = test_df["Id"]
    test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False)
    if MODEL == "XGB":
        train_X, valid_X, train_y, valid_y =\
            train_test_split(train_X, train_y, test_size=0.05)
        fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)],
                     "eval_metric": rmspe_xg,
                     "early_stopping_rounds": 100}
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1,
                           fit_params=fit_param)
    else:
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    y_pred = np.expm1(clf.predict(test_X))
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def convert_traindata(train_gray_data, labels):

    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X_train = fu.fit_transform(data_df)
    y_train = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    X_train = Std.fit_transform(X_train)

    return X_train, y_train
def get_data():
    '''
    get X, y data

    :rtype: tuple
    '''
    _, _, _, train_gray_data, _, _, labels = i_p.load_data()
    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    X = fu.fit_transform(data_df)
    y = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    return (X, y)
def train_model(trainset, testset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
#	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []
        testclasses = []
        testcorpus = []
	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])
	
	for item in testset:
		testcorpus.append(item['text'])
		testclasses.append(item['label'])

#	print "Training instances : ", len(classes)
#	print "Testing instances : ", len(set(classes)) 
	
	matrix = vectorizer.fit_transform(corpus)
	testmatrix = vectorizer.fit_transform(testcorpus)
#	print "feature count :. ", len(vectorizer.get_feature_names())
#	print "training model"
	X = matrix.toarray()
	TX = testmatrix.toarray()
	Ty= numpy.asarray(testclasses)
	y = numpy.asarray(classes)
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.9999,test_size=.00001,random_state=0)
	model = LinearSVC(dual=True, loss='l1')
#	model = SVC()
#	model = NuSVC()
#	model = RandomForestClassifier() 
	#scores=cross_validation.cross_val_score(model,X,y)
	#print "Accuracy "+ str(scores.mean())
#	print y_pred
	y_prob = model.fit(X_train, y_train).predict(TX)
#	y_prob = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
#	print(y_prob)
#	cm = confusion_matrix(y_test, y_pred)
#	cr = classification_report(y_test, y_pred)
#	print cr
#	print(cm)
#	pl.matshow()
#	pl.title('Confusion matrix#')
#	pl.colorbar()
#	pl.ylabel('True label')
#	pl.xlabel('Predicted label')
#	pl.show()
        print accuracy_score(y_prob,Ty)
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
class MuscleClassifier():

	def __init__(self, auto_load=True):
		""" Initializes our MuscleClassifier
			Option to preload it or start from fresh model 
		"""

		#=====[ If auto_load, then we rehydrate our existing models ]=====
		if auto_load:

			self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r'))
			self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r'))
			self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r'))

		else:

			self.model = BernoulliNB()

	def train(self, muscle_groups, labels):
		""" 
			Vectorizes raw input and trains our classifier 
		"""

		#=====[ Instantiate label encoder to turn text labels into ints ]=====
		self.le = preprocessing.LabelEncoder()

		#=====[ Declare vectorizers and merge them via a FeatureUnion ]=====
		char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8')
		word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8')

		self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)])

		#=====[ Transform our input and labels ]=====
		X = self.vectorizer.fit_transform(muscle_groups).toarray()
		Y = self.le.fit_transform(labels)

		#=====[ Fit our model and then run inference on training data ]=====
		self.model.fit(X,Y)
		y = self.model.predict(X)

		#=====[ Report Traning Accuracy ]=====
		print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y)))

	def predict(self, exercises):
		""" Takes in raw input, vectorizes it, and reports back predicted muscle group """

		X = self.vectorizer.transform(exercises).toarray()
		y = self.model.predict(X)

		return self.le.classes_[y]
Beispiel #27
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def make_checkdata(mode="df"):
    
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    train_keys = train_gray_data.keys()[:2]
   
    train_inputs = {}
    train_labels = {}
    for i in xrange(len(train_keys)):
        input_ = train_gray_data[train_keys[i]]
        label = labels[train_keys[i]]

        train_inputs.update({train_keys[i]:input_})
        train_labels.update({train_keys[i]:label})
 
    test_keys = test_gray_data.keys()[:2]
    test_inputs = {}
    for i in xrange(len(test_keys)):
        input_ = test_gray_data[test_keys[i]]
        test_inputs.update({test_keys[i]:input_})
        
    train_df = f.make_data_df(train_inputs, train_labels)
    test_df = f.make_test_df(test_inputs) 
    

    if mode == "df":

        train_df = train_df.reset_index()
        test_df = test_df.reset_index()
        
        train_df.columns = ["pngname", "input", "label"]
        test_df.columns = ["pngname", "input"]

        return train_df, train_keys, test_df, test_keys


    elif mode == "feature":

        X_train = fu.fit_transform(train_df)
        X_train = Std.fit_transform(X_train)
        y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
        
        
        
        X_test = fu.fit_transform(test_df)
        X_test = Std.fit_transform(X_test)    
        
        return X_train, y_train, X_test
def test_feature_stacker_weights():
    # test feature stacker with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())
def ageClassifier(doc, age):
	""" A function that trains an age classifier """
	xTrain = doc
	yTrain = age

	unionOfFeatures = FeatureUnion([
									('normaltfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)),
									('bigrams', TfidfVectorizer(preprocessor = identity, tokenizer = identity, ngram_range = (3,3), analyzer = 'char')),
									('counts', CountVectorizer(preprocessor = identity, tokenizer = identity))
									])

	featureFit = unionOfFeatures.fit(xTrain, yTrain).transform(xTrain)
	classifier = Pipeline([('featureunion', unionOfFeatures), ('cls', svm.SVC(kernel='linear', C=1.5))])
	classifier.fit(xTrain, yTrain)
	
	return classifier
Beispiel #31
0
class AICEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, candidateFeatures: List[CandidateFeature], classifier):
        self.candidateFeatures = candidateFeatures
        self.classifier = classifier

        self.ensemble_pipeline = FeatureUnion(
            transformer_list=[(str(c), self.generate_pipeline(c))
                              for c in candidateFeatures])

        # calculate weights
        self.AICc = np.array([
            np.min(
                c.runtime_properties['additional_metrics']['AICc_complexity'])
            for c in candidateFeatures
        ])
        #self.AICc = [np.mean(c.runtime_properties['additional_metrics']['AICc_complexity']) for c in candidateFeatures]

        delta_i = self.AICc - np.min(self.AICc)
        summed = np.sum(
            np.array([np.exp(-delta_r / 2.0) for delta_r in delta_i]))
        self.weights = np.array(
            [np.exp(-d_i / 2.0) / summed for d_i in delta_i])

        print(candidateFeatures)
        print(self.weights)

    def generate_pipeline(self, rep):
        best_hyperparameters = rep.runtime_properties['hyperparameters']

        all_keys = list(best_hyperparameters.keys())
        for k in all_keys:
            if 'classifier__' in k:
                best_hyperparameters[k[12:]] = best_hyperparameters.pop(k)

        my_pipeline = Pipeline([
            (str(rep) + '_f', rep.pipeline),
            (str(rep) + '_c',
             ClassifierTransformer(self.classifier(**best_hyperparameters)))
        ])

        return my_pipeline

    def fit(self, X, y=None):
        self.ensemble_pipeline.fit(X, y)
        return self

    def predict_proba(self, X):
        ensemble_predictions = self.ensemble_pipeline.transform(X)

        print(ensemble_predictions)
        print(ensemble_predictions.shape)

        #weight these predictions
        weighted_predictions = np.multiply(ensemble_predictions, self.weights)

        averaged_predictions = np.sum(weighted_predictions, axis=1)

        averaged_predictions_proba = np.zeros(
            (averaged_predictions.shape[0], 2))

        averaged_predictions_proba[:, 0] = averaged_predictions
        averaged_predictions_proba[:, 1] = 1.0 - averaged_predictions
        return averaged_predictions_proba

    def predict(self, X):
        return self.predict_proba(X)[:, 0] < 0.5
Beispiel #32
0

# Now, our new pipeline:

# In[ ]:

from sklearn.pipeline import FeatureUnion

pipe2 = Pipeline([
    ('u1',
     FeatureUnion([
         ('tfdif_features',
          Pipeline([
              ('cv', CountVectorizer()),
              ('tfidf', TfidfTransformer()),
          ])),
         ('pos_features',
          Pipeline([
              ('pos', PosTagMatrix(tokenizer=nltk.word_tokenize)),
          ])),
     ])),
    ('logit', LogisticRegression()),
])

# In[ ]:

pipe2.fit(X_train_part, y_train_part)
pred = pipe2.predict_proba(X_valid)
log_loss(y_valid, pred)

# Not an improvements, but hey, we learned somthing new!
Beispiel #33
0
 feature_list = np.array(feature_list)
 stop_words = helper.read_stopwords()
 # feature_list = feature_list[:, 0]
 #
 union = FeatureUnion(
     transformer_list=[
         ("feature",
          Pipeline([('selector', ItemSelector(1)),
                    ("vec", DictVectorizer(sparse=False))])),
         (
             "content",
             Pipeline([
                 ('selector', ItemSelector(0)),
                 (
                     'cvec',
                     CountVectorizer(
                         # analyzer='char_wb',
                         token_pattern=r"(?u)\b\w+\b",
                         min_df=1,
                         stop_words=stop_words)),
                 ('tfidf', TfidfTransformer())
             ]))
     ],
     transformer_weights={
         "feature": 1.0,
         "content": 1.0
     })
 union.fit_transform(feature_list)
 pipe: Pipeline = union.transformer_list[1][1]
 cvec: CountVectorizer = pipe.named_steps["cvec"]
 arr = cvec.get_feature_names()
Beispiel #34
0
    def default_pipeline(self,
                         name,
                         n_pca=10,
                         n_best=10,
                         lda_shrink=10,
                         svm_C=10,
                         svm_gamma=10,
                         fdr_alpha=[0.05],
                         fpr_alpha=[0.05]):
        """Use a default combination of parameters for building a pipeline

        Args:
            name: string
                The string for building a default pipeline (see examples below)

        Kargs:
            n_pca: integer, optional, (def: 10)
                The number of components to search

            n_best: integer, optional, (def: 10)
                Number of best features to consider using a statistical method

            lda_shrink: integer, optional, (def: 10)
                Fit optimisation parameter for the lda

            svm_C/svm_gamma: integer, optional, (def: 10/10)
                Parameters to optimize for the svm

            fdr/fpr_alpha: list, optional, (def: [0.05])
                List of float for selecting features using a fdr or fpr

        Examples:
            >>> # Basic classifiers :
            >>> name = 'lda' # or name = 'svm_linear' for a linear SVM
            >>> # Combine a classifier with a feature selection method :
            >>> name = 'lda_fdr_fpr_kbest_pca'
            >>> # The method above will use an LDA for the features evaluation
            >>> # and will combine a FDR, FPR, k-Best and pca feature seelction.
            >>> # Now we can combine with classifier optimisation :
            >>> name = 'lda_optimized_pca' # will try to optimize an LDA with a pca
            >>> name = 'svm_kernel_C_gamma_kbest' # optimize a SVM by trying
            >>> # diffrent kernels (linear/RBF), and optimize C and gamma parameters
            >>> # combine with a k-Best features selection.
        """
        # ----------------------------------------------------------------
        # DEFINED COMBINORS
        # ----------------------------------------------------------------
        pca = PCA()
        selection = SelectKBest()
        scaler = StandardScaler()
        fdr = SelectFdr()
        fpr = SelectFpr()

        # ----------------------------------------------------------------
        # RANGE DEFINITION
        # ---------------------------------------------------------
        pca_range = np.arange(1, n_pca + 1)
        kbest_range = np.arange(1, n_best + 1)
        C_range = np.logspace(-5, 15, svm_C,
                              base=2.)  #np.logspace(-2, 2, svm_C)
        gamma_range = np.logspace(-15, 3, svm_gamma,
                                  base=2.)  #np.logspace(-9, 2, svm_gamma)

        # Check range :
        if not kbest_range.size: kbest_range = [1]
        if not pca_range.size: pca_range = [1]
        if not C_range.size: C_range = [1.]
        if not gamma_range.size: gamma_range = ['auto']

        # ----------------------------------------------------------------
        # DEFINED PIPELINE ELEMENTS
        # ----------------------------------------------------------------
        pipeline = []
        grid = {}
        combine = []

        # ----------------------------------------------------------------
        # BUILD CLASSIFIER
        # ----------------------------------------------------------------
        # -> SCALE :
        if name.lower().find('scale') != -1:
            pipeline.append(("scaler", scaler))

        # -> LDA :
        if name.lower().find('lda') != -1:

            # Default :
            if name.lower().find('optimized') == -1:
                clf = LinearDiscriminantAnalysis(
                    priors=np.array([1 / self._nclass] * self._nclass))

            # Optimized :
            elif name.lower().find('optimized') != -1:
                clf = LinearDiscriminantAnalysis(priors=np.array(
                    [1 / self._nclass] * self._nclass),
                                                 solver='lsqr')
                grid['clf__shrinkage'] = np.linspace(0., 1., lda_shrink)

        # -> SVM :
        elif name.lower().find('svm') != -1:

            # Linear/RBF standard kernel :
            if name.lower().find('linear') != -1:
                kwargs = {'kernel': 'linear'}
            elif name.lower().find('rbf') != -1:
                kwargs = {'kernel': 'rbf'}
            else:
                kwargs = {}

            # Optimized :
            if name.lower().find('optimized') != -1:

                # Kernel optimization :
                if name.lower().find('kernel') != -1:
                    grid['clf__kernel'] = ('linear', 'rbf')

                # C optimization :
                if name.lower().find('_c_') != -1:
                    grid['clf__C'] = C_range

                # Gamma optimization :
                if name.lower().find('gamma') != -1:
                    grid['clf__gamma'] = gamma_range

            clf = SVC(**kwargs)

        # ----------------------------------------------------------------
        # BUILD COMBINE
        # ----------------------------------------------------------------
        # -> FDR :
        if name.lower().find('fdr') != -1:
            combine.append(("fdr", fdr))
            grid['features__fdr__alpha'] = fdr_alpha

        # -> FPR :
        if name.lower().find('fpr') != -1:
            combine.append(("fpr", fpr))
            grid['features__fpr__alpha'] = fpr_alpha

        # -> PCA :
        if name.lower().find('pca') != -1:
            combine.append(("pca", pca))
            grid['features__pca__n_components'] = pca_range

        # -> kBest :
        if name.lower().find('kbest') != -1:
            combine.append(("kBest", selection))
            grid['features__kBest__k'] = kbest_range

        # -> RFECV :
        if name.lower().find('rfecv') != -1:
            rfecv = RFECV(clf)
            combine.append(("RFECV", rfecv))

        # if combine is empty, select all features :
        if not len(combine):
            combine.append(("kBest", SelectKBest(k='all')))

        self.combine = FeatureUnion(combine)

        # ----------------------------------------------------------------
        # SAVE PIPELINE
        # ----------------------------------------------------------------
        # Build ordered pipeline :
        if len(combine):
            pipeline.append(("features", self.combine))
        pipeline.append(("clf", clf))

        # Save pipeline :
        self.pipeline = Pipeline(pipeline)
        self.grid = grid
        self._pipename = name
Beispiel #35
0
num_pipeline = Pipeline([
    ('selector', ds.DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', ca.CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# 选择
cat_pipeline = Pipeline([
    ('selector', ds.DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizer(sparse_output=True)),
])

# 拼接
full_pipeline = FeatureUnion(
    transformer_list=[("num_pipeline",
                       num_pipeline), ("cat_pipeline", cat_pipeline)])

housing_prepared = full_pipeline.fit_transform(housing)

# test set
test_housing = strat_test_set.drop("median_house_value", axis=1)
test_housing_labels = strat_test_set["median_house_value"].copy()

test_housing_prepared = full_pipeline.fit_transform(test_housing)

# Linear Reg
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
predict = lin_reg.predict(test_housing_prepared)
Beispiel #36
0
num_attribs = list(sample_data_num)
# 取出文本属性的
cat_attribs = ["class(OK/NG)"]

# 数据转换流水线,将维度13列数值进行标准化处理,最后一列属性不做处理
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("std_scaler", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

# 将数据输入到流水线中,得出准备好的数据
sample_data_prepared = full_pipeline.fit_transform(sample_data)

# 将最后一列分类标签单独取出,转换为一维数组
sample_data_label = sample_data_prepared[:, -1:]
sample_data_label = sample_data_label.flatten()
# 将标签列转化为布尔值 true为"OK" false为"NG" 便于后续衡量模型指标
label_train = (sample_data_label == "OK")

# 将13个维度单独取出,为训练做好准备
sample_data_13 = sample_data_prepared[:, :13]

# 导入数据
clf = pipeline.Pipeline([
    (
        'union',
        FeatureUnion(
            transformer_list=[
                ('cst', cust_regression_vals()),
                ('txt1',
                 pipeline.Pipeline([('s1', cust_txt_col(key='search_term')),
                                    ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                ('txt2',
                 pipeline.Pipeline([('s2', cust_txt_col(key='product_title')),
                                    ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                ('txt3',
                 pipeline.Pipeline([('s3',
                                     cust_txt_col(key='product_description')),
                                    ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                ('txt4',
                 pipeline.Pipeline([('s4', cust_txt_col(key='brand')),
                                    ('tfidf4', tfidf), ('tsvd4', tsvd)]))
            ],
            transformer_weights={
                'cst': 1.0,
                'txt1': 0.5,
                'txt2': 0.25,
                'txt3': 0.0,
                'txt4': 0.5
            },
            #n_jobs = -1
        )),
    ('rfr', rfr)
])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ['x5']

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [('m5', mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['m5__x5'], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[('mock', mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x3'], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x5'], ft.get_feature_names())
def test_feature_union_parallel():
    # test that n_jobs work for FeatureUnion
    X = JUNK_FOOD_DOCS

    fs = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ])

    fs_parallel = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ],
                               n_jobs=2)

    fs_parallel2 = FeatureUnion([
        ("words", CountVectorizer(analyzer='word')),
        ("chars", CountVectorizer(analyzer='char')),
    ],
                                n_jobs=2)

    fs.fit(X)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape[0], len(X))

    fs_parallel.fit(X)
    X_transformed_parallel = fs_parallel.transform(X)
    assert_equal(X_transformed.shape, X_transformed_parallel.shape)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel.toarray())

    # fit_transform should behave the same
    X_transformed_parallel2 = fs_parallel2.fit_transform(X)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel2.toarray())

    # transformers should stay fit after fit_transform
    X_transformed_parallel2 = fs_parallel2.transform(X)
    assert_array_equal(X_transformed.toarray(),
                       X_transformed_parallel2.toarray())
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(
        TypeError, 'All estimators should implement fit and '
        'transform.*\\bNoTrans\\b', FeatureUnion,
        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
    def enhance_transactions(self):  # load training data
        self.training_data = ml.load_training_data(
            self.training_data,
            known_account=self.account,
            existing_entries=self.existing_entries)

        # train the machine learning model
        self._trained = False
        if not self.training_data:
            logger.warning("Cannot train the machine learning model "
                           "because the training data is empty.")
        elif len(self.training_data) < 2:
            logger.warning(
                "Cannot train the machine learning model "
                "because the training data consists of less than two elements."
            )
        else:
            self.pipeline = Pipeline([
                (
                    'union',
                    FeatureUnion(
                        transformer_list=[
                            ('narration',
                             Pipeline([
                                 ('getNarration', ml.GetNarration()),
                                 ('vect', CountVectorizer(ngram_range=(1, 3))),
                             ])),
                            (
                                'payee',
                                Pipeline([  # any existing payee, if one exists
                                    ('getPayee', ml.GetPayee()),
                                    ('vect', CountVectorizer(ngram_range=(1,
                                                                          3))),
                                ])),
                            (
                                'dayOfMonth',
                                Pipeline([
                                    ('getDayOfMonth', ml.GetDayOfMonth()),
                                    ('caster', ml.ArrayCaster()
                                     ),  # need for issue with data shape
                                ])),
                        ],
                        transformer_weights={
                            'narration': 0.8,
                            'payee': 0.5,
                            'dayOfMonth': 0.1
                        })),
                ('svc', SVC(kernel='linear')),
            ])
            logger.debug("About to train the machine learning model...")
            self.pipeline.fit(self.training_data,
                              ml.GetPayee().transform(self.training_data))
            logger.info("Finished training the machine learning model.")
            self._trained = True

        if not self._trained:
            logger.warning(
                "Cannot generate predictions or suggestions "
                "because there is no trained machine learning model.")
            return self.imported_transactions

        # predict payees
        self.transactions = self.imported_transactions
        if self.predict_payees:
            logger.debug("About to generate predictions for payees...")
            predicted_payees: List[str]
            predicted_payees = self.pipeline.predict(self.transactions)
            self.transactions = [
                ml.add_payee_to_transaction(
                    *t_p, overwrite=self.overwrite_existing_payees)
                for t_p in zip(self.transactions, predicted_payees)
            ]
            logger.debug(
                "Finished adding predicted payees to the transactions to be imported."
            )

        # suggest likely payees
        if self.suggest_payees:
            # get values from the SVC decision function
            logger.debug(
                "About to generate suggestions about likely payees...")
            decision_values = self.pipeline.decision_function(
                self.imported_transactions)

            # add a human-readable class label (i.e., payee's name) to each value, and sort by value:
            suggested_payees = [[
                payee for _, payee in sorted(list(
                    zip(distance_values, self.pipeline.classes_)),
                                             key=lambda x: x[0],
                                             reverse=True)
            ] for distance_values in decision_values]

            # add the suggested payees to each transaction:
            self.transactions = [
                ml.add_suggested_payees_to_transaction(*t_p)
                for t_p in zip(self.transactions, suggested_payees)
            ]
            logger.debug(
                "Finished adding suggested payees to the transactions to be imported."
            )

        return self.transactions
Beispiel #44
0
    def __init__(self,
                 dataset,
                 obs_dim,
                 act_dim,
                 gamma,
                 horizon,
                 model_reg,
                 reward_reg,
                 value_reg,
                 default_length_scale=0.1,
                 random_feature_per_obs_dim=250,
                 norm=None,
                 scale_length_adjustment='median',
                 dtype=np.float64,
                 policy_net=None):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.horizon = horizon
        self.norm = norm
        self.policy_net = policy_net
        self.model_reg = model_reg
        self.reward_reg = reward_reg
        self.value_reg = value_reg
        self.dtype = dtype

        self.n_samples = dataset['obs'].shape[0]
        self.n_episode = dataset['init_obs'].shape[0]
        self.data_acts = dataset['acts']
        if self.policy_net is not None:
            self.pi_current = self.policy_net.get_probabilities(dataset['obs'])
            self.pi_next = self.policy_net.get_probabilities(
                dataset['next_obs'])
            self.pi_init = self.policy_net.get_probabilities(
                dataset['init_obs'])
            self.pi_term = self.policy_net.get_probabilities(
                dataset['term_obs'])
        else:
            self.pi_current = dataset['target_prob_obs']
            self.pi_next = dataset['target_prob_next_obs']
            self.pi_init = dataset['target_prob_init_obs']
            self.pi_term = dataset['target_prob_term_obs']
        if self.norm is None:
            self.obs = dataset['obs']
            self.next_obs = dataset['next_obs']
            self.init_obs = dataset['init_obs']
            self.term_obs = dataset['term_obs']
        elif self.norm == 'std':
            self.obs_mean = np.mean(dataset['obs'], axis=0, keepdims=True)
            self.obs_std = np.std(dataset['obs'], axis=0, keepdims=True)
            self.obs = (dataset['obs'] - self.obs_mean) / self.obs_std
            self.next_obs = (dataset['next_obs'] -
                             self.obs_mean) / self.obs_std
            self.init_obs = (dataset['init_obs'] -
                             self.obs_mean) / self.obs_std
            self.term_obs = (dataset['term_obs'] -
                             self.obs_mean) / self.obs_std
        else:
            raise NotImplementedError
        if scale_length_adjustment == 'median':
            sample_num = 5000
            idx1 = np.random.choice(self.n_samples, sample_num)
            idx2 = np.random.choice(self.n_samples, sample_num)
            med_dist = np.median(np.square(self.obs[None, idx1, :] -
                                           self.obs[idx2, None, :]),
                                 axis=(0, 1))
            med_dist[
                med_dist <
                0.01] = 0.01  # enforce a upperbound on the scale-length of the action component
            scale_length_vector = 1.0 / med_dist
        else:
            scale_length_vector = np.ones(self.obs_dim)
        # import pdb; pdb.set_trace()
        #* set the fourier feature
        transformer_list = []
        self.z_dim = random_feature_per_obs_dim * self.obs_dim
        models = [
            RBFSampler(n_components=random_feature_per_obs_dim,
                       gamma=default_length_scale * dist)
            for dist in scale_length_vector
        ]
        for model in models:
            model.fit([self.obs[0]])
            transformer_list.append((str(model), model))
        self.rff = FeatureUnion(transformer_list)

        # #* separate action set indexing
        # act_idx = []
        # for i in range(self.act_dim):
        #     act_idx.append(np.where(dataset['acts']==i)[0])
        # #* apply transformation
        # Z = self.rff.transform(self.obs).astype(self.dtype); Z_prime = self.rff.transform(self.next_obs).astype(self.dtype)
        # Z_init = self.rff.transform(self.init_obs).astype(self.dtype); Z_term = self.rff.transform(self.term_obs).astype(self.dtype)
        # assert self.z_dim == Z.shape[1]
        # self.Phi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim), dtype=self.dtype)
        # self.Phi_pi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim),dtype=self.dtype)
        # self.Phi_prime_pi = np.zeros((Z_prime.shape[0], Z_prime.shape[1]* self.act_dim),dtype=self.dtype)
        # self.Phi_init_pi = np.zeros((Z_init.shape[0], Z_init.shape[1]*self.act_dim), dtype=self.dtype)
        # self.Phi_term_pi = np.zeros((Z_term.shape[0], Z_term.shape[1]*self.act_dim),dtype=self.dtype)
        # for i in range(self.act_dim):
        #     self.Phi[act_idx[i], i*self.z_dim:(i+1)*self.z_dim] = Z[act_idx[i]]
        #     self.Phi_pi[:, i*self.z_dim:(i+1)*self.z_dim] = self.pi_current[:,i][:,None] * Z
        #     self.Phi_prime_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_next[:,i][:,None] * Z_prime
        #     self.Phi_init_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_init[:,i][:,None]*Z_init
        #     self.Phi_term_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_term[:,i][:,None]*Z_term

        #* Some commonly used variables
        # self.I_sa = np.eye(self.act_dim*self.z_dim)
        self.rews = dataset['rews']
        self.init_idx = np.arange(0, self.n_samples, self.horizon)
        self.end_idx = np.arange(self.horizon - 1, self.n_samples,
                                 self.horizon)

        self.rho = dataset[
            'ratio']  #* make sure that the importance weights are already calculated
Beispiel #45
0
class Kernel_Estimators(object):
    def __init__(self,
                 dataset,
                 obs_dim,
                 act_dim,
                 gamma,
                 horizon,
                 model_reg,
                 reward_reg,
                 value_reg,
                 default_length_scale=0.1,
                 random_feature_per_obs_dim=250,
                 norm=None,
                 scale_length_adjustment='median',
                 dtype=np.float64,
                 policy_net=None):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.gamma = gamma
        self.horizon = horizon
        self.norm = norm
        self.policy_net = policy_net
        self.model_reg = model_reg
        self.reward_reg = reward_reg
        self.value_reg = value_reg
        self.dtype = dtype

        self.n_samples = dataset['obs'].shape[0]
        self.n_episode = dataset['init_obs'].shape[0]
        self.data_acts = dataset['acts']
        if self.policy_net is not None:
            self.pi_current = self.policy_net.get_probabilities(dataset['obs'])
            self.pi_next = self.policy_net.get_probabilities(
                dataset['next_obs'])
            self.pi_init = self.policy_net.get_probabilities(
                dataset['init_obs'])
            self.pi_term = self.policy_net.get_probabilities(
                dataset['term_obs'])
        else:
            self.pi_current = dataset['target_prob_obs']
            self.pi_next = dataset['target_prob_next_obs']
            self.pi_init = dataset['target_prob_init_obs']
            self.pi_term = dataset['target_prob_term_obs']
        if self.norm is None:
            self.obs = dataset['obs']
            self.next_obs = dataset['next_obs']
            self.init_obs = dataset['init_obs']
            self.term_obs = dataset['term_obs']
        elif self.norm == 'std':
            self.obs_mean = np.mean(dataset['obs'], axis=0, keepdims=True)
            self.obs_std = np.std(dataset['obs'], axis=0, keepdims=True)
            self.obs = (dataset['obs'] - self.obs_mean) / self.obs_std
            self.next_obs = (dataset['next_obs'] -
                             self.obs_mean) / self.obs_std
            self.init_obs = (dataset['init_obs'] -
                             self.obs_mean) / self.obs_std
            self.term_obs = (dataset['term_obs'] -
                             self.obs_mean) / self.obs_std
        else:
            raise NotImplementedError
        if scale_length_adjustment == 'median':
            sample_num = 5000
            idx1 = np.random.choice(self.n_samples, sample_num)
            idx2 = np.random.choice(self.n_samples, sample_num)
            med_dist = np.median(np.square(self.obs[None, idx1, :] -
                                           self.obs[idx2, None, :]),
                                 axis=(0, 1))
            med_dist[
                med_dist <
                0.01] = 0.01  # enforce a upperbound on the scale-length of the action component
            scale_length_vector = 1.0 / med_dist
        else:
            scale_length_vector = np.ones(self.obs_dim)
        # import pdb; pdb.set_trace()
        #* set the fourier feature
        transformer_list = []
        self.z_dim = random_feature_per_obs_dim * self.obs_dim
        models = [
            RBFSampler(n_components=random_feature_per_obs_dim,
                       gamma=default_length_scale * dist)
            for dist in scale_length_vector
        ]
        for model in models:
            model.fit([self.obs[0]])
            transformer_list.append((str(model), model))
        self.rff = FeatureUnion(transformer_list)

        # #* separate action set indexing
        # act_idx = []
        # for i in range(self.act_dim):
        #     act_idx.append(np.where(dataset['acts']==i)[0])
        # #* apply transformation
        # Z = self.rff.transform(self.obs).astype(self.dtype); Z_prime = self.rff.transform(self.next_obs).astype(self.dtype)
        # Z_init = self.rff.transform(self.init_obs).astype(self.dtype); Z_term = self.rff.transform(self.term_obs).astype(self.dtype)
        # assert self.z_dim == Z.shape[1]
        # self.Phi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim), dtype=self.dtype)
        # self.Phi_pi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim),dtype=self.dtype)
        # self.Phi_prime_pi = np.zeros((Z_prime.shape[0], Z_prime.shape[1]* self.act_dim),dtype=self.dtype)
        # self.Phi_init_pi = np.zeros((Z_init.shape[0], Z_init.shape[1]*self.act_dim), dtype=self.dtype)
        # self.Phi_term_pi = np.zeros((Z_term.shape[0], Z_term.shape[1]*self.act_dim),dtype=self.dtype)
        # for i in range(self.act_dim):
        #     self.Phi[act_idx[i], i*self.z_dim:(i+1)*self.z_dim] = Z[act_idx[i]]
        #     self.Phi_pi[:, i*self.z_dim:(i+1)*self.z_dim] = self.pi_current[:,i][:,None] * Z
        #     self.Phi_prime_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_next[:,i][:,None] * Z_prime
        #     self.Phi_init_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_init[:,i][:,None]*Z_init
        #     self.Phi_term_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_term[:,i][:,None]*Z_term

        #* Some commonly used variables
        # self.I_sa = np.eye(self.act_dim*self.z_dim)
        self.rews = dataset['rews']
        self.init_idx = np.arange(0, self.n_samples, self.horizon)
        self.end_idx = np.arange(self.horizon - 1, self.n_samples,
                                 self.horizon)

        self.rho = dataset[
            'ratio']  #* make sure that the importance weights are already calculated

    def estimate_model_based(self):
        #* separate action set indexing
        act_idx = []
        for i in range(self.act_dim):
            act_idx.append(np.where(self.data_acts == i)[0])
        #* apply transformation
        Z = self.rff.transform(self.obs).astype(self.dtype)
        Z_prime = self.rff.transform(self.next_obs).astype(self.dtype)
        Z_init = self.rff.transform(self.init_obs).astype(self.dtype)
        Z_term = self.rff.transform(self.term_obs).astype(self.dtype)
        assert self.z_dim == Z.shape[1]
        Phi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim),
                       dtype=self.dtype)
        Phi_pi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim),
                          dtype=self.dtype)
        Phi_prime_pi = np.zeros(
            (Z_prime.shape[0], Z_prime.shape[1] * self.act_dim),
            dtype=self.dtype)
        Phi_init_pi = np.zeros(
            (Z_init.shape[0], Z_init.shape[1] * self.act_dim),
            dtype=self.dtype)
        Phi_term_pi = np.zeros(
            (Z_term.shape[0], Z_term.shape[1] * self.act_dim),
            dtype=self.dtype)
        for i in range(self.act_dim):
            Phi[act_idx[i],
                i * self.z_dim:(i + 1) * self.z_dim] = Z[act_idx[i]]
            Phi_pi[:, i * self.z_dim:(i + 1) *
                   self.z_dim] = self.pi_current[:, i][:, None] * Z
            Phi_prime_pi[:, i * self.z_dim:(i + 1) *
                         self.z_dim] = self.pi_next[:, i][:, None] * Z_prime
            Phi_init_pi[:, i * self.z_dim:(i + 1) *
                        self.z_dim] = self.pi_init[:, i][:, None] * Z_init
            Phi_term_pi[:, i * self.z_dim:(i + 1) *
                        self.z_dim] = self.pi_term[:, i][:, None] * Z_term

        I_sa = np.eye(self.act_dim * self.z_dim)
        #* uncentered /center covariance identity:
        H = np.eye(self.n_samples)
        # H = np.eye(self.n_samples) - 1.0/self.n_samples*np.ones((self.n_samples, self.n_samples))

        #* estimate reward function
        r_sa = np.linalg.inv(Phi.T @ Phi +
                             self.reward_reg * I_sa) @ Phi.T @ self.rews
        Sigma_yx = 1 / self.n_samples * Phi_prime_pi.T @ H @ Phi
        Sigma_xx = 1 / self.n_samples * Phi.T @ H @ Phi
        P = np.matmul(Sigma_yx,
                      np.linalg.inv(Sigma_xx + self.model_reg * I_sa))
        #* Now that we have the transition operator, we have that:
        #* E_{s'|s}[\phi(s')|s] = P \phi(s)
        #* This gives a clean mechanism to roll the model forward
        #* in particular, the next feature matrix will be
        #* Phi' = Phi P.T, where Phi = [phi_1, ..., phi_n].T \in R^{n\times p}
        finite_horizon_correction = I_sa - np.linalg.matrix_power(
            self.gamma * P.T, self.horizon)
        transposed_transition_inverse = np.linalg.inv(I_sa - self.gamma * P.T)
        accumulated_feature = Phi_pi @ finite_horizon_correction @ transposed_transition_inverse

        V = accumulated_feature @ r_sa
        value_est = np.mean(V[self.init_idx])
        return value_est

    def estimate_LSTD(self):
        reg = self.value_reg
        Z = self.rff.transform(self.obs)
        Z_prime = self.rff.transform(self.next_obs)
        R = self.rho * self.rews
        regularized_inverse = np.linalg.inv(
            np.matmul(Z.T, Z - self.gamma * self.rho * Z_prime) +
            reg * np.eye(self.z_dim))
        featurized_reward = np.matmul(Z.T, R)
        reward_coef = np.matmul(regularized_inverse, featurized_reward)
        V_init = Z[self.init_idx] @ reward_coef
        V_term = Z[self.end_idx] @ reward_coef
        V_traj = V_init - V_term * self.gamma**self.horizon
        value_est = np.mean(V_traj)
        return value_est

    def estimate_LSTDQ(self):
        #* separate action set indexing
        act_idx = []
        for i in range(self.act_dim):
            act_idx.append(np.where(self.data_acts == i)[0])
        #* apply transformation
        Z = self.rff.transform(self.obs).astype(self.dtype)
        Z_prime = self.rff.transform(self.next_obs).astype(self.dtype)
        Z_init = self.rff.transform(self.init_obs).astype(self.dtype)
        Z_term = self.rff.transform(self.term_obs).astype(self.dtype)
        assert self.z_dim == Z.shape[1]
        Phi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim),
                       dtype=self.dtype)
        Phi_pi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim),
                          dtype=self.dtype)
        Phi_prime_pi = np.zeros(
            (Z_prime.shape[0], Z_prime.shape[1] * self.act_dim),
            dtype=self.dtype)
        Phi_init_pi = np.zeros(
            (Z_init.shape[0], Z_init.shape[1] * self.act_dim),
            dtype=self.dtype)
        Phi_term_pi = np.zeros(
            (Z_term.shape[0], Z_term.shape[1] * self.act_dim),
            dtype=self.dtype)
        for i in range(self.act_dim):
            Phi[act_idx[i],
                i * self.z_dim:(i + 1) * self.z_dim] = Z[act_idx[i]]
            Phi_pi[:, i * self.z_dim:(i + 1) *
                   self.z_dim] = self.pi_current[:, i][:, None] * Z
            Phi_prime_pi[:, i * self.z_dim:(i + 1) *
                         self.z_dim] = self.pi_next[:, i][:, None] * Z_prime
            Phi_init_pi[:, i * self.z_dim:(i + 1) *
                        self.z_dim] = self.pi_init[:, i][:, None] * Z_init
            Phi_term_pi[:, i * self.z_dim:(i + 1) *
                        self.z_dim] = self.pi_term[:, i][:, None] * Z_term

        I_sa = np.eye(self.act_dim * self.z_dim)

        regularized_inverse = np.linalg.inv(
            np.matmul(Phi.T, Phi - self.gamma * Phi_prime_pi) +
            self.value_reg * I_sa)
        featurized_reward = np.matmul(Phi.T, self.rews)
        reward_coef = np.matmul(regularized_inverse, featurized_reward)
        V_init = Phi_init_pi @ reward_coef
        V_term = Phi_term_pi @ reward_coef
        V_traj = V_init - V_term * self.gamma**self.horizon
        value_est = np.mean(V_traj)
        return value_est

    def estimate_LSTD_dual(self):
        import kernel_util as ku
        sample_num = 5000
        idx1 = np.random.choice(self.n_samples, sample_num)
        idx2 = np.random.choice(self.n_samples, sample_num)
        med_dist = np.median(np.square(self.obs[None, idx1, :] -
                                       self.obs[idx2, None, :]),
                             axis=(0, 1))
        med_dist[
            med_dist <
            0.01] = 0.01  # enforce a upperbound on the scale-length of the action component
        w = 1.0 / med_dist

        default_gamma = 0.1
        reg = 1e-2

        ratio_vector = self.rho.copy().astype(np.float32)

        K = ku.weighted_rbf_kernel(self.obs, w=w,
                                   gamma=default_gamma).astype(np.float32)
        K_prime = ku.weighted_rbf_kernel(self.next_obs,
                                         self.obs,
                                         w=w,
                                         gamma=default_gamma).astype(
                                             np.float32)
        K_prime = self.gamma * (K_prime *
                                ratio_vector.repeat(self.n_samples, axis=1))
        R = (ratio_vector * self.rews).astype(np.float32)
        beta = np.linalg.inv(K - K_prime + reg * np.eye(self.n_samples)).dot(R)

        K0 = ku.weighted_rbf_kernel(self.obs,
                                    self.init_obs,
                                    w=w,
                                    gamma=default_gamma)
        K_terminal = ku.weighted_rbf_kernel(self.obs,
                                            self.term_obs,
                                            w=w,
                                            gamma=default_gamma)

        # V_init = np.matmul(beta.T, K0)
        # V_term = np.matmul(beta.T, K_terminal)
        V_init = K0.T @ beta
        V_term = K_terminal.T @ beta
        V_traj = V_init - V_term * self.gamma**self.horizon
        value_est = np.mean(V_traj)
        import pdb
        pdb.set_trace()
        return value_est
        ('data', DataFrameColumnExtracter('Surname')),
        ('vectorizer', HashingVectorizer(non_negative=True))
    ])

    bio_pipe = Pipeline([
        ('data', DataFrameColumnExtracter('Bio')),
        ('preprocessor', StripHTMLTransformer()),
        ('vectorizer', CountVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1, 3))),
        ('tfidf', TfidfTransformer())
    ])

    features = FeatureUnion(
        n_jobs=1,
        transformer_list=[
            ('email_pipe', email_pipe),
            ('fname_pipe', fname_pipe),
            ('lname_pipe', lname_pipe),
            ('bio_pipe',   bio_pipe)
        ],
        transformer_weights=None)

    classifier = Pipeline([
        ('features', features),
        ('model', MultinomialNB(alpha=0.0001, fit_prior=True))
    ])

    classifier.fit(trainData, labels)

    filename = 'member_classifier.pickle'
    print "writing model to file %s" % (filename)
    pickle.dump(classifier, open(filename, 'wb'))
data_train, data_test, labels_train, labels_test = train_test_split(
    data, labels, test_size=0.3, random_state=100)
vectorizer = CountVectorizer()

# TruncatedSVD to select the principal components:
pca = TruncatedSVD(n_components=2)

#NMF for MultinomialNB as a PCA technique
pca1 = NMF(n_components=2)

# K-best features to be selected
selection = SelectKBest(chi2, k=1)

# To combine the features for LinearSVC, Decision Trees and Logistic Regression
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
# To combine the features for MultinomialNB
combined_features1 = FeatureUnion([("pca", pca1), ("univ_select", selection)])

clf = DecisionTreeClassifier(criterion="gini", random_state=100)

# Performing training by creating one pipeline per classifier according to the respective
#combined features and classification algorithm
pipeline_logreg = Pipeline([("count_vectorizer", vectorizer),
                            ("features", combined_features),
                            ("Logreg", LogisticRegression())])
pipeline_svc = Pipeline([("count_vectorizer", vectorizer),
                         ("features", combined_features),
                         ("svm", LinearSVC())])
pipeline_dt = Pipeline([("count_vectorizer", vectorizer),
                        ("features", combined_features), ("dt", clf)])
def extract_features(train_data, test_data=None, model_persistor=None):
    """
    Does feature enrichment and enhancement (separate from modeling)
    if test_data is passed, it is included in the processing and split out again
    this is to account for encoding where feature categories aren't in the train set
    :param train_data: Training data
    :param test_data:  Test data
    :param model_persistor: An instance of PersistModel.  When passed, supporting objects can be added
    """

    # ADD TO THE NOTES EXPLAINING WHAT YOU ARE DOING WITH THE FEATURE EXTRACTION
    model_persistor.add_note(" Extract Features now includes...")

    if test_data is not None:
        data_to_process = pd.concat([
            train_data[the_settings.all_features],
            test_data[the_settings.all_features]
        ],
                                    ignore_index=True)
    else:
        data_to_process = train_data

    # I want to try some dimensionality reduction with this one.
    # First I'm going to start with all of the previous features and

    feature_extraction = Pipeline([
        ('initial features',
         FeatureUnion([
             ('numeric_features_standardized',
              Pipeline([('numeric_features_raw',
                         FeatureUnion([
                             ('numeric_features',
                              ColumnExtractor(the_settings.numeric_features)),
                             ('v22_letter_count',
                              LetterCountTransformer(
                                  the_settings.special_string_features)),
                             ('Nan_count', NaNCountTransformer())
                         ])), ('zero_na', NanToZeroTransformer())])),
             ('string_features_standardized',
              Pipeline([('string_features',
                         ColumnExtractor(the_settings.string_features)),
                        ('label', MultiColumnLabelEncoder()),
                        ('one_hot', OneHotEncoder(sparse=False))])),
             ('v22_standardized',
              Pipeline([('extract',
                         LetterExtractionTransformer(
                             the_settings.special_string_features)),
                        ('label', MultiColumnLabelEncoder()),
                        ('one_hot', OneHotEncoder(sparse=False))]))
         ]))
    ])

    fitted_feature_model = feature_extraction.fit(data_to_process)
    if model_persistor:
        model_persistor.add_object_to_save(fitted_feature_model,
                                           FileObjectType.feature_model)

    extracted_features = fitted_feature_model.transform(data_to_process)

    return extracted_features[:len(train_data
                                   )], extracted_features[len(train_data):]
Beispiel #49
0
# Scikit-Learn provides a very useful API to create data transformation
# pipelines. Our data contains both numerical values and categorical/text
# values. So we'll need a pipeline for each type of data. Then we'll need
# a way to merge both pipelines together to build the final training set.
housing_num = housing.drop("ocean_proximity", axis=1)


# Calling list() on a dataframe returns the attribute names
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]


num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer()),
    ])

preparation_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

housing_prepared = preparation_pipeline.fit_transform(housing)
    X_test, Y_test = utils.create_x_y(test, dt=128, shift=64, verbose=0)
    X_train, Y_train = utils.create_x_y(train, dt=128, shift=64)

    print('X_test.shape:', X_test.shape)
    print('Y_test.shape:', Y_test.shape)
    print('X_train.shape:', X_train.shape)
    print('Y_train.shape:', Y_train.shape)
    print('\nTesting features:\n')

    std = STD()
    entrop = Entropy()
    quantiles = Quantiles(quantiles=[0.5, 0.25, 0.75])

    test_std = std.fit_transform(X_test)
    print('test_std.shape:', test_std.shape)

    test_entrop = entrop.fit_transform(X_test)
    print('test_entrop.shape:', test_entrop.shape)

    test_quantiles = quantiles.fit_transform(X_test)
    print('test_quantiles.shape:', test_quantiles.shape)

    union = FeatureUnion([
        ('STD', STD()),
        ('Entropy', Entropy()),
        ('Quantiles', Quantiles(quantiles=[0.5])),
    ])

    result = union.fit_transform(X_test)
    print('All in one:', result.shape)
Beispiel #51
0
# replace White with white as well as urban and rural so it's consistent
df = df.replace('White', 'white')
df = df.replace('Rural', 'rural')
df = df.replace('Urban', 'urban')

# TODO: Process missing data in pipeline

categorical_pipeline = Pipeline(
    steps=[('cat_selector', FeatureSelector(['Sex', 'Race', 'RuralUrban'])
            ), ('one_hot_enc', OneHotEncoder(sparse=False))])

numerical_pipeline = Pipeline(steps=[('num_selector',
                                      FeatureSelector(['Age']))])

feature_pipeline = FeatureUnion(transformer_list=[(
    'numerical_pipeline',
    numerical_pipeline), ('categorical_pipeline', categorical_pipeline)])

final_pipeline = Pipeline(
    steps=[('feature_pipeline',
            feature_pipeline), ('model', LogisticRegression(C=0.001))])

le = LabelEncoder()
y = df[['ever_cigarettes']].to_numpy()
y = le.fit_transform(y.ravel())

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

final_pipeline.fit(X_train, y_train)

# final_pipeline.score(X_test, y_test)
housing_num = housing.drop("ocean_proximity", axis=1)

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', impute.SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
cat_pipelines = Pipeline([
    ('label_binarizer', DataFrameMapper([(cat_attribs, LabelBinarizer())])),
])
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeliine", num_pipeline),
    ("cat_pipeline", cat_pipelines),
])

housing_prepared = full_pipeline.fit_transform(housing)
#print(housing_prepared.shape)

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_label)

some_data = housing.iloc[:5]
some_label = housing_label.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
#print(lin_reg.predict(some_data_prepared))
#print(list(some_label))

housing_prediction = lin_reg.predict(housing_prepared)
def train(poems, nonpoems, quick=False):
    """
    Train the model based on given training data
    :return:
    """
    #nonpoems = nonpoems[::1]

    print(len(poems))
    print(len(nonpoems))

    all_train_data = poems + nonpoems
    all_train_target = [1] * len(poems) + [0] * len(nonpoems)

    all_train_data = [
        textdata.replace('w', 'v').replace('W', 'V')
        for textdata in all_train_data
    ]

    tfidf = Pipeline([('vect', CountVectorizer(max_df=1.0,
                                               max_features=25400)),
                      ('tfidf', TfidfTransformer())])

    text_feats = Pipeline([
        ('stats', TextStats()),  # returns a list of dicts
        ('vect', DictVectorizer()),  # list of dicts -> feature matrix
        ('norm', Normalizer(norm='l2')),
    ])

    combined_feats = FeatureUnion([
        ('text_feats', text_feats),
        ('word_freq', tfidf),
    ])

    sgd = SGDClassifier(loss='hinge',
                        penalty='l2',
                        alpha=0.0001,
                        n_iter=6,
                        random_state=42)

    combined_clf = Pipeline([
        ('features', combined_feats),
        ('clf', sgd),
    ])

    if quick:
        gs_clf = GridSearchCV(combined_clf, {})
    else:
        parameters = {
            # 'features__word_freq__vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
            'features__word_freq__vect__max_df': [1.0, 0.5],
            'features__word_freq__vect__max_features':
            [None, 20000, 25000, 25200, 25400, 25600, 26000],
            'features__text_feats__norm__norm': ('l1', 'l2', 'max'),
            'clf__alpha': (1e-3, 1e-4, 1e-5, 1e-6),
            'clf__penalty': ('l2', 'elasticnet'),
            'clf__loss': ('hinge', 'log'),
            'clf__n_iter': (4, 5, 6, 7, 8),
        }

        gs_clf = GridSearchCV(combined_clf, parameters, n_jobs=-1)

    gs_clf.fit(all_train_data, all_train_target)

    predicted = gs_clf.predict(all_train_data)

    print(np.average(predicted))

    print('Final params: %s' % gs_clf.best_params_)
    print('Best score: %s' % gs_clf.best_score_)

    stop_words = gs_clf.best_estimator_.get_params()['features'].get_params(
    ).get('word_freq').named_steps['vect'].stop_words_
    print('Number of generated stopwords: %s' % len(stop_words))

    with open('generated_stopwords.txt', 'w', newline='') as fp:
        fp.write('\n'.join(sorted(stop_words)))

    print('Weights %s' %
          gs_clf.best_estimator_.named_steps['clf'].coef_[0][:4])

    return gs_clf
def extract_features(X,
                     sfreq,
                     selected_funcs,
                     funcs_params=None,
                     n_jobs=1,
                     return_as_df=False):
    """Extraction of temporal or spectral features from epoched EEG signals.

    Parameters
    ----------
    X : ndarray, shape (n_epochs, n_channels, n_times)
        Array of epoched EEG data.

    sfreq : float
        Sampling rate of the data.

    selected_funcs : list of str or tuples
        The elements of ``selected_features`` are either strings or tuples of
        the form ``(str, callable)``. If an element is of type ``str``, it is
        the alias of a feature function. The aliases are built from the
        feature functions' names by removing ``compute_``. For instance, the
        alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``.
        (See the documentation of mne-features). If an element is of type
        ``tuple``, the first element of the tuple should be a string
        (name/alias given to a user-defined feature function) and the second
        element should be a  callable (a user-defined feature function which
        accepts Numpy arrays with shape ``(n_channels, n_times)``). The
        names/aliases given to user-defined feature functions should not
        intersect the aliases used by mne-features. If the name given to a
        user-defined feature function is already used as an alias in
        mne-features, an error will be raised.

    funcs_params : dict or None (default: None)
        If not None, dict of optional parameters to be passed to the feature
        functions. Each key of the ``funcs_params`` dict should be of the form:
        ``[alias_feature_function]__[optional_param]`` (for example:
        ``higuchi_fd__kmax``).

    n_jobs : int (default: 1)
        Number of CPU cores used when parallelizing the feature extraction.
        If given a value of -1, all cores are used.

    return_as_df : bool (default: False)
        If True, the extracted features will be returned as a Pandas DataFrame.
        The column index is a MultiIndex (see :class:`~pandas.MultiIndex`)
        which contains the alias of each feature function which was used.
        If False, the features are returned as a 2d Numpy array.

    Returns
    -------
    array-like, shape (n_epochs, n_features)
    """
    if sfreq <= 0:
        raise ValueError('Sampling rate `sfreq` must be positive.')
    univariate_funcs = get_univariate_funcs(sfreq)
    bivariate_funcs = get_bivariate_funcs(sfreq)
    feature_funcs = univariate_funcs.copy()
    feature_funcs.update(bivariate_funcs)
    sel_funcs = _check_funcs(selected_funcs, feature_funcs)

    # Feature extraction
    n_epochs = X.shape[0]
    _tr = [(n, FeatureFunctionTransformer(func=func)) for n, func in sel_funcs]
    extractor = FeatureUnion(transformer_list=_tr)
    if funcs_params is not None:
        extractor.set_params(**funcs_params)
    res = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(_apply_extractor)(extractor, X[j, :, :], return_as_df)
        for j in range(n_epochs))
    feature_names = res[0][1]
    res = list(zip(*res))[0]
    Xnew = np.vstack(res)
    if return_as_df:
        return _format_as_dataframe(Xnew, feature_names)
    else:
        return Xnew
Beispiel #55
0
 vectorizer = FeatureUnion([
     ('name',
      Pipeline([('select', ItemSelector('name', start_time=start_time)),
                ('transform',
                 HashingVectorizer(ngram_range=(1, 2),
                                   n_features=2**27,
                                   norm='l2',
                                   lowercase=False,
                                   stop_words=stopwords)),
                ('drop_cols', DropColumnsByDf(min_df=2))])),
     ('category_name',
      Pipeline([
          ('select', ItemSelector('category_name', start_time=start_time)),
          ('transform',
           HashingVectorizer(ngram_range=(1, 1),
                             token_pattern='.+',
                             tokenizer=split_cat,
                             n_features=2**27,
                             norm='l2',
                             lowercase=False)),
          ('drop_cols', DropColumnsByDf(min_df=2))
      ])),
     ('brand_name',
      Pipeline([
          ('select', ItemSelector('brand_name', start_time=start_time)),
          ('transform',
           CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)),
      ])),
     ('gencat_cond',
      Pipeline([
          ('select', ItemSelector('gencat_cond', start_time=start_time)),
          ('transform',
           CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)),
      ])),
     ('subcat_1_cond',
      Pipeline([
          ('select', ItemSelector('subcat_1_cond', start_time=start_time)),
          ('transform',
           CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)),
      ])),
     ('subcat_2_cond',
      Pipeline([
          ('select', ItemSelector('subcat_2_cond', start_time=start_time)),
          ('transform',
           CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)),
      ])),
     ('has_brand',
      Pipeline([('select', ItemSelector('has_brand',
                                        start_time=start_time)),
                ('ohe', OneHotEncoder())])),
     ('shipping',
      Pipeline([('select', ItemSelector('shipping', start_time=start_time)),
                ('ohe', OneHotEncoder())])),
     ('item_condition_id',
      Pipeline([('select',
                 ItemSelector('item_condition_id', start_time=start_time)),
                ('ohe', OneHotEncoder())])),
     ('item_description',
      Pipeline([
          ('select', ItemSelector('item_description',
                                  start_time=start_time)),
          ('hash',
           HashingVectorizer(ngram_range=(1, 3),
                             n_features=2**27,
                             dtype=np.float32,
                             norm='l2',
                             lowercase=False,
                             stop_words=stopwords)),
          ('drop_cols', DropColumnsByDf(min_df=2)),
      ]))
 ],
                           n_jobs=1)
Beispiel #56
0
#Romney_Test_dataset_NO_Label.csv

obama_output = 'Mayank_Raj_Chinmay_Nautiyal_Obama.txt'
romney_output = 'Mayank_Raj_Chinmay_Nautiyal_Romney.txt'

#test_data = pd.read_csv("Obama_Test_dataset_NO_Label.csv", encoding = "ISO-8859-1")
test_data = pd.read_csv("Romney_Test_dataset_NO_Label.csv", encoding = "ISO-8859-1")


test_data_fresh = test_data[['Tweet_ID', 'Tweet_text']]
test_data_fresh.head(10)

textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
,'count_mentions','count_urls','count_words']
features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
, ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
, ('vect', CountVectorizer(max_df=0.25, min_df=2, ngram_range=(1,3)))]))]
, n_jobs=-1)
pipeline = Pipeline([
('features', features)
, ('clf', LogisticRegression(C=1, penalty='l2'))
])
#best_model = pipeline.fit(df_model.drop('classes', axis=1), df_model.classes)
best_model = pipeline.fit(df2_model.drop('classes', axis=1), df2_model.classes)

  
df_counts_pos = tc.transform(test_data_fresh["Tweet_text"])
df_clean_pos = ct.transform(test_data_fresh["Tweet_text"])
df_model_pos = df_counts_pos
df_model_pos['clean_text'] = df_clean_pos
predictions = best_model.predict(df_model_pos).tolist()
final_result = pd.DataFrame({'id':test_data_fresh['Tweet_ID'],'label':predictions})
            start = time.time()
            encoded_case = bucket_encoder.fit_transform(dt_test_bucket)
            _, knn_idxs = bucketer.kneighbors(encoded_case)
            knn_idxs = knn_idxs[0]

            relevant_cases_bucket = encoded_train.iloc[knn_idxs].index
            dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_train_prefixes, relevant_cases_bucket)  # one row per event
            train_y = dataset_manager.get_label_numeric(dt_train_bucket)

            if len(set(train_y)) < 2:
                preds_all.append(train_y[0])
            else:
                feature_combiner = FeatureUnion([
                    (method,
                     EncoderFactory.get_encoder(method, **cls_encoder_args))
                    for method in methods
                ])

                if cls_method == "rf":
                    cls = RandomForestClassifier(
                        n_estimators=500,
                        max_features=args['max_features'],
                        random_state=random_state)

                elif cls_method == "xgboost":
                    cls = xgb.XGBClassifier(
                        objective='binary:logistic',
                        n_estimators=500,
                        learning_rate=args['learning_rate'],
                        subsample=args['subsample'],
Beispiel #58
0
])

# To make a pipeline from all of our pipelines, we do the same thing, but now we use a FeatureUnion to join the feature processing pipelines.
#
# The syntax is the same as a regular pipeline, it's just an array of tuple, with the (name, object) format.
#
# The feature union itself is not a pipeline, it's just a union, so you need to do *one more step* to make it useable: pass it to a pipeline, with the same structure, an array of tuples, with the simple (name, object) format. . As you can see, we get a pipeline-ception going on the more complex you get!
#
# You can then apply all those transformations at once with a single fit, transform, or fit_transform call. Nice, right?

# In[8]:

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), ('length', length), ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

# To add a model to the mix and generate predictions as well, you can add a model at the end of the pipeline. The syntax is, you guessed it, an array of tuples, merging the transformations with a model.
#
# We can see the raw accuracy is at 63%. Not bad for a start.
#

# In[12]:

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
Beispiel #59
0
        super(LabelBinarizerPipelineFriendly, self).fit(X)

    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)


num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline_2 = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('inputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizerPipelineFriendly()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline_2),
    ('cat_pipeline', cat_pipline),
])

housing_prepared = full_pipeline.fit_transform(housing)
Beispiel #60
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different pca object to control the random_state stream
    fs = FeatureUnion([("pca", pca), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))