Python FeatureUnionの例、sklearn.pipeline.FeatureUnion Pythonの例

コード例 #1

0

ファイルを表示

ファイル: anontesting.py プロジェクト: srini21/Amazon-deceptive-reviews

def train_model(trainset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []

	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])

	print "Training instances : ", 0.8*len(classes)
	print "Testing instances : ", 0.2*len(classes) 
	
	matrix = vectorizer.fit_transform(corpus)
	print "feature count : ", len(vectorizer.get_feature_names())
	print "training model"
	X = matrix.toarray()
	y = numpy.asarray(classes)
	model =LinearSVC()
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0)
	y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
	#y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test)
	#print y_prob
	#con_matrix = []
	#for row in range(len(y_prob)):
	#	temp = [y_pred[row]]	
	#	for prob in y_prob[row]:
	#		temp.append(prob)
	#	con_matrix.append(temp)
	#for row in con_matrix:
	#	output.write(str(row)+"\n")
	#print y_pred		
	#print y_test
	
	res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited']
	res2=[i for i, j in enumerate(y_test) if j == 'anonEdited']
	reset=[]
	for r in res1:
		if y_test[r] != "anonEdited":
			reset.append(y_test[r])
	for r in res2:
		if y_pred[r] != "anonEdited":
			reset.append(y_pred[r])
	
	
	output=open(sys.argv[2],"w")
	for suspect in reset:
		output.write(str(suspect)+"\n")	
	cm = confusion_matrix(y_test, y_pred)
	print(cm)
	pl.matshow(cm)
	pl.title('Confusion matrix')
	pl.colorbar()
	pl.ylabel('True label')
	pl.xlabel('Predicted label')
	pl.show()
	print accuracy_score(y_pred,y_test)

コード例 #2

0

ファイルを表示

ファイル: misc.py プロジェクト: kirk86/Task-1

def concat_feature_extractors(train_data, labels):
    # This dataset is way to high-dimensional. Better do PCA:
    pca = PCA(n_components = 2)

    # Maybe some original features where good, too?
    selection = SelectKBest(k = 1)

    # Build estimator from PCA and Univariate selection:

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(train_data, labels).transform(train_data)

    # Classify:
    svm = SVC(kernel = "linear")
    svm.fit(X_features, labels)

    # Do grid search over k, n_components and C:

    pipeline = Pipeline([("features", combined_features), ("svm", svm)])

    param_grid = dict(features__pca__n_components = [1, 2, 3],
                      features__univ_select__k = [1, 2],
                      svm__C = [0.1, 1, 10])

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = 10)
    grid_search.fit(train_data, labels)
    print(grid_search.best_estimator_)

コード例 #3

0

ファイルを表示

ファイル: tests.py プロジェクト: cgoldammer/simple_text_analysis

 def test_feature_union(self):
     """Tests that combining multiple featurizers works as expected"""
     modules = ["bag-of-words", "entities"]
     modules_list, _ = modules_to_dictionary(modules)
     feature_union = FeatureUnion(modules_list)
     feature_union.fit(texts_entities, outcomes)
     feature_union.transform(["unknown"])

コード例 #4

0

ファイルを表示

ファイル: svc_submission.py プロジェクト: Turf1013/Machine_Learning

def testSVC(lbda=1.0, n_components=20, kbest=4):
	otto = load_otto()
	X = otto.data
	y = otto.target
	# X = otto.data[:10000, :10]
	# y = otto.target[:10000]

	scaler = StandardScaler().fit(X)
	X = scaler.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ("univ_select", selection)]
	)
	X_features = combined_features.fit(X, y).transform(X)

	svc = SVC(C=1.0/lbda, kernel='rbf', cache_size=400, probability=True)
	pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scaler.transform(testData)
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)

コード例 #5

0

ファイルを表示

ファイル: Model.py プロジェクト: emschorsch/fanduel

	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1, 2, 3],
				                  features__univ_select__k=[1, 2])
			else:
				param_grid = dict(features__pca__n_components=[1, 2,3],
				                  features__univ_select__k=[1,2])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None

コード例 #6

0

ファイルを表示

ファイル: prediction.py プロジェクト: guruttosekai2011/Rossmann_Store_Sales

def prediction(train_df, test_df, MODEL):

    print "... start prediction"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()

    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    test_X = fu_obj.transform(test_df)

    y_pred = clf.predict(test_X)
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")

コード例 #7

0

ファイルを表示

ファイル: classifiers.py プロジェクト: chrispool/lfd

	def trainItalianSexClassifier(self):
		#get correct labels from dictionary in trainY and testY
		trainX = self.italianTrainData[0]
		trainY = self.getYlabels(self.italianTrainData[1], 'sex')

		

		combined_features = FeatureUnion([("tfidf", TfidfVectorizer()),
										("ngrams", TfidfVectorizer(ngram_range=(3, 3), analyzer="char")), 
										("counts", CountVectorizer()),
										("latin", Latin()),	
										],transformer_weights={
											'latin': 1,
											'tfidf': 2,
											'ngrams': 2,
											'counts': 1,

        								})
		
		X_features = combined_features.fit(trainX, trainY).transform(trainX)
		classifier = svm.LinearSVC()
		pipeline = Pipeline([("features", combined_features), ("classifier", classifier)])
		pipeline.fit(trainX, trainY)
		
		return pipeline

コード例 #8

0

ファイルを表示

ファイル: Model.py プロジェクト: cole-maclean/fanduel

	def best_estimator(self, X, y):
		try:
			pca = PCA(n_components=2)
			selection = SelectKBest(k=2)
			combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
			X_features = combined_features.fit(X, y).transform(X)
			regr = linear_model.LassoCV()
			pipeline = Pipeline([("features", combined_features), ("regression", regr)])

			if 'batter' in self.player:
				param_grid = dict(features__pca__n_components=[1],
				                  features__univ_select__k=[1])
			else:
				param_grid = dict(features__pca__n_components=[1,2,3,4],
				                  features__univ_select__k=[1,2,3,4])

			grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0)
			grid_search.fit(X, y)
			self.modelled = True
			regr = grid_search
			self.R2=r2_score(self.target_matrix,regr.predict(self.feature_matrix)) #Ian: should do R2 on predicted points vs. points on a given day
			return regr
		except ValueError,e:
			print e
			self.modelled = False
			return None

コード例 #9

0

ファイルを表示

ファイル: function_approximation.py プロジェクト: sotetsuk/pyRLbook

def rbf_kernels(env, n_samples=100000, gamma=[0.01, 0.1], n_components=100):
    """Represent observation samples using RBF-kernels.

    EXAMPLE
    -------
    >>> env = gym.make('MountainCar-v0')
    >>> n_params, rbf = rbf_kernels(env, n_components=100)
    >>> sample = env.observation_space.sample().reshape((1, env.observation_space.shape[0]))
    >>> rbf(sample).shape
    (1, 100)
    """
    observation_examples = np.array([env.observation_space.sample() for _ in range(n_samples)])

    # Fit feature scaler
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Fir feature extractor
    features = []
    for g in gamma:
        features.append(('gamma={}'.format(g), RBFSampler(n_components=n_components // len(gamma), gamma=g)))

    features = FeatureUnion(features)
    features.fit(scaler.transform(observation_examples))

    def _rbf_kernels(observation):
        return features.transform(scaler.transform(observation))

    return _rbf_kernels

コード例 #10

0

ファイルを表示

ファイル: misc.py プロジェクト: kirk86/Task-1

def pca_kpca(train_data, labels):
    estimators = make_union(PCA(), TruncatedSVD(), KernelPCA())
#    estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
    combined = FeatureUnion(estimators)
    combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels)

    return combined

コード例 #11

0

ファイルを表示

ファイル: Models_ml.py プロジェクト: FangMath/MachineLearning_Mini_Project

 def fit(self, X, y=None):
     Trans2 = Q2Transformer()
     Trans3 = Q3Transformer()
     Trans4 = Q4Transformer()
     combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)])
     self.fit = combined_features.fit(X)
     return self

コード例 #12

0

ファイルを表示

ファイル: logistic_submission.py プロジェクト: Turf1013/Machine_Learning

def testLogistic(lbda=1.0, n_components=20, kbest=4):
	# X = otto.data[:1000, :20]
	# y = otto.target[:1000]
	otto = load_otto()
	X = otto.data[:, :]
	y = otto.target[:]
	# n_components = 20
	# kbest = 4
#	print 'y.shape =', y.shape

	scalar = StandardScaler().fit(X)
	X = scalar.transform(X)

	pca = PCA(n_components=n_components)
	selection = SelectKBest(k=kbest)

	combined_features = FeatureUnion(
		[("pca", pca), ('univ_select', selection)]
	)
	X_features = combined_features.fit(X,y).transform(X)

	logistic = LogisticRegression(C=1.0/lbda)
	pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)])
	trainData = X
	trainTarget = y
	pipe.fit(trainData, trainTarget)
	# print trainTarget
	test_otto = load_testotto()
	testData = test_otto.data
	testData = scalar.transform(testData)
	# logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score()))
	'save the prediction'
	prediction = pipe.predict_proba(testData)
	proba = pipe.predict_proba(testData)
	save_submission(lbda, proba, prediction)

コード例 #13

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: Givonaldo/scikit-learn

def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

コード例 #14

0

ファイルを表示

ファイル: repredict.py プロジェクト: haisland0909/Denoising-Dirty-Documents

def convert_testdata(test_gray_data, feature_rule=f.feature_transformer_rule):

    data_df = f.make_test_df(test_gray_data)
    fu = FeatureUnion(transformer_list=feature_rule)
    Std = preprocessing.StandardScaler()

    X_test = fu.fit_transform(data_df)
    #X_test = Std.fit_transform(X_test)

    return X_test

コード例 #15

0

ファイルを表示

ファイル: pre_loader.py プロジェクト: challenging/kaggle

def get_pca_transformer(train_x, train_y, n_components=-1):
    if n_components == -1:
        n_components = int(np.ceil(np.sqrt(train_x.shape[1])))

    pca = PCA(n_components=n_components)
    selection = SelectKBest(k=n_components/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    return combined_features.fit(train_x, train_y)

コード例 #16

0

ファイルを表示

ファイル: senti_models.py プロジェクト: meshiguge/senti

 def fit_logreg(self):
     tokenize_sense = CachedFitTransform(Pipeline([
         ('tokenize', Map(compose(tokenize, normalize_special, unescape))),
         ('normalize', MapTokens(normalize_elongations)),
     ]), self.memory)
     features = FeatureUnion([
         # ('w2v_doc', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec(
         #         dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20,
         #         workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))),
         # ('w2v_word_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_avg_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_norm_avg', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         ('w2v_word_norm_avg_google', Pipeline([
             ('tokenize', tokenize_sense),
             ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         ])),
         # ('w2v_word_max', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
         #     ), self.memory))),
         # ]).fit(self.unsup_docs[:10**6])),
         # ('w2v_word_max_google', Pipeline([
         #     ('tokenize', tokenize_sense),
         #     ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
         # ])),
         # ('w2v_word_inv', ToCorporas(Pipeline([
         #     ('tokenize', MapCorporas(tokenize_sense)),
         #     ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec(
         #         sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16
         #     ), self.memory)))),
         # ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))),
     ])
     classifier = LogisticRegression()
     with temp_log_level({'gensim.models.word2vec': logging.INFO}):
         classifier.fit(features.transform(self.train_docs), self.train_labels())
     estimator = Pipeline([('features', features), ('classifier', classifier)])
     return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator

コード例 #17

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: cdeil/scikit-learn

def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different pca object to control the random_state stream
    fs = FeatureUnion([("pca", pca), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

コード例 #18

0

ファイルを表示

ファイル: classify.py プロジェクト: haisland0909/Denoising-Dirty-Documents

def set_traindata(df, key):

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X = fu.fit_transform(df)
    y = np.concatenate(df["label"].apply(lambda x: x.flatten()))

    X = Std.fit_transform(X)

    return (X, y)

コード例 #19

0

ファイルを表示

ファイル: prediction.py プロジェクト: guruttosekai2011/Rossmann_Store_Sales

def cv_score(train_df, MODEL):
    print "... start cross validation"

    fu_obj = FeatureUnion(transformer_list=features.feature_list)

    train_X = fu_obj.fit_transform(train_df)
    train_y = train_df["Sales"].as_matrix()
    clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                       param_grid=clf_dict[MODEL]["paramteters"],
                       n_jobs=-1, scoring=rmspe, cv=None)
    print cross_val_score(clf, train_X, train_y, scoring=rmspe, cv=5, n_jobs=3)

コード例 #20

0

ファイルを表示

ファイル: feature_engineering.py プロジェクト: challenging/kaggle

def pca(x, y, test_x, n_features=-1):
    if n_features == -1:
        n_features = int(np.ceil(np.sqrt(x.shape[1])))

    pca = PCA(n_components=n_features)
    selection = SelectKBest(k=n_features/2)

    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    combined_features.fit(x, y)

    return combined_features.transform(x), combined_features.transform(test_x)

コード例 #21

0

ファイルを表示

ファイル: prediction.py プロジェクト: guruttosekai2011/Rossmann_Store_Sales

def prediction(train_df, test_df, MODEL):

    print "... start prediction"
    fu_obj = FeatureUnion(transformer_list=features.feature_list)
    train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)]
    train_X = fu_obj.fit_transform(train_df)
    train_y = np.log1p(train_df["Sales"]).as_matrix()
    train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj))
    train_dump_df["target"] = train_y
    train_dump_df = train_dump_df.dropna(axis=0)
    print train_dump_df.shape
    train_X = train_dump_df[get_split_feature_list(fu_obj)].values
    train_y = train_dump_df["target"].values
    train_dump_df["ID"] = -1
    train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False)
    test_X = fu_obj.transform(test_df)
    test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj))
    print (test_dump_df == 0).sum(axis=0)
    test_dump_df["ID"] = test_df["Id"]
    test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False)
    if MODEL == "XGB":
        train_X, valid_X, train_y, valid_y =\
            train_test_split(train_X, train_y, test_size=0.05)
        fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)],
                     "eval_metric": rmspe_xg,
                     "early_stopping_rounds": 100}
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1,
                           fit_params=fit_param)
    else:
        clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"],
                           param_grid=clf_dict[MODEL]["paramteters"],
                           n_jobs=3, scoring=rmspe, verbose=1)
    clf.fit(train_X, train_y)
    print clf.best_score_
    index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature")
    if hasattr(clf.best_estimator_, "coef_"):
        coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "coef_%s.csv" % MODEL
        coef_df.to_csv(coeffile)
    if hasattr(clf.best_estimator_, "feature_importances_"):
        coef_sr = pd.Series(clf.best_estimator_.feature_importances_,
                            name="Importance")
        coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature")
        coeffile = SUBMISSION + "importance_%s.csv" % MODEL
        coef_df.to_csv(coeffile)

    print "... start y_pred"
    y_pred = np.expm1(clf.predict(test_X))
    pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"])
    submissionfile = SUBMISSION + "submission_%s.csv" % MODEL
    pred_sr.to_csv(submissionfile, header=True, index_label="ID")

コード例 #22

0

ファイルを表示

ファイル: prediction.py プロジェクト: haisland0909/Denoising-Dirty-Documents

def convert_traindata(train_gray_data, labels):

    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    X_train = fu.fit_transform(data_df)
    y_train = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    X_train = Std.fit_transform(X_train)

    return X_train, y_train

コード例 #23

0

ファイルを表示

ファイル: classify.py プロジェクト: haisland0909/Denoising-Dirty-Documents

def get_data():
    '''
    get X, y data

    :rtype: tuple
    '''
    _, _, _, train_gray_data, _, _, labels = i_p.load_data()
    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    X = fu.fit_transform(data_df)
    y = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    return (X, y)

コード例 #24

0

ファイルを表示

ファイル: getdeceptive.py プロジェクト: srini21/Amazon-deceptive-reviews

def train_model(trainset, testset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
#	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []
        testclasses = []
        testcorpus = []
	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])
	
	for item in testset:
		testcorpus.append(item['text'])
		testclasses.append(item['label'])

#	print "Training instances : ", len(classes)
#	print "Testing instances : ", len(set(classes)) 
	
	matrix = vectorizer.fit_transform(corpus)
	testmatrix = vectorizer.fit_transform(testcorpus)
#	print "feature count :. ", len(vectorizer.get_feature_names())
#	print "training model"
	X = matrix.toarray()
	TX = testmatrix.toarray()
	Ty= numpy.asarray(testclasses)
	y = numpy.asarray(classes)
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.9999,test_size=.00001,random_state=0)
	model = LinearSVC(dual=True, loss='l1')
#	model = SVC()
#	model = NuSVC()
#	model = RandomForestClassifier() 
	#scores=cross_validation.cross_val_score(model,X,y)
	#print "Accuracy "+ str(scores.mean())
#	print y_pred
	y_prob = model.fit(X_train, y_train).predict(TX)
#	y_prob = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
#	print(y_prob)
#	cm = confusion_matrix(y_test, y_pred)
#	cr = classification_report(y_test, y_pred)
#	print cr
#	print(cm)
#	pl.matshow()
#	pl.title('Confusion matrix#')
#	pl.colorbar()
#	pl.ylabel('True label')
#	pl.xlabel('Predicted label')
#	pl.show()
        print accuracy_score(y_prob,Ty)

コード例 #25

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: dsquareindia/scikit-learn

def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)

コード例 #26

0

ファイルを表示

ファイル: muscle_classifier.py プロジェクト: bgarcia7/workout_logger

class MuscleClassifier():

	def __init__(self, auto_load=True):
		""" Initializes our MuscleClassifier
			Option to preload it or start from fresh model 
		"""

		#=====[ If auto_load, then we rehydrate our existing models ]=====
		if auto_load:

			self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r'))
			self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r'))
			self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r'))

		else:

			self.model = BernoulliNB()

	def train(self, muscle_groups, labels):
		""" 
			Vectorizes raw input and trains our classifier 
		"""

		#=====[ Instantiate label encoder to turn text labels into ints ]=====
		self.le = preprocessing.LabelEncoder()

		#=====[ Declare vectorizers and merge them via a FeatureUnion ]=====
		char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8')
		word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8')

		self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)])

		#=====[ Transform our input and labels ]=====
		X = self.vectorizer.fit_transform(muscle_groups).toarray()
		Y = self.le.fit_transform(labels)

		#=====[ Fit our model and then run inference on training data ]=====
		self.model.fit(X,Y)
		y = self.model.predict(X)

		#=====[ Report Traning Accuracy ]=====
		print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y)))

	def predict(self, exercises):
		""" Takes in raw input, vectorizes it, and reports back predicted muscle group """

		X = self.vectorizer.transform(exercises).toarray()
		y = self.model.predict(X)

		return self.le.classes_[y]

コード例 #27

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: Givonaldo/scikit-learn

def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))

コード例 #28

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: haisland0909/Denoising-Dirty-Documents

def make_checkdata(mode="df"):
    
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    train_keys = train_gray_data.keys()[:2]
   
    train_inputs = {}
    train_labels = {}
    for i in xrange(len(train_keys)):
        input_ = train_gray_data[train_keys[i]]
        label = labels[train_keys[i]]

        train_inputs.update({train_keys[i]:input_})
        train_labels.update({train_keys[i]:label})
 
    test_keys = test_gray_data.keys()[:2]
    test_inputs = {}
    for i in xrange(len(test_keys)):
        input_ = test_gray_data[test_keys[i]]
        test_inputs.update({test_keys[i]:input_})
        
    train_df = f.make_data_df(train_inputs, train_labels)
    test_df = f.make_test_df(test_inputs) 
    

    if mode == "df":

        train_df = train_df.reset_index()
        test_df = test_df.reset_index()
        
        train_df.columns = ["pngname", "input", "label"]
        test_df.columns = ["pngname", "input"]

        return train_df, train_keys, test_df, test_keys


    elif mode == "feature":

        X_train = fu.fit_transform(train_df)
        X_train = Std.fit_transform(X_train)
        y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
        
        
        
        X_test = fu.fit_transform(test_df)
        X_test = Std.fit_transform(X_test)    
        
        return X_train, y_train, X_test

コード例 #29

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: PepGardiola/scikit-learn

def test_feature_stacker_weights():
    # test feature stacker with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)],
            transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # check against expected result
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

コード例 #30

0

ファイルを表示

ファイル: finalproject.py プロジェクト: Martbov/LearningFromData

def ageClassifier(doc, age):
	""" A function that trains an age classifier """
	xTrain = doc
	yTrain = age

	unionOfFeatures = FeatureUnion([
									('normaltfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)),
									('bigrams', TfidfVectorizer(preprocessor = identity, tokenizer = identity, ngram_range = (3,3), analyzer = 'char')),
									('counts', CountVectorizer(preprocessor = identity, tokenizer = identity))
									])

	featureFit = unionOfFeatures.fit(xTrain, yTrain).transform(xTrain)
	classifier = Pipeline([('featureunion', unionOfFeatures), ('cls', svm.SVC(kernel='linear', C=1.5))])
	classifier.fit(xTrain, yTrain)
	
	return classifier

コード例 #31

0

ファイルを表示

ファイル: AICEnsemble.py プロジェクト: BigDaMa/DFS

class AICEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, candidateFeatures: List[CandidateFeature], classifier):
        self.candidateFeatures = candidateFeatures
        self.classifier = classifier

        self.ensemble_pipeline = FeatureUnion(
            transformer_list=[(str(c), self.generate_pipeline(c))
                              for c in candidateFeatures])

        # calculate weights
        self.AICc = np.array([
            np.min(
                c.runtime_properties['additional_metrics']['AICc_complexity'])
            for c in candidateFeatures
        ])
        #self.AICc = [np.mean(c.runtime_properties['additional_metrics']['AICc_complexity']) for c in candidateFeatures]

        delta_i = self.AICc - np.min(self.AICc)
        summed = np.sum(
            np.array([np.exp(-delta_r / 2.0) for delta_r in delta_i]))
        self.weights = np.array(
            [np.exp(-d_i / 2.0) / summed for d_i in delta_i])

        print(candidateFeatures)
        print(self.weights)

    def generate_pipeline(self, rep):
        best_hyperparameters = rep.runtime_properties['hyperparameters']

        all_keys = list(best_hyperparameters.keys())
        for k in all_keys:
            if 'classifier__' in k:
                best_hyperparameters[k[12:]] = best_hyperparameters.pop(k)

        my_pipeline = Pipeline([
            (str(rep) + '_f', rep.pipeline),
            (str(rep) + '_c',
             ClassifierTransformer(self.classifier(**best_hyperparameters)))
        ])

        return my_pipeline

    def fit(self, X, y=None):
        self.ensemble_pipeline.fit(X, y)
        return self

    def predict_proba(self, X):
        ensemble_predictions = self.ensemble_pipeline.transform(X)

        print(ensemble_predictions)
        print(ensemble_predictions.shape)

        #weight these predictions
        weighted_predictions = np.multiply(ensemble_predictions, self.weights)

        averaged_predictions = np.sum(weighted_predictions, axis=1)

        averaged_predictions_proba = np.zeros(
            (averaged_predictions.shape[0], 2))

        averaged_predictions_proba[:, 0] = averaged_predictions
        averaged_predictions_proba[:, 1] = 1.0 - averaged_predictions
        return averaged_predictions_proba

    def predict(self, X):
        return self.predict_proba(X)[:, 0] < 0.5

コード例 #32

0

ファイルを表示


# Now, our new pipeline:

# In[ ]:

from sklearn.pipeline import FeatureUnion

pipe2 = Pipeline([
    ('u1',
     FeatureUnion([
         ('tfdif_features',
          Pipeline([
              ('cv', CountVectorizer()),
              ('tfidf', TfidfTransformer()),
          ])),
         ('pos_features',
          Pipeline([
              ('pos', PosTagMatrix(tokenizer=nltk.word_tokenize)),
          ])),
     ])),
    ('logit', LogisticRegression()),
])

# In[ ]:

pipe2.fit(X_train_part, y_train_part)
pred = pipe2.predict_proba(X_valid)
log_loss(y_valid, pred)

# Not an improvements, but hey, we learned somthing new!

コード例 #33

0

ファイルを表示

 feature_list = np.array(feature_list)
 stop_words = helper.read_stopwords()
 # feature_list = feature_list[:, 0]
 #
 union = FeatureUnion(
     transformer_list=[
         ("feature",
          Pipeline([('selector', ItemSelector(1)),
                    ("vec", DictVectorizer(sparse=False))])),
         (
             "content",
             Pipeline([
                 ('selector', ItemSelector(0)),
                 (
                     'cvec',
                     CountVectorizer(
                         # analyzer='char_wb',
                         token_pattern=r"(?u)\b\w+\b",
                         min_df=1,
                         stop_words=stop_words)),
                 ('tfidf', TfidfTransformer())
             ]))
     ],
     transformer_weights={
         "feature": 1.0,
         "content": 1.0
     })
 union.fit_transform(feature_list)
 pipe: Pipeline = union.transformer_list[1][1]
 cvec: CountVectorizer = pipe.named_steps["cvec"]
 arr = cvec.get_feature_names()

コード例 #34

0

ファイルを表示

ファイル: MFpipe.py プロジェクト: mafrasiabi/brainpipe

    def default_pipeline(self,
                         name,
                         n_pca=10,
                         n_best=10,
                         lda_shrink=10,
                         svm_C=10,
                         svm_gamma=10,
                         fdr_alpha=[0.05],
                         fpr_alpha=[0.05]):
        """Use a default combination of parameters for building a pipeline

        Args:
            name: string
                The string for building a default pipeline (see examples below)

        Kargs:
            n_pca: integer, optional, (def: 10)
                The number of components to search

            n_best: integer, optional, (def: 10)
                Number of best features to consider using a statistical method

            lda_shrink: integer, optional, (def: 10)
                Fit optimisation parameter for the lda

            svm_C/svm_gamma: integer, optional, (def: 10/10)
                Parameters to optimize for the svm

            fdr/fpr_alpha: list, optional, (def: [0.05])
                List of float for selecting features using a fdr or fpr

        Examples:
            >>> # Basic classifiers :
            >>> name = 'lda' # or name = 'svm_linear' for a linear SVM
            >>> # Combine a classifier with a feature selection method :
            >>> name = 'lda_fdr_fpr_kbest_pca'
            >>> # The method above will use an LDA for the features evaluation
            >>> # and will combine a FDR, FPR, k-Best and pca feature seelction.
            >>> # Now we can combine with classifier optimisation :
            >>> name = 'lda_optimized_pca' # will try to optimize an LDA with a pca
            >>> name = 'svm_kernel_C_gamma_kbest' # optimize a SVM by trying
            >>> # diffrent kernels (linear/RBF), and optimize C and gamma parameters
            >>> # combine with a k-Best features selection.
        """
        # ----------------------------------------------------------------
        # DEFINED COMBINORS
        # ----------------------------------------------------------------
        pca = PCA()
        selection = SelectKBest()
        scaler = StandardScaler()
        fdr = SelectFdr()
        fpr = SelectFpr()

        # ----------------------------------------------------------------
        # RANGE DEFINITION
        # ---------------------------------------------------------
        pca_range = np.arange(1, n_pca + 1)
        kbest_range = np.arange(1, n_best + 1)
        C_range = np.logspace(-5, 15, svm_C,
                              base=2.)  #np.logspace(-2, 2, svm_C)
        gamma_range = np.logspace(-15, 3, svm_gamma,
                                  base=2.)  #np.logspace(-9, 2, svm_gamma)

        # Check range :
        if not kbest_range.size: kbest_range = [1]
        if not pca_range.size: pca_range = [1]
        if not C_range.size: C_range = [1.]
        if not gamma_range.size: gamma_range = ['auto']

        # ----------------------------------------------------------------
        # DEFINED PIPELINE ELEMENTS
        # ----------------------------------------------------------------
        pipeline = []
        grid = {}
        combine = []

        # ----------------------------------------------------------------
        # BUILD CLASSIFIER
        # ----------------------------------------------------------------
        # -> SCALE :
        if name.lower().find('scale') != -1:
            pipeline.append(("scaler", scaler))

        # -> LDA :
        if name.lower().find('lda') != -1:

            # Default :
            if name.lower().find('optimized') == -1:
                clf = LinearDiscriminantAnalysis(
                    priors=np.array([1 / self._nclass] * self._nclass))

            # Optimized :
            elif name.lower().find('optimized') != -1:
                clf = LinearDiscriminantAnalysis(priors=np.array(
                    [1 / self._nclass] * self._nclass),
                                                 solver='lsqr')
                grid['clf__shrinkage'] = np.linspace(0., 1., lda_shrink)

        # -> SVM :
        elif name.lower().find('svm') != -1:

            # Linear/RBF standard kernel :
            if name.lower().find('linear') != -1:
                kwargs = {'kernel': 'linear'}
            elif name.lower().find('rbf') != -1:
                kwargs = {'kernel': 'rbf'}
            else:
                kwargs = {}

            # Optimized :
            if name.lower().find('optimized') != -1:

                # Kernel optimization :
                if name.lower().find('kernel') != -1:
                    grid['clf__kernel'] = ('linear', 'rbf')

                # C optimization :
                if name.lower().find('_c_') != -1:
                    grid['clf__C'] = C_range

                # Gamma optimization :
                if name.lower().find('gamma') != -1:
                    grid['clf__gamma'] = gamma_range

            clf = SVC(**kwargs)

        # ----------------------------------------------------------------
        # BUILD COMBINE
        # ----------------------------------------------------------------
        # -> FDR :
        if name.lower().find('fdr') != -1:
            combine.append(("fdr", fdr))
            grid['features__fdr__alpha'] = fdr_alpha

        # -> FPR :
        if name.lower().find('fpr') != -1:
            combine.append(("fpr", fpr))
            grid['features__fpr__alpha'] = fpr_alpha

        # -> PCA :
        if name.lower().find('pca') != -1:
            combine.append(("pca", pca))
            grid['features__pca__n_components'] = pca_range

        # -> kBest :
        if name.lower().find('kbest') != -1:
            combine.append(("kBest", selection))
            grid['features__kBest__k'] = kbest_range

        # -> RFECV :
        if name.lower().find('rfecv') != -1:
            rfecv = RFECV(clf)
            combine.append(("RFECV", rfecv))

        # if combine is empty, select all features :
        if not len(combine):
            combine.append(("kBest", SelectKBest(k='all')))

        self.combine = FeatureUnion(combine)

        # ----------------------------------------------------------------
        # SAVE PIPELINE
        # ----------------------------------------------------------------
        # Build ordered pipeline :
        if len(combine):
            pipeline.append(("features", self.combine))
        pipeline.append(("clf", clf))

        # Save pipeline :
        self.pipeline = Pipeline(pipeline)
        self.grid = grid
        self._pipename = name

コード例 #35

0

ファイルを表示

num_pipeline = Pipeline([
    ('selector', ds.DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', ca.CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# 选择
cat_pipeline = Pipeline([
    ('selector', ds.DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizer(sparse_output=True)),
])

# 拼接
full_pipeline = FeatureUnion(
    transformer_list=[("num_pipeline",
                       num_pipeline), ("cat_pipeline", cat_pipeline)])

housing_prepared = full_pipeline.fit_transform(housing)

# test set
test_housing = strat_test_set.drop("median_house_value", axis=1)
test_housing_labels = strat_test_set["median_house_value"].copy()

test_housing_prepared = full_pipeline.fit_transform(test_housing)

# Linear Reg
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
predict = lin_reg.predict(test_housing_prepared)

コード例 #36

0

ファイルを表示

num_attribs = list(sample_data_num)
# 取出文本属性的
cat_attribs = ["class(OK/NG)"]

# 数据转换流水线，将维度13列数值进行标准化处理，最后一列属性不做处理
num_pipeline = Pipeline([
    ("selector", DataFrameSelector(num_attribs)),
    ("std_scaler", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("selector", DataFrameSelector(cat_attribs)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

# 将数据输入到流水线中，得出准备好的数据
sample_data_prepared = full_pipeline.fit_transform(sample_data)

# 将最后一列分类标签单独取出，转换为一维数组
sample_data_label = sample_data_prepared[:, -1:]
sample_data_label = sample_data_label.flatten()
# 将标签列转化为布尔值 true为"OK" false为"NG" 便于后续衡量模型指标
label_train = (sample_data_label == "OK")

# 将13个维度单独取出，为训练做好准备
sample_data_13 = sample_data_prepared[:, :13]

# 导入数据

コード例 #37

0

ファイルを表示

ファイル: textmining.py プロジェクト: GiBDataCOL/Data-Cleaning-Data-Analysis

clf = pipeline.Pipeline([
    (
        'union',
        FeatureUnion(
            transformer_list=[
                ('cst', cust_regression_vals()),
                ('txt1',
                 pipeline.Pipeline([('s1', cust_txt_col(key='search_term')),
                                    ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                ('txt2',
                 pipeline.Pipeline([('s2', cust_txt_col(key='product_title')),
                                    ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                ('txt3',
                 pipeline.Pipeline([('s3',
                                     cust_txt_col(key='product_description')),
                                    ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                ('txt4',
                 pipeline.Pipeline([('s4', cust_txt_col(key='brand')),
                                    ('tfidf4', tfidf), ('tsvd4', tsvd)]))
            ],
            transformer_weights={
                'cst': 1.0,
                'txt1': 0.5,
                'txt2': 0.25,
                'txt3': 0.0,
                'txt4': 0.5
            },
            #n_jobs = -1
        )),
    ('rfr', rfr)
])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}

コード例 #38

0

ファイルを表示