def train_model(trainset): word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore") # print word_vector print "works fine" char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore") vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ]) corpus = [] classes = [] for item in trainset: corpus.append(item['text']) classes.append(item['label']) print "Training instances : ", 0.8*len(classes) print "Testing instances : ", 0.2*len(classes) matrix = vectorizer.fit_transform(corpus) print "feature count : ", len(vectorizer.get_feature_names()) print "training model" X = matrix.toarray() y = numpy.asarray(classes) model =LinearSVC() X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0) y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test) #y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test) #print y_prob #con_matrix = [] #for row in range(len(y_prob)): # temp = [y_pred[row]] # for prob in y_prob[row]: # temp.append(prob) # con_matrix.append(temp) #for row in con_matrix: # output.write(str(row)+"\n") #print y_pred #print y_test res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited'] res2=[i for i, j in enumerate(y_test) if j == 'anonEdited'] reset=[] for r in res1: if y_test[r] != "anonEdited": reset.append(y_test[r]) for r in res2: if y_pred[r] != "anonEdited": reset.append(y_pred[r]) output=open(sys.argv[2],"w") for suspect in reset: output.write(str(suspect)+"\n") cm = confusion_matrix(y_test, y_pred) print(cm) pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.show() print accuracy_score(y_pred,y_test)
def concat_feature_extractors(train_data, labels): # This dataset is way to high-dimensional. Better do PCA: pca = PCA(n_components = 2) # Maybe some original features where good, too? selection = SelectKBest(k = 1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(train_data, labels).transform(train_data) # Classify: svm = SVC(kernel = "linear") svm.fit(X_features, labels) # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components = [1, 2, 3], features__univ_select__k = [1, 2], svm__C = [0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = 10) grid_search.fit(train_data, labels) print(grid_search.best_estimator_)
def test_feature_union(self): """Tests that combining multiple featurizers works as expected""" modules = ["bag-of-words", "entities"] modules_list, _ = modules_to_dictionary(modules) feature_union = FeatureUnion(modules_list) feature_union.fit(texts_entities, outcomes) feature_union.transform(["unknown"])
def testSVC(lbda=1.0, n_components=20, kbest=4): otto = load_otto() X = otto.data y = otto.target # X = otto.data[:10000, :10] # y = otto.target[:10000] scaler = StandardScaler().fit(X) X = scaler.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion( [("pca", pca), ("univ_select", selection)] ) X_features = combined_features.fit(X, y).transform(X) svc = SVC(C=1.0/lbda, kernel='rbf', cache_size=400, probability=True) pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) test_otto = load_testotto() testData = test_otto.data testData = scaler.transform(testData) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2]) else: param_grid = dict(features__pca__n_components=[1, 2,3], features__univ_select__k=[1,2]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100) grid_search.fit(X, y) self.modelled = True regr = grid_search return regr except ValueError,e: print e self.modelled = False return None
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" test_X = fu_obj.transform(test_df) y_pred = clf.predict(test_X) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def trainItalianSexClassifier(self): #get correct labels from dictionary in trainY and testY trainX = self.italianTrainData[0] trainY = self.getYlabels(self.italianTrainData[1], 'sex') combined_features = FeatureUnion([("tfidf", TfidfVectorizer()), ("ngrams", TfidfVectorizer(ngram_range=(3, 3), analyzer="char")), ("counts", CountVectorizer()), ("latin", Latin()), ],transformer_weights={ 'latin': 1, 'tfidf': 2, 'ngrams': 2, 'counts': 1, }) X_features = combined_features.fit(trainX, trainY).transform(trainX) classifier = svm.LinearSVC() pipeline = Pipeline([("features", combined_features), ("classifier", classifier)]) pipeline.fit(trainX, trainY) return pipeline
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1], features__univ_select__k=[1]) else: param_grid = dict(features__pca__n_components=[1,2,3,4], features__univ_select__k=[1,2,3,4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0) grid_search.fit(X, y) self.modelled = True regr = grid_search self.R2=r2_score(self.target_matrix,regr.predict(self.feature_matrix)) #Ian: should do R2 on predicted points vs. points on a given day return regr except ValueError,e: print e self.modelled = False return None
def rbf_kernels(env, n_samples=100000, gamma=[0.01, 0.1], n_components=100): """Represent observation samples using RBF-kernels. EXAMPLE ------- >>> env = gym.make('MountainCar-v0') >>> n_params, rbf = rbf_kernels(env, n_components=100) >>> sample = env.observation_space.sample().reshape((1, env.observation_space.shape[0])) >>> rbf(sample).shape (1, 100) """ observation_examples = np.array([env.observation_space.sample() for _ in range(n_samples)]) # Fit feature scaler scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Fir feature extractor features = [] for g in gamma: features.append(('gamma={}'.format(g), RBFSampler(n_components=n_components // len(gamma), gamma=g))) features = FeatureUnion(features) features.fit(scaler.transform(observation_examples)) def _rbf_kernels(observation): return features.transform(scaler.transform(observation)) return _rbf_kernels
def pca_kpca(train_data, labels): estimators = make_union(PCA(), TruncatedSVD(), KernelPCA()) # estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())] combined = FeatureUnion(estimators) combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels) return combined
def fit(self, X, y=None): Trans2 = Q2Transformer() Trans3 = Q3Transformer() Trans4 = Q4Transformer() combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)]) self.fit = combined_features.fit(X) return self
def testLogistic(lbda=1.0, n_components=20, kbest=4): # X = otto.data[:1000, :20] # y = otto.target[:1000] otto = load_otto() X = otto.data[:, :] y = otto.target[:] # n_components = 20 # kbest = 4 # print 'y.shape =', y.shape scalar = StandardScaler().fit(X) X = scalar.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion( [("pca", pca), ('univ_select', selection)] ) X_features = combined_features.fit(X,y).transform(X) logistic = LogisticRegression(C=1.0/lbda) pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) # print trainTarget test_otto = load_testotto() testData = test_otto.data testData = scalar.transform(testData) # logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score())) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def convert_testdata(test_gray_data, feature_rule=f.feature_transformer_rule): data_df = f.make_test_df(test_gray_data) fu = FeatureUnion(transformer_list=feature_rule) Std = preprocessing.StandardScaler() X_test = fu.fit_transform(data_df) #X_test = Std.fit_transform(X_test) return X_test
def get_pca_transformer(train_x, train_y, n_components=-1): if n_components == -1: n_components = int(np.ceil(np.sqrt(train_x.shape[1]))) pca = PCA(n_components=n_components) selection = SelectKBest(k=n_components/2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) return combined_features.fit(train_x, train_y)
def fit_logreg(self): tokenize_sense = CachedFitTransform(Pipeline([ ('tokenize', Map(compose(tokenize, normalize_special, unescape))), ('normalize', MapTokens(normalize_elongations)), ]), self.memory) features = FeatureUnion([ # ('w2v_doc', ToCorporas(Pipeline([ # ('tokenize', MapCorporas(tokenize_sense)), # ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec( # dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, # workers=16 # ), self.memory)))), # ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))), # ('w2v_word_avg', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecAverage(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), # ('w2v_word_avg_google', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), # ])), # ('w2v_word_norm_avg', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), ('w2v_word_norm_avg_google', Pipeline([ ('tokenize', tokenize_sense), ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), ])), # ('w2v_word_max', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecMax(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), # ('w2v_word_max_google', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), # ])), # ('w2v_word_inv', ToCorporas(Pipeline([ # ('tokenize', MapCorporas(tokenize_sense)), # ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16 # ), self.memory)))), # ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))), ]) classifier = LogisticRegression() with temp_log_level({'gensim.models.word2vec': logging.INFO}): classifier.fit(features.transform(self.train_docs), self.train_labels()) estimator = Pipeline([('features', features), ('classifier', classifier)]) return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different pca object to control the random_state stream fs = FeatureUnion([("pca", pca), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def set_traindata(df, key): fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() X = fu.fit_transform(df) y = np.concatenate(df["label"].apply(lambda x: x.flatten())) X = Std.fit_transform(X) return (X, y)
def cv_score(train_df, MODEL): print "... start cross validation" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=-1, scoring=rmspe, cv=None) print cross_val_score(clf, train_X, train_y, scoring=rmspe, cv=5, n_jobs=3)
def pca(x, y, test_x, n_features=-1): if n_features == -1: n_features = int(np.ceil(np.sqrt(x.shape[1]))) pca = PCA(n_components=n_features) selection = SelectKBest(k=n_features/2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(x, y) return combined_features.transform(x), combined_features.transform(test_x)
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)] train_X = fu_obj.fit_transform(train_df) train_y = np.log1p(train_df["Sales"]).as_matrix() train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj)) train_dump_df["target"] = train_y train_dump_df = train_dump_df.dropna(axis=0) print train_dump_df.shape train_X = train_dump_df[get_split_feature_list(fu_obj)].values train_y = train_dump_df["target"].values train_dump_df["ID"] = -1 train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False) test_X = fu_obj.transform(test_df) test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj)) print (test_dump_df == 0).sum(axis=0) test_dump_df["ID"] = test_df["Id"] test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False) if MODEL == "XGB": train_X, valid_X, train_y, valid_y =\ train_test_split(train_X, train_y, test_size=0.05) fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)], "eval_metric": rmspe_xg, "early_stopping_rounds": 100} clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1, fit_params=fit_param) else: clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" y_pred = np.expm1(clf.predict(test_X)) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def convert_traindata(train_gray_data, labels): data_df = f.make_data_df(train_gray_data, labels) fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() X_train = fu.fit_transform(data_df) y_train = np.concatenate(data_df["label"].apply(lambda x: x.flatten())) X_train = Std.fit_transform(X_train) return X_train, y_train
def get_data(): ''' get X, y data :rtype: tuple ''' _, _, _, train_gray_data, _, _, labels = i_p.load_data() data_df = f.make_data_df(train_gray_data, labels) fu = FeatureUnion(transformer_list=f.feature_transformer_rule) X = fu.fit_transform(data_df) y = np.concatenate(data_df["label"].apply(lambda x: x.flatten())) return (X, y)
def train_model(trainset, testset): word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore") # print word_vector # print "works fine" char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore") vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ]) corpus = [] classes = [] testclasses = [] testcorpus = [] for item in trainset: corpus.append(item['text']) classes.append(item['label']) for item in testset: testcorpus.append(item['text']) testclasses.append(item['label']) # print "Training instances : ", len(classes) # print "Testing instances : ", len(set(classes)) matrix = vectorizer.fit_transform(corpus) testmatrix = vectorizer.fit_transform(testcorpus) # print "feature count :. ", len(vectorizer.get_feature_names()) # print "training model" X = matrix.toarray() TX = testmatrix.toarray() Ty= numpy.asarray(testclasses) y = numpy.asarray(classes) X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.9999,test_size=.00001,random_state=0) model = LinearSVC(dual=True, loss='l1') # model = SVC() # model = NuSVC() # model = RandomForestClassifier() #scores=cross_validation.cross_val_score(model,X,y) #print "Accuracy "+ str(scores.mean()) # print y_pred y_prob = model.fit(X_train, y_train).predict(TX) # y_prob = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test) # print(y_prob) # cm = confusion_matrix(y_test, y_pred) # cr = classification_report(y_test, y_pred) # print cr # print(cm) # pl.matshow() # pl.title('Confusion matrix#') # pl.colorbar() # pl.ylabel('True label') # pl.xlabel('Predicted label') # pl.show() print accuracy_score(y_prob,Ty)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35) ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message( AttributeError, 'Transformer tr1 (type Transf) does not provide ' 'get_feature_names', ft.get_feature_names)
class MuscleClassifier(): def __init__(self, auto_load=True): """ Initializes our MuscleClassifier Option to preload it or start from fresh model """ #=====[ If auto_load, then we rehydrate our existing models ]===== if auto_load: self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r')) self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r')) self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r')) else: self.model = BernoulliNB() def train(self, muscle_groups, labels): """ Vectorizes raw input and trains our classifier """ #=====[ Instantiate label encoder to turn text labels into ints ]===== self.le = preprocessing.LabelEncoder() #=====[ Declare vectorizers and merge them via a FeatureUnion ]===== char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8') word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8') self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)]) #=====[ Transform our input and labels ]===== X = self.vectorizer.fit_transform(muscle_groups).toarray() Y = self.le.fit_transform(labels) #=====[ Fit our model and then run inference on training data ]===== self.model.fit(X,Y) y = self.model.predict(X) #=====[ Report Traning Accuracy ]===== print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y))) def predict(self, exercises): """ Takes in raw input, vectorizes it, and reports back predicted muscle group """ X = self.vectorizer.transform(exercises).toarray() y = self.model.predict(X) return self.le.classes_[y]
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def make_checkdata(mode="df"): fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data() train_keys = train_gray_data.keys()[:2] train_inputs = {} train_labels = {} for i in xrange(len(train_keys)): input_ = train_gray_data[train_keys[i]] label = labels[train_keys[i]] train_inputs.update({train_keys[i]:input_}) train_labels.update({train_keys[i]:label}) test_keys = test_gray_data.keys()[:2] test_inputs = {} for i in xrange(len(test_keys)): input_ = test_gray_data[test_keys[i]] test_inputs.update({test_keys[i]:input_}) train_df = f.make_data_df(train_inputs, train_labels) test_df = f.make_test_df(test_inputs) if mode == "df": train_df = train_df.reset_index() test_df = test_df.reset_index() train_df.columns = ["pngname", "input", "label"] test_df.columns = ["pngname", "input"] return train_df, train_keys, test_df, test_keys elif mode == "feature": X_train = fu.fit_transform(train_df) X_train = Std.fit_transform(X_train) y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten())) X_test = fu.fit_transform(test_df) X_test = Std.fit_transform(X_test) return X_train, y_train, X_test
def test_feature_stacker_weights(): # test feature stacker with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def ageClassifier(doc, age): """ A function that trains an age classifier """ xTrain = doc yTrain = age unionOfFeatures = FeatureUnion([ ('normaltfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)), ('bigrams', TfidfVectorizer(preprocessor = identity, tokenizer = identity, ngram_range = (3,3), analyzer = 'char')), ('counts', CountVectorizer(preprocessor = identity, tokenizer = identity)) ]) featureFit = unionOfFeatures.fit(xTrain, yTrain).transform(xTrain) classifier = Pipeline([('featureunion', unionOfFeatures), ('cls', svm.SVC(kernel='linear', C=1.5))]) classifier.fit(xTrain, yTrain) return classifier
class AICEnsemble(BaseEstimator, ClassifierMixin): def __init__(self, candidateFeatures: List[CandidateFeature], classifier): self.candidateFeatures = candidateFeatures self.classifier = classifier self.ensemble_pipeline = FeatureUnion( transformer_list=[(str(c), self.generate_pipeline(c)) for c in candidateFeatures]) # calculate weights self.AICc = np.array([ np.min( c.runtime_properties['additional_metrics']['AICc_complexity']) for c in candidateFeatures ]) #self.AICc = [np.mean(c.runtime_properties['additional_metrics']['AICc_complexity']) for c in candidateFeatures] delta_i = self.AICc - np.min(self.AICc) summed = np.sum( np.array([np.exp(-delta_r / 2.0) for delta_r in delta_i])) self.weights = np.array( [np.exp(-d_i / 2.0) / summed for d_i in delta_i]) print(candidateFeatures) print(self.weights) def generate_pipeline(self, rep): best_hyperparameters = rep.runtime_properties['hyperparameters'] all_keys = list(best_hyperparameters.keys()) for k in all_keys: if 'classifier__' in k: best_hyperparameters[k[12:]] = best_hyperparameters.pop(k) my_pipeline = Pipeline([ (str(rep) + '_f', rep.pipeline), (str(rep) + '_c', ClassifierTransformer(self.classifier(**best_hyperparameters))) ]) return my_pipeline def fit(self, X, y=None): self.ensemble_pipeline.fit(X, y) return self def predict_proba(self, X): ensemble_predictions = self.ensemble_pipeline.transform(X) print(ensemble_predictions) print(ensemble_predictions.shape) #weight these predictions weighted_predictions = np.multiply(ensemble_predictions, self.weights) averaged_predictions = np.sum(weighted_predictions, axis=1) averaged_predictions_proba = np.zeros( (averaged_predictions.shape[0], 2)) averaged_predictions_proba[:, 0] = averaged_predictions averaged_predictions_proba[:, 1] = 1.0 - averaged_predictions return averaged_predictions_proba def predict(self, X): return self.predict_proba(X)[:, 0] < 0.5
# Now, our new pipeline: # In[ ]: from sklearn.pipeline import FeatureUnion pipe2 = Pipeline([ ('u1', FeatureUnion([ ('tfdif_features', Pipeline([ ('cv', CountVectorizer()), ('tfidf', TfidfTransformer()), ])), ('pos_features', Pipeline([ ('pos', PosTagMatrix(tokenizer=nltk.word_tokenize)), ])), ])), ('logit', LogisticRegression()), ]) # In[ ]: pipe2.fit(X_train_part, y_train_part) pred = pipe2.predict_proba(X_valid) log_loss(y_valid, pred) # Not an improvements, but hey, we learned somthing new!
feature_list = np.array(feature_list) stop_words = helper.read_stopwords() # feature_list = feature_list[:, 0] # union = FeatureUnion( transformer_list=[ ("feature", Pipeline([('selector', ItemSelector(1)), ("vec", DictVectorizer(sparse=False))])), ( "content", Pipeline([ ('selector', ItemSelector(0)), ( 'cvec', CountVectorizer( # analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1, stop_words=stop_words)), ('tfidf', TfidfTransformer()) ])) ], transformer_weights={ "feature": 1.0, "content": 1.0 }) union.fit_transform(feature_list) pipe: Pipeline = union.transformer_list[1][1] cvec: CountVectorizer = pipe.named_steps["cvec"] arr = cvec.get_feature_names()
def default_pipeline(self, name, n_pca=10, n_best=10, lda_shrink=10, svm_C=10, svm_gamma=10, fdr_alpha=[0.05], fpr_alpha=[0.05]): """Use a default combination of parameters for building a pipeline Args: name: string The string for building a default pipeline (see examples below) Kargs: n_pca: integer, optional, (def: 10) The number of components to search n_best: integer, optional, (def: 10) Number of best features to consider using a statistical method lda_shrink: integer, optional, (def: 10) Fit optimisation parameter for the lda svm_C/svm_gamma: integer, optional, (def: 10/10) Parameters to optimize for the svm fdr/fpr_alpha: list, optional, (def: [0.05]) List of float for selecting features using a fdr or fpr Examples: >>> # Basic classifiers : >>> name = 'lda' # or name = 'svm_linear' for a linear SVM >>> # Combine a classifier with a feature selection method : >>> name = 'lda_fdr_fpr_kbest_pca' >>> # The method above will use an LDA for the features evaluation >>> # and will combine a FDR, FPR, k-Best and pca feature seelction. >>> # Now we can combine with classifier optimisation : >>> name = 'lda_optimized_pca' # will try to optimize an LDA with a pca >>> name = 'svm_kernel_C_gamma_kbest' # optimize a SVM by trying >>> # diffrent kernels (linear/RBF), and optimize C and gamma parameters >>> # combine with a k-Best features selection. """ # ---------------------------------------------------------------- # DEFINED COMBINORS # ---------------------------------------------------------------- pca = PCA() selection = SelectKBest() scaler = StandardScaler() fdr = SelectFdr() fpr = SelectFpr() # ---------------------------------------------------------------- # RANGE DEFINITION # --------------------------------------------------------- pca_range = np.arange(1, n_pca + 1) kbest_range = np.arange(1, n_best + 1) C_range = np.logspace(-5, 15, svm_C, base=2.) #np.logspace(-2, 2, svm_C) gamma_range = np.logspace(-15, 3, svm_gamma, base=2.) #np.logspace(-9, 2, svm_gamma) # Check range : if not kbest_range.size: kbest_range = [1] if not pca_range.size: pca_range = [1] if not C_range.size: C_range = [1.] if not gamma_range.size: gamma_range = ['auto'] # ---------------------------------------------------------------- # DEFINED PIPELINE ELEMENTS # ---------------------------------------------------------------- pipeline = [] grid = {} combine = [] # ---------------------------------------------------------------- # BUILD CLASSIFIER # ---------------------------------------------------------------- # -> SCALE : if name.lower().find('scale') != -1: pipeline.append(("scaler", scaler)) # -> LDA : if name.lower().find('lda') != -1: # Default : if name.lower().find('optimized') == -1: clf = LinearDiscriminantAnalysis( priors=np.array([1 / self._nclass] * self._nclass)) # Optimized : elif name.lower().find('optimized') != -1: clf = LinearDiscriminantAnalysis(priors=np.array( [1 / self._nclass] * self._nclass), solver='lsqr') grid['clf__shrinkage'] = np.linspace(0., 1., lda_shrink) # -> SVM : elif name.lower().find('svm') != -1: # Linear/RBF standard kernel : if name.lower().find('linear') != -1: kwargs = {'kernel': 'linear'} elif name.lower().find('rbf') != -1: kwargs = {'kernel': 'rbf'} else: kwargs = {} # Optimized : if name.lower().find('optimized') != -1: # Kernel optimization : if name.lower().find('kernel') != -1: grid['clf__kernel'] = ('linear', 'rbf') # C optimization : if name.lower().find('_c_') != -1: grid['clf__C'] = C_range # Gamma optimization : if name.lower().find('gamma') != -1: grid['clf__gamma'] = gamma_range clf = SVC(**kwargs) # ---------------------------------------------------------------- # BUILD COMBINE # ---------------------------------------------------------------- # -> FDR : if name.lower().find('fdr') != -1: combine.append(("fdr", fdr)) grid['features__fdr__alpha'] = fdr_alpha # -> FPR : if name.lower().find('fpr') != -1: combine.append(("fpr", fpr)) grid['features__fpr__alpha'] = fpr_alpha # -> PCA : if name.lower().find('pca') != -1: combine.append(("pca", pca)) grid['features__pca__n_components'] = pca_range # -> kBest : if name.lower().find('kbest') != -1: combine.append(("kBest", selection)) grid['features__kBest__k'] = kbest_range # -> RFECV : if name.lower().find('rfecv') != -1: rfecv = RFECV(clf) combine.append(("RFECV", rfecv)) # if combine is empty, select all features : if not len(combine): combine.append(("kBest", SelectKBest(k='all'))) self.combine = FeatureUnion(combine) # ---------------------------------------------------------------- # SAVE PIPELINE # ---------------------------------------------------------------- # Build ordered pipeline : if len(combine): pipeline.append(("features", self.combine)) pipeline.append(("clf", clf)) # Save pipeline : self.pipeline = Pipeline(pipeline) self.grid = grid self._pipename = name
num_pipeline = Pipeline([ ('selector', ds.DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', ca.CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) # 选择 cat_pipeline = Pipeline([ ('selector', ds.DataFrameSelector(cat_attribs)), ('label_binarizer', LabelBinarizer(sparse_output=True)), ]) # 拼接 full_pipeline = FeatureUnion( transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline)]) housing_prepared = full_pipeline.fit_transform(housing) # test set test_housing = strat_test_set.drop("median_house_value", axis=1) test_housing_labels = strat_test_set["median_house_value"].copy() test_housing_prepared = full_pipeline.fit_transform(test_housing) # Linear Reg from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) predict = lin_reg.predict(test_housing_prepared)
num_attribs = list(sample_data_num) # 取出文本属性的 cat_attribs = ["class(OK/NG)"] # 数据转换流水线,将维度13列数值进行标准化处理,最后一列属性不做处理 num_pipeline = Pipeline([ ("selector", DataFrameSelector(num_attribs)), ("std_scaler", StandardScaler()), ]) cat_pipeline = Pipeline([ ("selector", DataFrameSelector(cat_attribs)), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) # 将数据输入到流水线中,得出准备好的数据 sample_data_prepared = full_pipeline.fit_transform(sample_data) # 将最后一列分类标签单独取出,转换为一维数组 sample_data_label = sample_data_prepared[:, -1:] sample_data_label = sample_data_label.flatten() # 将标签列转化为布尔值 true为"OK" false为"NG" 便于后续衡量模型指标 label_train = (sample_data_label == "OK") # 将13个维度单独取出,为训练做好准备 sample_data_13 = sample_data_prepared[:, :13] # 导入数据
clf = pipeline.Pipeline([ ( 'union', FeatureUnion( transformer_list=[ ('cst', cust_regression_vals()), ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])), ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])), ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])), ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])) ], transformer_weights={ 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, #n_jobs = -1 )), ('rfr', rfr) ]) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) mult5.get_feature_names = lambda: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['m5__x5'], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x3'], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x5'], ft.get_feature_names())
def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ]) fs_parallel = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ], n_jobs=2) fs_parallel2 = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ], n_jobs=2) fs.fit(X) X_transformed = fs.transform(X) assert_equal(X_transformed.shape[0], len(X)) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert_equal(X_transformed.shape, X_transformed_parallel.shape) assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) # test error if some elements do not support transform assert_raises_regex( TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def enhance_transactions(self): # load training data self.training_data = ml.load_training_data( self.training_data, known_account=self.account, existing_entries=self.existing_entries) # train the machine learning model self._trained = False if not self.training_data: logger.warning("Cannot train the machine learning model " "because the training data is empty.") elif len(self.training_data) < 2: logger.warning( "Cannot train the machine learning model " "because the training data consists of less than two elements." ) else: self.pipeline = Pipeline([ ( 'union', FeatureUnion( transformer_list=[ ('narration', Pipeline([ ('getNarration', ml.GetNarration()), ('vect', CountVectorizer(ngram_range=(1, 3))), ])), ( 'payee', Pipeline([ # any existing payee, if one exists ('getPayee', ml.GetPayee()), ('vect', CountVectorizer(ngram_range=(1, 3))), ])), ( 'dayOfMonth', Pipeline([ ('getDayOfMonth', ml.GetDayOfMonth()), ('caster', ml.ArrayCaster() ), # need for issue with data shape ])), ], transformer_weights={ 'narration': 0.8, 'payee': 0.5, 'dayOfMonth': 0.1 })), ('svc', SVC(kernel='linear')), ]) logger.debug("About to train the machine learning model...") self.pipeline.fit(self.training_data, ml.GetPayee().transform(self.training_data)) logger.info("Finished training the machine learning model.") self._trained = True if not self._trained: logger.warning( "Cannot generate predictions or suggestions " "because there is no trained machine learning model.") return self.imported_transactions # predict payees self.transactions = self.imported_transactions if self.predict_payees: logger.debug("About to generate predictions for payees...") predicted_payees: List[str] predicted_payees = self.pipeline.predict(self.transactions) self.transactions = [ ml.add_payee_to_transaction( *t_p, overwrite=self.overwrite_existing_payees) for t_p in zip(self.transactions, predicted_payees) ] logger.debug( "Finished adding predicted payees to the transactions to be imported." ) # suggest likely payees if self.suggest_payees: # get values from the SVC decision function logger.debug( "About to generate suggestions about likely payees...") decision_values = self.pipeline.decision_function( self.imported_transactions) # add a human-readable class label (i.e., payee's name) to each value, and sort by value: suggested_payees = [[ payee for _, payee in sorted(list( zip(distance_values, self.pipeline.classes_)), key=lambda x: x[0], reverse=True) ] for distance_values in decision_values] # add the suggested payees to each transaction: self.transactions = [ ml.add_suggested_payees_to_transaction(*t_p) for t_p in zip(self.transactions, suggested_payees) ] logger.debug( "Finished adding suggested payees to the transactions to be imported." ) return self.transactions
def __init__(self, dataset, obs_dim, act_dim, gamma, horizon, model_reg, reward_reg, value_reg, default_length_scale=0.1, random_feature_per_obs_dim=250, norm=None, scale_length_adjustment='median', dtype=np.float64, policy_net=None): self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.horizon = horizon self.norm = norm self.policy_net = policy_net self.model_reg = model_reg self.reward_reg = reward_reg self.value_reg = value_reg self.dtype = dtype self.n_samples = dataset['obs'].shape[0] self.n_episode = dataset['init_obs'].shape[0] self.data_acts = dataset['acts'] if self.policy_net is not None: self.pi_current = self.policy_net.get_probabilities(dataset['obs']) self.pi_next = self.policy_net.get_probabilities( dataset['next_obs']) self.pi_init = self.policy_net.get_probabilities( dataset['init_obs']) self.pi_term = self.policy_net.get_probabilities( dataset['term_obs']) else: self.pi_current = dataset['target_prob_obs'] self.pi_next = dataset['target_prob_next_obs'] self.pi_init = dataset['target_prob_init_obs'] self.pi_term = dataset['target_prob_term_obs'] if self.norm is None: self.obs = dataset['obs'] self.next_obs = dataset['next_obs'] self.init_obs = dataset['init_obs'] self.term_obs = dataset['term_obs'] elif self.norm == 'std': self.obs_mean = np.mean(dataset['obs'], axis=0, keepdims=True) self.obs_std = np.std(dataset['obs'], axis=0, keepdims=True) self.obs = (dataset['obs'] - self.obs_mean) / self.obs_std self.next_obs = (dataset['next_obs'] - self.obs_mean) / self.obs_std self.init_obs = (dataset['init_obs'] - self.obs_mean) / self.obs_std self.term_obs = (dataset['term_obs'] - self.obs_mean) / self.obs_std else: raise NotImplementedError if scale_length_adjustment == 'median': sample_num = 5000 idx1 = np.random.choice(self.n_samples, sample_num) idx2 = np.random.choice(self.n_samples, sample_num) med_dist = np.median(np.square(self.obs[None, idx1, :] - self.obs[idx2, None, :]), axis=(0, 1)) med_dist[ med_dist < 0.01] = 0.01 # enforce a upperbound on the scale-length of the action component scale_length_vector = 1.0 / med_dist else: scale_length_vector = np.ones(self.obs_dim) # import pdb; pdb.set_trace() #* set the fourier feature transformer_list = [] self.z_dim = random_feature_per_obs_dim * self.obs_dim models = [ RBFSampler(n_components=random_feature_per_obs_dim, gamma=default_length_scale * dist) for dist in scale_length_vector ] for model in models: model.fit([self.obs[0]]) transformer_list.append((str(model), model)) self.rff = FeatureUnion(transformer_list) # #* separate action set indexing # act_idx = [] # for i in range(self.act_dim): # act_idx.append(np.where(dataset['acts']==i)[0]) # #* apply transformation # Z = self.rff.transform(self.obs).astype(self.dtype); Z_prime = self.rff.transform(self.next_obs).astype(self.dtype) # Z_init = self.rff.transform(self.init_obs).astype(self.dtype); Z_term = self.rff.transform(self.term_obs).astype(self.dtype) # assert self.z_dim == Z.shape[1] # self.Phi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim), dtype=self.dtype) # self.Phi_pi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim),dtype=self.dtype) # self.Phi_prime_pi = np.zeros((Z_prime.shape[0], Z_prime.shape[1]* self.act_dim),dtype=self.dtype) # self.Phi_init_pi = np.zeros((Z_init.shape[0], Z_init.shape[1]*self.act_dim), dtype=self.dtype) # self.Phi_term_pi = np.zeros((Z_term.shape[0], Z_term.shape[1]*self.act_dim),dtype=self.dtype) # for i in range(self.act_dim): # self.Phi[act_idx[i], i*self.z_dim:(i+1)*self.z_dim] = Z[act_idx[i]] # self.Phi_pi[:, i*self.z_dim:(i+1)*self.z_dim] = self.pi_current[:,i][:,None] * Z # self.Phi_prime_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_next[:,i][:,None] * Z_prime # self.Phi_init_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_init[:,i][:,None]*Z_init # self.Phi_term_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_term[:,i][:,None]*Z_term #* Some commonly used variables # self.I_sa = np.eye(self.act_dim*self.z_dim) self.rews = dataset['rews'] self.init_idx = np.arange(0, self.n_samples, self.horizon) self.end_idx = np.arange(self.horizon - 1, self.n_samples, self.horizon) self.rho = dataset[ 'ratio'] #* make sure that the importance weights are already calculated
class Kernel_Estimators(object): def __init__(self, dataset, obs_dim, act_dim, gamma, horizon, model_reg, reward_reg, value_reg, default_length_scale=0.1, random_feature_per_obs_dim=250, norm=None, scale_length_adjustment='median', dtype=np.float64, policy_net=None): self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.horizon = horizon self.norm = norm self.policy_net = policy_net self.model_reg = model_reg self.reward_reg = reward_reg self.value_reg = value_reg self.dtype = dtype self.n_samples = dataset['obs'].shape[0] self.n_episode = dataset['init_obs'].shape[0] self.data_acts = dataset['acts'] if self.policy_net is not None: self.pi_current = self.policy_net.get_probabilities(dataset['obs']) self.pi_next = self.policy_net.get_probabilities( dataset['next_obs']) self.pi_init = self.policy_net.get_probabilities( dataset['init_obs']) self.pi_term = self.policy_net.get_probabilities( dataset['term_obs']) else: self.pi_current = dataset['target_prob_obs'] self.pi_next = dataset['target_prob_next_obs'] self.pi_init = dataset['target_prob_init_obs'] self.pi_term = dataset['target_prob_term_obs'] if self.norm is None: self.obs = dataset['obs'] self.next_obs = dataset['next_obs'] self.init_obs = dataset['init_obs'] self.term_obs = dataset['term_obs'] elif self.norm == 'std': self.obs_mean = np.mean(dataset['obs'], axis=0, keepdims=True) self.obs_std = np.std(dataset['obs'], axis=0, keepdims=True) self.obs = (dataset['obs'] - self.obs_mean) / self.obs_std self.next_obs = (dataset['next_obs'] - self.obs_mean) / self.obs_std self.init_obs = (dataset['init_obs'] - self.obs_mean) / self.obs_std self.term_obs = (dataset['term_obs'] - self.obs_mean) / self.obs_std else: raise NotImplementedError if scale_length_adjustment == 'median': sample_num = 5000 idx1 = np.random.choice(self.n_samples, sample_num) idx2 = np.random.choice(self.n_samples, sample_num) med_dist = np.median(np.square(self.obs[None, idx1, :] - self.obs[idx2, None, :]), axis=(0, 1)) med_dist[ med_dist < 0.01] = 0.01 # enforce a upperbound on the scale-length of the action component scale_length_vector = 1.0 / med_dist else: scale_length_vector = np.ones(self.obs_dim) # import pdb; pdb.set_trace() #* set the fourier feature transformer_list = [] self.z_dim = random_feature_per_obs_dim * self.obs_dim models = [ RBFSampler(n_components=random_feature_per_obs_dim, gamma=default_length_scale * dist) for dist in scale_length_vector ] for model in models: model.fit([self.obs[0]]) transformer_list.append((str(model), model)) self.rff = FeatureUnion(transformer_list) # #* separate action set indexing # act_idx = [] # for i in range(self.act_dim): # act_idx.append(np.where(dataset['acts']==i)[0]) # #* apply transformation # Z = self.rff.transform(self.obs).astype(self.dtype); Z_prime = self.rff.transform(self.next_obs).astype(self.dtype) # Z_init = self.rff.transform(self.init_obs).astype(self.dtype); Z_term = self.rff.transform(self.term_obs).astype(self.dtype) # assert self.z_dim == Z.shape[1] # self.Phi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim), dtype=self.dtype) # self.Phi_pi = np.zeros((Z.shape[0], Z.shape[1]* self.act_dim),dtype=self.dtype) # self.Phi_prime_pi = np.zeros((Z_prime.shape[0], Z_prime.shape[1]* self.act_dim),dtype=self.dtype) # self.Phi_init_pi = np.zeros((Z_init.shape[0], Z_init.shape[1]*self.act_dim), dtype=self.dtype) # self.Phi_term_pi = np.zeros((Z_term.shape[0], Z_term.shape[1]*self.act_dim),dtype=self.dtype) # for i in range(self.act_dim): # self.Phi[act_idx[i], i*self.z_dim:(i+1)*self.z_dim] = Z[act_idx[i]] # self.Phi_pi[:, i*self.z_dim:(i+1)*self.z_dim] = self.pi_current[:,i][:,None] * Z # self.Phi_prime_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_next[:,i][:,None] * Z_prime # self.Phi_init_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_init[:,i][:,None]*Z_init # self.Phi_term_pi[:,i*self.z_dim:(i+1)*self.z_dim] = self.pi_term[:,i][:,None]*Z_term #* Some commonly used variables # self.I_sa = np.eye(self.act_dim*self.z_dim) self.rews = dataset['rews'] self.init_idx = np.arange(0, self.n_samples, self.horizon) self.end_idx = np.arange(self.horizon - 1, self.n_samples, self.horizon) self.rho = dataset[ 'ratio'] #* make sure that the importance weights are already calculated def estimate_model_based(self): #* separate action set indexing act_idx = [] for i in range(self.act_dim): act_idx.append(np.where(self.data_acts == i)[0]) #* apply transformation Z = self.rff.transform(self.obs).astype(self.dtype) Z_prime = self.rff.transform(self.next_obs).astype(self.dtype) Z_init = self.rff.transform(self.init_obs).astype(self.dtype) Z_term = self.rff.transform(self.term_obs).astype(self.dtype) assert self.z_dim == Z.shape[1] Phi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_pi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_prime_pi = np.zeros( (Z_prime.shape[0], Z_prime.shape[1] * self.act_dim), dtype=self.dtype) Phi_init_pi = np.zeros( (Z_init.shape[0], Z_init.shape[1] * self.act_dim), dtype=self.dtype) Phi_term_pi = np.zeros( (Z_term.shape[0], Z_term.shape[1] * self.act_dim), dtype=self.dtype) for i in range(self.act_dim): Phi[act_idx[i], i * self.z_dim:(i + 1) * self.z_dim] = Z[act_idx[i]] Phi_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_current[:, i][:, None] * Z Phi_prime_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_next[:, i][:, None] * Z_prime Phi_init_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_init[:, i][:, None] * Z_init Phi_term_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_term[:, i][:, None] * Z_term I_sa = np.eye(self.act_dim * self.z_dim) #* uncentered /center covariance identity: H = np.eye(self.n_samples) # H = np.eye(self.n_samples) - 1.0/self.n_samples*np.ones((self.n_samples, self.n_samples)) #* estimate reward function r_sa = np.linalg.inv(Phi.T @ Phi + self.reward_reg * I_sa) @ Phi.T @ self.rews Sigma_yx = 1 / self.n_samples * Phi_prime_pi.T @ H @ Phi Sigma_xx = 1 / self.n_samples * Phi.T @ H @ Phi P = np.matmul(Sigma_yx, np.linalg.inv(Sigma_xx + self.model_reg * I_sa)) #* Now that we have the transition operator, we have that: #* E_{s'|s}[\phi(s')|s] = P \phi(s) #* This gives a clean mechanism to roll the model forward #* in particular, the next feature matrix will be #* Phi' = Phi P.T, where Phi = [phi_1, ..., phi_n].T \in R^{n\times p} finite_horizon_correction = I_sa - np.linalg.matrix_power( self.gamma * P.T, self.horizon) transposed_transition_inverse = np.linalg.inv(I_sa - self.gamma * P.T) accumulated_feature = Phi_pi @ finite_horizon_correction @ transposed_transition_inverse V = accumulated_feature @ r_sa value_est = np.mean(V[self.init_idx]) return value_est def estimate_LSTD(self): reg = self.value_reg Z = self.rff.transform(self.obs) Z_prime = self.rff.transform(self.next_obs) R = self.rho * self.rews regularized_inverse = np.linalg.inv( np.matmul(Z.T, Z - self.gamma * self.rho * Z_prime) + reg * np.eye(self.z_dim)) featurized_reward = np.matmul(Z.T, R) reward_coef = np.matmul(regularized_inverse, featurized_reward) V_init = Z[self.init_idx] @ reward_coef V_term = Z[self.end_idx] @ reward_coef V_traj = V_init - V_term * self.gamma**self.horizon value_est = np.mean(V_traj) return value_est def estimate_LSTDQ(self): #* separate action set indexing act_idx = [] for i in range(self.act_dim): act_idx.append(np.where(self.data_acts == i)[0]) #* apply transformation Z = self.rff.transform(self.obs).astype(self.dtype) Z_prime = self.rff.transform(self.next_obs).astype(self.dtype) Z_init = self.rff.transform(self.init_obs).astype(self.dtype) Z_term = self.rff.transform(self.term_obs).astype(self.dtype) assert self.z_dim == Z.shape[1] Phi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_pi = np.zeros((Z.shape[0], Z.shape[1] * self.act_dim), dtype=self.dtype) Phi_prime_pi = np.zeros( (Z_prime.shape[0], Z_prime.shape[1] * self.act_dim), dtype=self.dtype) Phi_init_pi = np.zeros( (Z_init.shape[0], Z_init.shape[1] * self.act_dim), dtype=self.dtype) Phi_term_pi = np.zeros( (Z_term.shape[0], Z_term.shape[1] * self.act_dim), dtype=self.dtype) for i in range(self.act_dim): Phi[act_idx[i], i * self.z_dim:(i + 1) * self.z_dim] = Z[act_idx[i]] Phi_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_current[:, i][:, None] * Z Phi_prime_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_next[:, i][:, None] * Z_prime Phi_init_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_init[:, i][:, None] * Z_init Phi_term_pi[:, i * self.z_dim:(i + 1) * self.z_dim] = self.pi_term[:, i][:, None] * Z_term I_sa = np.eye(self.act_dim * self.z_dim) regularized_inverse = np.linalg.inv( np.matmul(Phi.T, Phi - self.gamma * Phi_prime_pi) + self.value_reg * I_sa) featurized_reward = np.matmul(Phi.T, self.rews) reward_coef = np.matmul(regularized_inverse, featurized_reward) V_init = Phi_init_pi @ reward_coef V_term = Phi_term_pi @ reward_coef V_traj = V_init - V_term * self.gamma**self.horizon value_est = np.mean(V_traj) return value_est def estimate_LSTD_dual(self): import kernel_util as ku sample_num = 5000 idx1 = np.random.choice(self.n_samples, sample_num) idx2 = np.random.choice(self.n_samples, sample_num) med_dist = np.median(np.square(self.obs[None, idx1, :] - self.obs[idx2, None, :]), axis=(0, 1)) med_dist[ med_dist < 0.01] = 0.01 # enforce a upperbound on the scale-length of the action component w = 1.0 / med_dist default_gamma = 0.1 reg = 1e-2 ratio_vector = self.rho.copy().astype(np.float32) K = ku.weighted_rbf_kernel(self.obs, w=w, gamma=default_gamma).astype(np.float32) K_prime = ku.weighted_rbf_kernel(self.next_obs, self.obs, w=w, gamma=default_gamma).astype( np.float32) K_prime = self.gamma * (K_prime * ratio_vector.repeat(self.n_samples, axis=1)) R = (ratio_vector * self.rews).astype(np.float32) beta = np.linalg.inv(K - K_prime + reg * np.eye(self.n_samples)).dot(R) K0 = ku.weighted_rbf_kernel(self.obs, self.init_obs, w=w, gamma=default_gamma) K_terminal = ku.weighted_rbf_kernel(self.obs, self.term_obs, w=w, gamma=default_gamma) # V_init = np.matmul(beta.T, K0) # V_term = np.matmul(beta.T, K_terminal) V_init = K0.T @ beta V_term = K_terminal.T @ beta V_traj = V_init - V_term * self.gamma**self.horizon value_est = np.mean(V_traj) import pdb pdb.set_trace() return value_est
('data', DataFrameColumnExtracter('Surname')), ('vectorizer', HashingVectorizer(non_negative=True)) ]) bio_pipe = Pipeline([ ('data', DataFrameColumnExtracter('Bio')), ('preprocessor', StripHTMLTransformer()), ('vectorizer', CountVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1, 3))), ('tfidf', TfidfTransformer()) ]) features = FeatureUnion( n_jobs=1, transformer_list=[ ('email_pipe', email_pipe), ('fname_pipe', fname_pipe), ('lname_pipe', lname_pipe), ('bio_pipe', bio_pipe) ], transformer_weights=None) classifier = Pipeline([ ('features', features), ('model', MultinomialNB(alpha=0.0001, fit_prior=True)) ]) classifier.fit(trainData, labels) filename = 'member_classifier.pickle' print "writing model to file %s" % (filename) pickle.dump(classifier, open(filename, 'wb'))
data_train, data_test, labels_train, labels_test = train_test_split( data, labels, test_size=0.3, random_state=100) vectorizer = CountVectorizer() # TruncatedSVD to select the principal components: pca = TruncatedSVD(n_components=2) #NMF for MultinomialNB as a PCA technique pca1 = NMF(n_components=2) # K-best features to be selected selection = SelectKBest(chi2, k=1) # To combine the features for LinearSVC, Decision Trees and Logistic Regression combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # To combine the features for MultinomialNB combined_features1 = FeatureUnion([("pca", pca1), ("univ_select", selection)]) clf = DecisionTreeClassifier(criterion="gini", random_state=100) # Performing training by creating one pipeline per classifier according to the respective #combined features and classification algorithm pipeline_logreg = Pipeline([("count_vectorizer", vectorizer), ("features", combined_features), ("Logreg", LogisticRegression())]) pipeline_svc = Pipeline([("count_vectorizer", vectorizer), ("features", combined_features), ("svm", LinearSVC())]) pipeline_dt = Pipeline([("count_vectorizer", vectorizer), ("features", combined_features), ("dt", clf)])
def extract_features(train_data, test_data=None, model_persistor=None): """ Does feature enrichment and enhancement (separate from modeling) if test_data is passed, it is included in the processing and split out again this is to account for encoding where feature categories aren't in the train set :param train_data: Training data :param test_data: Test data :param model_persistor: An instance of PersistModel. When passed, supporting objects can be added """ # ADD TO THE NOTES EXPLAINING WHAT YOU ARE DOING WITH THE FEATURE EXTRACTION model_persistor.add_note(" Extract Features now includes...") if test_data is not None: data_to_process = pd.concat([ train_data[the_settings.all_features], test_data[the_settings.all_features] ], ignore_index=True) else: data_to_process = train_data # I want to try some dimensionality reduction with this one. # First I'm going to start with all of the previous features and feature_extraction = Pipeline([ ('initial features', FeatureUnion([ ('numeric_features_standardized', Pipeline([('numeric_features_raw', FeatureUnion([ ('numeric_features', ColumnExtractor(the_settings.numeric_features)), ('v22_letter_count', LetterCountTransformer( the_settings.special_string_features)), ('Nan_count', NaNCountTransformer()) ])), ('zero_na', NanToZeroTransformer())])), ('string_features_standardized', Pipeline([('string_features', ColumnExtractor(the_settings.string_features)), ('label', MultiColumnLabelEncoder()), ('one_hot', OneHotEncoder(sparse=False))])), ('v22_standardized', Pipeline([('extract', LetterExtractionTransformer( the_settings.special_string_features)), ('label', MultiColumnLabelEncoder()), ('one_hot', OneHotEncoder(sparse=False))])) ])) ]) fitted_feature_model = feature_extraction.fit(data_to_process) if model_persistor: model_persistor.add_object_to_save(fitted_feature_model, FileObjectType.feature_model) extracted_features = fitted_feature_model.transform(data_to_process) return extracted_features[:len(train_data )], extracted_features[len(train_data):]
# Scikit-Learn provides a very useful API to create data transformation # pipelines. Our data contains both numerical values and categorical/text # values. So we'll need a pipeline for each type of data. Then we'll need # a way to merge both pipelines together to build the final training set. housing_num = housing.drop("ocean_proximity", axis=1) # Calling list() on a dataframe returns the attribute names num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('label_binarizer', LabelBinarizer()), ]) preparation_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = preparation_pipeline.fit_transform(housing)
X_test, Y_test = utils.create_x_y(test, dt=128, shift=64, verbose=0) X_train, Y_train = utils.create_x_y(train, dt=128, shift=64) print('X_test.shape:', X_test.shape) print('Y_test.shape:', Y_test.shape) print('X_train.shape:', X_train.shape) print('Y_train.shape:', Y_train.shape) print('\nTesting features:\n') std = STD() entrop = Entropy() quantiles = Quantiles(quantiles=[0.5, 0.25, 0.75]) test_std = std.fit_transform(X_test) print('test_std.shape:', test_std.shape) test_entrop = entrop.fit_transform(X_test) print('test_entrop.shape:', test_entrop.shape) test_quantiles = quantiles.fit_transform(X_test) print('test_quantiles.shape:', test_quantiles.shape) union = FeatureUnion([ ('STD', STD()), ('Entropy', Entropy()), ('Quantiles', Quantiles(quantiles=[0.5])), ]) result = union.fit_transform(X_test) print('All in one:', result.shape)
# replace White with white as well as urban and rural so it's consistent df = df.replace('White', 'white') df = df.replace('Rural', 'rural') df = df.replace('Urban', 'urban') # TODO: Process missing data in pipeline categorical_pipeline = Pipeline( steps=[('cat_selector', FeatureSelector(['Sex', 'Race', 'RuralUrban']) ), ('one_hot_enc', OneHotEncoder(sparse=False))]) numerical_pipeline = Pipeline(steps=[('num_selector', FeatureSelector(['Age']))]) feature_pipeline = FeatureUnion(transformer_list=[( 'numerical_pipeline', numerical_pipeline), ('categorical_pipeline', categorical_pipeline)]) final_pipeline = Pipeline( steps=[('feature_pipeline', feature_pipeline), ('model', LogisticRegression(C=0.001))]) le = LabelEncoder() y = df[['ever_cigarettes']].to_numpy() y = le.fit_transform(y.ravel()) X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2) final_pipeline.fit(X_train, y_train) # final_pipeline.score(X_test, y_test)
housing_num = housing.drop("ocean_proximity", axis=1) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', impute.SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipelines = Pipeline([ ('label_binarizer', DataFrameMapper([(cat_attribs, LabelBinarizer())])), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeliine", num_pipeline), ("cat_pipeline", cat_pipelines), ]) housing_prepared = full_pipeline.fit_transform(housing) #print(housing_prepared.shape) lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_label) some_data = housing.iloc[:5] some_label = housing_label.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) #print(lin_reg.predict(some_data_prepared)) #print(list(some_label)) housing_prediction = lin_reg.predict(housing_prepared)
def train(poems, nonpoems, quick=False): """ Train the model based on given training data :return: """ #nonpoems = nonpoems[::1] print(len(poems)) print(len(nonpoems)) all_train_data = poems + nonpoems all_train_target = [1] * len(poems) + [0] * len(nonpoems) all_train_data = [ textdata.replace('w', 'v').replace('W', 'V') for textdata in all_train_data ] tfidf = Pipeline([('vect', CountVectorizer(max_df=1.0, max_features=25400)), ('tfidf', TfidfTransformer())]) text_feats = Pipeline([ ('stats', TextStats()), # returns a list of dicts ('vect', DictVectorizer()), # list of dicts -> feature matrix ('norm', Normalizer(norm='l2')), ]) combined_feats = FeatureUnion([ ('text_feats', text_feats), ('word_freq', tfidf), ]) sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, n_iter=6, random_state=42) combined_clf = Pipeline([ ('features', combined_feats), ('clf', sgd), ]) if quick: gs_clf = GridSearchCV(combined_clf, {}) else: parameters = { # 'features__word_freq__vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'features__word_freq__vect__max_df': [1.0, 0.5], 'features__word_freq__vect__max_features': [None, 20000, 25000, 25200, 25400, 25600, 26000], 'features__text_feats__norm__norm': ('l1', 'l2', 'max'), 'clf__alpha': (1e-3, 1e-4, 1e-5, 1e-6), 'clf__penalty': ('l2', 'elasticnet'), 'clf__loss': ('hinge', 'log'), 'clf__n_iter': (4, 5, 6, 7, 8), } gs_clf = GridSearchCV(combined_clf, parameters, n_jobs=-1) gs_clf.fit(all_train_data, all_train_target) predicted = gs_clf.predict(all_train_data) print(np.average(predicted)) print('Final params: %s' % gs_clf.best_params_) print('Best score: %s' % gs_clf.best_score_) stop_words = gs_clf.best_estimator_.get_params()['features'].get_params( ).get('word_freq').named_steps['vect'].stop_words_ print('Number of generated stopwords: %s' % len(stop_words)) with open('generated_stopwords.txt', 'w', newline='') as fp: fp.write('\n'.join(sorted(stop_words))) print('Weights %s' % gs_clf.best_estimator_.named_steps['clf'].coef_[0][:4]) return gs_clf
def extract_features(X, sfreq, selected_funcs, funcs_params=None, n_jobs=1, return_as_df=False): """Extraction of temporal or spectral features from epoched EEG signals. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_times) Array of epoched EEG data. sfreq : float Sampling rate of the data. selected_funcs : list of str or tuples The elements of ``selected_features`` are either strings or tuples of the form ``(str, callable)``. If an element is of type ``str``, it is the alias of a feature function. The aliases are built from the feature functions' names by removing ``compute_``. For instance, the alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``. (See the documentation of mne-features). If an element is of type ``tuple``, the first element of the tuple should be a string (name/alias given to a user-defined feature function) and the second element should be a callable (a user-defined feature function which accepts Numpy arrays with shape ``(n_channels, n_times)``). The names/aliases given to user-defined feature functions should not intersect the aliases used by mne-features. If the name given to a user-defined feature function is already used as an alias in mne-features, an error will be raised. funcs_params : dict or None (default: None) If not None, dict of optional parameters to be passed to the feature functions. Each key of the ``funcs_params`` dict should be of the form: ``[alias_feature_function]__[optional_param]`` (for example: ``higuchi_fd__kmax``). n_jobs : int (default: 1) Number of CPU cores used when parallelizing the feature extraction. If given a value of -1, all cores are used. return_as_df : bool (default: False) If True, the extracted features will be returned as a Pandas DataFrame. The column index is a MultiIndex (see :class:`~pandas.MultiIndex`) which contains the alias of each feature function which was used. If False, the features are returned as a 2d Numpy array. Returns ------- array-like, shape (n_epochs, n_features) """ if sfreq <= 0: raise ValueError('Sampling rate `sfreq` must be positive.') univariate_funcs = get_univariate_funcs(sfreq) bivariate_funcs = get_bivariate_funcs(sfreq) feature_funcs = univariate_funcs.copy() feature_funcs.update(bivariate_funcs) sel_funcs = _check_funcs(selected_funcs, feature_funcs) # Feature extraction n_epochs = X.shape[0] _tr = [(n, FeatureFunctionTransformer(func=func)) for n, func in sel_funcs] extractor = FeatureUnion(transformer_list=_tr) if funcs_params is not None: extractor.set_params(**funcs_params) res = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_apply_extractor)(extractor, X[j, :, :], return_as_df) for j in range(n_epochs)) feature_names = res[0][1] res = list(zip(*res))[0] Xnew = np.vstack(res) if return_as_df: return _format_as_dataframe(Xnew, feature_names) else: return Xnew
vectorizer = FeatureUnion([ ('name', Pipeline([('select', ItemSelector('name', start_time=start_time)), ('transform', HashingVectorizer(ngram_range=(1, 2), n_features=2**27, norm='l2', lowercase=False, stop_words=stopwords)), ('drop_cols', DropColumnsByDf(min_df=2))])), ('category_name', Pipeline([ ('select', ItemSelector('category_name', start_time=start_time)), ('transform', HashingVectorizer(ngram_range=(1, 1), token_pattern='.+', tokenizer=split_cat, n_features=2**27, norm='l2', lowercase=False)), ('drop_cols', DropColumnsByDf(min_df=2)) ])), ('brand_name', Pipeline([ ('select', ItemSelector('brand_name', start_time=start_time)), ('transform', CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)), ])), ('gencat_cond', Pipeline([ ('select', ItemSelector('gencat_cond', start_time=start_time)), ('transform', CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)), ])), ('subcat_1_cond', Pipeline([ ('select', ItemSelector('subcat_1_cond', start_time=start_time)), ('transform', CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)), ])), ('subcat_2_cond', Pipeline([ ('select', ItemSelector('subcat_2_cond', start_time=start_time)), ('transform', CountVectorizer(token_pattern='.+', min_df=2, lowercase=False)), ])), ('has_brand', Pipeline([('select', ItemSelector('has_brand', start_time=start_time)), ('ohe', OneHotEncoder())])), ('shipping', Pipeline([('select', ItemSelector('shipping', start_time=start_time)), ('ohe', OneHotEncoder())])), ('item_condition_id', Pipeline([('select', ItemSelector('item_condition_id', start_time=start_time)), ('ohe', OneHotEncoder())])), ('item_description', Pipeline([ ('select', ItemSelector('item_description', start_time=start_time)), ('hash', HashingVectorizer(ngram_range=(1, 3), n_features=2**27, dtype=np.float32, norm='l2', lowercase=False, stop_words=stopwords)), ('drop_cols', DropColumnsByDf(min_df=2)), ])) ], n_jobs=1)
#Romney_Test_dataset_NO_Label.csv obama_output = 'Mayank_Raj_Chinmay_Nautiyal_Obama.txt' romney_output = 'Mayank_Raj_Chinmay_Nautiyal_Romney.txt' #test_data = pd.read_csv("Obama_Test_dataset_NO_Label.csv", encoding = "ISO-8859-1") test_data = pd.read_csv("Romney_Test_dataset_NO_Label.csv", encoding = "ISO-8859-1") test_data_fresh = test_data[['Tweet_ID', 'Tweet_text']] test_data_fresh.head(10) textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags' ,'count_mentions','count_urls','count_words'] features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols)) , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')) , ('vect', CountVectorizer(max_df=0.25, min_df=2, ngram_range=(1,3)))]))] , n_jobs=-1) pipeline = Pipeline([ ('features', features) , ('clf', LogisticRegression(C=1, penalty='l2')) ]) #best_model = pipeline.fit(df_model.drop('classes', axis=1), df_model.classes) best_model = pipeline.fit(df2_model.drop('classes', axis=1), df2_model.classes) df_counts_pos = tc.transform(test_data_fresh["Tweet_text"]) df_clean_pos = ct.transform(test_data_fresh["Tweet_text"]) df_model_pos = df_counts_pos df_model_pos['clean_text'] = df_clean_pos predictions = best_model.predict(df_model_pos).tolist() final_result = pd.DataFrame({'id':test_data_fresh['Tweet_ID'],'label':predictions})
start = time.time() encoded_case = bucket_encoder.fit_transform(dt_test_bucket) _, knn_idxs = bucketer.kneighbors(encoded_case) knn_idxs = knn_idxs[0] relevant_cases_bucket = encoded_train.iloc[knn_idxs].index dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds_all.append(train_y[0]) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) if cls_method == "rf": cls = RandomForestClassifier( n_estimators=500, max_features=args['max_features'], random_state=random_state) elif cls_method == "xgboost": cls = xgb.XGBClassifier( objective='binary:logistic', n_estimators=500, learning_rate=args['learning_rate'], subsample=args['subsample'],
]) # To make a pipeline from all of our pipelines, we do the same thing, but now we use a FeatureUnion to join the feature processing pipelines. # # The syntax is the same as a regular pipeline, it's just an array of tuple, with the (name, object) format. # # The feature union itself is not a pipeline, it's just a union, so you need to do *one more step* to make it useable: pass it to a pipeline, with the same structure, an array of tuples, with the simple (name, object) format. . As you can see, we get a pipeline-ception going on the more complex you get! # # You can then apply all those transformations at once with a single fit, transform, or fit_transform call. Nice, right? # In[8]: from sklearn.pipeline import FeatureUnion feats = FeatureUnion([('text', text), ('length', length), ('words', words), ('words_not_stopword', words_not_stopword), ('avg_word_length', avg_word_length), ('commas', commas)]) feature_processing = Pipeline([('feats', feats)]) feature_processing.fit_transform(X_train) # To add a model to the mix and generate predictions as well, you can add a model at the end of the pipeline. The syntax is, you guessed it, an array of tuples, merging the transformations with a model. # # We can see the raw accuracy is at 63%. Not bad for a start. # # In[12]: from sklearn.ensemble import RandomForestClassifier pipeline = Pipeline([
super(LabelBinarizerPipelineFriendly, self).fit(X) def transform(self, X, y=None): return super(LabelBinarizerPipelineFriendly, self).transform(X) def fit_transform(self, X, y=None): return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline_2 = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('inputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('label_binarizer', LabelBinarizerPipelineFriendly()), ]) full_pipeline = FeatureUnion(transformer_list=[ ('num_pipeline', num_pipeline_2), ('cat_pipeline', cat_pipline), ]) housing_prepared = full_pipeline.fit_transform(housing)
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different pca object to control the random_state stream fs = FeatureUnion([("pca", pca), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))