def __init__(self, env): # Feature preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for _ in range(10000)], dtype='float64') self.scaler = preprocessing.StandardScaler() self.scaler.fit(observation_examples) # Used to convert a state to a featurized represenation. # We use RBF kernels with different variances to cover different parts of the space self.featurizer = pipeline.FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=100)), ('rbf2', RBFSampler(gamma=2.0, n_components=100)), ('rbf3', RBFSampler(gamma=1.0, n_components=100)), ('rbf4', RBFSampler(gamma=0.5, n_components=100)), ]) self.featurizer.fit(self.scaler.transform(observation_examples)) self.models = [] for _ in range(env.action_space.n): model = SGDRegressor(learning_rate='constant') model.partial_fit([self.featurize_state(env.reset())], [0]) self.models.append(model)
def __init__(self, env): """ SGD function approximator, with preprocessing steps from: https://github.com/dennybritz/reinforcement-learning/blob/master/FA/Q-Learning%20with%20Value%20Function%20Approximation%20Solution.ipynb """ # Feature preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this observation_examples = np.array( [env.observation_space.sample() for _ in range(10000)], dtype="float64" ) self.scaler = preprocessing.StandardScaler() self.scaler.fit(observation_examples) # Used to convert a state to a featurized represenation. # We use RBF kernels with different variances to cover different parts of the space self.featurizer = pipeline.FeatureUnion( [ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)), ] ) self.featurizer.fit(self.scaler.transform(observation_examples)) self.models = [] for _ in range(env.action_space.n): model = SGDRegressor(learning_rate="constant") model.partial_fit([self.featurize_state(env.reset())], [0]) self.models.append(model)
def create_estimator(ml_obj, numeric_features, cat_features, date_features): estimator = pipeline.Pipeline(steps=[ ('Feature_processing', pipeline.FeatureUnion(transformer_list=[ ('Numeric_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, numeric_features], validate=True)), ('scaling', preprocessing.StandardScaler( with_mean=0., with_std=1))])), ('Categical_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, cat_features], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder( handle_unknown='ignore'))])), ('Date_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, date_features], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder( handle_unknown='ignore'))])) ])), ('Model_fitting', ml_obj) ]) return estimator #TODO: #make custom score
def get_default_features(self): tfidfvect = TfidfVectorizer() #features = skpipeline.FeatureUnion([('word_tfidf', tfidfvect)]) features = skpipeline.FeatureUnion([ ('word_tfidf', tfidfvect), ]) return features
def main(): env = LunarLander() # Add transformers for tile coding or extra features. transformer = pipeline.FeatureUnion([ # ('scaler', preprocessing.StandardScaler()), # ('square', preprocessing.FunctionTransformer(lambda x: x**2, validate=False)), # ('dummy', DummyTransformer()), # ('poly', preprocessing.PolynomialFeatures(2)), # ('cos', preprocessing.FunctionTransformer(np.cos, validate=False)), # ('inverter', preprocessing.FunctionTransformer(lambda x: 1. /(x + 1.), validate=False)), # ('quantile', preprocessing.KBinsDiscretizer(strategy='uniform', n_bins=20, encode='onehot')), # ('quantile-poly', pipeline.Pipeline([ # ('poly', preprocessing.PolynomialFeatures(2, interaction_only=True)), # ('quantile', preprocessing.KBinsDiscretizer(strategy='quantile', n_bins=20, encode='onehot-dense', )), # ])), # ('quantile', pipeline.Pipeline([ # # ('poly', preprocessing.PolynomialFeatures(2)), # ('quantile', preprocessing.KBinsDiscretizer(strategy='uniform', n_bins=200, encode='ordinal', )), # # ('ohe', preprocessing.OneHotEncoder(sparse=False, categories='auto')) # # ])), # ('power', preprocessing.PowerTransformer()), ]) s = env.reset() # a = 1/(1000*s.T@s) print('Learning rate:', LR) agent = GMCAgent(lr=LR, init_epsilon=EPS, max_steps=MAX_STEPS, gamma=GAMMA, threshold=0.0, transformer=None, success_count=SUCCESS_COUNT, success_criteria=SUCCESS_CRITERIA) # agent = SemiGradientAgent(lr=LR, init_epsilon=EPS, max_steps=800, gamma=GAMMA, threshold=0.0, # transformer=None, success_count=3, success_criteria=220) # agent = EpisodicSemiGradient(lr=a, init_epsilon=0.3, max_steps=800, gamma=0.9999, threshold=0.0, # transformer=None, success_count=3, success_criteria=220) agent.fit(env, render_train=False, verbose=True, episodes=10000) agent.land(env, verbose=False) now = datetime.now().strftime("%Y-%m-%d") filename = f'weights/trained_agent_weights_{agent.alpha}_{agent.max_steps}_{now}' np.save(filename, agent.get_weights()) print("Weights saved to", filename)
def main(): books = datasets.load_files("data/Book/", shuffle=True, encoding="ISO-8859-1", random_state=1337) X_train, X_test, y_train, y_test = model_selection.train_test_split( books.data, books.target, test_size=0.10) model = pipeline.Pipeline( [('union', pipeline.FeatureUnion(transformer_list=[ ('other_features', util_q1.AddOtherFeatures(feature_to_add="pos_neg_count")), ('text_data', pipeline.Pipeline([ ('remove_words', util_q1.RemoveWords(words_to_remove="none")), ("normalisation", util_q1.NormaliseWords(normalise_type="lemmatize")), ("preprocess", util_q1.PreprocessData(attribute="frequency_filtering", attribute_values="tf-idf")), ])) ])), ("classifier", GradientBoostingClassifier(n_estimators=60, max_features="sqrt", subsample=0.8))]) scoring = { "accuracy": "accuracy", "recall": "recall", "precision": "precision" } grid_search_model = model_selection.GridSearchCV( model, { "classifier__max_depth": range(5, 11, 5), "classifier__min_samples_split": range(5, 11, 5) }, n_jobs=-1, verbose=10, scoring=scoring, refit=False) grid_search_model.fit(X_train, y_train) joblib.dump(grid_search_model, "outputs/gridsearch_xgboost_aws.pkl")
def main(): model = pipeline.Pipeline([ ("features", pipeline.FeatureUnion( transformer_list=[("other_features", AddOtherFeatures(feature_to_add="None")), ("text_data", pipeline.Pipeline(( "remove_words", RemoveWords(words_to_remove="None"), "normalize_words", NormalizeWords(normalize_type="None"), "vectorize_text", VectorizeText(vectorize_type="None"), "reduce_dimension", ReduceDimension(reduction_type="None"), "normalize_features", NormalizeFeatures( normalize_type="None"))))])), ("classifier", CurrentModel(model_name="knn")) ]) scoring = calculate_score() grid_search_model = model_selection.GridSearchCV(model, { "features__other_features__feature_to_add": ["None"], "features__text_data__remove_words": ["None", "tool_words", "closed_class", "tool_words_and_closed_class"], "features__text_data__normalize_words": ["None", "Stemming", "Lemmatization"], "features__text_data__vectorize_text": ["None", "Presence", "Frequency", "td_idf"], "features__text_data__reduce_dimension": ["None", "PCA"], "features__text_data__normalize_features": ["None", "min_max_scale"] }, n_jobs=-1, verbose=10, scoring=scoring, refit=False) grid_search_model.fit(X_train, y_train) pickle.dump(grid_search_model, open("fitted_pipeline", "wb"))
def build_model(): """ Build an NLP pipeline for multi-label text classification. """ # text processing and model pipeline pipeline = skpipe.Pipeline([ ('nlp', skpipe.FeatureUnion([ ('tfif', skpipe.Pipeline([('feat', skfet.TfidfVectorizer(strip_accents='unicode', tokenizer=tokenize)), ('lsa', skdec.TruncatedSVD(n_components=200, algorithm='arpack'))])), ('uppr', RatioUpperExtractor()), ('verb', CountVerbExtractor()), ('noun', RatioNounExtractor()) ])), ('norm', skprep.StandardScaler()), ('clf', MLPClassifier(activation='logistic', learning_rate='adaptive', early_stopping=True, random_state=RANDOM_SEED, verbose=1)) ]) # define grid search parameters params = { 'clf__learning_rate_init': [5e-3, 7.5e-3, 1e-2], 'clf__hidden_layer_sizes': [(100), (200, ), (300, )] } # instantiate GridSearchCV object cv = skms.GridSearchCV(estimator=pipeline, param_grid=params, n_jobs=-1, refit=True, return_train_score=True) return cv
def get_estimator(self): binary = ('binary_variables_processing', preprocessing.FunctionTransformer( lambda data: data[:, Model.binary_data_indices], validate=True)) categorial = ( 'categorical_variables_processing', pipeline.Pipeline( steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, Model.categorical_data_indices], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False))])) estimator = pipeline.Pipeline( steps=[('feature_processing', pipeline.FeatureUnion( transformer_list=[binary, categorial]) ), ('model_fitting', self.regressor)]) return estimator
def __init__(self, env, use_kernel=False, **agent_params): self.env = env self.use_kernel = use_kernel if use_kernel: # Sample feature space and define scaler to detrend data observation_samples = np.array( [env.observation_space.sample() for x in range(10000)]) self.detrend = preprocessing.StandardScaler() self.detrend.fit(observation_samples) # Use detrended data to generate feature space with RBF kernels self.featurizer = pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=3.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) self.featurizer.fit(self.detrend.transform(observation_samples)) # Generate linear value function model for each action in our action space self.models = [] initReward = np.array(0) for k in range(env.action_space.n): self.models.append( linear_model.SGDRegressor(learning_rate="constant")) random_features = self.map_to_features(self.env.reset()) self.models[k].partial_fit(random_features.reshape(1, -1), initReward.ravel()) self.agent_params = { "epsilon_min": 0.01, "decay_rate": 0.02, "discount": 0.99, "iter": 1000 } self.agent_params.update(agent_params)
]) #transformed_data=cat_feature_pipeline.fit_transform(X_train[['ENRL_CERT_NBR']]) num_feature_pipeline = pipeline.Pipeline([ ('imputation', impute.SimpleImputer()), ('standardscalar', preprocessing.StandardScaler()) ]) #transformed_data=num_feature_pipeline.fit_transform(X_train[['TOT_BLNG_AMT']]) feature_preprocessing = compose.ColumnTransformer( [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list), ('num_feature_pipeline', num_feature_pipeline, num_features_list)], n_jobs=10) features_pipeline = pipeline.FeatureUnion( [('pca_selector', decomposition.PCA(n_components=0.90)), ('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))], n_jobs=20) classifier = tree.DecisionTreeClassifier() #build complete pipeline with feature selection and ml algorithms complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing), ('zv_filter', feature_selection.VarianceThreshold()), ('features', features_pipeline), ('tree', classifier)]) pipeline_grid = {} grid_estimator = model_selection.GridSearchCV(complete_pipeline, pipeline_grid, scoring="accuracy",
def _generate_feature_extraction_pipeline(self): lang = self.feature_config.lang feature_weights = self.feature_config.weights prep_params = self.feature_config.prepchoice # features found in the processed tokens preprocessor = prep.Preprocessor( lang=lang, stopword=prep_params.stopword, more_stopwords=prep_params.more_stopwords, spellcheck=prep_params.spellcheck, stemming=prep_params.stemming, remove_numbers=prep_params.remove_numbers, deasciify=prep_params.deasciify, remove_punkt=prep_params.remove_punkt, lowercase=prep_params.lowercase) tfidfvect = sktext.TfidfVectorizer( tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params.use_idf, ngram_range=prep_params.wordngramrange, max_features=prep_params.nmaxfeature) polpipe3 = toktrans.get_lexicon_count_pipeline(tokenizer=prep.identity, lexicontype=lang) token_weights = dict(tfidfvect=feature_weights["word_tfidf"], polpipe3=feature_weights["lexicon_count"]) token_transformers_dict = dict( tfidfvect= tfidfvect, # not to lose above integrity if we change variable names polpipe3=polpipe3) token_transformers = [(k, v) for k, v in token_transformers_dict.items()] tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_transformers, transformer_weights=token_weights)), ]) charngramvect = sktext.TfidfVectorizer( analyzer='char_wb', ngram_range=prep_params.charngramrange, lowercase=False) polpipe1 = txtrans.get_polylglot_polarity_count_pipe(lang) polpipe2 = txtrans.get_polylglot_polarity_value_pipe(lang) text_weights = dict( charngramvect=feature_weights["char_tfidf"], # @TODO hardcoded polpipe1=feature_weights["polyglot_count"], polpipe2=feature_weights["polyglot_value"]) text_transformers_dict = dict(charngramvect=charngramvect, polpipe1=polpipe1, polpipe2=polpipe2) text_transformers = [(k, v) for k, v in text_transformers_dict.items()] ''' textpipes = [('charngramvect', charngramvect),] textpweights = {'charngramvect' : 1.5} textpweights = dict(charngramvect = 1 if charngramvect else 0) ''' textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_transformers, transformer_weights=text_weights), )]) print(text_weights) final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe, textbasedpipe=textbasedpipe) final_transformers = [(k, v) for k, v in final_transformers_dict.items()] ''' #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()} check_zero = lambda x : 1 if sum(x) > 0 else 0 x = list(tokenbasedpipe.get_params(False).values()) print(len(x), x[0]) print(x[0][1]) # convert x[0] tuple to dict, then get transformer weights print("**") print(x,"\n--") print(list(textbasedpipe.get_params(False).values())) tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values()) for _, k in final_transformers_dict.items()} ''' feature_union = skpipeline.FeatureUnion( transformer_list=final_transformers, # transformer_weights=tweights # weight assignment is not necessary as the number of features is small ) return feature_union
def run_prep(): classifier = sklinear.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) lang = "tr" stopword_choice = True more_stopwords_list = None spellcheck_choice = False stemming_choice = False number_choice = False deasc_choice = True punct_choice = True case_choice = True ngramrange = (1, 2) # tuple nmaxfeature = 10000 # int or None norm = "l2" use_idf = True preprocessor = Preprocessor(lang=lang, stopword=stopword_choice, more_stopwords=more_stopwords_list, spellcheck=spellcheck_choice, stemming=stemming_choice, remove_numbers=number_choice, deasciify=deasc_choice, remove_punkt=punct_choice, lowercase=case_choice) tfidfvect = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, use_idf=use_idf, ngram_range=ngramrange, max_features=nmaxfeature) keyword = "arıza" apipe = tbt.get_keyword_pipeline(keyword) keyword2 = "pstn" pstnpipe = tbt.get_keyword_pipeline(keyword2) polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang) polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang) polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=identity) tokenizedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), ('union1', skpipeline.FeatureUnion(transformer_list=[ ('vect', tfidfvect), ('polarity3', polpipe3), ])), ]) textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion([ ('has_ariza', apipe), ('has_pstn', pstnpipe), ('polarity1', polpipe1), ('polarity2', polpipe2), ]), )]) model = skpipeline.Pipeline([ # ('preprocessor', preprocessor), ("union", skpipeline.FeatureUnion(transformer_list=[ ('tfidf', tokenizedpipe), ('txtpipe', textbasedpipe), ])), ('classifier', classifier), ]) t0 = time() print("Read data") instances, labels = get_data.get_data() N = 100 instances, labels = corpus_io.select_N_instances(N, instances, labels) # instances_train, instances_test, ytrain, ytest = cv.train_test_split(instances, labels, test_size=0.30, random_state=20) print("Start classification\n..") nfolds = 5 ypred = cv.cross_val_predict(model, instances, labels, cv=nfolds) tc_utils.get_performance(labels, ypred, verbose=True) t1 = time() print("Classification took ", round(t1 - t0, 2), "sec.")
def _email_features_pipeline(lang, stopword_choice=True, more_stopwords_list=None, spellcheck_choice=False, stemming_choice=False, number_choice=False, deasc_choice=True, punct_choice=True, case_choice=True, ngramrange=(1, 2), # tuple nmaxfeature=10000, # int or None norm="l2", use_idf=True, keywords=[], # ["arıza", "pstn"] final_weights=dict(text_based=1, token_based=1) ): # use a list of (pipeline, pipeline_name, weight) # features found in the processed tokens token_features = [] token_weights = {} preprocessor = prep.Preprocessor(lang=lang, stopword=stopword_choice, more_stopwords=more_stopwords_list, spellcheck=spellcheck_choice, stemming=stemming_choice, remove_numbers=number_choice, deasciify=deasc_choice, remove_punkt=punct_choice, lowercase=case_choice ) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=use_idf, ngram_range=ngramrange, max_features=nmaxfeature) tfidfvect_name = 'word_tfidfvect' token_features.append((tfidfvect_name, tfidfvect)) token_weights[tfidfvect_name] = 1 # features found in the whole raw text text_features = [] text_weights = {} # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False) # keyword presence features if keywords: for keyword in keywords: keywordpipe = txbt.get_keyword_pipeline(keyword) feature_name = "has_" + keyword text_features.append((feature_name, keywordpipe)) text_weights[feature_name] = 1 tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=token_features , transformer_weights=token_weights )), ]) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion( transformer_list=text_features, transformer_weights=text_weights ), ) ]) ####### # add the feature pipes to final_features if all the component weights are non-zero. ######## check_zero_list = lambda x : 1 if sum(x) > 0 else 0 # l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1. final_features_dict = {} tkweights = list(token_weights.values()) if(check_zero_list(tkweights) != 0): final_features_dict["token_based"] = tokenbasedpipe else: final_weights["token_based"] = 0 txweights = list(text_weights.values()) if(check_zero_list(txweights) != 0): final_features_dict["text_based"] = textbasedpipe else: final_weights["text_based"] = 0 final_features = list(final_features_dict.items()) fweights = list(final_weights.values()) if((check_zero_list(fweights) == 0) or (len(final_features) == 0)): return None ''' features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ], transformer_weights=final_weights) ''' features = skpipeline.FeatureUnion(transformer_list=final_features, transformer_weights=final_weights) return features
def _tr_sentiment_features_pipeline( lang="tr", feature_weights={ "word_tfidf": 1, "polyglot_value": 0, "polyglot_count": 0, "lexicon_count": 0, "char_tfidf": 1 }, stopword_choice=True, more_stopwords_list=None, spellcheck_choice=False, stemming_choice=False, number_choice=False, deasc_choice=True, punct_choice=True, case_choice=True, word_ngramrange=(1, 2), # tuple char_ngramrange=(2, 2), nmaxfeature=10000, # int or None norm="l2", use_idf=True): preprocessor = prep.Preprocessor(lang=lang, stopword=stopword_choice, more_stopwords=more_stopwords_list, spellcheck=spellcheck_choice, stemming=stemming_choice, remove_numbers=number_choice, deasciify=deasc_choice, remove_punkt=punct_choice, lowercase=case_choice) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=use_idf, ngram_range=word_ngramrange, max_features=nmaxfeature) polpipe3 = obt.get_lexicon_count_pipeline(tokenizer=prep.identity) token_weights = dict(tfidfvect=feature_weights["word_tfidf"], polpipe3=feature_weights["lexicon_count"]) token_transformers_dict = dict( tfidfvect= tfidfvect, # not to lose above integrity if we change variable names polpipe3=polpipe3) token_transformers = [(k, v) for k, v in token_transformers_dict.items()] tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_transformers, transformer_weights=token_weights)), ]) charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=char_ngramrange, lowercase=False) polpipe1 = tbt.get_polylglot_polarity_count_pipe(lang) polpipe2 = tbt.get_polylglot_polarity_value_pipe(lang) text_weights = dict(charngramvect=feature_weights["char_tfidf"], polpipe1=feature_weights["polyglot_count"], polpipe2=feature_weights["polyglot_value"]) text_transformers_dict = dict(charngramvect=charngramvect, polpipe1=polpipe1, polpipe2=polpipe2) text_transformers = [(k, v) for k, v in text_transformers_dict.items()] ''' textpipes = [('charngramvect', charngramvect),] textpweights = {'charngramvect' : 1.5} textpweights = dict(charngramvect = 1 if charngramvect else 0) ''' textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_transformers, transformer_weights=text_weights), )]) final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe, textbasedpipe=textbasedpipe) final_transformers = [(k, v) for k, v in final_transformers_dict.items()] ''' #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()} check_zero = lambda x : 1 if sum(x) > 0 else 0 x = list(tokenbasedpipe.get_params(False).values()) print(len(x), x[0]) print(x[0][1]) # convert x[0] tuple to dict, then get transformer weights print("**") print(x,"\n--") print(list(textbasedpipe.get_params(False).values())) tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values()) for _, k in final_transformers_dict.items()} ''' features = skpipeline.FeatureUnion( transformer_list=final_transformers, # transformer_weights=tweights # weight assignment is not necessary as the number of features is small ) ''' tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), #('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=[ ('tfidfvect', tfidfvect), #('polarity3', polpipe3), ])),] ) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion([ #('polarity1', polpipe1), #('polarity2', polpipe2), ('charngramvect', charngramvect), ]),)]) features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ]) ''' return features
def _ar_txt_clf_features_pipeline2( feature_params_config_dict # {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example. ): lang = feature_params_config_dict[conf.lang_key] feature_weights = feature_params_config_dict[conf.weights_key] prep_params = feature_params_config_dict[conf.prep_key] #print(feature_weights) # features found in the processed tokens preprocessor = prep.Preprocessor( lang=lang, stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key], spellcheck=prep_params[conf.spellcheck_key], stemming=prep_params[conf.stemming_key], remove_numbers=prep_params[conf.remove_numbers_key], deasciify=prep_params[conf.deasciify_key], remove_punkt=prep_params[conf.remove_punkt_key], lowercase=prep_params[conf.lowercase_key]) tfidfvect = TfidfVectorizer( tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params[conf.use_idf_key], ngram_range=prep_params[conf.wordngramrange_key], max_features=prep_params[conf.nmaxfeature_key]) token_weights = dict(tfidfvect=feature_weights["word_tfidf"], ) token_transformers_dict = dict( tfidfvect= tfidfvect, # not to lose above integrity if we change variable names ) token_transformers = [(k, v) for k, v in token_transformers_dict.items()] tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_transformers, transformer_weights=token_weights)), ]) charngramvect = TfidfVectorizer( analyzer='char_wb', ngram_range=prep_params[conf.charngramrange_key], lowercase=False) # stylistic ''' # BUG named_entity_pipe = tbt.get_named_entity_weight_pipeline(lang) text_weights = dict(charngramvect=feature_weights["char_tfidf"], # @TODO hardcoded polpipe1=feature_weights["polyglot_count"], polpipe2=feature_weights["polyglot_value"], named_entity_pipe=feature_weights["named_entity_rate"]) text_transformers_dict = dict(charngramvect=charngramvect, polpipe1=polpipe1, polpipe2=polpipe2, named_entity_pipe=named_entity_pipe) ''' text_weights = dict( charngramvect=feature_weights["char_tfidf"], # @TODO hardcoded ) text_transformers_dict = dict(charngramvect=charngramvect, ) text_transformers = [(k, v) for k, v in text_transformers_dict.items()] ''' textpipes = [('charngramvect', charngramvect),] textpweights = {'charngramvect' : 1.5} textpweights = dict(charngramvect = 1 if charngramvect else 0) ''' textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_transformers, transformer_weights=text_weights), )]) final_transformers_dict = dict(tokenbasedpipe=tokenbasedpipe, textbasedpipe=textbasedpipe) final_transformers = [(k, v) for k, v in final_transformers_dict.items()] #print(textbasedpipe.named_steps) ''' #tweights = {k : 1 if v else 0 for k,v in final_transformers.items()} check_zero = lambda x : 1 if sum(x) > 0 else 0 x = list(tokenbasedpipe.get_params(False).values()) print(len(x), x[0]) print(x[0][1]) # convert x[0] tuple to dict, then get transformer weights print("**") print(x,"\n--") print(list(textbasedpipe.get_params(False).values())) tweights = {k : check_zero(list(k.get_params(False).values())[0][0][1].get_params(False)["transformer_weights"].values()) for _, k in final_transformers_dict.items()} ''' features = skpipeline.FeatureUnion( transformer_list=final_transformers, # transformer_weights=tweights # weight assignment is not necessary as the number of features is small ) #print("0000000000", feature_params_config_dict) return features
return self def transform(self, X, y=None): return X.fillna(self.most_frequent_) cat_pipeline = pipeline.Pipeline([ ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])), ("imputer", MostFrequentImputer()), ("cat_encoder", OneHotEncoder(sparse=False)), ]) cat_pipeline.fit_transform(train_data) preprocess_pipeline = pipeline.FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) X_train = preprocess_pipeline.fit_transform(train_data) print('X_train') print(X_train[:5]) y_train = train_data["Survived"] # SVC svm_clf = svm.SVC() svm_clf.fit(X_train, y_train) print('svm_clf') print(svm_clf) # CHECK PREDICION
INPUT: Dataframe with features (X), target variable dataframe (y), polynomial degree (parameter) OUTPUT: Score of XGB Regressor ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model_xgb = xgb.XGBRegressor(n_estimators=n_est,learning_rate=0.15,max_depth=2) model_xgb.fit(X_train, y_train, eval_metric='rmse') return model_xgb.score(X_test,y_test) def regression_pipeline(x,y,degree,pca_comps): ''' INPUT: Dataframe with features (X), target variable dataframe (y), polynomial degree, PCA components OUTPUT: Score of regression pipeline using polynomial features and PCA ''' # lets try using some feature tranforms, to our original X combined_features = pipeline.FeatureUnion([('poly',PolynomialFeatures(degree=degree)), ('pca', decomposition.PCA(n_components=pca_comps))]) X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = .3) # now lets lay out the steps of our model # First we do the feature transforms with our feature union defined above # Second we do feature selection with the built-in SelectFromModel # Third we train the actual model steps = [ ('features', combined_features), ('feature_selection', feature_selection.SelectFromModel(Lasso(alpha=.5))), ('model', LinearRegression())] #but at this point we have only defined what reg, no training has happened yet regression_pipeline = pipeline.Pipeline(steps) # calling fit here calls fit on the entire pipeline which in turn executes all its members regression_pipeline.fit(X_train,y_train)
clf = XGBClassifier(max_depth=5, n_estimators=100) estimator = pipeline.Pipeline(steps=[ ( 'feature_preprocessing', pipeline.FeatureUnion(transformer_list=[ #real ('numeric_variables_processing', pipeline.Pipeline( steps=[('selecting', preprocessing.FunctionTransformer( lambda data: data[:, real_data_indices])), ('scaling', preprocessing.StandardScaler(with_mean=0))])), #categorical ('categorical_variables_processing', pipeline.Pipeline( steps=[('selecting', preprocessing.FunctionTransformer( lambda data: data[:, cat_data_indices])), ('hot_encoding', preprocessing.OneHotEncoder( handle_unknown='ignore'))])), ])), ('model_fitting', clf) ]) estimator.fit(train_data, y) pred = estimator.predict(test_data) write_to_csv('check.csv', ['PassengerId', 'Survived'], pred)
counters_pipe = pipeline.FeatureUnion( n_jobs=-1, transformer_list=[ ('chars_features', pipeline.Pipeline([('chars_counter', TfidfVectorizer(analyzer=u'char', ngram_range=(2, 5), tokenizer=None, max_features=config.max_features, strip_accents=None, max_df=0.9, min_df=2, lowercase=False)), ('chars_tsvd', TruncatedSVD(n_components=config.svd_n_components, n_iter=25, random_state=42))])), ('words_features', pipeline.Pipeline([('words_counter', TfidfVectorizer(analyzer=u'word', ngram_range=(1, 3), tokenizer=None, use_idf=True, max_features=config.max_features, strip_accents=None, max_df=0.9, min_df=2, lowercase=False)), ('words_tsvd', TruncatedSVD(n_components=config.svd_n_components, n_iter=25, random_state=42))])), ])
def _generate_feature_extraction_pipeline(self): lang = self.feature_config.lang final_weights = self.feature_config.weights prep_params = self.feature_config.prepchoice # features found in the processed tokens token_features = [] token_weights = {} preprocessor = prep.Preprocessor( lang=lang, stopword=prep_params.stopword, more_stopwords=prep_params.more_stopwords, spellcheck=prep_params.spellcheck, stemming=prep_params.stemming, remove_numbers=prep_params.remove_numbers, deasciify=prep_params.deasciify, remove_punkt=prep_params.remove_punkt, lowercase=prep_params.lowercase) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params.use_idf, ngram_range=prep_params.wordngramrange, max_features=prep_params.nmaxfeature) tfidfvect_name = 'word_tfidfvect' token_features.append((tfidfvect_name, tfidfvect)) token_weights[tfidfvect_name] = 1 # features found in the whole raw text text_features = [] text_weights = {} # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False) tokenbasedpipe = skpipeline.Pipeline([ ('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion(transformer_list=token_features, transformer_weights=token_weights)), ]) if text_weights: textbasedpipe = skpipeline.Pipeline([( 'union2', skpipeline.FeatureUnion(transformer_list=text_features, transformer_weights=text_weights), )]) ''' features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ], transformer_weights=final_weights) ''' ####### # add the feature pipes to final_features if all the component weights are non-zero. ######## check_zero_list = lambda x: 1 if sum(x) > 0 else 0 # l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1. final_features_dict = {} tkweights = list(token_weights.values()) if (check_zero_list(tkweights) != 0): final_features_dict["token_based"] = tokenbasedpipe else: final_weights["token_based"] = 0 txweights = list(text_weights.values()) if (check_zero_list(txweights) != 0): final_features_dict["text_based"] = textbasedpipe else: final_weights["text_based"] = 0 final_features = list(final_features_dict.items()) fweights = list(final_weights.values()) #print(final_weights) if ((check_zero_list(fweights) == 0) or (len(final_features) == 0)): return None features = skpipeline.FeatureUnion(transformer_list=final_features, transformer_weights=final_weights) return features
def _build_feature_pipeline(self, sample_mode='rollouts', num_components=50, gammas=None, num_obs=10000, use_standard_scaler=True, featurizer_max_env_steps=10000): """Build the feature pipeline. Args: sample_mode: A string rerpresenting how to collect data from the environment to build features. Must be {'rollouts', 'reset', 'random'}. - `rollouts` will collect observations by executing a random policy in the env. - `reset` will collect rollouts by repeatedly resetting the env. - `random` will just sample the env observation space randomly. num_components: The number of components in each RBF. gammas: A list containing the frequency of each RBF. If None will default to `[0.5, 1.0, 2.5, 5.0]`. num_obs: The integer number of observations to use to fit the Kernels. use_standard_scaler: Boolean indicating if the observations should be normalized. featurizer_max_env_steps: Maximum number of steps to be taken in each rollout to estimate the kernels in the featurizer. Raises: ValueError: If the `sample_mode` is unknown. """ env = self._env._envs[0] # pylint: disable=protected-access if gammas is None: gammas = [0.5, 1.0, 2.5, 5.0] features = [] for i, gamma in enumerate(gammas): features.append( ('rbf{}'.format(i), kernel_approximation.RBFSampler(gamma=gamma, n_components=num_components))) self.featurizer = pipeline.FeatureUnion(features) if use_standard_scaler: self.scaler = skl_preprocessing.StandardScaler() if sample_mode == 'random': # Randomly sample from the observation space to fit the featurizers. observation_examples = np.array([env.observation_space.sample() for _ in range(num_obs)]) # pylint: disable=line-too-long elif sample_mode == 'reset': # Just reset the environment to obtain the observations. observation_examples = np.array( [env.reset() for _ in range(num_obs)]) elif sample_mode == 'rollouts': # Rollout mode. observations = [] while True: observations.append(env.reset()) done = False t = 0 while not done and t < featurizer_max_env_steps: action = env.action_space.sample() obs, _, done, _ = env.step(action) observations.append(obs) if len(observations) > num_obs: break # Collected enough observations. observation_examples = np.array(observations) else: raise ValueError('Unknown `sample_mode`!') if use_standard_scaler: self.scaler.fit(observation_examples) if use_standard_scaler: self.scaler.transform(observation_examples) self.featurizer.fit(observation_examples) self.use_standard_scaler = use_standard_scaler
def main(): ''' The main function reads in data, sets up the union features ''' train_file = 'train_data.csv' blind_file = 'test_features_2013-03-07.csv' #read csv files into a dataframe train_df = pd.read_csv(train_file) blind_df = pd.read_csv(blind_file) #do print-out checks of the file print(train_df.shape) print(blind_df.shape) print(train_df.head(5)) #fix column names in the blind_df column to be all lowercase to match with train_df for col in blind_df.columns: blind_df.rename(columns={col: col.lower()}, inplace=True) print(blind_df.head(5)) #take out the zero-target values from the train data X_total = train_df[train_df['target'] > 0] #set the predicted y values to be the 'target' column from the train dataframe y_total = X_total['target'] print(X_total.shape) print(y_total.shape) #divide train data into 'train' set and validation test set, where validation set is 20% of data from the training dataframe X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_total, y_total, test_size=0.2) #make a dictionary of weights for each step in the model pipeline transform_dict = {'job type':1, 'years exp': 1, 'location': 1, 'degree': 1, 'major':1, 'industry':1} #combine features into one large set all_features = pipeline.FeatureUnion([ ('job type', FullModelTransformer(CategTransformer('jobtype'))), ('years exp', FullModelTransformer(LinFitTransformer('yearsexperience'))), ('location', FullModelTransformer(LinFitTransformer('milesfrommetropolis'))), ('degree', FullModelTransformer(CategTransformer('degree'))), ('major', FullModelTransformer(CategTransformer('major'))), ('industry', FullModelTransformer(CategTransformer('industry'))) ], transformer_weights=transform_dict) #make a pipeline which performs fitting to all features and then fits those predictions to a an overall model (n nearest neighbors) k_union = pipeline.Pipeline([ ("features", all_features), ('modelfit', KNeighborsRegressor(n_neighbors=3)) #("linreg", LinearRegression(fit_intercept=True)) ]) #fit the train data k_union.fit(X_train, y_train.values.reshape(-1,1)) #print the R^2 score of the fit print k_union.score(X_train, y_train.values.reshape(-1,1)) #fit the validation test data k_union.fit(X_test, y_test.values.reshape(-1,1)) #print the R^2 score of the fit print k_union.score(X_test, y_test.values.reshape(-1,1)) #predict on the blind data result = k_union.predict(blind_df) #add the prediction result as a column in the blind dataframe blind_df['target'] = result #write out resulting dataframe to a new csv file header = ["jobid", "target"] #blind_df.to_csv('test_target.csv', columns = header, index=False) #predict on the entire input dataset result = k_union.predict(X_total) X_total['target_pred'] = result #write resuling dataframe to new csv file header = ["jobid", "target_pred"] X_total.to_csv('train_target_pred.csv', columns = header, index=False) #send results to plot make_plot(X_total, blind_df) return
# specify cross-validation k = 10 # number of folds cvsplitter = ms.KFold(n_splits=k, shuffle=True, random_state=0) # cross-validation splitter score = ms.cross_val_score(model, X, y, cv=cvsplitter) print('Standardized linear discriminant analysis mean accuracy: {0:.4f}'. format(score.mean())) # define steps in a feature selection pipeline features = list() features.append(('pca', decomp.PCA( n_components=3))) # use PCA to select 3 of the best features features.append(('select_best', fs.SelectKBest( k=6))) # use chi-squared test to select 6 of the best features feature_union = pipeline.FeatureUnion( features) # create the feature selection pipeline estimators = list() estimators.append(( 'feature_union', feature_union)) # add the feature selection pipleine to a new pipeline estimators.append(('logistic', sl.LogisticRegression( max_iter=1000))) # use logistic regression as the model model = pipeline.Pipeline( estimators ) # logistic regression with automatic feature selection by pca and chi-squared test # specify cross-validation score = ms.cross_val_score(model, X, y, cv=cvsplitter) print( 'Logistic regression with automatic feature selection by PCA and chi2 test mean accuracy: {0:.4f}' .format(score.mean()))
pipeline.FeatureUnion( n_jobs=-1, transformer_list=[ ('standard', cust_regression_vals()), ('pi1', pipeline.Pipeline([ ('Gene', cust_txt_col('Gene')), ('count_Gene', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd1', decomposition.TruncatedSVD(n_components=20, n_iter=25, random_state=12)) ])), ('pi2', pipeline.Pipeline([ ('Variation', cust_txt_col('Variation')), ('count_Variation', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd2', decomposition.TruncatedSVD(n_components=20, n_iter=25, random_state=12)) ])), ('pi3', pipeline.Pipeline([ ('Text', cust_txt_col('Text')), ('hv', feature_extraction.text.HashingVectorizer( decode_error='ignore', n_features=2**16, non_negative=True, ngram_range=(1, 3))), ('tfidf_Text', feature_extraction.text.TfidfTransformer()), ('tsvd3', decomposition.TruncatedSVD(n_components=300, n_iter=25, random_state=12)) ])) ]))
'NumUniqueBadges', 'NumPosts', 'MeanPostScore', 'MeanPostViews', 'MeanPostFavorites', 'MeanPostComments', 'NumComments', 'MeanCommentScore' ] feature_cols = numeric_cols + ['AboutMe'] model = skl_pipeline.Pipeline([ ('feat', skl_pipeline.FeatureUnion([ ('num', skl_pipeline.Pipeline([ ('get', skl_preproc.FunctionTransformer(itemgetter(numeric_cols), validate=False)), ('poly', skl_preproc.PolynomialFeatures()), ('std', skl_preproc.StandardScaler()), ])), ('text', skl_pipeline.Pipeline([ ('get', skl_preproc.FunctionTransformer(itemgetter('AboutMe'), validate=False)), ('tfidf', skl_featext.text.TfidfVectorizer()), ])) ])), ('reg', skl_linear.ElasticNet(alpha=1.0, l1_ratio=1.0)) ]) param_grid = [ dict( feat__num__poly__degree=[1, 2, 3], feat__num__poly__interaction_only=[True, False], feat__text__tfidf__max_df=[0.25, 0.5, 1.0],
_estimators.append( (cluster.KMeans(random_state=42), pickle.loads(pickle.dumps(pd_cluster.KMeans(random_state=42))), True)) _estimators.append((neighbors.KNeighborsClassifier(), pd_neighbors.KNeighborsClassifier(), True)) _estimators.append( (ensemble.GradientBoostingClassifier(random_state=42), pd_ensemble.GradientBoostingClassifier(random_state=42), True)) _estimators.append((pipeline.make_union(decomposition.PCA(n_components=2), feature_selection.SelectKBest(k=1)), pd_decomposition.PCA(n_components=2) + pd_feature_selection.SelectKBest(k=1), True)) _estimators.append( (pipeline.FeatureUnion([('pca', decomposition.PCA(n_components=2)), ('kbest', feature_selection.SelectKBest(k=1))], transformer_weights={ 'pca': 3, 'kbest': 4 }), pd_pipeline.FeatureUnion( [('pca', pd_decomposition.PCA(n_components=2)), ('kbest', pd_feature_selection.SelectKBest(k=1))], transformer_weights={ 'pca': 3, 'kbest': 4 }), True)) _estimators.append((pipeline.make_union(decomposition.PCA(n_components=1), decomposition.PCA(n_components=2), feature_selection.SelectKBest(k=1)), pd_decomposition.PCA(n_components=1) + pd_decomposition.PCA(n_components=2) + pd_feature_selection.SelectKBest(k=1), True))
def _email_features_pipeline2(feature_params_config_dict # {feature_params: {lang: .., weights : .., prep : {}, keywords : []}} see EMAIL_CONF for an example. ): lang = feature_params_config_dict[conf.lang_key] final_weights = feature_params_config_dict[conf.weights_key] prep_params = feature_params_config_dict[conf.prep_key] keywords = feature_params_config_dict[conf.keyword_key] # features found in the processed tokens token_features = [] token_weights = {} preprocessor = prep.Preprocessor(lang=lang, stopword=prep_params[conf.stopword_key], more_stopwords=prep_params[conf.more_stopwords_key], spellcheck=prep_params[conf.spellcheck_key], stemming=prep_params[conf.stemming_key], remove_numbers=prep_params[conf.remove_numbers_key], deasciify=prep_params[conf.deasciify_key], remove_punkt=prep_params[conf.remove_punkt_key], lowercase=prep_params[conf.lowercase_key] ) tfidfvect = TfidfVectorizer(tokenizer=prep.identity, preprocessor=None, lowercase=False, use_idf=prep_params[conf.use_idf_key], ngram_range=prep_params[conf.ngramrange_key], max_features=prep_params[conf.nmaxfeature_key]) tfidfvect_name = 'word_tfidfvect' token_features.append((tfidfvect_name, tfidfvect)) token_weights[tfidfvect_name] = 1 # features found in the whole raw text text_features = [] text_weights = {} # charngramvect = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 2), lowercase=False) # keyword presence features if keywords: for keyword in keywords: keywordpipe = txbt.get_keyword_pipeline(keyword) feature_name = "has_" + keyword text_features.append((feature_name, keywordpipe)) text_weights[feature_name] = 1 tokenbasedpipe = skpipeline.Pipeline([('preprocessor', preprocessor), # ('nadropper', tbt.DropNATransformer()), ('union1', skpipeline.FeatureUnion( transformer_list=token_features , transformer_weights=token_weights )), ]) textbasedpipe = skpipeline.Pipeline([('union2', skpipeline.FeatureUnion( transformer_list=text_features, transformer_weights=text_weights ), ) ]) ''' features = skpipeline.FeatureUnion(transformer_list=[ ('tokenbasedfeatures', tokenbasedpipe), ('textbasedfeatures', textbasedpipe), ], transformer_weights=final_weights) ''' ####### # add the feature pipes to final_features if all the component weights are non-zero. ######## check_zero_list = lambda x : 1 if sum(x) > 0 else 0 # l = [0,0,0] => check_zero(l) gives 0 and l=[0,0,1] => check_zero(l) gives 1. final_features_dict = {} tkweights = list(token_weights.values()) if(check_zero_list(tkweights) != 0): final_features_dict["token_based"] = tokenbasedpipe else: final_weights["token_based"] = 0 txweights = list(text_weights.values()) if(check_zero_list(txweights) != 0): final_features_dict["text_based"] = textbasedpipe else: final_weights["text_based"] = 0 final_features = list(final_features_dict.items()) fweights = list(final_weights.values()) #print(final_weights) if((check_zero_list(fweights) == 0) or (len(final_features) == 0)): return None features = skpipeline.FeatureUnion(transformer_list=final_features, transformer_weights=final_weights) return features
def __init__(self, env, use_kernel=False, **agent_params): self.env = env self.use_kernel = use_kernel self.agent_params = { "epsilon_min": 0.01, "decay_rate": 0.01, "discount": 0.99, "iter": 200, } self.agent_params.update(agent_params) # Generating feature space of RBF kernels if self.use_kernel: observation_samples = np.array( [env.observation_space.sample() for x in range(10000)]) self.detrend = preprocessing.StandardScaler() self.detrend.fit(observation_samples) self.featurizer = pipeline.FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=100)), ("rbf3", RBFSampler(gamma=1.0, n_components=100)), ("rbf4", RBFSampler(gamma=0.5, n_components=100)) ]) self.featurizer.fit(self.detrend.transform(observation_samples)) self.n_features = len( self.featurizer.transform(env.observation_space.sample())[0]) else: self.n_features = len(env.observation_space.sample()) print(self.n_features) # Generating linear model approximation for value function with tf.variable_scope("value_function"): self.value_features = tf.placeholder(tf.float32, [self.n_features], name="value_features") self.value_reward_target = tf.placeholder( tf.float32, name="value_reward_target") value_output_layer = tf.contrib.layers.fully_connected( inputs=tf.expand_dims(self.value_features, 0), num_outputs=1, activation_fn=None, weights_initializer=tf.zeros_initializer) self.value_estimate = tf.squeeze(value_output_layer) self.value_loss = tf.squared_difference(self.value_estimate, self.value_reward_target) self.value_optimizer = tf.train.AdamOptimizer() self.value_train_op = self.value_optimizer.minimize( self.value_loss) # Generating linear model approximation for policy function with tf.variable_scope("policy_function"): self.action = tf.placeholder(tf.int32, name="action") self.policy_features = tf.placeholder(tf.float32, [self.n_features], name="policy_features") self.policy_reward_target = tf.placeholder( tf.float32, name="policy_reward_target") policy_output_layer = tf.contrib.layers.fully_connected( inputs=tf.expand_dims(self.policy_features, 0), num_outputs=env.action_space.n, activation_fn=None, weights_initializer=tf.zeros_initializer) self.action_probabilities = tf.squeeze( tf.nn.softmax(policy_output_layer)) self.max_action_probability = tf.gather(self.action_probabilities, self.action) self.policy_loss = -tf.log( self.max_action_probability) * self.policy_reward_target self.policy_optimizer = tf.train.AdamOptimizer() self.policy_train_op = self.policy_optimizer.minimize( self.policy_loss)
train.fillna('', inplace=True) test.fillna('', inplace=True) counters_pipe = pipeline.FeatureUnion( n_jobs=-1, transformer_list=[ ('chars_features', TfidfVectorizer(analyzer=u'char', ngram_range=(2, 5), tokenizer=None, max_features=config.max_features, strip_accents=None, max_df=0.9, min_df=2, lowercase=False)), ('words_features', TfidfVectorizer(analyzer=u'word', ngram_range=(1, 3), tokenizer=None, use_idf=True, max_features=config.max_features, strip_accents=None, max_df=0.9, min_df=2, lowercase=False)), ]) cv = KFold(n_splits=config.nfolds, shuffle=True, random_state=42) splits = list(cv.split(train))