def partial_dependence(df, y): ''' INPUT: X = features y = target variable binary, imbalanced classes OUPUT: X = features oversampled to have balanced target classes y = target variable oversample to have balanced classes Discovers the minority class and then oversamples until eah class makes up 50% of your data. ''' X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([ ('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
def make_features(self): features = Pipeline([ ('count', self.build_vectorizer()), ('tfidf', TfidfTransformer()) ]) doc_vecs = features.fit_transform(self.docs) rp_vecs = features.fit_transform(self.rps) return (doc_vecs, rp_vecs)
def create_store_transforms(rl): trnsfrm = Pipeline([ ('vbk', ValueByKey('wrd_list')), ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english')), ]) with open('transforms/just_txt.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle) trnsfrm = Pipeline([ ('vbk', ValueByKey('wrd_list')), ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test)), ]) with open('transforms/just_txt_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle) trnsfrm = Pipeline([ ('union', FeatureUnion( transformer_list=[ ('cuisinetype', Pipeline([ ('vbk', ValueByKey('type_2')), ('labels', preprocessing.LabelBinarizer()), ])), # ('price_lev', Pipeline([ # ('vbk', ValueByKey('price_level')), # ('labels2', preprocessing.LabelBinarizer()), # ])), # # ('rating_lev', Pipeline([ # ('vbk', ValueByKey('rating_level')), # ('labels3', preprocessing.LabelBinarizer()), # ])), ('nlp', Pipeline([ ('vbk', ValueByKey('wrd_list')), ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english')) ])) ] )) ]) with open('transforms/txt_cat.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle) trnsfrm = Pipeline([ ('union', FeatureUnion( transformer_list=[ ('cuisinetype', Pipeline([ ('vbk', ValueByKey('type_2')), ('labels', preprocessing.LabelBinarizer()), ])), ('nlp', Pipeline([ ('vbk', ValueByKey('wrd_list')), ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test)) ])) ] )) ]) with open('transforms/txt_cat_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
def transformCorpus(tdocuments, tentities): X1 = None #treat the tasks as documents and calculate the tfIdf vector '''hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = Pipeline(( ('hasher', hasher), ('tf_idf', TfidfTransformer()) )) ''' '''lsa = TruncatedSVD(1000) X = lsa.fit_transform( vectorizer.fit_transform(tdocuments) ) X1 = Normalizer(copy=False).fit_transform(X) ''' #X1 = vectorizer.fit_transform(tdocuments) #print("n_samples: %d, n_features: %d" % X1.shape) #print() vec = Pipeline((('dictText', DictVectorizer()), ('tfIdf', TfidfTransformer()))) X2 = vec.fit_transform(tentities) lsa = TruncatedSVD(1000) X = lsa.fit_transform(X2) X1 = Normalizer(copy=False).fit_transform(X) #X2 = Normalizer(copy=False).fit_transform(X) print('n_samples: %d, n_features: %d' % X.shape) print() return X1, X2
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_dict_equal( pipeline.get_params(deep=True), {"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5}, ) pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict") # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
class MultinomialNB(Step): def __init__(self, percentile_threshold, bins): assert bins > 0 bin_size = 1 / bins self.bins = np.arange(bin_size, 1, bin_size) self.lower = percentile_threshold self.upper = 100 - percentile_threshold scaler = MinMaxScaler() discretizer = FunctionTransformer(Discretizer(self.bins)) self.pipeline = Pipeline( [('scaler', scaler), ('discretizer', discretizer)]) def fit(self, vectors): self.lower_clip = np.percentile(vectors, self.lower, axis=0) self.upper_clip = np.percentile(vectors, self.upper, axis=0) vectors = np.clip(vectors, self.lower_clip, self.upper_clip) vectors = self.pipeline.fit_transform(vectors) n_docs = vectors.shape[0] self.distribution = np.array( [np.bincount(v, minlength=len(self.bins)) / n_docs for v in vectors.T]) def transform(self, vectors): assert self.distribution is not None vectors = np.clip(vectors, self.lower_clip, self.upper_clip) probabilities = [] n_dim = vectors.shape[1] vectors = self.pipeline.transform(vectors) for bins in vectors: pr = np.product(self.distribution[np.arange(n_dim), bins]) probabilities.append(pr) return -np.log(np.maximum(1e-10, np.array(probabilities)))
def main(): with open('recipes.json') as f: recipes = json.load(f) for recipe in recipes: ingredients.append(recipe['ingredients']) items.append(recipe['name']) pca = Pipeline([ ('vect', DictVectorizer(sparse=False)), ('pca', PCA(n_components=2)) ]) X_pca = pca.fit_transform(ingredients) labels = Pipeline([ ('vect', DictVectorizer(sparse=False)), ('pca', PCA(n_components=2)), ('agglom', AgglomerativeWrapper(AgglomerativeClustering(n_clusters=6, linkage='ward'))) ]) labels.fit(ingredients) clusters = labels.named_steps['agglom'].labels_ print(clusters) plt.figure() for row, item in enumerate(items): plt.scatter(X_pca[row, 0], X_pca[row, 1], s=100, c='rgbcyk'[clusters[row]]) plt.annotate("{}".format(item), xy=(X_pca[row, 0], X_pca[row, 1]), textcoords='offset points', xytext=(10,10), size=10, arrowprops=dict(arrowstyle="->", facecolor='white')) plt.show()
def load_data_template(argv): # Train set data = np.load("data/train.npz") y_train = data["y_train"] X_train = data["X_train"] fu = FeatureUnion([ #('spec', FlattenTransformer(scale=1.0)), ('st1', StatsTransformer(axis=1)), #('st0', StatsTransformer(axis=0)) ]) tf = Pipeline(steps=[('specg', SpectrogramTransformer(NFFT=256, clip=500, noverlap=0.5, dtype=np.float32, log=False, flatten=False)), ('tm', TemplateMatcher(raw=True)), #('flatten', FlattenTransformer()), ('fu', fu), ]) X_train = tf.fit_transform(X_train, y_train) # Test set data = np.load("data/test.npz") y_test = None X_test = data['X_test'] X_test = tf.transform(X_test) return X_train, X_test, y_train, y_test
class BallTreeRecommender(object): """ Given input terms, provide k recipe recommendations """ def __init__(self, k=3, **kwargs): self.k = k self.trans_path = "svd.pkl" self.tree_path = "tree.pkl" self.transformer = False self.tree = None self.load() def load(self): """ Load a pickled transformer and tree from disk, if they exist. """ if os.path.exists(self.trans_path): self.transformer = joblib.load(open(self.trans_path, 'rb')) self.tree = joblib.load(open(self.tree_path, 'rb')) else: self.transformer = False self.tree = None def save(self): """ It takes a long time to fit, so just do it once! """ joblib.dump(self.transformer, open(self.trans_path, 'wb')) joblib.dump(self.tree, open(self.tree_path, 'wb')) def fit_transform(self, documents): # Transformer will be False if pipeline hasn't been fit yet, # Trigger fit_transform and save the transformer and lexicon. if self.transformer == False: self.transformer = Pipeline([ ('norm', TextNormalizer(minimum=50, maximum=200)), ('transform', Pipeline([ ('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=200)) ]) ) ]) self.lexicon = self.transformer.fit_transform(documents) self.tree = BallTree(self.lexicon) self.save() def query(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.transformer.named_steps['transform'].fit_transform( wordpunct_tokenize(terms) ) dists, inds = self.tree.query(vect_doc, k=self.k) return inds[0]
def sgdfeature(self,data): newdata = pd.DataFrame() preproc = Pipeline([('fh',FeatureHasher( n_features=2**20,input_type='string'))]) ##for SGDClassifier newdata['app_id_specs'] = data['app_id'].values+data['app_domain'].values+data['app_category'].values newdata['app_dom_specs'] = data['app_domain'].values+data['app_category'].values newdata['site_id_specs'] = data['site_id'].values+data['site_domain'].values+data['site_category'].values newdata['site_dom_specs'] = data['site_domain'].values+data['site_category'].values # data['device'] = data['device_model'].values+(data['device_type'].values.astype(str))+(data['device_conn_type'].values.astype(str)) newdata['type'] = data['device_type'].values +data['device_conn_type'].values newdata['domain'] = data['app_domain'].values +data['site_domain'].values newdata['category'] = data['app_category'].values+data['site_category'].values newdata['pos_cat'] = data['banner_pos'].values.astype(str)+data['app_category'].values+data['site_category'].values newdata['pos_dom'] = data['banner_pos'].values.astype(str)+data['app_domain'].values+data['site_domain'].values # data['pos_id'] = data['banner_pos'].values.astype(str)+data['app_id'].values+data['site_id'].values newdata['hour'] = data['hour'].map(lambda x: datetime.strptime(x.astype(str),"%y%m%d%H")) newdata['dayoftheweek'] = newdata['hour'].map(lambda x: x.weekday) newdata['day'] = newdata['hour'].map(lambda x: x.day) newdata['hour'] = newdata['hour'].map(lambda x: x.hour) newdata = newdata.drop('hour',axis=1) newdata = newdata.astype(str) del data X_dict = np.asarray(newdata) self.X_train = preproc.fit_transform(X_dict) return self.X_train
def main(opt): with codecs.open(opt.vocab, encoding='utf-8') as f: vocab = load_vocab(f) id2word = build_id2word(vocab) _, docs_train, _ = load_all_data(opt.train_jsons) lda = Pipeline([ ('bow', BagOfWords(vocab=vocab)), ('lda', Lda(id2word=id2word, num_topics=opt.num_topics))]) lda_vec_train = lda.fit_transform(docs_train) sent_set = set() tmp_path = opt.lda_vec_path + '.tmp' with codecs.open(tmp_path, encoding='utf-8', mode='w') as f: dump_lda_vec(docs_train, lda_vec_train, sent_set, f) if opt.test_jsons: _, docs_test, _ = load_all_data(opt.test_jsons) lda_vec_test = lda.transform(docs_test) with codecs.open(tmp_path, encoding='utf-8', mode='a') as f: dump_lda_vec(docs_test, lda_vec_test, sent_set, f) with codecs.open(tmp_path, encoding='utf-8') as fin, \ codecs.open(opt.lda_vec_path, encoding='utf-8', mode='w') as fout: fout.write('{} {}\n'.format(len(sent_set), opt.num_topics)) for line in fin: fout.write(line) os.remove(tmp_path)
def XY9(): X, y, X_test, X_test_index = load_xy() #### DON'T CHANGE BEFORE dummy_cols = ['FinelineNumber'] keep_cols = ['Weekday', 'Returns'] mul_col = None dfta = ft.DataFrameToArray() add_returns = ft.NGAddReturns() print("starting grouping") grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col, keep_cols) print("done grouping") transform_steps = [("imputer", ft.NGNAImputer()), ("add_returns", add_returns), ('grouper', grouper)] ### DON'T CHANGE AFTER transform_steps.append((("dfta", dfta))) transform_pipe = Pipeline(steps=transform_steps) print("done with pipeline, now calculating") return { "X": transform_pipe.fit_transform(X), "y": y, "X_test": transform_pipe.transform(X_test), "X_test_index": X_test_index }
class Vectorizer(): def __init__(self, hash=False, min_df=0.015, max_df=0.9): """ `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ if hash: args = [ ('vectorizer', HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer())), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=400)), ('normalizer', Normalizer(copy=False)) ] else: args = [ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=min_df, max_df=max_df)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('normalizer', Normalizer(copy=False)) ] self.pipeline = Pipeline(args) def vectorize(self, docs, train=False): if train: return self.pipeline.fit_transform(docs) else: return self.pipeline.transform(docs) @property def vocabulary(self): return self.pipeline.named_steps['vectorizer'].get_feature_names()
def test_sklearn_pipeline(self): df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}]) t = bt.Exclude_features_transform(excluded=["b"]) transformers = [("exclude_transform",t)] p = Pipeline(transformers) df2 = p.fit_transform(df) self.assertEquals(len(df2.columns),1)
def makePlots(Z): imp = Imputer() scal = StandardScaler() vart = VarianceThreshold() pipe = Pipeline([("imputer", imp), ("var theshold", vart), ("scaler", scal) ]) # Require Z X1 = pipe.fit_transform(Z) pca = PCA(n_components=2) x2d = pca.fit_transform(X1.T) labels = {} centers = [] for n in [2, 3, 5, 10]: agglo = FeatureAgglomeration(n_clusters=n).fit(X1) labels['ag%d'%n] = agglo.labels_ plot(x2d, agglo.labels_, "Feature Agglomeration") km = KMeans(n_clusters=n).fit(X1.T) labels['km%d'%n] = km.labels_ plot(x2d, km.labels_, "K-Means") centers = km.cluaster_centers_ dbs = DBSCAN(eps = 100 ,min_samples=10).fit(X1.T) labels['DBSCAN'] = dbs.labels_ plot(x2d, dbs.labels_, "DBSCAN") return labels, centers
def train(self,sample): tTfidf = ptfidf.Tfidf_transform(input_feature="review",output_feature="tfidf",target_feature="sentiment",min_df=10,max_df=0.7,select_features=False,topn_features=50000,stop_words="english",ngram_range=[1,2]) tFilter2 = bt.Include_features_transform(included=["tfidf","sentiment"]) svmTransform = bt.Svmlight_transform(output_feature="svmfeatures",excluded=["sentiment"],zero_based=False) classifier_xg = xg.XGBoostClassifier(target="sentiment",svmlight_feature="svmfeatures",silent=1,max_depth=5,n_estimators=200,objective='binary:logistic',scale_pos_weight=0.2) cv = cf.Seldon_KFold(classifier_xg,metric='auc',save_folds_folder="./folds") transformers = [("tTfidf",tTfidf),("tFilter2",tFilter2),("svmTransform",svmTransform),("cv",cv)] p = Pipeline(transformers) pw = sutl.Pipeline_wrapper() df = pw.create_dataframe_from_files([self.data_folder],df_format="csv") if sample < 1.0: logger.info("sampling dataset to size %s ",sample) df = df.sample(frac=sample,random_state=1) logger.info("Data frame shape %d , %d",df.shape[0],df.shape[1]) df2 = p.fit_transform(df) pw.save_pipeline(p,self.model_folder) logger.info("cross validation scores %s",cv.get_scores()) return p
def test_l2density_basic(): dim = 3 bags = [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(50)] pipe = Pipeline([ ('scale', BagMinMaxScaler([0, 1])), ('density', L2DensityTransformer(15)), ]) l2ed = pipe.fit_transform(bags) assert np.all(np.isfinite(l2ed)) # ||x - y||^2 = <x, x> - 2 <x, y> + <y, y> K = l2ed.dot(l2ed.T) row_norms_sq = np.diagonal(K) l2_dist_sq = row_norms_sq[:, None] - 2 * K + row_norms_sq[None, :] assert np.min(row_norms_sq) > 0 assert np.min(l2_dist_sq) >= 0 assert_raises(ValueError, lambda: L2DensityTransformer(10, basis='foo')) t = L2DensityTransformer(10) assert_raises(AttributeError, lambda: t.transform(bags)) t.fit(dim) t.transform(BagMinMaxScaler([0, 1]).fit_transform(bags)) assert_raises(ValueError, lambda: t.transform([b[:, :2] for b in bags])) assert_raises(ValueError, lambda: t.transform(bags)) t.basis = 'haha snuck my way in' assert_raises(ValueError, lambda: t.transform(bags))
def XY1(): X, y, X_test, X_test_index = load_xy() ####### VARIABLES dummy_cols = ['Weekday', 'DepartmentDescription'] keep_cols = ['ScanCount', 'Returns'] funcs = [np.sum, np.count_nonzero] dfta = ft.DataFrameToArray() add_returns = ft.NGAddReturns() gdd = ft.GDummyAndKeepTransform(dummy_cols, keep_cols, funcs) # Doesn't work! transform_steps = [("imputer", ft.NGNAImputer())] + \ list(ft.wrapStep(("add_returns", add_returns))) + \ list(ft.wrapStep(('grouper', gdd))) + \ [("dfta", dfta)] transform_pipe = Pipeline(steps=transform_steps) kh.start_pipeline() kh.record_metric("validation", "start", "NA", "transform_pipeline", str(transform_pipe), "NA") return { "X": transform_pipe.fit_transform(X), "y": y, "X_test": transform_pipe.transform(X_test), "X_test_index": X_test_index }
def XY7(): X, y, X_test, X_test_index = load_xy() #### DON'T CHANGE BEFORE dummy_cols = ['DepartmentDescription'] keep_cols = ['Weekday'] mul_col = 'ScanCount' dfta = ft.DataFrameToArray() grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col, keep_cols) transform_steps = [("imputer", ft.NGNAImputer())] + \ list(ft.wrapStep(('grouper', grouper))) ### DON'T CHANGE AFTER transform_steps.append((("dfta", dfta))) transform_pipe = Pipeline(steps=transform_steps) kh.start_pipeline() kh.record_metric("validation", "start", "NA", "transform_pipeline", str(transform_pipe), "NA") return { "X": transform_pipe.fit_transform(X), "y": y, "X_test": transform_pipe.transform(X_test), "X_test_index": X_test_index }
def XY8(): X, y, X_test, X_test_index = load_xy() #### DON'T CHANGE BEFORE dummy_cols = ['DepartmentDescription'] keep_cols = ['Weekday', 'Returns'] mul_col = 'ScanCount' dfta = ft.DataFrameToArray() add_returns = ft.NGAddReturns() grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col, keep_cols) transform_steps = [("imputer", ft.NGNAImputer()), ("add_returns", add_returns), ('grouper', grouper)] ### DON'T CHANGE AFTER transform_steps.append((("dfta", dfta))) transform_pipe = Pipeline(steps=transform_steps) return { "X": transform_pipe.fit_transform(X), "y": y, "X_test": transform_pipe.transform(X_test), "X_test_index": X_test_index }
class MultinomialDEP(Step): def __init__(self, percentile_threshold, bins): self.lower = percentile_threshold self.upper = 100 - percentile_threshold scaler = MinMaxScaler() discretizer = FunctionTransformer(Discretizer(bins)) self.pipeline = Pipeline( [('scaler', scaler), ('discretizer', discretizer)]) def fit(self, vectors): self.lower_clip = np.percentile(vectors, self.lower, axis=0) self.upper_clip = np.percentile(vectors, self.upper, axis=0) vectors = np.clip(vectors, self.lower_clip, self.upper_clip) self.transformed_vectors = self.pipeline.fit_transform(vectors) def transform(self, vectors): assert self.transformed_vectors is not None vectors = np.clip(vectors, self.lower_clip, self.upper_clip) probabilities = [] vectors = self.pipeline.transform(vectors) docs = self.transformed_vectors.shape[0] for x in vectors: count = np.count_nonzero( (self.transformed_vectors == x).all(axis=1)) pr = count / docs probabilities.append(pr) return -np.log(np.maximum(1e-10, np.array(probabilities)))
class SklearnTopicModels(object): def __init__(self, n_topics=50, estimator='LDA'): """ n_topics is the desired number of topics To use Latent Semantic Analysis, set estimator to 'LSA', To use Non-Negative Matrix Factorization, set estimator to 'NMF', otherwise, defaults to Latent Dirichlet Allocation ('LDA'). """ self.n_topics = n_topics if estimator == 'LSA': self.estimator = TruncatedSVD(n_components=self.n_topics) elif estimator == 'NMF': self.estimator = NMF(n_components=self.n_topics) else: self.estimator = LatentDirichletAllocation(n_topics=self.n_topics) self.model = Pipeline([ ('norm', TextNormalizer()), ('tfidf', CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('model', self.estimator) ]) def fit_transform(self, documents): self.model.fit_transform(documents) return self.model def get_topics(self, n=25): """ n is the number of top terms to show for each topic """ vectorizer = self.model.named_steps['tfidf'] model = self.model.steps[-1][1] names = vectorizer.get_feature_names() topics = dict() for idx, topic in enumerate(model.components_): features = topic.argsort()[:-(n - 1): -1] tokens = [names[i] for i in features] topics[idx] = tokens return topics
def _do_feature_selection(must_be_in_thesaurus, k, handler='Base', vector_source='default', max_feature_len=1, delete_kid=False): """ Loads a data set, vectorizes it by extracting n-grams (default n=1) using a feature handler (default BaseFeatureHandler) and then performs feature selection based on either a vector source or on chi2 scores. Returns the encode/decode matrices and the stripped vocabulary of the Vectorizer after feature selection. The vector source by default has a unigrams source that covers all unigrams in the training set (feature vectors are made up), and does not know about n-grams. Optionally, another vector source can be passed in. """ handler_pattern = 'eval.pipeline.feature_handlers.{}FeatureHandler' raw_data, data_ids = load_text_data_into_memory( training_path='tests/resources/test-tr', test_path='tests/resources/test-ev', ) tokenizer = XmlTokenizer() x_train, y_train, x_test, y_test = tokenize_data(raw_data, tokenizer, data_ids) if vector_source == 'default': unigrams_vect = Vectors.from_tsv('tests/resources/thesauri/exp0-0a.txt.events-unfiltered.strings') vector_source = unigrams_vect if delete_kid: # the set of vectors we load from disk covers all unigrams in the training set, which makes it boring # let's remove one entry del unigrams_vect['kid/N'] unigrams_vect.matrix = unigrams_vect.matrix[:, :-1] if max_feature_len == 1: # extract only unigram features feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'], 'extract_phrase_features': []} standard_ngram_features = 0 else: feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'], 'extract_phrase_features': ['AN', 'NN', 'VO', 'SVO']} standard_ngram_features = max_feature_len feature_extractor = FeatureExtractor(standard_ngram_features=standard_ngram_features).update(**feat_extr_opts) pipeline_list = [ ('vect', ThesaurusVectorizer(min_df=1, use_tfidf=False, decode_token_handler=handler_pattern.format(handler))), ('fs', VectorBackedSelectKBest(must_be_in_thesaurus=must_be_in_thesaurus, k=k)), ('dumper', FeatureVectorsCsvDumper('fs-test')) ] p = Pipeline(pipeline_list) fit_params = {'vect__vector_source': vector_source, 'vect__train_time_extractor':feature_extractor, 'vect__decode_time_extractor':feature_extractor, 'fs__vector_source': vector_source} tr_matrix, tr_voc = p.fit_transform(x_train, y_train, **fit_params) if 'fs' in p.named_steps: p.named_steps['vect'].vocabulary_ = p.named_steps['fs'].vocabulary_ ev_matrix, ev_voc = p.transform(x_test) return tr_matrix.A, strip(tr_voc), ev_matrix.A, strip(ev_voc)
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ('count', CountVectorizer(vocabulary=what_we_like)), ('tfidf', TfidfTransformer())]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert_equal(set(pipe.named_steps['count'].vocabulary), set(what_we_like)) assert_equal(X.shape[1], len(what_we_like))
def test_bace_2(): assignments, ref_macrostate_assignments = _metastable_system() pipeline = Pipeline([ ('msm', MarkovStateModel()), ('bace', BACE(n_macrostates=2)) ]) macro_assignments = pipeline.fit_transform(assignments)[0] assert (np.min(assignments) >= 0)
def test_sklearn_pipeline(self): df = pd.DataFrame.from_dict([{"a":"something"},{}]) t = bt.Binary_transform(input_feature="a",output_feature="abin") transformers = [("binary_transform",t)] p = Pipeline(transformers) df2 = p.fit_transform(df) self.assertEquals(df["abin"][0],1) self.assertEquals(df["abin"][1],0)
def pipeline(housing): housing_num = housing.drop("ocean_proximity", axis=1) num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler) ]) housing_num_tr = num_pipeline.fit_transform(housing_num) housing_num_tr
def main(): pipeline = Pipeline([ ('features', FeatureUnion([ ('name', Pipeline([ ('extract_columns', ColumnExtractor("Name")), ('binarize_names', ExistenceBinarizer()), ('one_hot', OneHotEncoder(sparse=False)) ])), ('weekday', Pipeline([ ('extract_columns', ColumnExtractor('DateTime')), ('weekday_extractor', WeekdayExtractor()), # ('one_hot', OneHotEncoder(sparse=False)) ])) ])) ]) train_df = pd.read_csv('data/train_updated_colors.csv', sep=',') train_features, train_labels = preprocess_data(train_df) pipeline.fit_transform(train_features, train_labels)
def train(neg=None, pos=None): the_file = os.path.dirname(os.path.abspath(__file__)) if not neg: neg = os.path.join(the_file, '..', 'origin', 'neg.txt') if not pos: pos = os.path.join(the_file, '..', 'origin', 'pos.txt') tagger = crfseg.create_tagger() tok_cn = lambda (x): crfseg.cut_zh(x, tagger) tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5) pipe = Pipeline([ ('tfidf', tfidf), # ('svd', TruncatedSVD(32)), # ('normal', Normalizer(copy=False)) ]) ''' hasher = HashingVectorizer(n_features=2**16, tokenizer=tok_cn, non_negative=True, norm=None, binary=False) ''' #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True) #clf = MultinomialNB() clf = BernoulliNB() neg_file = codecs.open(neg, 'r', 'utf-8') pos_file = codecs.open(pos, 'r', 'utf-8') x_train = [] y_train = [] i = 0 for line in neg_file: x_train.append(line) y_train.append(0) for line in pos_file: x_train.append(line) y_train.append(1) print 'begin transform' #x_train = hasher.transform(x_train) x_train = pipe.fit_transform(x_train) print 'begin fit' clf.fit(x_train, y_train) print 'begin save' tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl') clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl') #_ = joblib.dump(tfidf, tfidf_file, compress=9) _ = joblib.dump(clf, clf_file, compress=9) print 'begin test' x_test = [u'这个东西真心很赞'] #x_test = hasher.transform(x_test) x_test = pipe.transform(x_test) print clf.predict(x_test)
def test_sklearn_pipeline_str_numbers(self): df = pd.DataFrame.from_dict([{"a":"2"},{"a":"0"}]) t = bt.BinaryTransform(input_feature="a",output_feature="abin") transformers = [("BinaryTransform",t)] p = Pipeline(transformers) df2 = p.fit_transform(df) print df2 self.assertEquals(df["abin"][0],1) self.assertEquals(df["abin"][1],0)
X_numeric = X.select_dtypes(exclude=["object"]) skewness = X_numeric.apply(lambda x: skew(x)) skewness_features = skewness[abs(skewness) >= self.skew].index X[skewness_features] = np.log1p(X[skewness_features]) X = pd.get_dummies(X) return X # build pipeline pipe = Pipeline([ ('labenc', labelenc()), ('skew_dummies', skew_dummies(skew=1)), ]) full2 = full.copy() data_pipe = pipe.fit_transform(full2) data_pipe.shape # + __use robustscaler since maybe there are other outliers.__ scaler = RobustScaler() n_train = train.shape[0] X = data_pipe[:n_train] test_X = data_pipe[n_train:] y = train.SalePrice X_scaled = scaler.fit(X).transform(X) y_log = np.log1p(train.SalePrice) test_X_scaled = scaler.transform(test_X)
train_size=0.8, random_state=0) num_imputer = SimpleImputer(strategy='most_frequent') cat_one_hot = OneHotEncoder(handle_unknown='ignore') cols_preprocessor = ColumnTransformer( transformers=[('num', num_imputer, num_cols), ('cat', cat_one_hot, cat_cols)]) my_model = XGBRegressor(n_estimators=1000, learning_rate=0.1) model_pipeline = Pipeline(steps=[('col_formatter', cols_preprocessor)]) x_train = x_train.fillna(method='ffill') x_train_final = model_pipeline.fit_transform(x_train) x_val_final = model_pipeline[0].transform(x_val) y_val_final = (np.array(y_val)).reshape(-1, 1) y_validation = num_imputer.fit_transform(y_val_final) y_train_final = (np.array(y_train)).reshape(-1, 1) y_training = num_imputer.transform(y_train_final) my_model.fit(x_train_final, y_training, eval_set=[(x_val_final, y_validation)], verbose=0) x_test_final = model_pipeline.fit_transform(x_test)
from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import Pipeline from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import GaussianNB import copy #y1.return0=y1.return0.apply(float) from sklearn.ensemble import * from sklearn.svm import LinearSVC from sklearn.dummy import DummyClassifier gbc = GaussianNB() ohc = Pipeline([('b', OneHotEncoder()), ('a', Densifier())]) X = g.iloc[:, 1:] Y = g.iloc[:, 0] X = ohc.fit_transform(X) t = 0.9 pr = list((max(x) > t) for x in gbc.predict_proba(x_test)) #gbc.score(x_test,y_test,pr) [ gbc.score(x_test, y_test, pr), sum(list((max(x) > t) for x in gbc.predict_proba(x_test))) ] h = g.groupby(by=['letter1', 'letter2', 'letter3', 'letter4', 'letter5']).sum() h2 = g.groupby( by=['letter1', 'letter2', 'letter3', 'letter4', 'letter5']).mean() h[h.return0 > 2]
# STEP 3 oof = np.zeros(len(train)) preds = np.zeros(len(test)) for i in tqdm_notebook(range(512)): train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())]) data2 = pipe.fit_transform(data[cols]) train3 = data2[:train2.shape[0]] test3 = data2[train2.shape[0]:] skf = StratifiedKFold(n_splits=11, random_state=42) for train_index, test_index in skf.split(train2, train2['target']): clf = QuadraticDiscriminantAnalysis(0.5) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits auc = roc_auc_score(train['target'], oof) print(f'AUC: {auc:.5}') # STEP 4
fig = plt.figure(figsize=(15, 8)) plt.suptitle("Manifold Learning with %i cases, %i variables, %i neighbors" % (len(y), np.count_nonzero(y), n_neighbors), fontsize=14) estimators = list() # estimators.append(('variance_thresholder', VarianceThreshold())) # estimators.append(('scaler', StandardScaler())) estimators.append(('ae', AETransform(dim=32))) tsne = manifold.TSNE(n_components=2, random_state=0, perplexity=100, early_exaggeration=4) estimators.append(('tsne', tsne)) pipeline = Pipeline(estimators) X_ = pipeline.fit_transform(X) X_0 = X_[y == 0] X_1 = X_[y == 1] ax = fig.add_subplot(241) ax.scatter(X_0[:, 0], X_0[:, 1], color='g', alpha=0.5) ax.scatter(X_1[:, 0], X_1[:, 1], color='r', alpha=0.5) plt.title("t-SNE") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') estimators = list() estimators.append(('variance_thresholder', VarianceThreshold())) estimators.append(('scaler', StandardScaler()))
np.random.seed(42) m = 100 X = 6 * np.random.rand(m, 1) - 3 y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1) X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10) poly_scaler = Pipeline([ ("poly_features", PolynomialFeatures(degree=90, include_bias=False)), ("std_scaler", StandardScaler()), ]) X_train_poly_scaled = poly_scaler.fit_transform(X_train) X_val_poly_scaled = poly_scaler.transform(X_val) sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, penalty=None, eta0=0.0005, warm_start=True, learning_rate="constant", random_state=42) n_epochs = 500 train_errors, val_errors = [], [] for epoch in range(n_epochs): sgd_reg.fit(X_train_poly_scaled, y_train) y_train_predict = sgd_reg.predict(X_train_poly_scaled)
class Model(object): ''' Multi label classifier model ''' def __init__(self): #self.trainset = pd.read_csv("data/raw/train_set.csv") #self.testset = pd.read_csv("data/raw/test_set.csv") self.cv = CountVectorizer(ngram_range=(0, 2)) self.model = LogisticRegression() self.build_pipe() def build_pipe(self): sent_features = Pipeline([('select', Selector(key='Utterance')), ('extract', SentenceFeatures()), ('vectorize', DictVectorizer())]) hapax = Pipeline([('select', Selector(key='Utterance')), ('extract', HapaxLegomera()), ('vectorize', DictVectorizer())]) CV = Pipeline([('select', Selector(key='Utterance')), ('cv', CountVectorizer(ngram_range=(0, 2)))]) self.pipe = Pipeline([ ('union', FeatureUnion( transformer_list=[('features', sent_features), ('hapax', hapax), ('Ngrams', CV)])) ]) self.label_pipe = Pipeline([('lt', LabelTransformer()), ('MLJ', MultiLabelJoiner()), ('MLB', MyLabelEncoder())]) def train(self, trainset): X = self.pipe.fit_transform(trainset) y = self.label_pipe.fit_transform(trainset) self.model.fit(X, y) def test(self, testset): X = self.pipe.transform(testset) y = self.label_pipe.transform(testset) y_pred = self.model.predict(X) #self.print_scores(y, y_pred) return y, y_pred def distribution(self, which): if which == 'test': df = self.testset elif which == 'train': df = self.trainset labels = df.filter([ 'Stance category', 'second stance category', 'third', 'fourth', 'fifth' ]) labels = labels.stack() print(labels.value_counts(True)) def unique_labels(self): pass
# z.plot.hist(bins=50, ax=ax_133) # plt.xlabel('z') # plt.title('{:.0f} samples with z>3'.format(n_outliers)) return model, cv_score, grid_results data_train = reduce_mem_usage(data_train) # scatter_matrix(data_train, figsize=(20, 16)) num_pipeline = Pipeline([ # ('Imputer', Imputer("median")), ('StandardScaler', StandardScaler()), ]) data_train_std = num_pipeline.fit_transform(data_train) data_train_std = pd.DataFrame(data_train_std, index=data_train.index, columns=data_train.columns) # savfig_send() # data_train.iloc[:, :10].hist(bins=50, figsize=[20, 15]) linear_regression = LinearRegression() X = data_train_std.iloc[:, :-1] y = data_train_std['target'] # linear_model = linear_regression.fit(X, y) # outliers = find_outliers(Ridge(), X, y) model, cv_score, grid_results = train_model(LinearRegression(), {}, X=X,
# TSNE tsne = TSNE(n_components=2, random_state=seed) tsne_data = tsne.fit_transform(data_std) # Data for ploting data_sets = [pca_data, tsne_data] names = ["pca_data", "tsne_data"] colors = ["red", "green", "blue"] # Pipeline pipe = Pipeline([ ("std", StandardScaler()), ("pca", PCA(n_components=0.95, random_state=seed)), ("tsne", TSNE(n_components=2, random_state=seed)), ]) piped = pipe.fit_transform(test_data_02) def diabetes_classification() -> None: elements = [2, 4, 6] sum_of_elements = [] for element in elements: pca = PCA(n_components=element, random_state=seed) pca.fit(x_standard) ratio = pca.explained_variance_ratio_ print(ratio) ratio_sum = sum(ratio) sum_of_elements.append(ratio_sum)
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.compose import ColumnTransformer from sklearn.preprocessing import FunctionTransformer columns = [ 'median_income', 'households', 'population', 'total_bedrooms', 'total_rooms' ] pipeline1 = Pipeline([ ('log', FunctionTransformer(np.log1p, validate=False)), ]) pipeline1.fit_transform(df) num_attribs = list(df.drop('ocean_proximity', axis=1)) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", pipeline1, columns), ]) df1 = full_pipeline.fit_transform(df) df2 = pd.DataFrame(df1, columns=columns) df3 = df.copy() df3[columns] = df2 df.dtypes
def enet_path(est, x_train, x_test, y_train, y_test, num_alphas, eps, l1_ratio, target_score, n_tail, max_complexity): models = [] trafo = Pipeline(steps=est.steps[:-1]) final = est._final_estimator fit_intercept = final.fit_intercept normalize = final.normalize with warnings.catch_warnings(): warnings.filterwarnings("ignore") features = trafo.fit_transform(x_train) if isinstance(final, RationalFunctionMixin): features = est._final_estimator._transform(features, y_train) X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( features, y_train, None, True, normalize=normalize, fit_intercept=fit_intercept, copy=True) n_samples = X.shape[0] alpha_max = np.abs(np.nanmax(X.T @ y) / (n_samples * l1_ratio)) est.set_params(regression__precompute=precompute, regression__fit_intercept=False, regression__normalize=False, regression__warm_start=True) est_ = FFXElasticNet() est_.set_params(**final.get_params()) for alpha in _get_alphas(alpha_max, num_alphas, eps): est_.set_params(l1_ratio=l1_ratio, alpha=alpha) with warnings.catch_warnings(): warnings.filterwarnings("ignore") est_.fit(X, y, check_input=False) model = deepcopy(est) model.set_params(regression__fit_intercept=fit_intercept, regression__normalize=normalize, regression__l1_ratio=l1_ratio, regression__alpha=alpha) for attr in ["coef_", "intercept_", "n_iter_"]: setattr(model._final_estimator, attr, getattr(est_, attr)) model._final_estimator._set_intercept(X_offset, y_offset, X_scale) if isinstance(model._final_estimator, RationalFunctionMixin): model._final_estimator._arrange_coef() model.train_score_ = model.score(x_train, y_train) model.test_score_ = model.score(x_test, y_test) model.complexity_ = np.count_nonzero(model._final_estimator.coef_) models.append(model) if model.train_score_ <= target_score: # print("Reached target score") break elif model.complexity_ >= max_complexity: # print("Reached target complexity") break elif _path_is_saturated(models, n_tail=n_tail): # print("Stagnation in train score") break return models
# Spracovanie zaznamu a klasifikacia if konspekt: konspekt = False output_file.write(line) continue # Filtrovanie meta dat a titulu text = filter_text(text) title = filter_text(title) # Vytvorenie dat pre klasifikaciu, # v pripade klasifikacie aj z fulltextu je potrebne # do data['titile'] priradit aj fulltext zaznamu data = {} data['meta_data'] = [text] data['title'] = [title + ' ' + text] # Spracovanie dat X = pipeline.fit_transform(data) # Klasifikacia predicted = clf.predict(X)[0] predicted_proba = clf.predict_proba(X)[0] # Zoradenie a vypis vysledkov klasifikacie all_pred = [] for index, item in enumerate(predicted_proba): if item != 0: all_pred.append([cat_names[index], item]) for index, item in enumerate( sorted(all_pred, key=lambda x: x[1], reverse=True)): if index > 6: break output_file.write('072 c $a' + item[0] + '$b' + str(item[1]) + '\n') output_file.write(line)
# Use on numeric columns in the data class NumberSelector(BaseEstimator, TransformerMixin): def __init__(self, key): self.key = key def fit(self, X, y=None): return self def transform(self, X): return X[[self.key]] desc = Pipeline([('selector', TextSelector(key='desc')), ('tfidf', TfidfVectorizer(stop_words='english'))]) desc.fit_transform(train_features) value = Pipeline([ ('selector', NumberSelector(key='value')), # ('standard', StandardScaler()) ]) value.fit_transform(train_features) feats = FeatureUnion([('desc', desc), ('value', value)]) feature_processing = Pipeline([('feats', feats)]) feature_processing.fit_transform(train_features) pipeline = Pipeline([ ('features', feats),
print("finding best classifier") features = FEATURES_ARRAY2 scores = {} # for feature in [OUTPUT_FOLDER + 'lbp2' + FORMAT]: # features: for feature in features: print(""" ---------------------------------- getting feature: {} ---------------------------------- """.format(feature)) X = np.load(feature, allow_pickle=True) X = transformer_pipe.fit_transform(X) "Resampling" # rus = RandomUnderSampler(random_state=RANDOM_STATE) # X_res, y_res = rus.fit_resample(X, y) # RandomOverSampler(random_state=RANDOM_STATE) # ADASYN(random_state=RANDOM_STATE) smote = SMOTE(random_state=RANDOM_STATE) X_res, y_res = smote.fit_resample(X, y) X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res) "One model for all" model = SVC(random_state=RANDOM_STATE) model.fit(X_train, y_train)
def main(): """extract features""" train = pd.read_csv(settings.train) y = train[settings.y] coldrop = ['Unnamed: 0','Gene','Variation','Variation_type','Gene_type'] train = train.drop([settings.y], axis=1) train = train.drop(coldrop, axis=1) test = pd.read_csv(settings.test) test = test.drop(coldrop, axis=1) pid = test[settings.id_colname] feat_p = Pipeline([ ('union', FeatureUnion( n_jobs = -1, transformer_list = [ ('standard', cust_regression_vals()), ('p1', Pipeline([ ('Text', cust_txt_col(settings.text_colname)), ('tfidf_Text', TfidfVectorizer(ngram_range=(1, 2))), ('tsvd1', TruncatedSVD(n_components=50, n_iter=25, random_state=12)), ('p2', Pipeline([(settings.gene_colname, cust_txt_col(settings.gene_colname)), ('count_Gene', CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd2', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])), ('p3', Pipeline([('Variation', cust_txt_col(settings.var_name)), ('count_Variation', CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd3', TruncatedSVD(n_components=20, n_iter=25, random_state=12))])), ])) ]) )]) train = feat_p.fit_transform(train); print(train.shape) test = feat_p.transform(test); print(test.shape) """ init and run model""" y = y - 1 #fix for zero bound array denom = 0 fold = 20 for i in range(fold): params = { 'eta': 0.03333, 'max_depth': 4, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'num_class': 9, 'seed': i, 'silent': True } x1, x2, y1, y2 = train_test_split(train, y, test_size=0.18, random_state = i) watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')] model = xgb.train( params, xgb.DMatrix(x1, y1), 1000, watchlist, verbose_eval = 50, early_stopping_rounds=100 ) score = metrics.log_loss( y2, model.predict(xgb.DMatrix(x2), ntree_limit = model.best_ntree_limit), labels = list(range(9)) ) print(score) if denom != 0: pred = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit+80) preds += pred else: pred = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit+80) preds = pred.copy() denom += 1 submission = pd.DataFrame(pred, columns = ['class'+str(c+1) for c in range(9)]) submission[settings.id_colname] = pid submission.to_csv('submission_xgb_fold_' + str(i) + '.csv', index=False) preds /= denom submission = pd.DataFrame(preds, columns=['class' + str(c + 1) for c in range(9)]) submission[settings.id_colname] = pid submission.to_csv('submission_xgb.csv', index=False)
for shift_vector in [[0, -1], [0, 1]]: for image in X: image_copy = image.copy().reshape(28, 28) shift(image_copy, shift_vector, cval=0) X_augmented.append(image_copy.reshape([-1])) return np.array(X_augmented) # Data prep pipeline. Standardization included pipeline = Pipeline([ ("augmenter", DataAugmentation()), ("scaler", StandardScaler()), ]) # The attributes train set is run through the pipeline X_train_prepared = pipeline.fit_transform(X_train.astype(np.float64)) # y_train_prepared accounts for the shifted images in X_train_prepared y_train_prepared = np.array([label for label in y_train] * 3) # Cross-validation is used to train and test various Random Forests using Grid Search param_grid = [ { "n_estimators": [3, 10, 30], "max_features": [4, 6, 8, 12], }, ] rf_clf = RandomForestClassifier(random_state=42) search = GridSearchCV(rf_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True,
def main(): df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') df_test = pd.read_csv('data/test_data.csv') feature_cols = list(df_train.columns[:-1]) target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values X_test = df_test[feature_cols].values tsne_data_2d_5p = np.load('data/tsne_2d_5p.npz') tsne_data_2d_10p = np.load('data/tsne_2d_10p.npz') tsne_data_2d_15p = np.load('data/tsne_2d_15p.npz') tsne_data_2d_20p = np.load('data/tsne_2d_20p.npz') tsne_data_2d_30p = np.load('data/tsne_2d_30p.npz') tsne_data_2d_40p = np.load('data/tsne_2d_40p.npz') tsne_data_2d_50p = np.load('data/tsne_2d_50p.npz') tsne_data_3d_30p = np.load('data/tsne_3d_30p.npz') # concat features X_train_concat = { 'X': X_train, 'tsne_2d_5p': tsne_data_2d_5p['train'], 'tsne_2d_10p': tsne_data_2d_10p['train'], 'tsne_2d_15p': tsne_data_2d_15p['train'], 'tsne_2d_20p': tsne_data_2d_20p['train'], 'tsne_2d_30p': tsne_data_2d_30p['train'], 'tsne_2d_40p': tsne_data_2d_40p['train'], 'tsne_2d_50p': tsne_data_2d_50p['train'], 'tsne_3d_30p': tsne_data_3d_30p['train'], } X_valid_concat = { 'X': X_valid, 'tsne_2d_5p': tsne_data_2d_5p['valid'], 'tsne_2d_10p': tsne_data_2d_10p['valid'], 'tsne_2d_15p': tsne_data_2d_15p['valid'], 'tsne_2d_20p': tsne_data_2d_20p['valid'], 'tsne_2d_30p': tsne_data_2d_30p['valid'], 'tsne_2d_40p': tsne_data_2d_40p['valid'], 'tsne_2d_50p': tsne_data_2d_50p['valid'], 'tsne_3d_30p': tsne_data_3d_30p['valid'], } X_test_concat = { 'X': X_test, 'tsne_2d_5p': tsne_data_2d_5p['test'], 'tsne_2d_10p': tsne_data_2d_10p['test'], 'tsne_2d_15p': tsne_data_2d_15p['test'], 'tsne_2d_20p': tsne_data_2d_20p['test'], 'tsne_2d_30p': tsne_data_2d_30p['test'], 'tsne_2d_40p': tsne_data_2d_40p['test'], 'tsne_2d_50p': tsne_data_2d_50p['test'], 'tsne_3d_30p': tsne_data_3d_30p['test'], } pipeline = Pipeline(steps=[ ('features', FeatureUnion(transformer_list=[ ('X', ItemSelector('X')), ('tsne_2d_5p', ItemSelector('tsne_2d_5p')), ('tsne_2d_10p', ItemSelector('tsne_2d_10p')), ('tsne_2d_15p', ItemSelector('tsne_2d_15p')), ('tsne_2d_20p', ItemSelector('tsne_2d_20p')), ('tsne_2d_30p', ItemSelector('tsne_2d_30p')), ('tsne_2d_40p', ItemSelector('tsne_2d_40p')), ('tsne_2d_50p', ItemSelector('tsne_2d_50p')), ('tsne_3d_30p', ItemSelector('tsne_3d_30p')), ])), ('poly', PolynomialFeatures(degree=2)), ('scaler', MinMaxScaler()), ]) X_train_concat = pipeline.fit_transform(X_train_concat, y_train) X_valid_concat = pipeline.transform(X_valid_concat) X_test_concat = pipeline.transform(X_test_concat) X_valid_both, y_valid_both = divide_samples_train(X_valid_concat, y_valid) classifier = make_pipeline( make_union(ItemSelector(key='L'), ItemSelector(key='R')), LogisticRegression(penalty='l2', C=1e-2, n_jobs=-1, warm_start=True)) for i in trange(10): X_train_both, y_train_both = divide_samples_train( *shuffle(X_train_concat, y_train)) print('Fitting...') start_time = time.time() classifier.fit(X_train_both, y_train_both) print('Fit: {}s'.format(time.time() - start_time)) p_valid = classifier.predict_proba(X_valid_both) loss = log_loss(y_valid_both, p_valid[:, 1]) auc = roc_auc_score(y_valid_both, p_valid[:, 1]) print('Pairwise Loss: {}, AUC: {}'.format(loss, auc)) p_valids = [] for i in trange(100): X_valid_both = divide_samples_test(X_valid_concat) p_valid = classifier.predict_proba(X_valid_both) p_valids.append(p_valid) p_valid = np.array(p_valids) p_valid = np.mean(p_valid, axis=0) loss = log_loss(y_valid, p_valid[:, 1]) auc = roc_auc_score(y_valid, p_valid[:, 1]) print('Validation Loss: {}, AUC: {}'.format(loss, auc)) p_tests = [] for i in trange(100): X_test_both = divide_samples_test(X_test_concat) p_test = classifier.predict_proba(X_test_both) p_tests.append(p_test) p_test = np.array(p_tests) p_test = np.mean(p_test, axis=0) df_pred = pd.DataFrame({ 't_id': df_test['t_id'], 'probability': p_test[:, 1] }) csv_path = 'predictions/predictions_{}_{}.csv'.format( int(time.time()), loss) df_pred.to_csv(csv_path, columns=('t_id', 'probability'), index=None) print('Saved: {}'.format(csv_path))
]))]) comb_vectorizer = Pipeline([( 'features', FeatureUnion([ ('tfidf', tfidf_vectorizer), #find tfidf value ('tp', tp_vectorizer), #find term presence ('sv', sv_vectorizer) ]))]) tfidf_train = tfidf_vectorizer.fit_transform(x_train).todense() tfidf_test = tfidf_vectorizer.transform(x_test).todense() tp_train = tp_vectorizer.fit_transform(x_train) tp_test = tp_vectorizer.transform(x_test) sv_train = sv_vectorizer.fit_transform(x_train) sv_test = sv_vectorizer.transform(x_test) c_train = comb_vectorizer.fit_transform(x_train) c_test = comb_vectorizer.transform(x_test) path_result = "result/" file_res = open(path_result + "output_desc.txt", "w") file_res.write("data training: " + str(len(x_train)) + "\n") file_res.write("data testing: " + str(len(x_test)) + "\n") file_res.write("\n") file_res.write("data training pos: " + str(sum(i == "positive" for i in y_train)) + "\n") file_res.write("data training neg: " + str(sum(i == "negative" for i in y_train)) + "\n")
def dat_prep(nbd_train,nbd_test,k,vect_type,Type_train,Type_test,Chr_train,Chr_test,Label_train,Label_test,scaled_feats_train,scaled_feats_test,dummy_train,dummy_test): #Derives the Count Vectorizer or TFIDF scores for a given neighborhood sequence """ Arguments: nbd_train = Column containing the neighborhood sequence from the training data nbd_test = Column containing the neighborhood sequence from the test data k=size of kmer vect_type= 'CV' for Count Vectorizer or else TFIDF Vectorizer Type_train=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from training data Type_test=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from test data Chr_train= Chromosome number from training data Chr_test=Chromosome number from test data Label_train=Binary label (training data), where 1=Passenger and 2=Driver Label_test=Binary label (test data), where 1=Passenger and 2=Driver scaled_feats_train=Scaled genomic features (consrvation, amino acid etc.) for training data scaled_feats_test=Scaled genomic features (consrvation, amino acid etc.) for test data dummy train= One-hot encoding based feature matrix for training data dummy test=One-hot encoding based feature matrix for test data Returns: df_comb_train= The complete dataframe (using training data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type df_comb_test= The complete dataframe (using test data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type count_vector_train= Just the TFIDF or Count vect features (training data) also known as the Document-Term matrix count_vector_test= Just the TFIDF or Count vect features (test data) also known as the Document-Term matrix cols= feature names vect= The vocabulary derived from the training data sc= The scaling variable derived from the training data """ if(vect_type=="CV"): vect=Pipeline([('cv1',CountVectorizer(lowercase=False))]) else: vect = Pipeline([('cv1',CountVectorizer(lowercase=False)), ('tfidf_transformer',TfidfTransformer(smooth_idf=True,use_idf=True))]) count_vector_train=vect.fit_transform(preprocess(nbd_train,k)) count_vector_test=vect.transform(preprocess(nbd_test,k)) df_train=pd.DataFrame(count_vector_train.todense(),columns=vect['cv1'].get_feature_names()) df_test=pd.DataFrame(count_vector_test.todense(),columns=vect['cv1'].get_feature_names()) if(vect_type=="tf"): sc=MinMaxScaler() #We have used fit_transform() here because we wanted to learn the vocabulary dictionary and return document-term matrix using the traininig data df_train=pd.DataFrame(sc.fit_transform(df_train),columns=df_train.columns) #We have used transform() here since we already have a pretrained vocabulary using which we just wanted to derive the term-document matrix for the test data df_test=pd.DataFrame(sc.transform(df_test),columns=df_test.columns) df_train['Type']=Type_train;df_test['Type']=Type_test df_train['Label']=Label_train;df_test['Label']=Label_test df_train['Chr']=Chr_train;df_test['Chr']=Chr_test df_comb_train=pd.concat([df_train, scaled_feats_train,dummy_train], axis=1) df_comb_test=pd.concat([df_test, scaled_feats_test,dummy_test], axis=1) df_comb_train = df_comb_train.loc[:,~df_comb_train.columns.duplicated()] df_comb_test = df_comb_test.loc[:,~df_comb_test.columns.duplicated()] cols=vect['cv1'].get_feature_names() return df_comb_train,df_comb_test,count_vector_train,count_vector_test,cols,vect,sc
x_prepared = pd.DataFrame(stdsc.fit_transform(x),index=x.index,columns=x.columns) # Repeat for test set x_prepared_test = pd.DataFrame(stdsc.fit_transform(x_test), index=x_test.index,columns=x_test.columns) # Repeat "all" set x_prepared_all = pd.DataFrame(stdsc.fit_transform(x_all), index=x_all.index,columns=x_all.columns) #%% Generate "x_poly" sets ----------------------- # Pipeline uses Poly + Standard Scaling pipeline = Pipeline([("poly_features",PolynomialFeatures(degree=2, include_bias=True)),('std_scaler',StandardScaler())]) pipeline.fit(x_prepared) # Retrieve the column names, for book-keeping poly_cols = pipeline.named_steps["poly_features"].get_feature_names(x_prepared.columns) # Transform the data and re-frame the results as pandas DataFrame x_poly=pd.DataFrame(pipeline.fit_transform(x_prepared), index=x_prepared.index,columns=poly_cols) # Repeat for test set x_poly_test=pd.DataFrame(pipeline.fit_transform(x_prepared_test), index=x_prepared_test.index,columns=poly_cols) # repeat for "all" set x_poly_all=pd.DataFrame(pipeline.fit_transform(x_prepared_all), index=x_prepared_all.index,columns=poly_cols) #%% Take a look at the transformations before proceeding ----------------- # Need to recombine x and y for easy correlation plotting df_temp = x_poly.copy() df_temp["salary"] = y plt.figure(figsize=(10,3.5)) # Absoloute Magnitude--
return self def transform(self,x): output = x.copy() if self.columns is not None: for col in self.columns: output [col] =LabelEncoder().fit_transform(output[col]) else: for colname,col in output.iteritems(): output[colname] =LabelEncoder().fit_transform(col) return output def fit_transform(self,x,y=None): return self.fit(x,y).transform(x) encoding_pipeline = Pipeline([ ('encoding',MultiColumnLabelEncoder(columns=['buying','maintain','lug_boot','safety','class'])) ]) dataread = encoding_pipeline.fit_transform(dataread) out = dataread.ix[:,6:7] #out=np.array(out) inp = dataread.ix[:,0:6] #inp=np.array(inp) inp.columns.tolist() inp['person']=inp['person'].replace(['5more'] , 5) inp['doors']=inp['doors'].replace(['more'], 5) print(inp) from sklearn.cross_validation import train_test_split train_inp,test_inp,train_out,test_out=train_test_split(inp,out,train_size=0.66,test_size=0.33) print(np.shape(train_inp)) print (dataread.head()) from sklearn.ensemble import RandomForestClassifier model=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=4, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) model.fit(train_inp, train_out)
# Plot pairwise plot sns.set_context('notebook') sns.set_palette('dark') sns.set_style('white') sns.pairplot(data) plt.show() # Create a pipeline to pre-process the data and compare with previous result # The custom NumPy log transformer log_transformer = FunctionTransformer(np.log1p) # The pipeline estimators = [('log1p', log_transformer), ('minmaxscale', MinMaxScaler())] pipeline = Pipeline(estimators) # Convert the original data data_pipe = pipeline.fit_transform(data_orig) print("check two arrays (pipeline, no pipeline) are equal = ",np.allclose(data_pipe, data)) # Perform PCA with n_components ranging from 1 to 5. Find and plot the explained variance and feature importances pca_list = list() feature_weight_list = list() # Fit a range of PCA models for n in range(1, 6): # Create and fit the model PCAmod = PCA(n_components=n) PCAmod.fit(data) # Store the model and variance pca_list.append(pd.Series({'n': n, 'model': PCAmod, 'var': PCAmod.explained_variance_ratio_.sum()})) # Calculate and store feature importances abs_feature_values = np.abs(PCAmod.components_).sum(axis=0)
index_cols = features_df.columns[[0, 1, 2, -1]] features_df.drop(index_cols, axis=1, inplace=True) feature_names = features_df.columns X = features_df # train-test split and wrap output in data frame to save column names X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41) # preprocess the data by setting NaN values to the mean and standard scaling preprocess_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="mean")), ('std_scaler', StandardScaler()), ]) X_train = pd.DataFrame(preprocess_pipeline.fit_transform(X_train), columns=feature_names) X_test = pd.DataFrame(preprocess_pipeline.transform(X_test), columns=feature_names) clf = xgb.XGBClassifier(colsample_bytree=0.9, n_estimators=200, learning_rate=0.04, max_depth=8, subsample=0.9, gamma=0.05, objective="multi:softprob", tree_method="gpu_hist") clf.fit(X_train, y_train) xgb.plot_importance(clf, max_num_features=20)
def use_pipeline_with_fs(self): ##################### #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent ##################### pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)), ("selector", SelectPercentile()), ('clf', MultinomialNB()), ]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1,1), (1,2), (1,3)], 'vect__use_idf': (True, False), 'selector__score_func': (chi2, f_classif), 'selector__percentile': (85, 95, 100), 'clf__alpha': (0.4, 0.5) } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(docs_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) #y_predicted = clf_gs.predict(docs_test) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### ngram_range = best_parameters['vect__ngram_range'] use_idf = best_parameters['vect__use_idf'] score_func = best_parameters['selector__score_func'] percentile = best_parameters['selector__percentile'] alpha = best_parameters['clf__alpha'] # vectorisation count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation tfidf_transformer = TfidfTransformer(use_idf=use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=score_func, percentile=percentile) combined_features = Pipeline([ ("vect", count_vect), ("tfidf", tfidf_transformer), ("feat_select", selector) ]) X_features = combined_features.fit_transform(docs_train,y_train) X_test_features = combined_features.transform(docs_test) print ("Shape of train data after feature selection is "+str(X_features.shape)) print ("Shape of test data after feature selection is "+str(X_test_features.shape)) # run classifier on selected features clf = MultinomialNB(alpha=alpha).fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file,'w') for fb in feature_boolean: f.write(str(fb)+'\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################# # run classifier on test data ################# y_predicted = clf.predict(X_test_features) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_features,y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) # import matplotlib.pyplot as plt # plt.matshow(cm) # plt.show() return clf, count_vect
def train_and_evaluate(self): data_dir = self.datasetprop.dir filename = self.datasetprop.filename self.dataset = self.loadDatasets(data_dir, filename, self.localTrain) # train_set, test_set = self.create_train_test_df(self.dataset) train_set = self.dataset # using te full dataset # Prepare IDS Dataset dataclean_pipeline = Pipeline([ ('data_cleaner', CustomDataCleaner()), ]) prepdata_pipeline = Pipeline([ ('attribs_remover', AttributesRemover()), ('standard_scaler', StandardScaler()), ]) ids_label_pipeline = Pipeline([ ('label_encoder', MyLabelEncoder()), ('benign_encoder', BenignLabelEncoder()), ]) train_x = train_set.copy() train_x = dataclean_pipeline.fit_transform(train_x) train_y = train_x["Label"].copy() train_x_prepared = prepdata_pipeline.fit_transform(train_x) train_y_prepared = ids_label_pipeline.fit_transform(train_y) # test_x = test_set.copy() # test_x = dataclean_pipeline.transform(test_x) # test_y = test_x["Label"].copy() # test_x_prepared = prepdata_pipeline.transform(test_x) # test_y_prepared = ids_label_pipeline.transform(test_y) # PredefinedSplit # my_test_fold = [] # for _ in range(len(cleanset_prepared)): # my_test_fold.append(-1) # for _ in range(len(anomalyset_prepared)): # my_test_fold.append(0) # param_grid = [{'gamma': [0.05,0.1,0.2,0.001,0.02,0.03], # 'kernel': ['rbf',], # 'nu':[0.01,0.05,0.1,0.03,0.3,0.07] # }] # estimator = OneClassSVM() # grid_search = GridSearchCV(estimator, # param_grid, # cv=PredefinedSplit(test_fold=my_test_fold), # scoring='f1_micro' # ) # grid_search.fit(np.concatenate((cleanset_prepared,anomalyset_prepared),axis=0), # np.concatenate((cleanset_label_prepared,anomalyset_label_prepared),axis=0) # ) # Print the cv scores. # cvres = grid_search.cv_results_ # for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): # print(mean_score, params) # return grid_search estimator = OneClassSVM(gamma=0.2, kernel='rbf', nu=0.07) # estimator = OneClassSVM(gamma=0.001, kernel='rbf', nu=0.001) # hyperparams are for test purpose estimator.fit(train_x_prepared) return estimator
#The first 100 instances are for trainning, the remaining is for testing #EARLY STOPPING from sklearn.base import clone #Function to copy the model from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler #Function to scale the data from sklearn.pipeline import Pipeline from sklearn.metrics import mean_squared_error poly_scaler = Pipeline([ ('poly_features', PolynomialFeatures( degree=2, include_bias=False)), #First, performs a polynomial regression ('std_scaler', StandardScaler()) ]) #Then, scales the values X_train_poly_scaled = poly_scaler.fit_transform( X_train) #Executes the transformation X_val_poly_scaled = poly_scaler.transform(X_val) #SOFTMAX REGRESSION from sklearn.linear_model import LogisticRegression softmax_reg = LogisticRegression( multi_class='multinomial', solver='lbfgs', C=10, max_iter=1, warm_start=True ) #ONLY 1 iteration is performed each time, Warm Start let the fit function use the previous results too. minimum_val_error = float('inf') best_epoch = None best_model = None
else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) df_extra_attribs = attr_adder.transform(housing.values) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) df_num_tr = num_pipeline.fit_transform(df_num) from sklearn.compose import ColumnTransformer num_attribs = list(df_num) cat_attribs = ['ocean_proximity'] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) df_prepared = full_pipeline.fit_transform(df) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(df_prepared, df_labels)
from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler numeric_transformer = Pipeline( steps=[('imputer', Imputer(missing_values='NaN', strategy='mean') ), ('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor)]) X = clf.fit_transform(X) #splitting the dataset from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) print(X_test) #linear regression from sklearn.linear_model import LinearRegression regressor = LinearRegression(fit_intercept=False) regressor.fit(X_train, y_train)
# In[75]: #Transformation Pipelines --housing_num is training data from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy = "median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # In[84]: #Full pipeline to transform both numerical and categorical attributes from sklearn.pipeline import FeatureUnion num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', SimpleImputer(strategy = "median")), ('attribs_adder', CombinedAttributesAdder()),
# delete everything qc!=0 except the value larger than alarming and max valid missing = np.where((qc_cpc==2)|(qc_cpc==962)|(qc_cpc==65474))[0] df['cpc_con'].values[missing]= np.NaN df['diff_con'].values[missing] = np.NaN ''' #%% '''PRE-PROCESS DATA''' selected_features = df.columns scaled_features = ['cpc_con', 'diff_con'] pipe = Pipeline([('RowDropper', DataSampleDropper()), ('FeatureSelector', DataFrameSelector(selected_features)), ('Scale', DataScaler(scaled_features))]) processed_data = pipe.fit_transform(df) # TODO print(processed_data.isnull().values.any()) # PLot the training data fig = plt.figure(figsize=(15, 5)) myFmt = DateFormatter("%H:%M:%S") ax = fig.gca() ax.xaxis.set_major_formatter(myFmt) ax.plot(processed_data['cpc_con'][4000:4400], '.', linewidth=1.0, color='grey', label='ori_data') ax.plot(processed_data['diff_con'][4000:4400], '.',