def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] mult3 = Mult(3) mult3.get_feature_names = lambda: ["x3"] mult5 = Mult(5) mult5.get_feature_names = lambda: ["x5"] ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["m5__x5"], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x3"], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x5"], ft.get_feature_names())
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) mult5.get_feature_names = lambda: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['m5__x5'], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x3'], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x5'], ft.get_feature_names())
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) mult5.get_feature_names = lambda: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['m5__x5'], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x3'], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x5'], ft.get_feature_names())
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_set_feature_union_step_drop(drop): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() ft.set_params(m2=drop) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() ft.set_params(m3=drop) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert [] == ft.get_feature_names() # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) # Check 'drop' step at construction time ft = FeatureUnion([('m2', drop), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names()
def test_feature_union_get_feature_names_deprecated(): """Check that get_feature_names is deprecated""" msg = "get_feature_names is deprecated in 1.0" mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] ft = FeatureUnion([("m2", mult2)]) with pytest.warns(FutureWarning, match=msg): ft.get_feature_names()
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert len(feature_names) == 35 ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) msg = re.escape("Transformer tr1 (type Transf) does not provide get_feature_names") with pytest.raises(AttributeError, match=msg): ft.get_feature_names()
def train_model(trainset): word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore") # print word_vector print "works fine" char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore") vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ]) corpus = [] classes = [] for item in trainset: corpus.append(item['text']) classes.append(item['label']) print "Training instances : ", 0.8*len(classes) print "Testing instances : ", 0.2*len(classes) matrix = vectorizer.fit_transform(corpus) print "feature count : ", len(vectorizer.get_feature_names()) print "training model" X = matrix.toarray() y = numpy.asarray(classes) model =LinearSVC() X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0) y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test) #y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test) #print y_prob #con_matrix = [] #for row in range(len(y_prob)): # temp = [y_pred[row]] # for prob in y_prob[row]: # temp.append(prob) # con_matrix.append(temp) #for row in con_matrix: # output.write(str(row)+"\n") #print y_pred #print y_test res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited'] res2=[i for i, j in enumerate(y_test) if j == 'anonEdited'] reset=[] for r in res1: if y_test[r] != "anonEdited": reset.append(y_test[r]) for r in res2: if y_pred[r] != "anonEdited": reset.append(y_pred[r]) output=open(sys.argv[2],"w") for suspect in reset: output.write(str(suspect)+"\n") cm = confusion_matrix(y_test, y_pred) print(cm) pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.show() print accuracy_score(y_pred,y_test)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
class FeatureUnionDataFrame(TransformerMixin): def __init__(self, *args, **kwargs): self.fu = FeatureUnion(*args, **kwargs) def fit(self, X, y=None, **kwargs): self.fu.fit(X, y, **kwargs) return self def transform(self, X, y=None, **fit_params): return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names()) def get_feature_names(self): return self.fu.get_feature_names() def set_params(self, **kwargs): self.fu.set_params(**kwargs) def get_params(self, deep=False): return self.fu.get_params(deep)
def test_set_feature_union_step_drop(): mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] mult3 = Mult(3) mult3.get_feature_names = lambda: ["x3"] X = np.asarray([[1]]) ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert ["m2__x2", "m3__x3"] == ft.get_feature_names() with pytest.warns(None) as record: ft.set_params(m2="drop") assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ["m3__x3"] == ft.get_feature_names() assert not record with pytest.warns(None) as record: ft.set_params(m3="drop") assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert [] == ft.get_feature_names() assert not record with pytest.warns(None) as record: # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) assert not record with pytest.warns(None) as record: # Check 'drop' step at construction time ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ["m3__x3"] == ft.get_feature_names() assert not record
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert_equal(len(feature_names), 35) ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message( AttributeError, 'Transformer tr1 (type Transf) does not provide ' 'get_feature_names', ft.get_feature_names)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35) ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message( AttributeError, 'Transformer tr1 (type Transf) does not provide ' 'get_feature_names', ft.get_feature_names)
def run(): df = pd.read_csv("../input/train.csv", usecols=["description", "title"]) df_test = pd.read_csv("../input/test.csv", usecols=["description", "title"]) df = pd.concat([df, df_test], axis=0) cleanup = Cleanup() df["title"] = df["title"].fillna("").apply(lambda x: cleanup.process2(x)) df["description"] = ( df["description"].fillna("").apply(lambda x: cleanup.process2(x))) tfidf_para = { "stop_words": set(stopwords.words("russian")), "analyzer": "word", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", # "min_df": .05, # "max_df": .9, "smooth_idf": False, } vectorizer = FeatureUnion([ ( "description", TfidfVectorizer(ngram_range=(1, 2), max_features=17000, **tfidf_para, preprocessor=get_col("description")), ), ( "title", CountVectorizer( ngram_range=(1, 2), stop_words=set(stopwords.words("russian")), preprocessor=get_col("title"), ), ), ]) vectorizer.fit(df.to_dict("records")) out_df = vectorizer.transform(df.to_dict("records")) vocab = vectorizer.get_feature_names() with open("../cache/feature_tfidf_names.pkl", "wb") as f: pickle.dump(vocab, f) return out_df
def data_vectorize(df): russian_stop = set(stopwords.words("russian")) tfidf_para = { "stop_words": russian_stop, "analyzer": "word", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", #"min_df":5, #"max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ("description", TfidfVectorizer(ngram_range=(1, 2), max_features=36000, **tfidf_para, preprocessor=get_col("description"))), ("title_description", TfidfVectorizer(ngram_range=(1, 2), max_features=200000, **tfidf_para, preprocessor=get_col("title_description"))), ("text_feature", CountVectorizer(ngram_range=(1, 2), preprocessor=get_col("text_feature"))), ("title", TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, preprocessor=get_col("title"))), ]) vectorizer.fit(df.to_dict("records")) ready_full_df = vectorizer.transform(df.to_dict("records")) tfvocab = vectorizer.get_feature_names() df.drop([ "text_feature", "text_feature_2", "description", "title", "title_description" ], axis=1, inplace=True) df.fillna(-1, inplace=True) return df, ready_full_df, tfvocab
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def train_model(trainset): # create 2 blocks of features, word and character ngrams, size of 2 (using TF-IDF method) # we can also append here multiple other features in general word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 ) char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 ) # our vectors are the feature union of word/char ngrams vectorizer = FeatureUnion([ ("chars", char_vector),("words", word_vector) ] ) corpus, classes = [], [] for item in trainset: corpus.append( item['text'] ) classes.append( item['label'] ) print "num of training instances: ", len(classes) print "num of training classes: ", len(set(classes)) #fit the model of tfidf vectors for the coprus matrix = vectorizer.fit_transform(corpus) print "num of features: " , len(vectorizer.get_feature_names()) print "training model" X = matrix.toarray() y = np.asarray(classes) print X[0] # Here are results of several different models for Law corpus: # model = SVC(kernel='sigmoid') # -> 0.38 # model = KNeighborsClassifier(algorithm = 'kd_tree') # -> 0.41 # model = AdaBoostClassifier() #-> 0.46 # model = RandomForestClassifier() # -> 0.52 # model = LogisticRegression() # -> 0.65 model = LinearSVC( loss='l1', dual=True) # -> 0.70 # Results of several different models for Enron corpus: # model = LinearSVC( loss='l1', dual=True) # -> 0.6 scores = cross_validation.cross_val_score( estimator = model, X = matrix.toarray(), y= np.asarray(classes), cv=10 ) print "10-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
def test_feature_union_feature_names(): JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def test_feature_stacker_feature_names(): JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def make_tfidf(train, test): russian_stop = set(stopwords.words('russian')) tfidf_para = { "stop_words": russian_stop, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', # "min_df":5, # "max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), max_features=100, **tfidf_para, preprocessor=get_col('description'))), # ('text_feat', CountVectorizer( # ngram_range=(1, 2), # # max_features=7000, # preprocessor=get_col('text_feat'))), ('title', TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, max_features=70, preprocessor=get_col('title'))) ]) vectorizer.fit(train) ret_df = vectorizer.transform(train) feature_names = vectorizer.get_feature_names() return ret_df, feature_names # vectorizer.fit(df.loc[traindex, :].to_dict('records')) # ready_df = vectorizer.transform(df.to_dict('records')) # tfvocab = vectorizer.get_feature_names() # # # # get char count # length_of_words = len(df["len"])
def extract_features(dataset_dir): print_message("Extracting Features") X, y = get_data(dataset_dir) glbs.LABELS = y glbs.DATASET_DATA = X ######################################## # X, y = zip(*list(zip(X, y))[:160]) # from help_functions import get_fetuer_by_DF # get_fetuer_by_DF(X) ######################################## feature_lst = [] # add all the N-Grams feature to the list for feature in glbs.FEATURES: if is_ngrams(feature): vectorizer = get_vectorizer(feature) feature_lst = add_feature(feature_lst, feature, vectorizer) # add all the stylistic features to the list for feature in glbs.STYLISTIC_FEATURES: vectorizers = get_stylistic_features_vectorizer(feature) for i in range(len(vectorizers)): feature_lst = add_feature(feature_lst, feature + str(i), vectorizers[i]) # convert the list to one vectoriazer using FeatureUnion if glbs.MULTIPROCESSING: n_jobs = -1 else: n_jobs = None all_features = FeatureUnion(feature_lst, n_jobs=n_jobs) glbs.FEATURE_MODEL.append(all_features) all_features.fit(X, y) glbs.NUM_OF_FEATURE = len(all_features.get_feature_names()) if glbs.SELECTION: from feature_selction import get_selected_features get_selected_features(X, y, all_features) return X, y
def main(): qtrain = read_set() # X_train = gen_features(qtrain) Y_train = get_ans(qtrain) qtest = read_set() # X_test = gen_features(qtest) # (X_train, X_test), featkeys = dictVec(X_train, X_test) # tfidf_word = TfidfVectorizer(preprocessor=lambda x: x['question_text'].lower(), ngram_range=(1, 3), analyzer="word", binary=False, min_df=3) tfidf_word = TfidfVectorizer(preprocessor=exa, ngram_range=(1, 3), analyzer="word", binary=False, min_df=0.05) # feat_select = SelectPercentile(score_func=f_regression_, percentile=0.15) feat_select = SelectKBest(score_func=f_regression_, k=QN_PARAMS[QUESTION]['features_select']) cf = CustomFeat() feat = FeatureUnion([('word_counts', tfidf_word), ('custom', cf)]) # feat = FeatureUnion([('custom', cf)]) # feat = FeatureUnion([('word_counts', tfidf_word)]) # est = ESTIMATOR(**params[SETTINGS['EST']]) w_model = Pipeline([('funion', feat), ('feat_select', feat_select)]) #, ('est', est)] # w_X_train = tfidf_word.fit_transform(qtrain) # w_X_test = tfidf_word.transform(qtest) # print_err(w_X_train[0]) # X_train = w_X_train # X_test = w_X_test # featkeys = tfidf_word.get_feature_names() # feat_select # f_regression_(X_train[:,0],Y_train) # print_err('fitting') # w_model.fit(qtrain, Y_train) # print_err(feat_select.get_support(indices=True)) X_train = w_model.fit_transform(qtrain, Y_train).toarray() X_test = w_model.transform(qtest).toarray() featkeys = np.asarray(feat.get_feature_names())[feat_select.get_support(indices=True)] # featkeys = [] # Y_test = classify(w_model, qtest) # print_err(est.coef_.nonzero()) clf = get_clf(X_train, Y_train, feat_indices=featkeys, clf_used=SETTINGS['EST'], grid_search=SETTINGS['GRIDSEARCH']) Y_test = classify(clf, X_test) for qn, pans in zip(qtest, Y_test): print json.dumps({ 'question_key': qn['question_key'].encode('ascii'), '__ans__': pans })
class POSTagsTransformer(TransformerMixin, BaseEstimator): def __init__(self): self.init_transformer = None def fit(self, X, y=None): """All SciKit-Learn compatible transformers and classifiers have the same interface. `fit` always returns the same object.""" feature_lst = [] all_posts_pos, all_pos = get_corpus_pos_tags(X) for i, pos in enumerate(all_pos): feature_lst += [("pos" + pos + str(i), InitTransformer(pos, all_posts_pos))] self.init_transformer = FeatureUnion(feature_lst) return self def transform(self, X): return self.init_transformer.transform(X) def get_feature_names(self): """Array mapping from feature integer indices to feature name""" return self.init_transformer.get_feature_names()
def _transform(ds_url: str) -> pd.DataFrame: normal_list, anomalous_list = data_sets.get(ds_url) X_normal_list = [] X_anomalous_list = [] for tf_list in (NUMBERS_TF_LIST, RAW_DATA_TF_LIST): # make and fit feature union fu = FeatureUnion( [(class_.__name__.replace('Transformer', ''), class_()) for class_ in tf_list], n_jobs=-1) fu.fit(normal_list) # create column MultiIndex col_tuples = [] for s in fu.get_feature_names(): s = '{}__{}'.format(s[:2], s[2:]) source_0, tf_name, source_1, tf_part = s.split('__') col_tuples.append((tf_name, tf_part, source_0, source_1)) idx = pd.MultiIndex.from_tuples(col_tuples, names=COLUMN_NAMES) # transform requests X_normal = pd.DataFrame(fu.transform(normal_list), columns=idx) X_anomalous = pd.DataFrame(fu.transform(anomalous_list), columns=idx) X_normal_list.append(X_normal) X_anomalous_list.append(X_anomalous) # concatenate all features X_normal = pd.concat(X_normal_list, axis=1) # type: pd.DataFrame X_anomalous = pd.concat(X_anomalous_list, axis=1) # type: pd.DataFrame # add meta X_normal[META_ID] = X_normal.index X_normal[META_TRUE_LABEL] = 'normal' X_anomalous[META_ID] = X_anomalous.index X_anomalous[META_TRUE_LABEL] = 'anomalous' return pd.concat([X_normal, X_anomalous], ignore_index=True)
def dump_train(): _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data() train_df = f.make_data_df(train_gray_data, labels) test_df = f.make_test_df(test_gray_data) train_df = train_df.reset_index() test_df = test_df.reset_index() train_df.columns = ["pngname", "input", "label"] test_df.columns = ["pngname", "input"] fu = FeatureUnion(transformer_list=f.feature_transformer_rule) feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()] feature_name_list.append("target") train_X = fu.fit_transform(train_df) train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten())) train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2) train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list) dump_path = os.path.abspath(os.path.dirname(__file__)) +\ "/../tmp/train_dump" train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
def test_feature_union(): df = _create_test_data() features = FeatureUnion(transformer_list=[ ('scale_A', series_pipeline('col_A', [ScalingTransformer(2.0)])), ('scale_B', series_pipeline('col_B', [ScalingTransformer(-5.0)])), ('null_A', series_pipeline('col_A', [NullTransformer()])), ('daysofweek', series_pipeline('dates', [DateAttributeTransformer('dayofweek')])), ('is_some_day', series_pipeline('dates', [ MultiDateTransformer([ date(2014, 10, 4), date(2016, 5, 30), ]) ])), ('linear_date', series_pipeline('dates', [LinearDateTransformer()])), ('label_A', series_pipeline('col_A', [LabelEncoderWithUnknown()])), ]) features.fit(df) ret = features.transform(df) feature_names = features.get_feature_names() assert ret.shape[0] == df.shape[0] assert ret.shape[1] == len(feature_names)
TfidfVectorizer(max_features=20000, **tfidf_para, preprocessor=get_col('title'))), ]) dftrl.index = df.index dftrl.fillna('', inplace=True) dftrl.head() start_vect = time.time() text_cols = ['text', 'text_feat', 'title'] vectorizer1.fit(df[text_cols].loc[traindex, :].to_dict('records')) text_cols = ['text', 'title'] vectorizer2.fit(dftrl[text_cols].loc[traindex, :].to_dict('records')) ready_df = vectorizer1.transform(df.to_dict('records')) ready_dftrl = vectorizer2.transform(dftrl.to_dict('records')) tfvocab1 = vectorizer1.get_feature_names() tfvocab2 = vectorizer2.get_feature_names() print('[{}] Vectorisation completed'.format(time.time() - start_time)) # Drop Text Cols df.drop(textfeats + ['text', 'all_titles'], axis=1, inplace=True) del dftrl gc.collect() print('[{}] Drop all the categorical'.format(time.time() - start_time)) df.drop(categorical, axis=1, inplace=True) gc.collect() ready_df.shape ready_dftrl.shape
}) X_train = dft["text"] stop_word_lib = set(stopwords.words('english')) tfidf_param = { "stop_words": stop_word_lib, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', # "min_df":5, # "max_df":.9, "smooth_idf": False } unioned = FeatureUnion([ ("colA", TfidfVectorizer(ngram_range=(1, 2), max_features=50, **tfidf_param)), ("colB", CountVectorizer(min_df=0, stop_words=stop_word_lib)) ]) print("here") unioned.fit(dft) res_df = unioned.transform(dft) f_names = unioned.get_feature_names() print(f_names) print(res_df.toarray()) print("here")
unigram_vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=1) temp_uni_tfidf = unigram_vectorizer.fit_transform(X_train).toarray() # n_features = len(unigram_vectorizer.get_feature_names()) n_features = 9000 multigrams_vectorizer = TfidfVectorizer(ngram_range=(2, 2), min_df=2, max_features=n_features) comb_vectorizer = FeatureUnion([("uni_vec", unigram_vectorizer), ("multi_vec", multigrams_vectorizer)]) # comb_vectorizer.set_params(multi_vec=None) X_tfidf = comb_vectorizer.fit_transform(X_train).toarray() with open("vectorizer_NEALTA_Binary.pk", "wb") as vect_file: pickle.dump(comb_vectorizer, vect_file) feature_names = comb_vectorizer.get_feature_names() print("num_features: " + str(len(feature_names))) # print(feature_names[:50]) print("features extracted & tfidf transformed") # Transform documents to document-term matrix. (.transform) - No learning involved as it is test data # For test data # X_test_tfidf = vectorizer.transform(X_test).toarray() X_test_tfidf = comb_vectorizer.transform(X_test).toarray() print('Creating Linear SVC Model...') # model=svm.LinearSVC(C=1000) model = svm.SVC(kernel="linear", C=1000, cache_size=5000, probability=True) # model = svm.SVC(kernel="rbf", C=1, cache_size=5000,probability=True) print('Linear SVC Model created!') # .fit(X, y[, sample_weight]): Fit the model according to the given training data
def preparTotalData(y, df, predictors, len_train, len_test, frm, to, tot_filename): y, df, predictors, len_train, categorical, textfeats = preparBaseData( y, df, predictors, len_train, len_test, frm, to, tot_filename) print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage") russian_stop = set(stopwords.words('russian')) tfidf_para = { "stop_words": russian_stop, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', #"min_df":5, #"max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), max_features=17000, **tfidf_para, preprocessor=get_col('description'))), ( 'title', TfidfVectorizer( ngram_range=(1, 2), **tfidf_para, #max_features=7000, preprocessor=get_col('title'))) ]) start_vect = time.time() #vectorizer.fit(df.loc[traindex,:].to_dict('records')) vectorizer.fit(df[:len_train].to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() print("Vectorization Runtime: %0.2f Minutes" % ((time.time() - start_vect) / 60)) # Drop Text Cols df.drop(textfeats, axis=1, inplace=True) #from sklearn.metrics import mean_squared_error from math import sqrt kf = KFold(len_train, n_folds=NFOLDS, shuffle=True, random_state=SEED) ridge_params = { 'alpha': 30.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': SEED } ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params) ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:len_train], y, ready_df[len_train:], len_train, len_test, kf) #rms = sqrt(mean_squared_error(y, ridge_oof_train)) ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test]) df['ridge_preds'] = ridge_preds predictors.append('ridge_preds') df = kaggle_util.reduce_mem_usage(df) return y, df, ready_df, tfvocab, predictors, len_train, categorical
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
from Utils import load_test_data X_test, Y_test = load_test_data() X_test, Y_test = np.array(X_test), np.array(Y_test) if parameters['combine_feature']==True: from vectorizer_estimator import StatisticVectorizer from sklearn.pipeline import FeatureUnion statistic_vec=StatisticVectorizer() combined_features =FeatureUnion([('ngrams',vectorizer),('statistic_vec',statistic_vec)]) else: combined_features=vectorizer trian_vec = combined_features.fit_transform(X_train) pickle.dump(combined_features.get_feature_names(), open('./debug/feature_names.p', 'wb')) test_vec = combined_features.transform(X_test) # use transform for test data, instead of fit_transform # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb")) if parameters['classifier']=='svm': from sklearn import svm clf = svm.SVC() clf.fit(trian_vec.toarray(), Y_train) pickle.dump(trian_vec, open("./debug/trian_vec.p", "wb")) pickle.dump(clf, open("./acc_tmp/clf.p", "wb")) true_labels=Y_test predict_labels=np.array(clf.predict(test_vec.toarray())) precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary') print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score)) else:
def create_text_feature(df, todir, ext, language, max_features): print('\n>> doing Text Features') if language == 'russian': df = remove_english(df) else: df = remove_russian(df) df = rename_en_to_ru(df) if language == 'russian': filename = todir + 'text_feature_kernel_' + str(max_features) + ext filename_len = todir + 'len_feature_kernel' + ext filename_vocab = todir + 'vocab_' + str(max_features) + ext suffix_feature = '' else: filename = todir + 'text_feature_kernel_' + str( max_features) + '_en' + ext filename_len = todir + 'len_feature_kernel_en' + ext filename_vocab = todir + 'vocab_' + str(max_features) + '_en' + ext suffix_feature = '_en' if os.path.exists(filename): print('done already...') df = load_file(filename_len, ext) ready_df = load_file(filename, ext) else: df = get_original_data( df, ['param_1', 'param_2', 'param_3', 'description', 'title']) df['text_feat'] = df.apply(lambda row: ' '.join( [str(row['param_1']), str(row['param_2']), str(row['param_3'])]), axis=1) # Group Param Features print(df[['text_feat', 'param_1', 'param_2', 'param_3']].head()) print(df[['text_feat', 'param_1', 'param_2', 'param_3']].tail()) df.drop(["param_1", "param_2", "param_3"], axis=1, inplace=True) textfeats = ["description", "text_feat", "title"] for cols in textfeats: df[cols] = df[cols].astype(str) df[cols] = df[cols].astype(str).fillna('n/a') # FILL NA df[cols] = df[cols].str.lower( ) # Lowercase all text, so that capitalized words dont get treated differently df[cols + '_num_chars' + suffix_feature] = df[cols].apply( len) # Count number of Characters df[cols + '_num_words' + suffix_feature] = df[cols].apply( lambda comment: len(comment.split())) # Count number of Words df[cols + '_num_unique_words' + suffix_feature] = df[cols].apply( lambda comment: len(set(w for w in comment.split()))) df[cols + '_words_vs_unique' + suffix_feature] = \ df[cols+'_num_unique_words'+ suffix_feature] / \ df[cols+'_num_words'+ suffix_feature] * 100 # Count Unique Words print_memory() print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage") language_stop = set(stopwords.words(language)) tfidf_para = { "stop_words": language_stop, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', #"min_df":5, #"max_df":.9, "smooth_idf": False } if max_features > 0: vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), max_features=max_features, **tfidf_para, preprocessor=get_col('description'))), ('text_feat', CountVectorizer(ngram_range=(1, 2), preprocessor=get_col('text_feat'))), ('title', TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, preprocessor=get_col('title'))) ]) else: vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, preprocessor=get_col('description'))), ('text_feat', CountVectorizer(ngram_range=(1, 2), preprocessor=get_col('text_feat'))), ('title', TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, preprocessor=get_col('title'))) ]) vectorizer.fit(df.to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() print_memory() # Drop Text Cols df.drop(textfeats, axis=1, inplace=True) print("Modeling Stage") # Combine Dense Features with Sparse Text Bag of Words Features # ready_df = ready_df.astype(np.float32) X = hstack([csr_matrix(df.values), ready_df]) for shape in [X]: print("{} Rows and {} Cols".format(*shape.shape)) print_memory() save_file(df, filename_len, ext) print('>> saving to', filename) with open(filename, "wb") as f: pickle.dump((ready_df, tfvocab), f) del ready_df, tfvocab gc.collect() with open(filename, "rb") as f: ready_df, tfvocab = pickle.load(f) return df, ready_df
X_test, X_test_labels = clustering_test_data_method( X_test, X_train, cluster_size) X_test, Y_test = np.array(X_test), np.array(Y_test) if parameters['combine_feature'] == True: from vectorizer_estimator import StatisticVectorizer from sklearn.pipeline import FeatureUnion statistic_vec = StatisticVectorizer() combined_features = FeatureUnion([('ngrams', vectorizer), ('statistic_vec', statistic_vec)]) else: combined_features = vectorizer trian_vec = combined_features.fit_transform(X_train) pickle.dump(combined_features.get_feature_names(), open('./debug/feature_names.p', 'wb')) test_vec = combined_features.transform( X_test) # use transform for test data, instead of fit_transform # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb")) if parameters['classifier'] == 'svm': from sklearn import svm clf = svm.SVC() clf.fit(trian_vec.toarray(), Y_train) pickle.dump(trian_vec, open("./debug/trian_vec.p", "wb")) pickle.dump(clf, open("./acc_tmp/clf.p", "wb")) true_labels = Y_test predict_labels = np.array(clf.predict(test_vec.toarray())) precision, recall, fbeta_score, support = precision_recall_fscore_support(
def train_model(trainset): #create two blocks of features, word anc character ngrams, size of 2 #we can also append here multiple other features in general word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 ) char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 ) #ur vectors are the feature union of word/char ngrams vectorizer = FeatureUnion([ ("chars", char_vector),("words", word_vector) ] ) #corpus is a list with the n-word chunks corpus = [] #classes is the labels of each chunk classes = [] #load training sets, for males & females for item in trainset: corpus.append( item['text'] ) classes.append( item['label'] ) print "num of training instances: ", len(classes) print "num of training classes: ", len(set(classes)) #fit the model of tfidf vectors for the coprus matrix = vectorizer.fit_transform(corpus) print "num of features: " , len(vectorizer.get_feature_names()) print "training model" X =matrix.toarray() y = np.asarray(classes) model = LinearSVC( loss='l1', dual=True) #scores = cross_validation.cross_val_score( estimator = model, # X = matrix.toarray(), # y= np.asarray(classes), cv=10 ) #http://scikit-learn.org/stable/auto_examples/plot_confusion_matrix.html #print scores #print "10-fold cross validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores) #model.fit( X= matrix.toarray(), y= np.asarray(classes) ) #predicted = model.predict(matrix.toarray()) #cm = confusion_matrix(classes, predicted) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) y_pred = model.fit(X_train, y_train).predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.show()
**countv_para, preprocessor=get_col('text_feat'))), ('title',CountVectorizer( **countv_para, preprocessor=get_col('title'))), ('translation',TfidfVectorizer( #ngram_range=(1, 2), max_features=40000, **tfidf_para, preprocessor=get_col('translation'))), ]) start_vect=time.time() vectorizer.fit(df.loc[traindex,:].to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() tfvocab[:50] print('[{}] Vectorisation completed'.format(time.time() - start_time)) # Drop Text Cols df.drop(textfeats+['text', 'all_titles', 'translation'], axis=1,inplace=True) gc.collect() print('[{}] Drop all the categorical'.format(time.time() - start_time)) df.drop(categorical, axis=1,inplace=True) # Training and Validation Set lgbm_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective' : 'regression', 'metric' : 'rmse',
class FeatureExtractor(object): """Class for exracting features from dataframe object. Attributes ---------- _transform_only: bool True, if extractor was initiated from previously serialized vectorizer and labelencoder. """ def __init__(self, settings, dataframe, vectorizer=None, labelencoder=None): """Initiate a featureextractor. Parameters ---------- settings: Settings Settings instance of the classification task. dataframe: Dataframe Pandas Dataframe object containing the data compatible with the settings. Keyword parameters ------------------ vectorizer: str Pickled and base64-encoded vectorizer from previous serialization. labelencoder: str Pickled and base64-encoded labelencoder from previous serialization. """ assert isinstance(settings, Settings) self._settings = settings self._dataframe = dataframe self._cache = {} self._transform_only = False if vectorizer is None: self._vectorizer = FeatureUnion( [ ("lemma", CountVectorizer(analyzer=SimpleTextAnalyzer(settings.unifier), binary=True, min_df=5)), ("word", TfidfVectorizer(analyzer="word", min_df=5, use_idf=False, ngram_range=(2, 3))), ], n_jobs=NUM_CORES, ) else: self._vectorizer = pickle.loads(base64.b64decode(vectorizer)) self._transform_only = True if labelencoder is None: self._labelencoder = preprocessing.LabelEncoder() else: self._labelencoder = pickle.loads(base64.b64decode(labelencoder)) self._transform_only = True def export(self): return { "settings": self.settings.export(), "vectorizer": base64.b64encode(pickle.dumps(self._vectorizer)).decode("ascii"), "labelencoder": base64.b64encode(pickle.dumps(self._labelencoder)).decode("ascii"), } @property def settings(self): return self._settings @property def vectorizer(self): return self._vectorizer @property def dataframe(self): return self._dataframe @property def settings(self): return self._settings @property def labelencoder(self): return self._labelencoder @property def strings(self): """Get feature strings. Returns ------- list[unicode] Dataframe columns concatenated to a single string. """ if "strings" not in self._cache: self._cache["strings"] = [ " ".join(row) for row in self._dataframe[self._settings.features].fillna("").values ] return self._cache["strings"] @property def X(self): """Returns ------- scipy.sparse Sparse matrix containing textual features for classification. """ if "X" not in self._cache: X = self.strings if self._transform_only: self._cache["X"] = self._vectorizer.transform(X) else: self._cache["X"] = self._vectorizer.fit_transform(X) return self._cache["X"] @property def y(self): """Returns ------- numpy.array Labels encoded as integer values. Use get_labels() for mapping them back to strings. """ if "y" not in self._cache: y = list(self._dataframe[self._settings.label].fillna("")) if self._transform_only: self._cache["y"] = np.array(self._labelencoder.transform(y)) else: self._cache["y"] = np.array(self._labelencoder.fit_transform(y)) return self._cache["y"] @property def feature_names(self): """Returns ------- list[unicode] Meaningful feature names for vectorized feature matrix columns. """ return self._vectorizer.get_feature_names() @property def labels(self): """Returns ------- list[unicode] Labels for for encoded labels (y). """ self.y return [l for l in self._labelencoder.classes_]
def train_model(train_set, mode='linear'): """ Train the models, using 10-fold-cv and different classifications. :param train_set: The set that is used for training :param mode: The mode that gets used for training """ global used_countries # Create two blocks of features, word anc character n-grams, size of 2 # We can also append here multiple other features in general word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2), binary=False, max_features=2000) char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0, max_features=2000) # Our vectors are the feature union of word/char n-grams inner_vectorizer = FeatureUnion([("chars", char_vector), ("words", word_vector)]) # Corpus is a list with the n-word chunks corpus = [] # Classes is the labels of each chunk classes = [] # Load training set for key, country_list in train_set.items(): # print("Processing " + '"' + key + '"') for item in country_list: corpus.append(item['text']) classes.append(item['label']) print() print("Size of corpus: " + str(sys.getsizeof(corpus))) print("Number of training instances: ", len(classes)) print("Number of training classes: ", len(set(classes))) print("Processed Countries:\n" + str(used_countries)) # Fit the model of tf-idf vectors for the corpus x1 = inner_vectorizer.fit_transform(corpus) print("Number of features: ", len(inner_vectorizer.get_feature_names())) x = x1.toarray() y = np.asarray(classes) print() print("Training model...") if mode == 'kernel_rbf': inner_model = SVC(kernel="rbf") inner_model.fit(x, y) elif mode == 'kernel_poly': inner_model = SVC(kernel="poly") inner_model.fit(x, y) elif mode == 'kernel_linear': inner_model = SVC(kernel="linear") inner_model.fit(x, y) else: inner_model = LinearSVC(loss='hinge', dual=True) inner_model.fit(x, y) mode = 'linear' print("Saving model...") # Create the model-folder if it does not exist if not isdir("../data/model/"): makedirs("../data/model/") pickle.dump(inner_model, open('../data/model/trained_model_' + mode, 'wb')) print("Saving tfidf-vectorizer...") pickle.dump(inner_vectorizer, open('../data/model/vectorizer_' + mode, 'wb'))
.format(args.H_matrix)) nmf_model.fit(tfidf) W_matrix = nmf_model.components_ sparse_H = tfidf.dot(csr_matrix(W_matrix).transpose()) H_matrix = np.asarray(sparse_H.todense()) else: LOG.info("Using provided H matrix ({})".format(args.H_matrix)) H_matrix = nmf_model.fit_transform(tfidf) W_matrix = nmf_model.components_ reconstruction_err = nmf_model.reconstruction_err_ LOG.info("done in %0.3fs." % (time() - t0)) LOG.info("Labeling topics...") feature_names = preprocess.get_feature_names() topic_names = [] max_words = max(2, args.show_top_words) \ if args.show_top_words is not None \ else 2 for j, topic in enumerate(W_matrix): weight_words = [(topic[i], feature_names[i]) for i in topic.argsort()[:-max_words - 1:-1]] word1, word2 = weight_words[:2] prefix = str(j) + '_' topic_name = prefix + word1[1] \ if safe_div(abs(word1[0]), abs(word2[0])) >= args.word_ratio \ else prefix + word1[1] + '-' + word2[1] topic_names.append(topic_name)
print '' word_count = SpecialWordCounter() word_count.fit(t) print word_count.get_feature_names() print word_count.transform(t) combined_features = FeatureUnion([ ('stats', TextStats()) , ('special_word_stats', SpecialWordCounter()) ]) # Use combined features to transform dataset: X_features = combined_features.fit(t).transform(t) print '\nfeature union' print 'X:', X_features print 'names:', combined_features.get_feature_names() print pipeline = Pipeline([ # Use FeatureUnion to combine the features from subject and body ('union', FeatureUnion( transformer_list=[ ('scaled_text_stats', Pipeline([ ('stats', TextStats()) , ('scaling', StandardScaler()) ]) ) , ('special_word_stats', SpecialWordCounter()) ] ) )
def data_vectorize(df): russian_stop1 = set(stopwords.words("russian")) russian_stop2 = read_stopwords() russian_stop = list(set(russian_stop1).intersection(set(russian_stop2))) tfidf_para = { "stop_words": russian_stop, "analyzer": "word", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", # "min_df":5, # "max_df":.9, "smooth_idf": False } tfidf_para2 = { "stop_words": russian_stop, "analyzer": "char", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", # "min_df":5, # "max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ # ("description", TfidfVectorizer( # ngram_range=(1, 2), # max_features=40000, # 40000,18000 # **tfidf_para, # preprocessor=get_col("description")) # ), # ("title_description", TfidfVectorizer( # ngram_range=(1, 2),#(1,2) # max_features=1800,#40000,18000 # **tfidf_para, # preprocessor=get_col("title_description")) # ), # ("text_feature", CountVectorizer( # ngram_range=(1, 2), # preprocessor=get_col("text_feature")) # ), # ("title", TfidfVectorizer( # ngram_range=(1, 2), # **tfidf_para, # preprocessor=get_col("title")) # ), # 新加入两个文本处理title2,title_char ("title2", TfidfVectorizer(ngram_range=(1, 1), **tfidf_para, preprocessor=get_col("title"))), # ("title", TfidfVectorizer( # ngram_range=(1, 2), # **tfidf_para, # preprocessor=get_col("title_ru")) # ), # # 新加入两个文本处理title2,title_char # ("title2", TfidfVectorizer( # ngram_range=(1, 1), # **tfidf_para, # preprocessor=get_col("title_ru")) # ), #增加了1倍的运行时间 # ("title_char", TfidfVectorizer( # # ngram_range=(1, 4), # (1, 4),(1,6) # max_features=16000, # 16000 # **tfidf_para2, # preprocessor=get_col("title")) # ), # # 新加2018-6-3,速度很慢 # ("description_feature", CountVectorizer( # ngram_range=(1, 2), # stop_words= russian_stop, # max_features=8000, # preprocessor=get_col("description")) # ), ]) vectorizer.fit(df.to_dict("records")) ready_full_df = vectorizer.transform(df.to_dict("records")) tfvocab = vectorizer.get_feature_names() df.drop( [ "text_feature", "text_feature_2", "description", "title", "title_ru", # "title_description" ], axis=1, inplace=True) df.fillna(-1, inplace=True) return df, ready_full_df, tfvocab
#('text_feat',CountVectorizer( # **countv_para, # preprocessor=get_col('text_feat'))), ('name_bi', CountVectorizer(**countv_para, preprocessor=get_col('name_bi'))), #('translation',TfidfVectorizer( # #ngram_range=(1, 2), # max_features=50000, # **tfidf_para, # preprocessor=get_col('translation'))), ]) start_vect = time.time() vectorizer.fit(df.loc[traindex, :].to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() tfvocab[:50] print('[{}] Vectorisation completed'.format(time.time() - start_time)) # Drop Text Cols df.drop(textfeats + ['text', 'all_titles', 'translation', 'name_bi'], axis=1, inplace=True) #drop_cols= [c for c in textfeats+['text', 'all_titles', 'translation'] if c in df.columns] #df.drop(drop_cols, axis=1,inplace=True) gc.collect() print('[{}] Drop all the categorical'.format(time.time() - start_time)) df.drop(categorical, axis=1, inplace=True) # Training and Validation Set
print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = preprocess.transform(data_test.data) duration = time() - t0 print("X_test: n_samples: %d, n_features: %d" % X_test.shape) print() y_train, y_test = data_train.target, data_test.target # mapping from integer feature name to original token string if opts.vectorizer == "hashing": feature_names = None elif opts.vectorizer == "tfidf": feature_names = np.asarray(preprocess.get_feature_names()) assert feature_names.shape[0] == X_train.shape[1] == X_test.shape[1], \ ("feature_names-len: %d, X-train-len:%d, X-test-len: %d" % (feature_names.shape[0], X_train.shape[1], X_test.shape[1])) if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() if feature_names is not None:
class Orchestrator(object): """ Main singleton, implementing Model part on Model-View-Presenter """ def __init__(self, mainPfcsamrApp): """ Constructor :param mainPfcsamrApp: singleton of MainPafcsmrApp :return: None """ self.file_path = None """ Current file path. :type: str""" self.headings = [] """ List of column headings :type: list[str]""" self.rows = [] """ Full data table :type: np.ndarray""" self.preprocessed_rows = None """ Preproc data table. :type: np.ndarray""" self.main_pfcsamr_app = mainPfcsamrApp """:type: MainPfcsamr2App""" self.featured_rows = [] """ Featured data table. :type: np.ndarray""" self.featured_rows_train = [] """ Featured data table (train subset) :type: np.ndarray""" self.featured_rows_test = [] """ Featured data table (test subset) :type: np.ndarray""" self.train_y = [] """ Vector of classes :type: np.ndarray""" self.train_y_train = [] """ Vector of classes (train subset) :type: np.ndarray""" self.train_y_test = [] """ Vector of classes (test subset) :type: np.ndarray""" self.feature_union = None """:type: FeatureUnion""" self.featured_headings = [] """ heading column names for features data table. :type: list[str]""" self.estimators = {} """ Map of trained estimators indexed by name. :type: dict""" self.already_splitted = False """ Boolean flag indicating if train set has been sub-split on train and test. :type: bool""" self.featured_support = [] """ Support vector of selected features. :type: list[bool]""" self.featured_selected_headings = [] """ Selected subset of feature headings. :type: list[str]""" self.classify_headings = [] """ Heading of classify stage :type: list[str]""" self.classify_rows = [] """ Data table of input classify stage :type: np.ndarray""" self.classify_preprocessed_rows = [] """ Data table of preprocessed data on classify stage :type: np.ndarray""" self.classify_featured_rows = [] """ Data table of vectorized data to features on classify stage :type: np.ndarray""" self.predictions_headings = [] """ List of heading names for predictions :type: list[str]""" self.predictions_rows = [] """ Data date of predictions to be saved and uploaded to kaggle. :type: np.ndarray""" def do_load_train_tsv(self, file_path: str = None, max_rows=None): """ Read and load ``train.tsv`` :param file_path: str - path to file :param max_rows: int or None - max number of rows to read or everyone :return: """ if file_path.startswith("file:///"): file_path = file_path[7:] self.file_path = file_path with open(file_path, "rt") as file: rdr = csv.reader(file, dialect="excel-tab") self.headings = next(rdr) no = 0 self.main_pfcsamr_app.status_count = no for no, row in enumerate(rdr, 1): if no % 32 == 0: self.main_pfcsamr_app.status_count = no self.rows.append(row) if max_rows is not None and no >= max_rows: break self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_text = "Read {0} train samples".format(len(self.rows)) self.main_pfcsamr_app.preproc_tab_enabled = True self.main_pfcsamr_app.features_tab_enabled = True self.main_pfcsamr_app.current_model = MyTableModel(self.headings, self.rows) return self def do_classify_test_tsv(self, file_path: str = None, max_rows=None): """ Read and load ``test.tsv``. :param file_path: str - path to file :param max_rows: int or None - max number of rows to read or everyone :return: """ if file_path.startswith("file:///"): file_path = file_path[7:] self.file_path = file_path self.classify_rows = [] with open(file_path, "rt") as file: rdr = csv.reader(file, dialect="excel-tab") self.classify_headings = next(rdr) no = 0 self.main_pfcsamr_app.status_count = no for no, row in enumerate(rdr, 1): if no % 32 == 0: self.main_pfcsamr_app.status_count = no self.classify_rows.append(row) if max_rows is not None and no >= max_rows: break self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_text = "Read {0} test samples".format(len(self.classify_rows)) self.main_pfcsamr_app.current_model = MyTableModel(self.classify_headings, self.classify_rows) return self def do_preprocess(self): """ Do preprocess (train stage). :return: """ from .replacers import RegexpReplacer as ContractionsExpander expander = ContractionsExpander() ws_tokenizer = nltk.WhitespaceTokenizer() stemmer = nltk.PorterStemmer() lemmatizer = nltk.WordNetLemmatizer() self.preprocessed_rows = [] no = 0 self.main_pfcsamr_app.status_count = no for no, row in enumerate(self.rows, 1): new_row = [] for column in row: if is_text(column): if self.main_pfcsamr_app.config["preproc_unsplit_contractions"]: column = unsplit_contractions(column) if self.main_pfcsamr_app.config["preproc_expand_contractions"]: column = expander.replace(column) if self.main_pfcsamr_app.config["preproc_remove_stopwords"]: column = remove_stopwords(column) if ( self.main_pfcsamr_app.config["preproc_word_replacement"] and self.main_pfcsamr_app.config["preproc_stemmize"] ): column = " ".join([stemmer.stem(w) for w in ws_tokenizer.tokenize(column)]) if ( self.main_pfcsamr_app.config["preproc_word_replacement"] and self.main_pfcsamr_app.config["preproc_lemmatize"] ): column = " ".join([lemmatizer.lemmatize(w) for w in ws_tokenizer.tokenize(column)]) if self.main_pfcsamr_app.config["preproc_pos_tag_words"]: column = postag(column) new_row.append(column) self.preprocessed_rows.append(new_row) if no % 32 == 0: self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_text = "Preprocessed done" self.main_pfcsamr_app.features_tab_enabled = True self.main_pfcsamr_app.current_model = MyTableModel(self.headings, self.preprocessed_rows) return self def do_classify_preprocess(self): """ Do preprocess (classify stage). :return: """ from .replacers import RegexpReplacer as ContractionsExpander expander = ContractionsExpander() ws_tokenizer = nltk.WhitespaceTokenizer() stemmer = nltk.PorterStemmer() lemmatizer = nltk.WordNetLemmatizer() self.classify_preprocessed_rows = [] no = 0 self.main_pfcsamr_app.status_count = no for no, row in enumerate(self.classify_rows, 1): new_row = [] for column in row: if is_text(column): if self.main_pfcsamr_app.config["preproc_unsplit_contractions"]: column = unsplit_contractions(column) if self.main_pfcsamr_app.config["preproc_expand_contractions"]: column = expander.replace(column) if self.main_pfcsamr_app.config["preproc_remove_stopwords"]: column = remove_stopwords(column) if ( self.main_pfcsamr_app.config["preproc_word_replacement"] and self.main_pfcsamr_app.config["preproc_stemmize"] ): column = " ".join([stemmer.stem(w) for w in ws_tokenizer.tokenize(column)]) if ( self.main_pfcsamr_app.config["preproc_word_replacement"] and self.main_pfcsamr_app.config["preproc_lemmatize"] ): column = " ".join([lemmatizer.lemmatize(w) for w in ws_tokenizer.tokenize(column)]) if self.main_pfcsamr_app.config["preproc_pos_tag_words"]: column = postag(column) new_row.append(column) self.classify_preprocessed_rows.append(new_row) if no % 32 == 0: self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_count = no self.main_pfcsamr_app.status_text = "Preprocessed done" self.main_pfcsamr_app.current_model = MyTableModel(self.classify_headings, self.classify_preprocessed_rows) return self def do_features_countvectorizer(self, variance_threshold=None, **kwargs): """ Do feature extract and selection (train stage). :param variance_threshold: float or None - threshold for feature selection :param kwargs: dict - options passed to CountVectorizer :return: """ self.main_pfcsamr_app.variance_warn_message = "" if not self.preprocessed_rows: self.preprocessed_rows = copy(self.rows) # TODO No hardcodear columns_names = self.headings columns_is_text = [False, False, True, False] columns_is_class = [False, False, False, True] train_y = [] steps = [] # steps.append(('numeric_feats', MyPipeline([ # ('selector', SelectNumerics(columns_is_text, columns_names, columns_is_class)), # ('dict', DictVectorizer()), # ]))) for column_i, column_is_text in enumerate(columns_is_text): if columns_is_class[column_i]: train_y = map(lambda x: float(x[column_i]), self.preprocessed_rows) train_y = np.array(list(train_y)) else: if column_is_text: steps.append( ( columns_names[column_i], MyPipeline( [ ("selector", SelectText(column_i=column_i)), ("count_vector", CountVectorizer(**kwargs)), ] ), ) ) self.feature_union = FeatureUnion(steps) self.featured_rows = self.feature_union.fit_transform(self.preprocessed_rows, train_y) self.featured_headings = deepcopy(self.feature_union.get_feature_names()) self.train_y = train_y variance_too_high = False if variance_threshold is not None: thresholder = VarianceThreshold(threshold=variance_threshold) try: self.featured_rows = thresholder.fit_transform(self.featured_rows) self.featured_support = thresholder.get_support() self.featured_selected_headings = [ self.featured_headings[i] for i, v in enumerate(self.featured_support) if v ] self.main_pfcsamr_app.variance_warn_message = "" except ValueError: traceback.print_exc() self.featured_rows = np.empty_like(self.featured_rows) self.featured_support = [] self.featured_selected_headings = [] self.main_pfcsamr_app.variance_warn_message = "threshold too high!!!" variance_too_high = True else: self.main_pfcsamr_app.variance_warn_message = "" self.featured_support = [True] * self.featured_rows.shape[1] self.featured_selected_headings = deepcopy(self.featured_headings) if not variance_too_high: self.main_pfcsamr_app.learn_tab_enabled = True self.main_pfcsamr_app.current_model = MyTableModel(self.featured_selected_headings, self.featured_rows) self.main_pfcsamr_app.status_text = "Feature extraction done. Shape of useful features: %s. Removed %d." % ( str(self.featured_rows.shape), len(self.featured_headings) - len(self.featured_selected_headings), ) return self def do_classify_features_countvectorizer(self): """ Do feature extract and selection (classify stage). :return: """ if not self.classify_preprocessed_rows: self.classify_preprocessed_rows = copy(self.classify_rows) self.classify_featured_rows = self.feature_union.transform(self.classify_preprocessed_rows) # apply feature support self.classify_featured_rows = self.classify_featured_rows[:, self.featured_support] self.main_pfcsamr_app.current_model = MyTableModel(self.featured_selected_headings, self.classify_featured_rows) self.main_pfcsamr_app.status_text = "Feature extraction done. Shape of useful features: %s." % ( str(self.classify_featured_rows.shape), ) return self def do_learn(self, estimator_klazz: object, train_split: float = 0.75, **estimator_klazz_params): """ Do learn (train stage). :param estimator_klazz: class - class object of sklearn estimator :param train_split: float - percentage for train and test subsets split :param estimator_klazz_params: kwargs passed to estimator_klazz constructor :return: """ if self.main_pfcsamr_app.learn_train_split_resplit and train_split is not None: self.already_splitted = False logger.debug("Re-Splitting") def gui_callback(): print("callback has been called") self.main_pfcsamr_app.learn_train_split_resplit = False self.main_pfcsamr_app.queue.put_nowait(gui_callback) if not self.already_splitted: if train_split is not None: self.featured_rows_train, self.featured_rows_test, self.train_y_train, self.train_y_test = train_test_split( self.featured_rows, self.train_y, train_size=train_split ) logger.debug("Splitted") else: self.featured_rows_train, self.featured_rows_test, self.train_y_train, self.train_y_test = ( self.featured_rows, [], self.train_y, [], ) logger.debug("Not Splitted") self.already_splitted = True if estimator_klazz in [GaussianNB, LDA, QDA]: x_train = self.featured_rows_train.toarray() x_test = self.featured_rows_test.toarray() else: x_train = self.featured_rows_train x_test = self.featured_rows_test self.estimators[estimator_klazz.__name__] = estimator_klazz(**estimator_klazz_params) self.estimators[estimator_klazz.__name__].fit(x_train, self.train_y_train) if train_split: score_name = "selftest_score_" + estimator_klazz.__name__.lower() def gui_callback(): self.main_pfcsamr_app.config[score_name] = self.estimators[estimator_klazz.__name__].score( x_test, self.train_y_test ) self.main_pfcsamr_app.config = {score_name: str(self.main_pfcsamr_app.config[score_name])} self.main_pfcsamr_app.classify_tab_enabled = True self.main_pfcsamr_app.status_text = "Learned using {0}".format(estimator_klazz.__name__) print("{0}: {1}".format(score_name, self.main_pfcsamr_app.config[score_name])) self.main_pfcsamr_app.queue.put_nowait(gui_callback) def do_classify_classify(self, evaluate_using: str): """ Compute predictions (classify stage). :param evaluate_using: str - fqcn of trained estimator :return: """ my_estimator = self.estimators[evaluate_using] """:type: LinearClassifierMixin""" predictions = my_estimator.predict(self.classify_featured_rows.toarray()) # PhraseId, SentenceId, Phrase, *Sentiment*, **Features** self.predictions_headings = self.headings + self.featured_headings self.predictions_rows = np.c_[np.array(self.classify_rows), predictions.T.astype(int)] self.main_pfcsamr_app.current_model = MyTableModel(self.predictions_headings, self.predictions_rows) self.main_pfcsamr_app.status_text = "Predictions done using {0}".format(my_estimator.__class__.__name__) return self def classify_save_csv(self, filename: str): """ Save ``submission.csv`` for uploading it to kaggle :param filename: str - path to file save to :return: """ with open(filename, "wt") as file: writer = csv.writer(file) writer.writerow(["PhraseId", "Sentiment"]) writer.writerows(self.predictions_rows[:, (0, 3)].astype(int)) self.main_pfcsamr_app.status_text = "Saved predictions to {0}".format(filename)
def test_same_result_withdictrdd(self): X, X_rdd = self.make_text_rdd(2) Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2) Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") loc_word_2 = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") dist_word_2 = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word), ("words2", loc_word_2) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word), ("words2", dist_word_2) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) converted_union = dist_union.to_scikit() assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names(), converted_union.get_feature_names(), ) # test same results Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect()) assert_array_equal(loc_union.transform(X).toarray(), Z_transformed.toarray()) assert_array_equal(loc_union.transform(X).toarray(), converted_union.transform(X).toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) X_converted_transformed = converted_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z)[:, 'X'].collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) assert_array_equal(X_transformed.toarray(), X_converted_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) converted_union = dist_union_par.to_scikit() X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z)[:, 'X'].collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) assert_array_equal(X_transformed.toarray(), converted_union.transform(X).toarray())
preprocessor=get_col('description'))), ('title', CountVectorizer( ngram_range=(1, 2), stop_words=russian_stop, # max_features=7000, preprocessor=get_col('title'))) ]) start_vect = time.time() # Fit my vectorizer on the entire dataset instead of the training rows # Score improved by .0001 vectorizer.fit(df.to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() print("Vectorization Runtime: %0.2f Minutes" % ((time.time() - start_vect) / 60)) # Drop Text Cols textfeats = ["description", "title"] df.drop(textfeats, axis=1, inplace=True) from sklearn.metrics import mean_squared_error from math import sqrt ridge_params = {'alpha': 30.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True, 'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': SEED} # Ridge oof method from Faron's kernel # I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset # It doesn't really add much to the score, but it does help lightgbm converge faster
# Ridge Feature Processing ridge_train, ridge_test = ens_ridge(ready_df[:train_row], train_y, train_row, ready_df[train_row:], test_row) ridge_preds = np.concatenate([ridge_train, ridge_test]) df['ridge_preds'] = ridge_preds # NN Feature Processing # nn_train, nn_test = ens_nn(ready_df[:train_row], # train_y, train_row, # ready_df[train_row:], test_row) # nn_preds = np.concatenate([nn_train, nn_test]) # df['nn_preds'] = nn_preds # Feature Stack tfvocab = vectorizer.get_feature_names() tfvocab = df.columns.tolist() + tfvocab logger.info('Feature Names Length:{}'.format(len(tfvocab))) csr_train_X = csr_matrix(df.loc[train_index, :].values) csr_test_X = csr_matrix(df.loc[test_index, :].values) train_X = hstack([csr_train_X, ready_df[:train_row]]) test_X = hstack([csr_test_X, ready_df[train_row:]]) # train_X = hstack([csr_matrix(df.loc[train_index, :].values), ready_df[:train_row]]) # test_X = hstack([csr_matrix(df.loc[test_index, :].values), ready_df[train_row:]]) del df gc.collect() # Train Data Split train_X, valid_X, train_y, valid_y = train_test_split(train_X,
def data_vectorize(df): russian_stop = set(stopwords.words("russian")) tfidf_para = { "stop_words": russian_stop, "analyzer": "word", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", #"min_df":5, #"max_df":.9, "smooth_idf":False } tfidf_para2 = { "stop_words": russian_stop, "analyzer": "char", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", # "min_df":5, # "max_df":.9, "smooth_idf": False } # mean rmse is: 0.23865288181138436 def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ("description", TfidfVectorizer( ngram_range=(1, 2), max_features=40000,#40000,18000 **tfidf_para, preprocessor=get_col("description")) ), # ("title_description", TfidfVectorizer( # ngram_range=(1, 2),#(1,2) # max_features=1800,#40000,18000 # **tfidf_para, # preprocessor=get_col("title_description")) # ), ("text_feature", CountVectorizer( ngram_range=(1, 2), preprocessor=get_col("text_feature")) ), ("title", TfidfVectorizer( ngram_range=(1, 2), **tfidf_para, preprocessor=get_col("title")) ), #新加入两个文本处理title2,title_char ("title2", TfidfVectorizer( ngram_range=(1, 1), **tfidf_para, preprocessor=get_col("title")) ), ("title_char", TfidfVectorizer( ngram_range=(1, 4),#(1, 4),(1,6) max_features=16000,#16000 **tfidf_para2, preprocessor=get_col("title")) ), ]) vectorizer.fit(df.to_dict("records")) ready_full_df = vectorizer.transform(df.to_dict("records")) tfvocab = vectorizer.get_feature_names() df.drop(["text_feature", "text_feature_2", "description","title", "title_description"], axis=1, inplace=True) df.fillna(-1, inplace=True) return df, ready_full_df, tfvocab