def main(full = False): vectorizer = make_union( #on_field('user', Tfidf(max_features=100000 , token_pattern='\w+')), #100000 on_field('name', Tfidf(max_features=15000 , token_pattern='\w+')), #100000 on_field('text', Tfidf(max_features=60000, token_pattern='\w+', ngram_range=(1, 2))), #100000 on_field(['region', 'city', 'price_cut', 'item_seq_number_cut', 'image_top_1', \ 'param_1', 'param_3', 'param_3', 'user_type', 'user'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer('process train'): train, valid, test, y_train, y_valid, trndex, tstdex = load_data(full) y_train = y_train.values X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) if full: X_test = vectorizer.transform(preprocess(test)).astype(np.float32) with ThreadPool(processes=4) as pool: if full: Xb_train, Xb_test = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]] xs = [[Xb_train, Xb_test], [X_train, X_test]] * 2 else: Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) if not full: print('Valid RMSE: {:.4f}'.format(np.sqrt(metrics.mean_squared_error(y_valid.values, y_pred))) ) return y_pred, trndex return y_pred, tstdex
def main(): vectorizer = make_union( on_field('title', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))), on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['price', 'user_type', 'image_top_1', 'city', 'region'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer('process train'): train = pd.read_csv('../input/train.csv', parse_dates = ["activation_date"]) train = train[train['deal_probability'] > 0.5].reset_index(drop=True) cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1))) X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with ThreadPool(processes=4) as pool: Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0] print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
def init_preprocesses(): vectorizer = make_union( on_field('name', Tfidf(max_features=15000 , token_pattern='\w+')), #100000 on_field('all_titles', Tfidf(max_features=80000 , token_pattern='\w+')), #100000 #on_field('user_categories', Tfidf(max_features=10000 , token_pattern='\w+')), #100000 on_field('text', Tfidf(max_features=60000, token_pattern='\w+', ngram_range=(1, 2))), #100000 on_field(['region', 'city', 'price_cut', 'item_seq_number_cut', 'image_top_1', 'user_avg_price_cut', \ 'param_1', 'param_3', 'param_3', 'user_type', 'user', 'user_ad_ct', 'usercat_avg_price_cut', 'usercat_ad_ct'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() return vectorizer, y_scaler
def evaluate(train): vectorizer = make_union(make_pipeline( PandasSelector("name"), Tfidf(max_features=100000, token_pattern=r"\w+"), ), make_pipeline( PandasSelector("text"), Tfidf(max_features=100000, token_pattern=r"\w+", ngram_range=(1, 2)), ), make_pipeline( PandasSelector( ["shipping", "item_condition_id"], records=True), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer("process train"): train = train[train["price"] > 0].reset_index(drop=True) cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] y_train = y_scaler.fit_transform( np.log1p(train["price"].values.reshape(-1, 1))) X_train = vectorizer.fit_transform(preprocess(train)).astype( np.float32) print(f"X_train: {X_train.shape} of {X_train.dtype}") del train with timer("process valid"): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with ThreadPool(processes=4) as pool: Xb_train, Xb_valid = [ x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid] ] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]) print("Valid RMSLE: {:.4f}".format( np.sqrt(mean_squared_log_error(valid["price"], y_pred))))
def train(): vectorizer = make_union( on_field('title', Tfidf(max_features=1000000, min_df=5, # token_pattern='[\w\?,\.;:\(\)\[\]]+', token_pattern='\w+', stop_words=stops, ngram_range=(1, 2), # lowercase=True, # smooth_idf=False )), on_field('description', Tfidf(max_features=1000000, min_df=5, lowercase=True, # token_pattern='[\w\?,\.;:\(\)\[\]]+', token_pattern='\w+', ngram_range=(1, 2), # stop_words=stops, # smooth_idf=False ) ), n_jobs=1) df = pd.DataFrame() list_size = [] # # for path in ['../input/train.csv', '../input/test.csv', '../input/train_active.csv', '../input/test_active.csv']: _df = pd.read_csv(path, usecols=['title', 'description', 'price', 'item_seq_number', 'parent_category_name', 'region', 'city', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type' ]) list_size.append(_df.shape[0]) df = pd.concat([df, _df], axis=0, ignore_index=True) data = vectorizer.fit_transform(preprocess(df)).astype(np.float32) X_train = data[:list_size[0], :] X_test = data[list_size[0]:list_size[0] + list_size[1], :] with open('train_tfidf_all.pkl', 'wb') as f: pickle.dump(X_train, f, -1) with open('test_tfidf_all.pkl', 'wb') as f: pickle.dump(X_test, f, -1) with open('vectorizer_tfidf_all.pkl', 'wb') as f: pickle.dump(vectorizer, f, -1)
def main(): vectorizer = make_union( on_field('name', Tfidf(max_features=100000, token_pattern='\w+')), on_field( 'text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['shipping', 'item_condition_id'], FunctionTransformer(to_records, validate=False), DictVectorizer())) y_scaler = StandardScaler() train = pd.read_table(TRAIN_FILE_MERCARI) train = train[train['price'] > 0].reset_index(drop=True) cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] y_train = y_scaler.fit_transform( np.log1p(train['price'].values.reshape(-1, 1))) X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with ThreadPool(processes=8) as pool: Xb_train, Xb_valid = [ x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid] ] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]) print('Valid RMSLE: {:.4f}'.format( np.sqrt(mean_squared_log_error(valid['price'], y_pred))))
def create_vectorizers(train): train = preprocess(train) name_vectorizer = Tfidf(max_features=100000, lowercase=False, encoding="ascii", decode_error="strict", analyzer="word") name_vectorizer.fit(train["name"]) text_vectorizer = Tfidf(max_features=100000, ngram_range=(1, 1), lowercase=False, encoding="ascii", decode_error="strict", analyzer="word") text_vectorizer.fit(train["text"]) valid_records = to_records(train[["shipping", "item_condition_id"]]) dict_vectorizer = DictVectorizer() dict_vectorizer.fit(valid_records) return name_vectorizer, text_vectorizer, dict_vectorizer
def _make_tfidf_NB_clf(self, **cfg): max_f = cfg.get('max_features', 1200) max_df = cfg.get('max_df', 0.7) sublin = cfg.get('sublin', True) vectorizer = Tfidf(stop_words='english', norm='l2', max_df=max_df, max_features=max_f, sublinear_tf=sublin) model = MNB() clf = Pipeline(steps=[('v', vectorizer), ('nb', model)]) return clf
def main(): stopWords = stopwords.words('russian') vectorizer = make_union( on_field('title', Tfidf(max_features=100000, stop_words=stopWords)), # token_pattern='\w+', on_field( 'text', Tfidf(max_features=100000, ngram_range=(1, 2), stop_words=stopWords)), on_field('param', Tfidf(max_features=1000, stop_words=stopWords)), # on_field(['shipping', 'item_condition_id'], # FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer('process train'): train = pd.read_csv('../data/train.csv') cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] # y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1))) y_train = y_scaler.fit_transform( train['deal_probability'].values.reshape(-1, 1)) X_train = vectorizer.fit_transform(preprocess(train)).astype( np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with ThreadPool(processes=4) as pool: Xb_train, Xb_valid = [ x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid] ] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]) print('Valid RMSLE: {:.4f}'.format( np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred))))
def main(): vectorizer = make_union(on_field('name', Tfidf(max_features=1000)), on_field('text', Tfidf(max_features=1000)), on_field(['#friends', '#followers', '#favorites'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer('process train'): train, label, __, __ = load_data() train['label'] = label #train = train.sample(10000) train, valid = train_test_split(train, test_size=0.2, random_state=123) y_train = y_scaler.fit_transform( np.log1p(train['label'].values.reshape(-1, 1))) X_train = vectorizer.fit_transform(preprocess(train)).astype( np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with timer('fit predict'): Xb_train, Xb_valid = [ x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid] ] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] #y_pred0 = fit_predict(xs[0], y_train=y_train) y_pred1 = fit_predict(xs[1], y_train=y_train) y_pred = y_pred1 y_pred = np.expm1( y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0]) print('Valid MSLE: {:.4f}'.format( mean_squared_log_error(valid['label'], np.where(y_pred < 0, 0, y_pred))))
def lr_model(): steps = [("vec", Count(max_features=20000, ngram_range=(1, 3))), ("tfidf", Tfidf()), ("clf", LR(penalty="l2", solver="saga", multi_class="auto", random_state=42, max_iter=1e4, n_jobs=-1, C=9.930234540337622, tol=0.0007738614855921237))] return Pipeline(steps=steps)
def main1(): vectorizer = make_union( on_field( 'text', Tfidf(max_features=300000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['len_text'], FunctionTransformer(to_records, validate=False), DictVectorizer())) with timer('process train'): if os.path.exists(resource_path + 'dataset.pik'): with open(resource_path + 'dataset.pik', 'rb') as f: trainX, testX, trainY = pickle.load(f) else: train = pd.read_csv(data_path + 'train.csv') train['Discuss'] = train['Discuss'].apply( lambda x: ' '.join(jieba.cut(x))) test = pd.read_csv(data_path + 'test.csv') test['Discuss'] = test['Discuss'].apply( lambda x: ' '.join(jieba.cut(x))) train = train[train['Score'] > 0].reset_index(drop=True) trainY = train['Score'].values trainX = vectorizer.fit_transform(get_dataset_x(train)).astype( np.float32) testX = vectorizer.fit_transform(get_dataset_x(test)).astype( np.float32) sk = SelectKBest(chi2, k=100000) trainX = sk.fit_transform(trainX, trainY) testX = sk.transform(testX) with open(resource_path + 'dataset.pik', 'wb') as f: pickle.dump((trainX, testX, trainY), f) print(f'trainX: {trainX.shape} of {trainX.dtype} with{type(trainX)}') print(f'testX: {testX.shape} of {testX.dtype} with{type(testX)}') #pred=model_lgb(trainX,testX,trainY) pred = model_svm(trainX, testX, trainY) store_result(pred)
return [doc_names[docs_indices[0, x]] for x in range(num_items)] def find_images(docs): images = [] for d in docs: filename = d.split("/")[-1].split(".")[0] img_name = "/".join(filename.split("_")) + ".jpg" images.append(img_name) return images if __name__ == "__main__": docs = os.listdir(docs_folder) doc_names = [os.path.join(docs_folder, x) for x in docs] words = { str(x): x for x in range(sum([codebook_size, codebook_size, codebook_size])) } print("Training model") model = Tfidf(input='filename', ngram_range=(1, 2), vocabulary=words) matrix = model.fit_transform(doc_names) pickle.dump((doc_names, matrix), open(tfidf_model, "wb")) test_words = [12, 23, 1323, 234, 214, 1224, 1532] docs_match = find_matching_docs(matrix, test_words, doc_names) imgs = find_images(docs_match) pdb.set_trace()
def train(): russian_stop = set(stopwords.words('russian')) vectorizer = make_union(on_field( 'title', Tfidf(max_features=1000000, min_df=5, token_pattern='\w+', stop_words=russian_stop, ngram_range=(1, 2))), on_field( 'description', Tfidf(max_features=1000000, min_df=5, token_pattern='\w+', ngram_range=(1, 2), stop_words=russian_stop)), on_field(['item_seq_number'], FunctionTransformer(to_records, validate=False), DictVectorizer()), FunctionTransformer(itemgetter(['price']), validate=False), n_jobs=4) df = pd.read_csv('../input/train.csv', usecols=[ 'title', 'description', 'price', 'item_seq_number', 'parent_category_name', 'region', 'city', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type', 'deal_probability', 'activation_date' ]) print('load end') target = df['deal_probability'] X_train = vectorizer.fit_transform(preprocess(df)).astype(np.float32) with open('train_nn.pkl', 'wb') as f: pickle.dump(X_train, f, -1) """ with open('train_nn.pkl', 'rb') as f: X_train = pickle.load(f) """ with open('vectorizer.pkl', 'wb') as f: pickle.dump(vectorizer, f, -1) y_train = target.values metric = 'val_loss' mode = 'min' callbacks = [ EarlyStopping(monitor=metric, patience=10, verbose=1, min_delta=1e-6, mode=mode), ReduceLROnPlateau(monitor=metric, factor=0.1, patience=2, verbose=1, epsilon=1e-4, mode=mode), ModelCheckpoint(monitor=metric, filepath='weights/best_weights.hdf5', save_best_only=True, save_weights_only=True, mode=mode), TensorBoard(log_dir='logs'), LoggingCallback() ] model = get_model(X_train.shape[1]) train = df['activation_date'] < '2017-03-25' test = df['activation_date'] >= '2017-03-25' trn_x = X_train[[i for i in range(X_train.shape[0]) if train[i]]] val_x = X_train[[i for i in range(X_train.shape[0]) if test[i]]] trn_y = y_train[train] val_y = y_train[test] model.fit(x=trn_x, y=trn_y, validation_data=(val_x, val_y), batch_size=2**11, epochs=1000, callbacks=callbacks)
cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(data)) train = data.iloc[train_ids] valid = data.iloc[valid_ids] del data y_train = y_scaler.fit_transform(np.log1p(train['price'].values.reshape(-1, 1))) train = preprocess(train) name_tfidf = Tfidf(max_features=100000, token_pattern='\w+') name_p: Pipeline = make_pipeline( FunctionTransformer(itemgetter('name'), validate=False), name_tfidf) text_tfidf = Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2)) text_p = make_pipeline(FunctionTransformer(itemgetter('text'), validate=False), text_tfidf) shipping_p = make_pipeline( FunctionTransformer(itemgetter('shipping'), validate=False), FunctionTransformer(to_records, validate=False), DictVectorizer())
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import make_pipeline, make_union, Pipeline from sklearn.feature_extraction import DictVectorizer from operator import itemgetter import pandas as pd class Vectorizer(): def __init__(self): self.vectorizer = None def on_field(self, f: str, *vec): return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec) def to_records(self, df: pd.DataFrame): return df.to_dict(orient='records') def tfidf_vectorizer(self, title_feat=100000, description_feat=500000) self.vectorizer = make_union( self.on_field("title", Tfidf(max_features=title_feat, token_pattern="\w+")), self.on_field("description", Tfidf(max_features=description_feat, token_pattern="\w+", ngram_range=(1, 2))), self.on_field(['shipping', 'status'], FunctionTransformer(self.to_records, validate=False), DictVectorizer()) ) return self.vectorizer
def main(evalation=True): #make_union将各个特征组合到一起 vectorizer = make_union( #先获取pd中name,进行Tfidf,根据语料库的出现词的频率排序,选择前300000个词,\w+匹配数字字母下划线的多个字符 #on_field('name', Tfidf(max_features=1000, token_pattern='\w+')), #获取pd中的text,也是tfidf,不同的是使用ngram on_field('text', Tfidf(max_features=300000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['len_discuss'],FunctionTransformer(to_records,validate=False),DictVectorizer()), #on_field(['shipping', 'item_condition_id'], #FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=1) y_scaler = StandardScaler() with timer('process train'): train = pd.read_csv(data_path+'train_split.csv') train['len_discuss']=train['Discuss'].apply(lambda x:len(x)) train['Discuss']=train['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) test=pd.read_csv(data_path+"dev_split.csv") test['len_discuss']=test['Discuss'].apply(lambda x:len(x)) test['Discuss']=test['Discuss'].apply(lambda x:' '.join(jieba.cut(x))) y_true=None if evalation: y_true=test['Score'].values ##################### noun # print('load noun set...') # # if os.path.exists(resource_path+'noun_set.pik'): # with open(resource_path+'noun_set.pik','rb') as f: # noun_set=pickle.load(f) # # noun_set=filter_noun(noun_set) # else: # noun_set=get_nouns(train['Discuss'].values) # with open(resource_path+'noun_set.pik','wb') as f: # pickle.dump(noun_set,f) # # noun_set=filter_noun(noun_set) # # print(f'noun size:{len(noun_set)}') ####################### ###################### keyword print('load keyword set...') if os.path.exists(resource_path+'keyword_set.pik'): with open(resource_path+'keyword_set.pik','rb') as f: keyword_set=pickle.load(f) else: keyword_set=get_keywords(train['Discuss'].values) with open(resource_path+'keyword_set.pik','wb') as f: pickle.dump(keyword_set,f) print(f'keyword size:{len(keyword_set)}') ###################### train = train[train['Score'] > 0].reset_index(drop=True)#取出所有价格大于0的数据 # cv = KFold(n_splits=10, shuffle=True, random_state=42)#20折 # train_ids, valid_ids = next(cv.split(train)) # valid=train.iloc[valid_ids] # train=train.iloc[train_ids] y_train_start=train['Score'].values y_train=y_scaler.fit_transform(train['Score'].values.reshape(-1,1)) X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) X_test=vectorizer.transform(preprocess(test)).astype(np.float32) #y_test=valid['Score'] sk=SelectKBest(chi2,k=100000) X_train=sk.fit_transform(X_train,y_train_start) X_test=sk.transform(X_test) print(f'X_train: {X_train.shape} of {X_train.dtype}') print(f'X_test: {X_test.shape} of {X_test.dtype}') # del train # with timer('process valid'): # X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) with ThreadPool(processes=6) as pool: Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_test]] ############################### noun # vec=CountVectorizer(binary=True,tokenizer=seg_sentence) # vec.fit(noun_set) # Xn_train,Xn_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]] ################################## ############################# keyword if os.path.exists(resource_path+'keyword_train.pik'): with open(resource_path+'resource_train.pik','rb') as f: Xk_train,Xk_valid=pickle.load(f) else: vec=CountVectorizer(binary=True,tokenizer=seg_sentence) vec.fit(keyword_set) Xk_train,Xk_valid=[vec.transform(x) for x in [train['Discuss'].values,test['Discuss'].values]] with open(resource_path+'resource_train.pik','wb') as f: pickle.dump([Xk_train,Xk_valid],f) ############################# ############################ ####下面的xn_train,Xn_valid ############## ############# 拼接在内部 # Xb_a_train=np.concatenate([Xb_train,Xk_train],axis=1) # Xb_a_valid=np.concatenate([Xb_valid,Xk_valid],axis=1) # X_a_train=np.concatenate([X_train,Xk_train],axis=1) # X_a_test=np.concatenate([X_test,Xk_valid],axis=1) xs = [[Xb_train, Xb_valid], [X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍 #Xb表示单词的出现与否,而X使用的是tfidf特征权重 ############## 放在训练 # xs = [[Xb_train, Xb_valid],[X_train, X_test],[Xk_train,Xk_valid]]*2 #复制一遍 #Xb表示单词的出现与否,而X使用的是tfidf特征权重 ############### print(len(xs),len(xs[0])) #print(len(xs[1])) xx=pool.map(partial(fit_predict, y_train=y_train), xs)#np.mean指传入多次进行平均 print(len(xx)) y_pred = np.mean(xx,axis=0) y_pred=y_scaler.inverse_transform(y_pred) # print(y_pred) pre=[] for i in y_pred: if i>4.7: pre.append(5) else: pre.append(i) if evalation and y_true is not None: print('the score is :',evaluate(y_true,pre)) result=pd.DataFrame({'ID':test.Id,'Discuss':test.Discuss,'Score':pre}) result.to_csv('MLP_simple_jieba_stopword_chibest.csv',header=None,index=None)
def create_text_pipeline(): l = [] l.append ( ('td_text', Tfidf(max_features=200000, token_pattern='\w+', ngram_range=(1, 3)))) return Pipeline(l)
def __init__(self, cols): self.cols = cols def fit(self, X, y=None): # stateless transformer return self def transform(self, X): # assumes X is a DataFrame Xdict = X.to_dict('records') return Xdict vec_name = FeatureUnion([ ('continuous', Pipeline([ ('extract', ColumnExtractor('name')), ('tfidf', Tfidf(max_features=100000, token_pattern='\w+')) ])), ]) vec_text = Pipeline([ ('extract', ColumnExtractor('text')), ('tfidf', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))) ]) vec_num = Pipeline([ ('extract', ColumnExtractor(['shipping', 'item_condition_id'])), ('ToDict', ToDict(['shipping', 'item_condition_id'])), ('DictVectorizer', DictVectorizer()) ]) y_scaler = StandardScaler() def load_data():
def main(): vectorizer = make_union(on_field( 'title', Tfidf(max_features=100000, token_pattern='\w+')), on_field( 'text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))), on_field(['price', 'user_type', 'image_top_1'], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() with timer('process train'): train = pd.read_csv('../input/train.csv', parse_dates=["activation_date"]) #train = train[train['deal_probability'] > 0].reset_index(drop=True) cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] y_train = y_scaler.fit_transform( np.log1p(train['deal_probability'].values.reshape(-1, 1))) X_train = vectorizer.fit_transform(preprocess(train)).astype( np.float32) print('X_train: {} of {}'.format(X_train.shape, X_train.dtype)) del train print('read test data ...') test = pd.read_csv('../input/test.csv', parse_dates=["activation_date"]) del test['image'] with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) # TODO X_test = vectorizer.transform(preprocess(test)) with ThreadPool(processes=4) as pool: # Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]] Xb_train, Xb_valid, Xb_test = [ x.astype(np.bool) for x in [X_train, X_valid, X_test] ] xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2 # TODO xs_test = [[Xb_train, Xb_test], [X_train, Xb_test]] y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0) # TODO y_pred_test = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs_test), axis=0) print("Start to join...") pool.join() y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0] print('Valid RMSLE: {:.4f}'.format( np.sqrt(mean_squared_log_error(valid['deal_probability'], y_pred)))) # TODO y_pred_test = y_scaler.inverse_transform(y_pred_test.reshape(-1, 1))[:, 0] pool.close() sub = pd.read_csv('../input/sample_submission.csv') sub['deal_probability'] = y_pred_test sub.to_csv('sub3.csv', index=False) print('all done!')
###########add target encoding (mean) for cf in categorical_features_tobe[1:]: new_f = "{}_dl".format(cf) temp = train[[cf,"deal_probability"]].groupby(cf).mean().reset_index().rename(columns={"deal_probability": new_f}) df = df.merge(temp, how="left", on=cf) df[new_f] = np.log1p(df[new_f]) df[new_f] = df[new_f].fillna(df[new_f].mean()) features.append(new_f) del temp gc.collect() ###### text features from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf from scipy.sparse import hstack, csr_matrix tfidf = Tfidf(ngram_range=(1,2), max_features=20000, sublinear_tf=True) textfeats = ["description", "title"] for cols in textfeats: df[cols] = df[cols].astype(str) df[cols] = df[cols].astype(str).fillna('missing') # FILL NA df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split()))) df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Word features.extend([cols + '_num_words',cols + '_num_unique_words', cols + '_words_vs_unique']) df["text"] = df["title"].astype(str) + " " + df["description"].astype(str) x = tfidf.fit_transform(df["text"]) features.extend(categorical_features) ######### add aggragated features
def train(): vectorizer = make_union( on_field( 'title', Tfidf(ngram_range=(3, 3), analyzer='char', max_features=1000000, min_df=5)), on_field( 'description', Tfidf(ngram_range=(3, 3), analyzer='char', max_features=1000000, min_df=5)), on_field(['item_seq_number'], FunctionTransformer(to_records, validate=False), DictVectorizer()), FunctionTransformer(itemgetter(['price']), validate=False), # n_jobs=4 ) df = pd.read_csv('../input/train.csv', usecols=[ 'title', 'description', 'price', 'item_seq_number', 'parent_category_name', 'region', 'city', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type', 'deal_probability' ]) print('load end') target = df['deal_probability'] X_train = vectorizer.fit_transform(preprocess(df)).astype(np.float32) with open('train_nn_chargram.pkl', 'wb') as f: pickle.dump(X_train, f, -1) with open('vectorizer_chargram.pkl', 'wb') as f: pickle.dump(vectorizer, f, -1) y_train = target.values metric = 'val_loss' mode = 'min' callbacks = [ EarlyStopping(monitor=metric, patience=10, verbose=1, min_delta=1e-6, mode=mode), ReduceLROnPlateau(monitor=metric, factor=0.1, patience=2, verbose=1, epsilon=1e-4, mode=mode), ModelCheckpoint(monitor=metric, filepath='weights/best_weights_chargram.hdf5', save_best_only=True, save_weights_only=True, mode=mode), TensorBoard(log_dir='logs'), LoggingCallback() ] model = get_model(X_train.shape[1]) cv = KFold(n_splits=5, shuffle=True, random_state=871) for train, test in cv.split(X_train, y_train): trn_x = X_train[train, :] val_x = X_train[test, :] trn_y = y_train[train] val_y = y_train[test] break model.fit(x=trn_x, y=trn_y, validation_data=(val_x, val_y), batch_size=2**11, epochs=1000, callbacks=callbacks)
best_model = ks.models.clone_model(model) best_model.set_weights(model.get_weights()) res = dict(pred_valid=pred_valid_best, pred_test=best_model.predict(X_test)) return res class_names = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] ## from https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams word_vectorizer = Tfidf(sublinear_tf=True, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 2), max_features=20000) char_vectorizer = Tfidf(sublinear_tf=True, strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(2, 6), max_features=30000) vectorizer = make_union(on_field('comment_text', word_vectorizer), on_field('comment_text', char_vectorizer), n_jobs=4) with timer('process train'): train = pd.read_csv('../input/train.csv') cv = KFold(n_splits=20, shuffle=True, random_state=42)
content = [] for newspaper in newspapers: c.execute( u'select title, body from {} where title IS NOT NULL and body IS NOT NULL and date >= "{}" and date <= "{}";' .format(newspaper, init_date, final_date)) content += [row[0] + row[1] for row in c] conn.close() #Palabras comunes fp = codecs.open(stopwords, "r", encoding="utf-8") data = fp.read() fp.close() if 'spanish' in stopwords: aux = data.split('\r\n') elif 'english' in stopwords: aux = data.split('\n') words = [a.lower() for a in aux] """ # Entrenamiento de la valorizacion tfidf """ tfidf = Tfidf(min_df = 2, max_df = 0.95, \ stop_words = words, \ ngram_range = (1,1)) tfidf.fit(content) pk.dump(tfidf, open('idf.pk', 'w'))
def main(): vectorizer = make_union( on_field( 'description', Tfidf(max_features=100000, stop_words=sw, token_pattern='\w+', norm='l2', min_df=3, sublinear_tf=True, smooth_idf=False, ngram_range=(1, 2))), #max_df=0.3, on_field( 'title', Tfidf(max_features=100000, stop_words=sw, token_pattern='\w+', norm='l2', min_df=3, sublinear_tf=True, smooth_idf=False, ngram_range=(1, 2))), on_field([ 'image_top_1', 'region', 'category_name', 'parent_category_name', 'user_type' ], FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=1) with timer('reading data '): dtypes = { 'category_name': 'category', 'parent_category_name': 'category', 'region': 'category', 'item_seq_number': 'uint32', 'user_type': 'category', 'image_top_1': 'category', 'price': 'float32', 'deal_probability': 'float32' } train = pd.read_csv(path + 'train.csv', dtype=dtypes) test = pd.read_csv(path + 'test.csv', dtype=dtypes) with timer('add new features'): cat_cols = [ 'image_top_1', 'region', 'city', 'parent_category_name', 'category_name', 'param_1', 'param_2', 'param_3', 'user_type' ] num_cols = ['price', 'deal_probability'] for c in cat_cols: for c2 in num_cols: enc = train.groupby(c)[c2].agg(['mean']).astype( np.float32).reset_index() enc.columns = [ '_'.join([str(c), str(c2), str(c3)]) if c3 != c else c for c3 in enc.columns ] train = pd.merge(train, enc, how='left', on=c) test = pd.merge(test, enc, how='left', on=c) del (enc) with timer('process train'): cv = KFold(n_splits=20, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] y_train = train['deal_probability'].values X_train = vectorizer.fit_transform(preprocess(train)).astype( np.float32) print(f'X_train: {X_train.shape} of {X_train.dtype}') del train with timer('process valid'): X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) gc.collect() print('train shape', X_train.shape) print('valid shape', X_valid.shape) with timer('process test'): X_test = vectorizer.transform(preprocess(test)).astype(np.float32) del test gc.collect() print('test shape', X_test.shape) valid_length = X_valid.shape[0] X_valid = vstack([X_valid, X_test]) del (X_test) gc.collect() xs = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]] del (X_train, X_valid) gc.collect() y_pred = fit_predict(xs, y_train=y_train) test_pred = y_pred[valid_length:] y_pred = y_pred[:valid_length] print('Valid RMSLE: {:.4f}'.format( np.sqrt(mean_squared_error(valid['deal_probability'], y_pred)))) submission = pd.read_csv(path + 'test.csv', usecols=["item_id"]) submission["deal_probability"] = test_pred submission['deal_probability'].clip(0.0, 1.0, inplace=True) submission.to_csv(subpath + "MLP_V15.csv", index=False)
c = conn.cursor() content = [] c.execute('select title, body from lanacion where title IS NOT NULL;') for row in c: try: content.append(row[0] + row[1]) except: content.append(row[0]) c.execute('select title, body from pagina12 where title IS NOT NULL;') for row in c: try: content.append(row[0] + row[1]) except: content.append(row[0]) conn.close() """ Entrenamiento de la valorizacion tfidf """ m = 0.00 while m <= 0.95: tfidf = Tfidf(min_df=m, max_df=m + 0.05, ngram_range=(1, 2)) x_tfidf = tfidf.fit_transform(content) print m, x_tfidf.shape[1] m += 0.05
logger = get_logger(exp=args.exp) with timer("Load Data", logger): loader = DataLoader() with timer("tokenize", logger): loader.tokenize(tokenizer, { "stopwords": get_stopwords(), "include_verb": True }) train, test = loader.load() X = train["tokenized"].fillna("") X_test = test["tokenized"].fillna("") y = train["label"].values y_test = test["label"].values with timer("vectorize", logger): tv = Tfidf(max_features=20000, ngram_range=(1, 3)) X = tv.fit_transform(X) X_test = tv.transform(X_test) with timer("optimize", logger): study = optuna.create_study() study.optimize(optimal_params, n_trials=args.ntrial, n_jobs=args.n_jobs) logger.info(f"Best params: {study.best_params}") logger.info(f"Best value: {study.best_value}") logger.info(f"Best trial: {study.best_trial}")
optimizer=ks.optimizers.Adam(lr=3e-3)) for i in range(3): with timer(f'epoch {i + 1}'): model.fit(x=X_train, y=y_train, batch_size=2**(11 + i), epochs=1, verbose=0) y_pred = model.predict(X_test)[:, 0] return y_pred stopWords = stopwords.words('russian') vectorizer = make_union( on_field('title', Tfidf(max_features=100000, stop_words=stopWords)), # token_pattern='\w+', on_field( 'text', Tfidf(max_features=100000, ngram_range=(1, 2), stop_words=stopWords)), on_field('param', Tfidf(max_features=1000, stop_words=stopWords)), # on_field(['shipping', 'item_condition_id'], # FunctionTransformer(to_records, validate=False), DictVectorizer()), n_jobs=4) y_scaler = StandardScaler() train = pd.read_csv('../data/train.csv') test = pd.read_csv('../data/test.csv') cv = KFold(n_splits=5, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] # y_train = y_scaler.fit_transform(np.log1p(train['deal_probability'].values.reshape(-1, 1))) y_train = y_scaler.fit_transform(train['deal_probability'].values.reshape(
def tfidf_fabric(x=1): return Tfidf(max_features=15000, token_pattern='\w+', ngram_range=(1, x))