def main(): ######################################################################## # use hyperopt to find the best parameters of the model # use 5 fold cross validation learner_name = 'vanila_ensemble' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, logger) optimizer.run()
def main(): feature_vectorized_file_name = 'Data/feature_vectorized2' if os.path.exists(feature_vectorized_file_name) == False: sparse_merge, price = _load(feature_vectorized_file_name) print(sparse_merge.shape) else: ######################################################################## start_time = time.time() merge, submission, price = get_extract_feature() merge = merge[:TRAIN_SIZE] #merge['item_condition_id'] = merge['item_condition_id'].astype('category') # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # # # vectorize features # wb = CountVectorizer() # X_category2 = wb.fit_transform(merge['category_2']) # X_category3 = wb.fit_transform(merge['category_name']) # X_brand2 = wb.fit_transform(merge['brand_name']) # print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # # lb = LabelBinarizer(sparse_output=True) # X_brand = lb.fit_transform(merge['brand_name']) # X_category1 = lb.fit_transform(merge['category_1']) # X_category4 = lb.fit_transform(merge['category_name']) # print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) # # X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) # # # hand feature # for col in merge.columns: # if ('Len' in col) or ('Frec' in col): # merge[col] = np.log1p(merge[col]) # merge[col] = merge[col] / merge[col].max() # # hand_feature = ['brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', # 'brand_name_item_description_Intsct'] # X_hand_feature = merge[hand_feature].values # name_w1 = param_space_best_WordBatch['name_w1'] name_w2 = param_space_best_WordBatch['name_w2'] desc_w1 = param_space_best_WordBatch['desc_w1'] desc_w2 = param_space_best_WordBatch['desc_w2'] # # wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { # "hash_ngrams": 2, # "hash_ngrams_weights": [name_w1, name_w2], # "hash_size": 2 ** 28, # "norm": None, # "tf": 'binary', # "idf": None, # }), procs=8) # wb.dictionary_freeze = True # X_name = wb.fit_transform(merge['name']) # del (wb) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)] # print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) merge['item_description'] = merge['category_2'].map(str)+' E '+\ merge['name'].map(str)+' E '+\ merge['item_description'].map(str) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 3, "hash_ngrams_weights": [desc_w1, desc_w2, 0.7], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 6, 0, 1), dtype=bool)] print( '[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) print(X_description.shape) sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1, X_category2, X_category3, X_category4, X_hand_feature, X_name, X_description)).tocsr() print(X_dummies.shape, X_brand.shape, X_brand2.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_category4.shape, X_hand_feature.shape, X_name.shape, X_description.shape, sparse_merge.shape) _save(feature_vectorized_file_name, [sparse_merge, price]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model # use 3 fold cross validation # learner_name='best_FTRL' # learner_name='FTRL' learner_name = 'best_FM_FTRL' #learner_name='FM_FTRL' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger) optimizer.run() a = 12
def main(): feature_vectorized_file_name = 'Data/feature_vectorized' if os.path.exists(feature_vectorized_file_name) == False: sparse_merge, price, desc, name = _load(feature_vectorized_file_name) print(sparse_merge.shape) else: ######################################################################## start_time = time.time() merge, submission, price = get_extract_feature() merge = merge[:TRAIN_SIZE] merge['item_condition_id'] = merge['item_condition_id'].astype( 'category') print('[{}] Convert categorical completed'.format(time.time() - start_time)) brands_filling(merge) merge['gencat_cond'] = merge['category_1'].map( str) + '_' + merge['item_condition_id'].astype(str) merge['subcat_1_cond'] = merge['category_2'].map( str) + '_' + merge['item_condition_id'].astype(str) merge['subcat_2_cond'] = merge['category_name'].map( str) + '_' + merge['item_condition_id'].astype(str) print( f'[{time.time() - start_time}] Categories and item_condition_id concancenated.' ) # vectorize features wb = CountVectorizer() X_category2 = wb.fit_transform(merge['category_2']) X_category3 = wb.fit_transform(merge['category_name']) X_brand2 = wb.fit_transform(merge['brand_name']) wb = CountVectorizer(token_pattern='.+', min_df=2) X_gencat_cond = wb.fit_transform(merge['gencat_cond']) X_subcat_1_cond = wb.fit_transform(merge['subcat_1_cond']) X_subcat_2_cond = wb.fit_transform(merge['subcat_2_cond']) print( '[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_category1 = lb.fit_transform(merge['category_1']) X_category4 = lb.fit_transform(merge['category_name']) print( '[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) # hand feature columns = ['brand_name_Frec', 'item_description_wordLen'] for col in columns: merge[col] = np.log1p(merge[col]) merge[col] = merge[col] / merge[col].max() hand_feature = [ 'brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', 'brand_name_item_description_Intsct', 'has_brand' ] X_hand_feature = merge[hand_feature].values sparse_merge = hstack( (X_dummies, X_brand, X_brand2, X_category1, X_category2, X_category3, X_category4, X_hand_feature, X_gencat_cond, X_subcat_1_cond, X_subcat_2_cond)).tocsr() print(X_dummies.shape, X_brand.shape, X_brand2.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_category4.shape, X_hand_feature.shape, sparse_merge.shape, X_gencat_cond.shape, X_subcat_1_cond.shape, X_subcat_2_cond.shape) merge['item_description'] = merge['category_2'].map(str)+' . . '+\ merge['name'].map(str)+' . . '+\ merge['item_description'].map(str) desc = merge['item_description'] name = merge['name'] _save(feature_vectorized_file_name, [sparse_merge, price, desc, name]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model learner_name = 'best_WordBatch' #learner_name='WordBatch' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, sparse_merge, price, desc, name, logger) optimizer.run()
def main(): ######################################################################## file_name = 'Data/feature_processed' if os.path.exists(file_name) == False: merge, Item_size, hand_feature = _load('Data/feature_processed') print(hand_feature) else: start_time = time.time() merge, submission, y = get_extract_feature() merge['item_condition_id'] = merge['item_condition_id'].astype( 'category') print('[{}] Convert categorical completed'.format(time.time() - start_time)) Item_size = {} # Label_Encoder brand_name + category columns = ['category_1', 'category_2', 'category_name', 'brand_name'] p = multiprocessing.Pool(4) dfs = p.imap(Label_Encoder, [merge[col] for col in columns]) for col, df in zip(columns, dfs): merge[col] = df Item_size[col] = merge[col].max() + 1 print('[{}] Label Encode `brand_name` and `categories` completed.'. format(time.time() - start_time)) # sequance item_description,name columns = ['item_description', 'name'] p = multiprocessing.Pool(4) dfs = p.imap(Item_Tokenizer, [merge[col] for col in columns]) for col, df in zip(columns, dfs): merge['Seq_' + col], Item_size[col] = df print('[{}] sequance `item_description` and `name` completed.'.format( time.time() - start_time)) print(Item_size) # hand feature columns = ['brand_name_Frec', 'item_description_wordLen'] for col in columns: merge[col] = np.log1p(merge[col]) merge[col] = merge[col] / merge[col].max() hand_feature = [ 'brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', 'brand_name_item_description_Intsct' ] _save(file_name, [merge, Item_size, hand_feature]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model # use 3 fold cross validation #learner_name = 'best_GRU_ensemble' # learner_name = 'GRU_ensemble' #learner_name = 'best_con1d_ensemble' # learner_name = 'con1d_ensemble' learner_name = 'best_vanila_con1d' # learner_name='best_vanila_GRU' #learner_name='vanila_con1d' # learner_name='vanila_GRU' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, merge, Item_size, hand_feature, logger) optimizer.run()
#coding: utf-8 import os import sys import pkgutil import time_utils import logging_utils import config feature_name = "stack1" logname = "feature_combiner_%s_%s.log" % (feature_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) data_dict = { "train_basic": "newdata/train_v20.csv", "train_files": [], "test_files": [], } fname = os.path.join(config.FEAT_DIR + "/Combine", feature_name + config.FEAT_FILE_SUFFIX) pkl_utils._save(fname, data_dict) logger.info("Save to %s" % fname)