def predict(self, X_test, time_remain): Xs = self.tables self.istrain = False Xs[MAIN_TABLE_NAME] = X_test clean_tables(Xs) start = time.time() X = merge_table(Xs, self.config) self.time['merging_test']= time.time() -start clean_df(X) #feature_engineer(X, self.config, self.dropcols,self.numericmap, self.istrain,self.square_cubic_transform,self.skewness) transform_numeric(X, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness) transform_categorical_hash(X, self.dropcols,self.istrain) start = time.time() result =self.model.predict(X,self.diff_info,self.start_time) self.time['result_predict']= time.time() -start return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME]) main_table = pd.concat([main_table, X_test], keys=['train', 'test'], sort=True) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table # Xs[MAIN_TABLE_NAME] = clean_df(Xs[MAIN_TABLE_NAME]) clean_df(Xs[MAIN_TABLE_NAME]) X = merge_table(Xs, self.config) clean_df(X) selected_features = list( self.selected_features_0 ) + self.time_feature_list + self.mul_feature_list X = feature_engineer_rewrite(X.filter(selected_features), self.config) # X = X[X.index.str.startswith("test")] X = X.iloc[len_X_train:, ] X.sort_index(inplace=True) if FEATURE_SELECTION_SWITCH: X = X[self.selected_features_1] result = predict(X, self.config) del self.tables, X_test # gc.collect() return pd.Series(result)
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() timetrain(X_train, y_train, self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): ##--------Calculate sample size---------- '''main_table=self.tables[MAIN_TABLE_NAME] print(main_table.shape[0]) print(X_test.shape[0]) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table ## Clean tables clean_tables(Xs) #remove_trivial_features_in_tables(Xs) ## Merge tables and remove trivial features X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ### ----------Temporarily remove multi-categorical features from related tables---------- X.drop([c for c in X.columns if c.startswith("mul_")], axis=1, inplace=True) #print(X.columns) #input() ### ----------End----------- remove_trivial_features(X) ## Add number frequency feature cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ## Split train and test data X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) ## Training process train_with_time_control(X_train, self.y, self.config) ## Testing process result = predict(X, self.config) return pd.Series(result)
def baseline_features_test(Xs,X_test,config,m_features,mlbs,one_hot_model): main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, config) clean_df(X) from feature_for_test import multi_features_for_test X = X[X.index.str.startswith("test")] feature_engineer(X, config) new_features=None if len(m_features)>0 and int(config["time_budget"])>300: new_features = multi_features_for_test(X, m_features, mlbs, one_hot_model) #new_features.index = X.index X.drop(m_features, inplace=True, axis=1) X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) X = hstack([X,new_features]).tocsr() print("------------------") print(X.shape) #X = pd.concat([X, new_features], axis=1) elif len(m_features)>0: X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) X.drop(m_features, inplace=True, axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) else: X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) return X
def fit(self, Xs, y, time_remain): print('', flush=True) time_manager = TimeManager(self.config, time_remain) duration = 0 # self.tables = copy.deepcopy(Xs) self.tables = Xs clean_tables(self.tables) duration += 2*time_manager.check("clean tables") if DROP_OUTLIER: # the percentage of outliers dropped is around 15% to 20% inlier_lable = drop_outlier(self.tables[MAIN_TABLE_NAME]) self.tables[MAIN_TABLE_NAME] = self.tables[MAIN_TABLE_NAME][inlier_lable == 1].reset_index(drop=True) y = y[inlier_lable == 1].reset_index(drop=True) duration += time_manager.check("drop outlier") X = merge_table(self.tables, self.config) duration += 2*time_manager.check("merge table") clean_df(X) duration += 2*time_manager.check("clean data before learning") self.time_feature_list = [c for c in X if c.startswith(TIME_PREFIX)] self.mul_feature_list = [c for c in X if c.startswith(MULTI_CAT_PREFIX)] self.num_feature_list = [c for c in X if c.startswith(NUMERICAL_PREFIX)] print('', flush=True) if FEATURE_SELECTION_SWITCH: _, self.selected_features_0 = feature_selection(X.drop(columns=self.time_feature_list+self.mul_feature_list+self.num_feature_list) , y, self.config, FEATURE_RATIO_1) time_manager.check("first feature selection") selected_features = list(self.selected_features_0) + self.time_feature_list + self.mul_feature_list + self.num_feature_list else: selected_features = self.time_feature_list + self.mul_feature_list + self.num_feature_list st = time.time() X = feature_engineer_rewrite(X.filter(selected_features), self.config, time_manager) duration += 2*(time.time() - st) time_manager.check("exit feature engineering") if FEATURE_SELECTION_SWITCH: X, self.selected_features_1 = feature_selection(X, y , self.config, FEATURE_RATIO_2) duration += time_manager.check("second feature selection") print('', flush=True) self.config["prediction_estimated"] = 1.57 * duration print(f"estimated prediction time: {self.config['prediction_estimated']}") train(X, y, self.config, time_manager) time_manager.check("model training") print('', flush=True)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) result = None for i in range(0,3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result == None: result = tmp continue else: result = result + tmp result = result/float(3) return pd.Series(result)
def predict(self, X_test, time_remain): time_manager = TimeManager(self.config, time_remain) print(f"prediction remaining time: {time_remain}") print('', flush=True) Xs = self.tables # main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME]) # main_table = pd.concat([main_table, X_test], keys=['train', 'test'], sort=True) # time_manager.check("concat X_train and X_test") # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") # Xs[MAIN_TABLE_NAME] = main_table Xs[MAIN_TABLE_NAME] = X_test clean_df(Xs[MAIN_TABLE_NAME]) time_manager.check("clean main table") X = merge_table(Xs, self.config) time_manager.check("merge table") clean_df(X) time_manager.check("clean data before learning") print('', flush=True) if FEATURE_SELECTION_SWITCH: selected_features = list(self.selected_features_0) + self.time_feature_list + self.mul_feature_list + self.num_feature_list else: selected_features = self.time_feature_list + self.mul_feature_list + self.num_feature_list X = feature_engineer_rewrite(X.filter(selected_features), self.config, time_manager) time_manager.check("exit feature engineering") print('', flush=True) # X = X[X.index.str.startswith("test")] # X = X.iloc[len_X_train:, ] X.sort_index(inplace=True) time_manager.check("X sorting") if FEATURE_SELECTION_SWITCH: test_data_feature_selection(X, self.selected_features_1) X = X[self.selected_features_1] time_manager.check("test data feature selection") print('', flush=True) result = predict(X, self.config) time_manager.check("prediction") print('', flush=True) return pd.Series(result)
def fit(self, Xs, y, time_ramain): # self.tables = copy.deepcopy(Xs) self.tables = Xs if DATA_DOWNSAMPLING_SWITCH: self.tables[MAIN_TABLE_NAME], y = data_downsampling( self.tables[MAIN_TABLE_NAME], y, self.config) print("before clean table mem used") os.system("free -m") clean_tables(self.tables) print("before merge table mem used") os.system("free -m") # gc.collect() X = merge_table(self.tables, self.config) # gc.collect() print("before engineer mem used") os.system("free -m") self.time_feature_list = [c for c in X if c.startswith(TIME_PREFIX)] self.mul_feature_list = [ c for c in X if c.startswith(MULTI_CAT_PREFIX) ] clean_df(X) if FEATURE_SELECTION_SWITCH: _, self.selected_features_0 = feature_selection( X.drop(columns=self.time_feature_list + self.mul_feature_list), y, self.config) selected_features = list( self.selected_features_0 ) + self.time_feature_list + self.mul_feature_list X = feature_engineer_rewrite(X.filter(selected_features), self.config) # clean_df(X) if FEATURE_SELECTION_SWITCH: X, self.selected_features_1 = feature_selection(X, y, self.config) # gc.collect() print("before feature selection mem used") os.system("free -m") print("before train mem used") os.system("free -m") train(X, y, self.config)
def baseline_features(Xs, y, config): clean_tables(Xs) stampcol = Xs[CONSTANT.MAIN_TABLE_NAME][config["time_col"]].apply( lambda x: int(x.timestamp())) main_table = Xs[CONSTANT.MAIN_TABLE_NAME] main_table["label"] = y main_table["timestamp"] = stampcol main_table.sort_values("timestamp", inplace=True) tmp_columns = main_table.columns main_table = pd.DataFrame(main_table.values) main_table.columns = tmp_columns #main_table = main_table.iloc[0:40000] Xs[CONSTANT.MAIN_TABLE_NAME] = main_table y = main_table["label"] stampcol = main_table["timestamp"] X = merge_table(Xs, config) print(X.columns) X.drop(["label", "timestamp"], axis=1, inplace=True) clean_df(X) feature_engineer(X, config) cat_feature_map = {} for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_feature_map[col] = set(X[col]) feature_names = X.columns m_features = [] for feature in feature_names: if "mul_feature_" in feature: m_features.append(feature) one_hot_features = None one_hot_models = None mlbs = None if len(m_features) > 0 and int(config["time_budget"]) > 200000: one_hot_features, one_hot_models, mlbs = onehot_feature_selection_m( X, y, m_features, feature_num_everyiter=len(m_features)) X.drop(m_features, inplace=True, axis=1) #X = pd.concat([X, one_hot_features], axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) X = hstack([X, one_hot_features]).tocsr() elif len(m_features) > 0: X.drop(m_features, inplace=True, axis=1) from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) else: from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) print("---------------------------------") print(X.shape) #X.drop(m_features,inplace=True,axis=1) # one_hot_features=None # one_hot_models = None # import random # X_tmp = [X] # y_tmp = [y] # for i in range(5): # cols = list(X.columns) # random.shuffle(cols) # cols_tmp = cols[0:int(len(cols)*0.5)] # X_tmp.append(X[cols_tmp]) # y_tmp.append(y) # # y = pd.concat(y_tmp,axis=0) # X = pd.concat(X_tmp, axis=0) return X, y, feature_names, cat_feature_map, stampcol, one_hot_features, one_hot_models, m_features, mlbs
def predict(self, X_test, time_remain): ##--------Calculate sample size---------- '''main_table=self.tables[MAIN_TABLE_NAME] print(main_table.shape[0]) print(X_test.shape[0]) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] ## -------------Sort X_train according to the timestamp------------------- time_col = self.config['time_col'] '''print(main_table[time_col]) print(main_table[time_col].dtype) input()''' main_table.sort_values(time_col, inplace=True) index = main_table.index self.y = self.y.reindex(index) main_table.reset_index(inplace=True, drop=True) self.y.reset_index(inplace=True, drop=True) #print(main_table.index) #print(self.y.index) #input() #print(main_table.columns) #print(self.y.columns) #input() if self.test_mode: train_ratio = 0.8 train_size = int(train_ratio * main_table.shape[0]) X_test = main_table[train_size:] main_table = main_table[0:train_size] y_test = self.y[train_size:] self.y = self.y[0:train_size] ##----------concat and merge tables------------------------ main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table ## Clean tables clean_tables(Xs) #remove_trivial_features_in_tables(Xs) ## Merge tables and remove trivial features print(main_table.index) X = merge_table(Xs, self.config) train_index = X.index.str.startswith("train") test_index = X.index.str.startswith("test") ##------convert m_ features to c_---------## new_columns = [] mul_features = [] for col in X.columns: if "m_" in col and "mul_" not in col: new_columns.append("c_" + col) elif "mul_" in col: mul_features.append(col) new_columns.append(col) else: new_columns.append(col) X.columns = new_columns print(X.columns) clean_df(X) print(X.shape) ##-------------- Add number frequency feature ---------------- cat_features = [] for col in X.columns: #if "c_" in col and "ROLLING" not in col and "cnt" not in col: if col.split('_')[0] == 'c': cat_features.append(col) print("cat_features", cat_features) X, _ = cat_value_counts(X, cat_features) print(X.shape) #print(X.dtypes) # ##-------------- Reserve multi-cat features ------------------ # all_features = X.columns # tmp_c = None # mul_features = [] # # for c in all_features: # if c.split('_')[0] == 'm': # tmp_c = X[c].copy() # tmp_c.fillna("0",inplace=True) # tmp_c = tmp_c.apply(lambda x: str(x).split(",")) # X["mul_"+tmp_c.name] = tmp_c # mul_features.append("mul_"+tmp_c.name) # tmp_c = tmp_c.apply(lambda x: int(x[0])) # tmp_c.name = f"{CONSTANT.CATEGORY_PREFIX}{tmp_c.name}" # X = pd.concat([X, tmp_c], axis=1) # print(c) # # #input() # if not (tmp_c is None): # del tmp_c # #print(X.columns) # print(X.shape) # #input() ###--------------Change data type------------------ X.drop([self.config['time_col']], axis=1, inplace=True) X = normalize_categorical_features(X) for c in X.columns: if c.split('_')[0] == 'n': X[c] = X[c].astype('float32') elif c.split('_')[0] == 'c': #X[c] = X[c].apply(lambda x: int(x)) X[c] = X[c].astype('int32') elif c.split('_')[0] == 't': X[c] = X[c].values.astype('float32') elif c.split('_')[0] == 'm': continue elif c.split('_')[0] == 'mul': continue else: raise ValueError('Undefined column type: %s' % c) print(X.shape) ##---------------features split-------------------- main_features = [] for feature in X.columns: if "mul_" not in feature: main_features.append(feature) print(main_features) print(X.shape) ##--------------Remove trivial features------------------- remove_trivial_features(X[main_features]) print(X.shape) # ##---------------Multi-cat features-------------------- # multi_cat_features=[] # for c in X.columns: # if c.split('_')[0] == 'm': # multi_cat_features.append(c) # # if len(multi_cat_features) > 0: # X_multi_cat = X[multi_cat_features] # X.drop(multi_cat_features, axis=1, inplace=True) ###-------------Train lightgbm to get an initial result------------- result = None num_trails = 0 skip_multi_cat = False selection = False one_hot_features_m = None mlbs_m = None mlbs = None for i in range(1000): random_state = np.random.RandomState(i) num_trails = num_trails + 1 if self.config.time_left() < 50: num_trails = num_trails - 1 break X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] ##-------------data sample ----------------# from data_sample import data_sample if i == 0: X_train_sample, y_train_sample = data_sample( X_train, self.y, p_n_ratio=1, ratio=1, random_state_seed=i) mul_size = 0 for mul_feature in mul_features: mul_count_data_tmp = X_train_sample[mul_feature].apply( lambda x: len(x)) #print(mul_feature,mul_count_data_tmp.sum(axis=0)) mul_size = mul_size + mul_count_data_tmp.sum(axis=0) size_fo_train = 60000000 train_size = csr_matrix( X_train_sample[main_features]).nonzero()[0].size + mul_size p_n_ratio = size_fo_train / train_size X_train_sample, y_train_sample = data_sample(X_train, self.y, random_state_seed=i, p_n_ratio=p_n_ratio, ratio=1) #print(y_train_sample) #input() if self.config.time_left() < 50: num_trails = num_trails - 1 break print(X_train_sample.shape) X_tmp = pd.concat([X_train_sample, X_test]) print("train test_train shape:train", X_tmp.shape) print("train test_train shape:test", X_tmp[0:y_train_sample.shape[0]].shape) print("train test_train shape:y", X_tmp[y_train_sample.shape[0]:].shape) from feature_expansion import onehot_feature_selection_m main_body_start_time = time.time() if i >= 0: if len(mul_features) > 0: # if i>1 : # selection=False # one_hot_features_m, _, mlbs_m = onehot_feature_selection_m(X_tmp, y_train_sample, mlbs_m.keys(), # feature_num_everyiter=len( # mlbs_m.keys()), # selection=selection) # # else: # selection = False if i == 0: first_iter_flag = True else: first_iter_flag = False if skip_multi_cat: one_hot_features_m, mul_features = onehot_encoding_without_fit( X_tmp, mul_features, mlbs_m, config=self.config) else: one_hot_features_m, one_hot_models, mlbs_m, mul_features = onehot_feature_selection_m( X_tmp, y_train_sample, mul_features, config=self.config, is_first_iter=first_iter_flag, feature_num_everyiter=len(mul_features), selection=selection) if self.config.time_left() < 50: num_trails = num_trails - 1 break if one_hot_features_m is not None: one_hot_features_m = csr_matrix(one_hot_features_m, dtype=np.float32) one_hot_features_m = one_hot_features_m[ 0:y_train_sample.shape[0], :] print("mul_features shape sparse:", one_hot_features_m.shape) ###--------------------data concat--------------------### X_train_sample = X_tmp[0:y_train_sample.shape[0]] X_test = X_tmp[y_train_sample.shape[0]:] X_train_sample = X_train_sample[main_features] X_train_sample = csr_matrix(X_train_sample) y_train_sample = y_train_sample.values if one_hot_features_m is not None: X_train_sample = hstack([X_train_sample, one_hot_features_m]).tocsr() #if self.config.time_left() < 60: # num_trails = num_trails - 1 # break if result is None: #model = train_and_predict_with_time_control_basic(X_train_sample, X_test, y_train_sample, self.config, random_state=random_state) model = train_lightgbm(X_train_sample, y_train_sample, self.config, random_state=random_state, mode="timestamp") del X_train_sample result = lightgbm_predict_by_split(model, X_test, main_features, mlbs_m, mul_features, self.config) whole_process_time = time.time() - main_body_start_time print("Time for the whole process time is: %.4f" % whole_process_time) if result is None: mul_features = list() else: #model= train_and_predict_with_time_control_basic(X_train_sample, X_test, y_train_sample, self.config, random_state=random_state) model = train_lightgbm(X_train_sample, y_train_sample, self.config, random_state=random_state, mode="timestamp") del X_train_sample print("timeleft", self.config.time_left()) if self.config.time_left() < 50 or model is None: num_trails = num_trails - 1 break result_tmp = lightgbm_predict_by_split(model, X_test, main_features, mlbs_m, mul_features, self.config) if self.config.time_left() < 50 or result_tmp is None: num_trails = num_trails - 1 break result = result_tmp + result ''' if i>0: ###-------------- get sparse training and testing matrix--------- numeric_table = X_tmp[main_features] numeric_table = min_max_func(numeric_table) numeric_table = csr_matrix(numeric_table) X_train_sample = numeric_table[0:y_train_sample.shape[0],:] if len(cat_features) > 0 : if i > 1: selection = False one_hot_features, _, mlbs, le_models = onehot_feature_selection(X_tmp, y_train_sample, mlbs.keys(), feature_num_everyiter=len( mlbs.keys()), selection=selection) else: one_hot_features, one_hot_models, mlbs, le_models = onehot_feature_selection(X_tmp, y_train_sample, cat_features, feature_num_everyiter=len( cat_features), selection=selection) if one_hot_features is not None: X_train_sample = hstack([X_train_sample, one_hot_features]).tocsr() if one_hot_features_m is not None: X_train_sample = hstack([X_train_sample, one_hot_features_m]).tocsr() train_fastfm_batch(X_train_sample, y_train_sample, self.config) ''' ####---------------If there is not enough time for the whole process, skip multi-cat------------- if self.config.time_left() < whole_process_time: if skip_multi_cat: print("Time is not enough even using basic features!") break else: skip_multi_cat = True print("Time remaining: %.4f" % self.config.time_left()) print("-" * 50) if self.config.time_left() < 50: break if result is None: print("Time is not enough for a complete iteration!") result = np.zeros(X_test.shape[0]) else: result = result / num_trails print(result) #result /= num_trails ###-------------Ensemble------------------------- flag = False if flag: ###--------------Multi-cat features processing----------- if len(multi_cat_features) > 0: X_multi_cat_sparse = None for c in multi_cat_features: sparse_out = get_tfidf_vector(X_multi_cat[c], max_features=100, sparse=True) if X_multi_cat_sparse is None: X_multi_cat_sparse = sparse_out else: X_multi_cat_sparse = hstack( [X_multi_cat_sparse, sparse_out]).tocsr() ## Warning appears here, but there is no effect. X_multi_cat.drop([c], axis=1, inplace=True) print("Time remaining: %.4f" % self.config.time_left()) print("-" * 50) ###-------------- get sparse training and testing matrix--------- numeric_table = X[[ c for c in X.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX) ]] X = X[[ c for c in X.columns if not (c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX)) ]] numeric_table = (numeric_table - numeric_table.min()) / ( numeric_table.max() - numeric_table.min()) enc = OneHotEncoder(sparse=True, dtype=np.float32, categories="auto") X = enc.fit_transform(X) X = hstack((X, numeric_table.values), dtype=np.float32).tocsr() del numeric_table del enc if len(multi_cat_features) > 0: X = hstack([X, X_multi_cat_sparse]).tocsr() del X_multi_cat_sparse print("Time remaining: %.4f" % self.config.time_left()) print("-" * 50) ###--------------- train FM and merge result ------------------- weight = 1.0 result *= weight #result += (1 - weight)*train_fastfm_batch(X[train_index], X[test_index], self.y.values, self.config) ## Training process #train_with_time_control(X_train, self.y, self.config) ## Testing process #result = predict(X, self.config) ###-------------Train and Predict-------------------- #result = train_and_predict(X, self.y, self.config) #result = train_and_predict_with_concept_drift_zhiqiang(X, self.y, self.config) # Can not install autosklearn '''from autosklearn import classification import sklearn X, X_test, y, y_test = sklearn.model_selection.train_test_split(X, self.y, test_size=0.2, random_state=1) automl = classification.AutoSklearnClassifier() automl.fit(X, y) pred = automl.predict(X_test) score = roc_auc_score(y_test.values, pred) print("Test auc on hold-out dataset: %.4f"%score) result=None''' if self.test_mode: score = roc_auc_score(y_test.values, result) print("Test auc on hold-out dataset: %.4f" % score) result = None return pd.Series(result)
def predict(self, X_test, time_remain): timer = Timer() timer.set(time_remain) with timer.time_limit('ProProcess'): # fetch information of test dataset self.config[TEST_DATA_LENGTH] = len(X_test) self.config['test_time'] = self._fectch_time_range(X_test) self.config[STAGE] = 'test' Xs = self.tables main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test], axis=0, copy=False) main_table.reset_index(drop=True, inplace=True) del Xs[MAIN_TABLE_NAME] Xs[MAIN_TABLE_NAME] = main_table pre_process(Xs, self.config) clean_tables(Xs) pre_feature_extract(Xs) pre_tables_memory_cut(Xs) X = merge_table(Xs, self.config) # clean datas del self.tables, Xs gc.collect() self.null_count_sum(X, self.config) clean_df(X, fill_time=True) # compress data for memory problem X = table_memory_cut(X) # feature engineering print('overall X size', X.shape) X, add_feature = feature_engineer(X, self.config) # 内存问题 11G X = table_memory_cut(X) add_feature = table_memory_cut(add_feature) X = pd.concat([X, add_feature], axis=1, copy=False) del add_feature print(X.shape) # re compress data # 测试集分割 X_train_val, y_train_val = X.iloc[:self.config[ TRAIN_DATA_LENGTH]], self.train_label X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:] train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8) valid_len = self.config[TRAIN_DATA_LENGTH] - train_len self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len del X gc.collect() # 特征处理 all_label_count_feature_list = cat_Lable_Cnt_Fun( X_train_val, y_train_val, X_test, self.config) all_mutlicat_feature_data_list = Mv_Label_Cnt_Func( X_train_val, y_train_val, X_test, self.config) if (all_label_count_feature_list is None) & (all_mutlicat_feature_data_list is None): X_train, y_train = X_train_val.iloc[: train_len], self.train_label[: train_len] X_val, y_val = X_train_val.iloc[train_len:], self.train_label[ train_len:] else: all_feature_list = [] if all_label_count_feature_list is not None: all_feature_list += all_label_count_feature_list if all_mutlicat_feature_data_list is not None: all_feature_list += all_mutlicat_feature_data_list add_feature_data = pd.concat(all_feature_list, axis=1, copy=False) add_feature_data.sort_index(inplace=True) del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list gc.collect() X_train = pd.concat( [X_train_val[:train_len], add_feature_data[:train_len]], axis=1, copy=False) X_val = pd.concat([ X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]], add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]] ], axis=1, copy=False) y_train = self.train_label[:train_len] y_val = self.train_label[train_len:] X_test = pd.concat([ X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:] ], axis=1, copy=False) del X_train_val, y_train_val, add_feature_data, self.train_label gc.collect() train_columns = train(X_train, X_val, y_train, y_val, self.config, timer.remain) del X_train, X_val, y_train, y_val gc.collect() result = predict(X_test[train_columns], self.config) return pd.Series(result)
def predict(self, X_test, time_remain): self.Time_data_info['time_ramain_so_far'] = time_remain start_feature = time.time() Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #index = main_table.sort_values(by=self.config['time_col']).index #split = int(0.6*len(index)) #train_index, test_index = index[:split], index[split:] #log(f"Merge train and test tables...") main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log(f"Feature engineering...") clean_tables(Xs) X = merge_table(Xs, self.config) X = clean_df(X) X = feature_engineer(X, self.config) X_train = X[X.index.str.startswith("train")] X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) y_train = self.targets end_feature = time.time() self.Time_data_info['time_for_feature_engineering'] = (end_feature - start_feature) self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - self.Time_data_info[ 'time_for_feature_engineering'] #self.Time_data_info['data_cols_for_hp'] = X.shape[1] #self.Time_data_info['data_rows_for_hp'] = X.shape[0] print(f"TIME info:", self.Time_data_info) # train model log(f"Training...") train_start = time.time() #train(X_train.iloc[train_index], y_train.iloc[train_index], self.config) timetrain(X_train, y_train, self.config, self.Time_data_info) #train with time limitation #timetrain(X_train.iloc[train_index], y_train.iloc[train_index], self.config, self.Time_data_info) train_end = time.time() self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[ 'time_ramain_so_far'] - (train_end - train_start) self.Time_data_info['time_for_model_train'] = (train_end - train_start) print("TIME info:", self.Time_data_info) #r = predict(X_train.iloc[test_index], self.config) #r = timepredict(X_train.iloc[test_index], self.config) #print('Test auc: ', roc_auc_score(y_train.iloc[test_index], r)) #importance = self.config["model"].feature_importance(importance_type='split') #feature_name = np.array(self.config["model"].feature_name()) #feature_importance = pd.DataFrame({'feature_importance': feature_name[np.argsort(-importance)], 'importnace':-np.sort(-importance)}) #feature_importance.to_csv('feature_importance.csv', index=False) # predict log(f"Predicting...") X_test = X[X.index.str.startswith("test")] X_test.index = X_test.index.map(lambda x: int(x.split('_')[1])) X_test.sort_index(inplace=True) result = predict(X_test, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #.iloc[0:4000] #X_test = X_test#.iloc[0:4000] #self.y = self.y#.iloc[0:4000] if int(self.config["time_budget"]) > 2000: from data_sample import data_sample main_table, self.y = data_sample(main_table, self.y, ratio=1) # main_table = Xs[MAIN_TABLE_NAME].iloc[-1000000:] # self.y = self.y.iloc[-1000000:] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) ###-------------------- cat feature -----------------------### cat_features = [] for col in X.columns: if "ROLLING" not in col and "c_" in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) ###--------------------------------------------------------### ###------------------- data sample ------------------### if int(self.config["time_budget"]) <= 300: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y, flag=True) X = pd.concat([X_train, X_test], keys=['train', 'test']) elif int(self.config["time_budget"]) < 2000: X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] from data_sample import data_sample X_train, self.y = data_sample(X_train, self.y) X = pd.concat([X_train, X_test], keys=['train', 'test']) #X.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") ###------------------- mul onehot feature -----------------### m_features = [] for col in X.columns: if ("ROLLING" not in col) and ("mul_feature_" in col): m_features.append(col) # if len(self.mlbs)>0 or self.mlbs is not None: # m_features = list(self.mlbs.keys()) # else: # m_features = [] one_hot_features = None one_hot_models = None mlbs = None one_hot_features_m = None from feature_expansion import onehot_feature_selection_m if len(m_features) > 0 and int(self.config["time_budget"]) > 100: one_hot_features_m, one_hot_models, mlbs = onehot_feature_selection_m( X, self.y, m_features, feature_num_everyiter=len(m_features), selection=True) X.drop(m_features, inplace=True, axis=1) elif len(m_features) > 0: X.drop(m_features, inplace=True, axis=1) ###-------------------------------------------------### ###------------------- onehot encoder ------------------### from feature_expansion import onehot_feature_selection one_hot_features = None if len(cat_features) > 0 and int(self.config["time_budget"]) > 4000: one_hot_features, one_hot_models, mlbs = onehot_feature_selection( X, self.y, cat_features, feature_num_everyiter=len(cat_features), selection=True) for cat_col in cat_features: if cat_col not in mlbs: X.drop(cat_col, inplace=True, axis=1) ###-----------------------concat--------------------### from scipy.sparse import hstack, csr_matrix X = csr_matrix(X) if one_hot_features is not None: X = hstack([X, one_hot_features]).tocsr() if one_hot_features_m is not None: X = hstack([X, one_hot_features_m]).tocsr() ###-------------------------------------------------### # ###------------------drop mul_feature---------------### # m_features = [] # for feature in X.columns: # if "mul_feature_" in feature: # m_features.append(feature) # # X.drop(m_features,inplace=True,axis=1) # ###-------------------------------------------------### X_train = X[0:self.y.shape[0]] X = X[self.y.shape[0]:] result = None if int(self.config["time_budget"]) < 2000 and int( self.config["time_budget"]) > 300: for i in range(0, 3): train(X_train, self.y, self.config) tmp = predict(X, self.config) if result is None: result = tmp continue else: result = result + tmp result = result / float(3) else: train(X_train, self.y, self.config) result = predict(X, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): ### calculate time range '''Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] print(main_table.columns) input() min_train_time = np.min(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) max_train_time = np.max(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) min_test_time = np.min(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) max_test_time = np.max(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]]) print("minimum time in training dataset %s"%str(min_train_time)) print("maximum time in training dataset %s"%str(max_train_time)) print("minimum time in testing dataset %s"%str(min_test_time)) print("maximum time in testing dataset %s"%str(max_test_time)) return None''' ### test concept drift '''Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] #main_table = pd.concat([main_table, X_test], keys=['train', 'test']) #main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") main_table = pd.concat([main_table, self.y], axis=1) time_feature = [c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)] main_table = main_table.sort_values(time_feature) number_test = int(main_table.shape[0]*0.2) X_test = main_table.tail(number_test) X_test.index = range(X_test.shape[0]) main_table = main_table.head(main_table.shape[0] - number_test) main_table.index = range(main_table.shape[0]) min_train_time = np.min(main_table[time_feature]) max_train_time = np.max(main_table[time_feature]) min_test_time = np.min(X_test[time_feature]) max_test_time = np.max(X_test[time_feature]) print("minimum time in training dataset %s"%str(min_train_time)) print("maximum time in training dataset %s"%str(max_train_time)) print("minimum time in testing dataset %s"%str(min_test_time)) print("maximum time in testing dataset %s"%str(max_test_time)) y_test = X_test[X_test.columns[-1]] X_test = X_test[X_test.columns[0:-1]] y_train = main_table[main_table.columns[-1]] main_table = main_table[main_table.columns[0:-1]] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col) X, _ = cat_value_counts(X, cat_features) X_train = X[X.index.str.startswith("train")] X_test = X[X.index.str.startswith("test")] train(X_train, y_train, self.config) result = predict(X_test, self.config) fpr, tpr, thresholds=metrics.roc_curve(y_test.values, result, pos_label=1) print("test auc is %.4f"%(metrics.auc(fpr, tpr))) return None''' Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table, X_test], keys=['train', 'test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) feature_engineer(X, self.config) diff = X.max() - X.min() threshold = 1e-6 X = X[X.columns[diff > threshold]] print("There are %d columns of trivial features" % (diff.shape[0] - X.shape[1])) '''cat_features = [] for col in X.columns: if "c_" in col and "ROLLING" not in col and "cnt" not in col: cat_features.append(col)''' #X, _ = cat_value_counts(X, cat_features) #X = pd.get_dummies(X, columns = X.columns, sparse=True) #cumulative_shift, X = oneHotEncoding(X) #self.config["cumulative_shift"] = cumulative_shift X_train, X, one_hot_features, all_features = oneHotEncodingCSRMatrix(X) #cumulative_shift = X.shape[1] self.config["cumulative_shift"] = all_features y = self.y.values result = None #X_train = X[X.index.str.startswith("train")] #train(X_train, y, self.config) #X = X[X.index.str.startswith("test")] #X.index = X.index.map(lambda x: int(x.split('_')[1])) #X.sort_index(inplace=True) #result = predict(X, self.config) #result = train_fm_keras(X_train, X, y, self.config, one_hot_features) #input() result = train_fm_keras_batch(X_train, X, y, self.config, one_hot_features) #result = train_and_predict(X_train, y, X, self.config, one_hot_features) '''tf.reset_default_graph() from tensorflow.python.summary.writer import writer_cache #print(writer_cache.FileWriterCache.get('./models/eval')) writer_cache.FileWriterCache.clear() input() os.system("rm -r ./models/*")''' '''os.system("rm -r ./models/model.*") os.system("rm -r ./models/check*") os.system("rm -r ./models/graph.*") os.system("rm -r ./models/eval/*")''' return pd.Series(result)
def predict(self, X_test, time_remain): time_1 = time.time() Xs = self.tables main_table_tmp = Xs[MAIN_TABLE_NAME] main_table = pd.concat([main_table_tmp, X_test], keys=['train', 'test']) # main_table = pd.concat([X_test], keys=['test']) main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table del main_table_tmp del X_test gc.collect() clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) del Xs gc.collect() ##############################################################################3 ##############################################################################3 print( "########################################################################\n" "# select feature #\n" "########################################################################\n" ) X_to_select = X[X.index.str.startswith("train")] big_df_memory = X_to_select.memory_usage().sum() big_df_len = X_to_select.shape[0] sample_num = int(len(self.y) / 10) part_X, part_y = data_sample_new(X_to_select, self.y, sample_num) del X_to_select # del y gc.collect() part_X = part_X.reset_index(drop=True) part_y = part_y.reset_index(drop=True) tmp_part_X, \ self.two_order_cols, \ self.two_group_cols, \ self.mv_encs, \ self.c_one_order_cols, \ self.c_two_order_cols, \ self.c_two_order_group_cnt_cols, \ self.c_two_order_n_groupby_cat_cols, \ self.n_minus_mean_cols, \ max_numb_cols_to_select, \ fe_model \ = feature_engineer(part_X, self.config, part_y, two_order_cols=self.two_order_cols, two_group_cols=self.two_group_cols, mv_encs=self.mv_encs, c_one_order_cols=self.c_one_order_cols, c_two_order_cols=self.c_two_order_cols, c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols, c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols, n_minus_mean_cols=self.n_minus_mean_cols, cols_selected=self.cols_selected, big_df_memory=big_df_memory, big_df_len=big_df_len, fe_model=None ) tmp_part_X_d, self.cols_selected = feature_selector( tmp_part_X, part_y, max_numb_cols_to_select=max_numb_cols_to_select) # print("#" * 50) # print(part_X.memory_usage()) # # print(tmp_part_X_d.memory_usage()) # # part_X_mem_use_b = part_X.memory_usage().sum() # tmp_part_X_mem_use_b = tmp_part_X.memory_usage().sum() # tmp_part_X_d_mem_use_b = tmp_part_X_d.memory_usage().sum() # print(part_X_mem_use_b) # print(tmp_part_X_d_mem_use_b) # print(tmp_part_X_d_mem_use_b / part_X_mem_use_b) # print(tmp_part_X_mem_use_b / part_X_mem_use_b) # # part_X_mem_use_g = part_X.memory_usage().sum() / (1024 ** 3) # tmp_part_X__d_mem_use_g = tmp_part_X_d.memory_usage().sum() / (1024 ** 3) # print(part_X_mem_use_g) # print(tmp_part_X__d_mem_use_g) # print(tmp_part_X__d_mem_use_g / part_X_mem_use_g) # print("#" * 50) self.mv_encs = None del tmp_part_X del tmp_part_X_d del part_X del part_y gc.collect() print( "########################################################################\n" "# after select feature use all of data to train #\n" "########################################################################\n" ) ##############################################################################3 ##############################################################################3 X, \ self.two_order_cols, \ self.two_group_cols, \ self.mv_encs, \ self.c_one_order_cols, \ self.c_two_order_cols, \ self.c_two_order_group_cnt_cols, \ self.c_two_order_n_groupby_cat_cols, \ self.n_minus_mean_cols, \ max_numb_cols_to_select, \ fe_model \ = feature_engineer(X, self.config, two_order_cols=self.two_order_cols, two_group_cols=self.two_group_cols, mv_encs=self.mv_encs, c_one_order_cols=self.c_one_order_cols, c_two_order_cols=self.c_two_order_cols, c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols, c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols, n_minus_mean_cols=self.n_minus_mean_cols, cols_selected=self.cols_selected, fe_model=fe_model ) X = X[self.cols_selected] print(X.columns.tolist()) print(self.cols_selected) X_train = X[X.index.str.startswith("train")] X = X[X.index.str.startswith("test")] gc.collect() X_train.index = X_train.index.map(lambda x: int(x.split('_')[1])) X_train.sort_index(inplace=True) gc.collect() time_2 = time.time() time_left_to_train = time_remain - (time_2 - time_1) tmp_time = time_left_to_train run_flag = True a_time = 0 train_count = 0 train_num = 0 run_num = 1 # while run_flag: change_flag = True print(tmp_time) while run_num > 0: for i in range(1): t_1 = time.time() part_X, part_y = data_sample_for_train(X_train, self.y) print("*" * 10) print(len(part_y)) print("*" * 10) train(part_X, part_y, self.config) t_2 = time.time() a_time = t_2 - t_1 time_left_to_train = time_left_to_train - a_time print('a_time: ', a_time, 'time_left_to_train: ', time_left_to_train) if tmp_time / a_time > 60: if change_flag: run_num = 25 print("###25###") elif tmp_time / a_time < 5 and tmp_time > 3 * a_time: if change_flag: run_num = 2 print("###2###") elif time_left_to_train <= 3 * a_time: run_num = 0 print("###stop###") elif time_left_to_train < 50: run_num = 0 print("###stop###") else: if change_flag: run_num = 3 print("###3###") change_flag = False run_num = run_num - 1 # if a_time * 5 + 30 >= time_left_to_train: # run_flag = False # train_count = train_count + 1 # if train_count > 25: # run_flag = False # if train_count < 4: # run_flag = True # if time_left_to_train / a_time < 3: # run_flag = False # train(X_train, self.y, self.config) gc.collect() del X_train gc.collect() # X = X[X.index.str.startswith("test")] X.index = X.index.map(lambda x: int(x.split('_')[1])) X.sort_index(inplace=True) gc.collect() result = predict(X, self.config) return pd.Series(result)
def predict(self, X_test, time_remain): Xs = self.tables main_table = Xs[MAIN_TABLE_NAME] main_table['y_sorted'] = self.y main_table.sort_values(self.ts_col, inplace=True) #y_trn = main_table.y_sorted.copy() #main_table.drop('y_sorted', axis=1, inplace=True) #main_table['data_type'] = 'train' #X_test['data_type'] = 'test' X_test['y_sorted'] = -1 main_table = pd.concat([main_table, X_test], ignore_index=True).reset_index() del X_test gc.collect() # main_table = pd.concat([main_table, X_test], keys=['train', 'test']) # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}") Xs[MAIN_TABLE_NAME] = main_table log('memory usage of main_table: {:.2f}MB'.format( df_memory_usage(main_table) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) clean_tables(Xs) X = merge_table(Xs, self.config) clean_df(X) del Xs, main_table gc.collect() log('memory usage of X: {:.2f}MB'.format(df_memory_usage(X) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) self.cat_cols = sorted( [c for c in X.columns if c.startswith(CATEGORY_PREFIX)]) self.mcat_cols = sorted( [c for c in X.columns if c.startswith(MULTI_CAT_PREFIX)]) self.num_cols = sorted( [c for c in X.columns if c.startswith(NUMERICAL_PREFIX)]) self.ts_cols = sorted( [c for c in X.columns if c.startswith(TIME_PREFIX)]) X = self.feature_engineer(X, train=True) # X_trn = X[X.index.str.startswith("train")] # X_trn.index = X_trn.index.map(lambda x: int(x.split('_')[1])) X_trn = X[X['y_sorted'] != -1] y_trn = X_trn.y_sorted.copy() X_trn = X_trn.drop('y_sorted', axis=1) # X_tst = X[X.index.str.startswith("test")] # X_tst.index = X_tst.index.map(lambda x: int(x.split('_')[1])) X_tst = X[X['y_sorted'] == -1] X_tst = X_tst.drop('y_sorted', axis=1) X_tst.sort_index(inplace=True) del X gc.collect() log('memory usage of X_trn: {:.2f}MB'.format( df_memory_usage(X_trn) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) train(X_trn, y_trn, self.config) del X_trn, y_trn gc.collect() log('memory usage of X_tst: {:.2f}MB'.format( df_memory_usage(X_tst) // 1e6)) log('memory usage of process: {:.2f}MB'.format(get_process_memory())) result = predict(X_tst, self.config) del X_tst gc.collect() return pd.Series(result)
def fit(self, Xs, y, time_ramain): self.tables = copy.deepcopy(Xs) self.dropcols = [] self.istrain = True self.numericmap = {} self.square_cubic_transform = True self.skewness = True clean_tables(Xs) enc = OneHotEncoder(handle_unknown='ignore') self.ohe = enc start = time.time() X = merge_table(Xs, self.config) self.time['merging_train']= time.time() -start clean_df(X) start = time.time() #feature_engineer(X, self.config, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness) transform_numeric(X, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness) transform_categorical_hash(X, self.dropcols,self.istrain) self.time['feature_engineer']= time.time() -start numerical_list = list() date_time = list() categorical=list() for term,col in enumerate(X.columns): if ((X[col].dtype == "int64") or (X[col].dtype=="float64")): numerical_list.append(term) if ((X[col].dtype=="datetime64[ns]")): date_time.append(term) if ((X[col].dtype.name=="category")): categorical.append(term) datainfo={} datainfo['loaded_feat_types'] = list() datainfo['loaded_feat_types'].append(date_time) datainfo['loaded_feat_types'].append(numerical_list) datainfo['loaded_feat_types'].append(categorical) datainfo['time_budget'] = self.config['time_budget'] self.diff_info = datainfo self.training_data = X self.model = Model_NIPS(datainfo) start = time.time() self.model.fit(X, y,datainfo) self.time['fitting']= time.time() -start