def go(self): y_train = self.dfAll["relevance"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s" % obs_field) continue obs_corpus = self.dfAll[obs_field].values ext = self.generator(obs_corpus, None, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_%dD" % (feat_name, obs_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_%dD" % (ext.__name__(), obs_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f" % (fname, j + 1, dim, corr))
def go(self): y_train = self.dfAll["relevance"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s"%obs_field) continue obs_corpus = self.dfAll[obs_field].values for target_field in self.target_fields: if target_field not in self.dfAll.columns: self.logger.info("Skip %s"%target_field) continue target_corpus = self.dfAll[target_field].values ext = self.generator(obs_corpus, target_corpus, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def go(self): y_train = self.dfAll['relevance'].values for obs_field in self.obs_fields: obs_transformer, obs_field_transformed = make_transformer( self.dfAll, obs_field) if obs_transformer is None: self.logger.info("Skip %s" % (obs_field)) continue for target_field in self.target_fields: target_transformer, target_field_transformed = make_transformer( self.dfAll, target_field) if target_transformer is None: self.logger.info("Skip %s" % (target_field)) continue deduplicator = self.make_deduplicator( obs_field_transformed, target_field_transformed) obs_corpus, target_corpus = deduplicator.deduplicate() obs_trans = obs_transformer(obs_corpus) target_trans = target_transformer(target_corpus) estimator = self.generator(obs_trans, target_trans, *self.param_list) x = deduplicator.reduplicate(obs_corpus, target_corpus, estimator.transform()) if isinstance(estimator.__name__(), list): for i, feat_name in enumerate(estimator.__name__()): self.save_feature(feat_name, obs_field, target_field, 1, x[:, i], y_train) else: dim = np_utils._dim(x) self.save_feature(estimator.__name__(), obs_field, target_field, dim, x, y_train) # Release memory between iterations. Not sure if necessary yet, # but noticed some strange memory usage so trying this out del obs_corpus del obs_trans del target_corpus del target_trans del x
def go(self): y_train = self.dfAll["relevance"].values for obs_field in self.obs_fields: obs_transformer, obs_field_transformed = make_transformer( self.dfAll, obs_field) if obs_transformer is None: self.logger.info("Skip %s" % (obs_field)) continue deduplicator = self.make_deduplicator(obs_field_transformed, None) obs_corpus, _ = deduplicator.deduplicate() obs_trans = obs_transformer(obs_corpus) estimator = self.generator(obs_trans, None, *self.param_list) x = deduplicator.reduplicate(obs_corpus, None, estimator.transform()) if isinstance(estimator.__name__(), list): for i, feat_name in enumerate(estimator.__name__()): self.save_feature(feat_name, obs_field, 1, x[:, i], y_train) else: dim = np_utils._dim(x) self.save_feature(estimator.__name__(), obs_field, dim, x, y_train)
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id","relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id","relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format( fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1,self.n_iter+1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or (fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d"%run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan"%fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d"%(fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:,:,run-1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def combine(self): # combine meta features if self.meta_feature_dict: cb = Combiner(feature_dict=self.meta_feature_dict, feature_name=self.feature_name, feature_suffix=".pkl", corr_threshold=self.corr_threshold) cb.combine() self.X_train_basic = cb.X_train self.X_test_basic = cb.X_test self.feature_names_basic = cb.feature_names_basic self.feature_names.extend(cb.feature_names) else: self.X_train_basic = None self.X_test_basic = None # combine other features dfAll = pkl_utils._load(config.INFO_DATA) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) ## all first = True feat_cnt = 0 feature_dir = "%s/All" % (config.OUTPUT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if fname not in self.feature_list: continue if first: self.logger.info("Run for all...") first = False # load prediction x = self.load_feature(feature_dir, "test.pred."+fname) x = np.nan_to_num(x) dim = np_utils._dim(x) dfTest[fname] = x feat_cnt += 1 self.feature_names_cv.append(fname) self.feature_names.append(fname) self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "test.proba."+fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) for i in range(dim): dfTest["%s_proba%d"%(fname, i)] = x[:,i] self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) self.feature_names.extend(["%s_proba%d"%(fname, i) for i in range(dim)]) except: pass dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.X_test = dfTest.drop(["id","relevance"], axis=1).values.astype(float) if self.meta_feature_dict: self.X_test = np.hstack([self.X_test_basic, self.X_test]) ## for cv features first = True for run in range(1,self.n_iter+1): feat_cnt = 0 idx1 = splitter_level1[run-1][1] idx2 = splitter_level2[run-1][1] if self.feature_level == 2: idx = idx1 elif self.feature_level == 3: idx = [ idx1[i] for i in idx2 ] self.splitter_prev[run-1] = idx dfTrain_cv = dfTrain.iloc[idx].copy() feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if (fname not in self.feature_list) or (fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cnt == 0: self.logger.info("Run %d"%run) # load prediction x = self.load_feature(feature_dir, "valid.pred."+fname) x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] dfTrain_cv[fname] = x feat_cnt += 1 self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "valid.proba."+fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] for i in range(dim): dfTrain_cv["%s_proba%d"%(fname, i)] = x[:,i] self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) except: pass dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) if run == 1: self.y_train_cv = [0]*self.n_iter self.X_train_cv = [0]*self.n_iter self.y_train_cv[run-1] = dfTrain_cv["relevance"].values.astype(float) self.X_train_cv[run-1] = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float) if self.has_basic: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train_cv[self.n_iter-1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter-1].shape[1])) else: self.logger.info("Overall Shape: %d x %d"%( len(self.y_train_cv[self.n_iter-1]), self.X_train_cv[self.n_iter-1].shape[1])) self.logger.info("Done combinning.") return self
def combine(self): dfAll = pkl_utils._load(config.INFO_DATA) dfAll_raw = dfAll.copy() y_train = dfAll["relevance"].values[:TRAIN_SIZE] ## for basic features feat_cnt = 0 self.logger.info("Run for basic...") for file_name in sorted(os.listdir(config.FEAT_DIR)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) ## basic dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() self.y_train = dfTrain["relevance"].values.astype(float) dfTrain.drop(["id", "relevance"], axis=1, inplace=True) self.X_train = dfTrain.values.astype(float) dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) dfTest.drop(["id", "relevance"], axis=1, inplace=True) self.X_test = dfTest.values.astype(float) ## all first = True feat_cv_cnt = 0 dfAll_cv_all = dfAll_raw.copy() feature_dir = "%s/All" % (config.FEAT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if fname not in self.feature_dict: continue if first: self.logger.info("Run for all...") first = False x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) if not mandatory and abs(corr) < self.corr_threshold: self.logger.info( "Drop: {} ({}D) (abs corr = {}, < threshold = {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll_cv_all[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1) self.feature_names.extend(columns) feat_cv_cnt += 1 self.feature_names_cv.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})". format(feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) X_tmp = dfAll_cv_all.drop(["id", "relevance"], axis=1).values.astype(float) self.X_train_cv_all = X_tmp[:TRAIN_SIZE] self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:])) else: self.X_train_cv_all = None feat_cnt += feat_cv_cnt ## for cv features first = True for run in range(1, self.n_iter + 1): feat_cv_cnt = 0 dfAll_cv = dfAll_raw.copy() feature_dir = "%s/Run%d" % (config.FEAT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: fname = file_name.split(".")[0] if (fname not in self.feature_dict) or ( fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cv_cnt == 0: self.logger.info("Run %d" % run) x = self.load_feature(feature_dir, fname) x = np.nan_to_num(x) if np.isnan(x).any(): self.logger.info("%s nan" % fname) continue # apply feature transform mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: dfAll_cv[fname] = x else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll_cv = pd.concat([dfAll_cv, df], axis=1) feat_cv_cnt += 1 self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt + feat_cv_cnt, len(self.feature_dict.keys()), fname, dim)) if feat_cv_cnt > 0: dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy() X_tmp = dfTrain_cv.drop(["id", "relevance"], axis=1).values.astype(float) if run == 1: self.X_train_cv = np.zeros( (X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float) self.X_train_cv[:, :, run - 1] = X_tmp if feat_cv_cnt == 0: self.X_train_cv = None self.basic_only = 1 # report final results if self.basic_only: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1])) else: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train), self.X_train.shape[1] + self.X_train_cv_all.shape[1])) self.logger.info("Done combinning.") return self
def combine(self): # combine meta features if self.meta_feature_dict: cb = Combiner(feature_dict=self.meta_feature_dict, feature_name=self.feature_name, feature_suffix=".pkl", corr_threshold=self.corr_threshold) cb.combine() self.X_train_basic = cb.X_train self.X_test_basic = cb.X_test self.feature_names_basic = cb.feature_names_basic self.feature_names.extend(cb.feature_names) else: self.X_train_basic = None self.X_test_basic = None # combine other features dfAll = pkl_utils._load(config.INFO_DATA) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() dfTest = dfAll.iloc[TRAIN_SIZE:].copy() self.id_test = dfTest["id"].values.astype(int) ## all first = True feat_cnt = 0 feature_dir = "%s/All" % (config.OUTPUT_DIR) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if fname not in self.feature_list: continue if first: self.logger.info("Run for all...") first = False # load prediction x = self.load_feature(feature_dir, "test.pred." + fname) x = np.nan_to_num(x) dim = np_utils._dim(x) dfTest[fname] = x feat_cnt += 1 self.feature_names_cv.append(fname) self.feature_names.append(fname) self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "test.proba." + fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) for i in range(dim): dfTest["%s_proba%d" % (fname, i)] = x[:, i] self.logger.info( "Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) self.feature_names.extend( ["%s_proba%d" % (fname, i) for i in range(dim)]) except: pass dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.X_test = dfTest.drop(["id", "relevance"], axis=1).values.astype(float) if self.meta_feature_dict: self.X_test = np.hstack([self.X_test_basic, self.X_test]) ## for cv features first = True for run in range(1, self.n_iter + 1): feat_cnt = 0 idx1 = splitter_level1[run - 1][1] idx2 = splitter_level2[run - 1][1] if self.feature_level == 2: idx = idx1 elif self.feature_level == 3: idx = [idx1[i] for i in idx2] self.splitter_prev[run - 1] = idx dfTrain_cv = dfTrain.iloc[idx].copy() feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run) for file_name in sorted(os.listdir(feature_dir)): if self.feature_suffix in file_name: ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv fname = file_name.split(".")[2] if (fname not in self.feature_list) or ( fname not in self.feature_names_cv): continue if first: self.logger.info("Run for cv...") first = False if feat_cnt == 0: self.logger.info("Run %d" % run) # load prediction x = self.load_feature(feature_dir, "valid.pred." + fname) x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] dfTrain_cv[fname] = x feat_cnt += 1 self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) # load probability if any try: x = self.load_feature(feature_dir, "valid.proba." + fname, columns=None, columns_pattern="proba") x = np.nan_to_num(x) dim = np_utils._dim(x) # also including level 1 models' preditions if x.shape[0] > len(idx): x = x[idx2] for i in range(dim): dfTrain_cv["%s_proba%d" % (fname, i)] = x[:, i] self.logger.info( "Combine {:>3}/{:>3} proba feat: {} ({}D)".format( feat_cnt, len(self.feature_list), fname, dim)) except: pass dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) if run == 1: self.y_train_cv = [0] * self.n_iter self.X_train_cv = [0] * self.n_iter self.y_train_cv[run - 1] = dfTrain_cv["relevance"].values.astype(float) self.X_train_cv[run - 1] = dfTrain_cv.drop( ["id", "relevance"], axis=1).values.astype(float) if self.has_basic: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train_cv[self.n_iter - 1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter - 1].shape[1])) else: self.logger.info("Overall Shape: %d x %d" % (len(self.y_train_cv[self.n_iter - 1]), self.X_train_cv[self.n_iter - 1].shape[1])) self.logger.info("Done combinning.") return self
def combine(self): dfAll = table_utils._read(config.INFO_DATA) dfAll_raw = dfAll.copy() y = dfAll['relevance'].values feat_cnt = 0 self.logger.info('Run for basic...') for file_name in sorted(os.listdir(config.FEAT_DIR)): if not config.FEAT_FILE_SUFFIX in file_name: continue fname = os.path.splitext(file_name)[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) # Still necessary? if np.isnan(x).any(): self.logger.info("%s nan" % (fname)) continue # Apply feature transformers (?) mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x, y) if not mandatory and (np.isnan(corr) or abs(corr) < self.corr_threshold): self.logger.info( "Drop: {} ({}D) (abs_corr = {}, < threshold {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.y = dfAll["relevance"].values.astype(float) self.weights = dfAll['weight'].values self.query_ids = dfAll['norm_query_id'].values dfAll.drop(["relevance", "weight", "norm_query_id"], axis=1, inplace=True) self.X = dfAll.values.astype(float) self.logger.info("Overall Shape: %d x %d" % (len(self.y), self.X.shape[1])) self.logger.info("Done combining")