def read_and_save_features( self, train_table_name: str, test_table_name: str, train_output_path: str, test_output_path: str, ) -> None: df_train_input = self._read_from_bigquery(train_table_name) df_test_input = self._read_from_bigquery(test_table_name) df_train_features, df_test_features = self.make_features( df_train_input, df_test_input) assert (df_train_input.shape[0] == df_train_features.shape[0] ), "generated train features is not compatible with the table" assert (df_test_input.shape[0] == df_test_features.shape[0] ), "generated test features is not compatible with the table" df_train_features.columns = f"{self.name}_" + df_train_features.columns df_test_features.columns = f"{self.name}_" + df_test_features.columns if self.save_memory: self._logger.info("Reduce memory size - train data") df_train_features = reduce_mem_usage(df_train_features) self._logger.info("Reduce memory size - test data") df_test_features = reduce_mem_usage(df_test_features) self._logger.info(f"Saving features to {train_output_path}") df_train_features.to_feather(train_output_path) self._logger.info(f"Saving features to {test_output_path}") df_test_features.to_feather(test_output_path)
def read_data(): print('\n\nRunning read_data') calendar_df = pd.read_csv('./input/calendar.csv') #date, wm_yr_wk, weekday, wday, month, year, d, event_name_1, event_type_1, snap_CA, snap_TX, snap_WI calendar_df = reduce_mem_usage(calendar_df) print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1])) # id, submission_id are only unique, # 30k # 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1' ... 'd_1941' sales_train_evaluation_df = pd.read_csv('./input/sales_train_evaluation.csv') print('Sales train validation has {} rows and {} columns'.format(sales_train_evaluation_df.shape[0], sales_train_evaluation_df.shape[1])) # no uniques, # 6M sell_prices_df = pd.read_csv('./input/sell_prices.csv') #store_id, item_id, wm_yr_wk, sell_price sell_prices_df = reduce_mem_usage(sell_prices_df) print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1])) submission_df = pd.read_csv('./input/sample_submission.csv') calendar_df = encode_categorical(calendar_df, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage) sales_train_evaluation_df = encode_categorical(sales_train_evaluation_df, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage) sell_prices_df = encode_categorical(sell_prices_df, ["item_id", "store_id"]).pipe(reduce_mem_usage) # PICKLES calendar_df.to_pickle('./data/calendar_df.pkl.compress', compression="gzip") sales_train_evaluation_df.to_pickle('./data/sales_train_evaluation_df.pkl.compress', compression="gzip") sell_prices_df.to_pickle('./data/sell_prices_df.pkl.compress', compression="gzip") return calendar_df, sell_prices_df, sales_train_evaluation_df, submission_df
def extract_features(train_df): identity_df = pd.read_csv(config.DATA_PATH + 'train_identity.csv') target = train_df['isFraud'] train_df.drop(['isFraud'], axis=1, inplace=True) train_df = train_df.merge(identity_df, on='TransactionID', how='left') handl_P_emaildomain(train_df) handle_NaN(train_df) transfer_cat_2_int(train_df) drop_corr_column(train_df) reduce_mem_usage(train_df) # del train, df # gc.collect() # return X_train, y_train, test X = train_df #.to_numpy() y = target #.to_numpy() return X, y
def fe(df, path): # time delta df['AvSigVersion-m-Census_OSVersion'] = (df['AvSigVersion'] - df['Census_OSVersion']).dt.seconds df['Census_OSVersion-m-OsBuildLab'] = (df['Census_OSVersion'] - df['OsBuildLab']).dt.seconds # min max col = ['AvSigVersion', 'Census_OSVersion', 'OsBuildLab'] df['date_min'] = df[col].min(1) df['date_max'] = df[col].max(1) df['date_max-m-min'] = (df['date_max'] - df['date_min']).dt.seconds # from max d_max = df[col].max().max() df['max-m-AvSigVersion'] = (d_max - df['AvSigVersion']).dt.seconds df['max-m-Census_OSVersion'] = (d_max - df['Census_OSVersion']).dt.seconds df['max-m-OsBuildLab'] = (d_max - df['OsBuildLab']).dt.seconds # save dt df[['AvSigVersion', 'Census_OSVersion', 'OsBuildLab']].to_feather(path) # into int64 and rank for c in [ 'AvSigVersion', 'Census_OSVersion', 'OsBuildLab', 'date_min', 'date_max' ]: df[c] = df[c].astype(np.int64) // 10**9 df.loc[df[c] < 0, c] = np.nan df[c] = df[c].rank(pct=True) utils.reduce_mem_usage(df) return
def multi(args): c, outpath_tr, outpath_te = args tr_f = pd.DataFrame(index=tr.index) te_f = pd.DataFrame(index=te.index) # count train + test di_tr = frequency_encoding(c, False, True) di_trte = frequency_encoding(c, True, True) tr_f[c + '_ts'] = tr[c].map(lambda x: di_tr.get(x, np.nan)) te_f[c + '_ts'] = te[c].map(lambda x: di_trte.get(x, np.nan)) # count train + test and na di_tr = frequency_encoding(c, False, False) di_trte = frequency_encoding(c, True, False) tr_f[c + '_ts_na'] = tr[c].map(lambda x: di_tr.get(x, np.nan)) te_f[c + '_ts_na'] = te[c].map(lambda x: di_trte.get(x, np.nan)) utils.reduce_mem_usage(tr_f) utils.reduce_mem_usage(te_f) # output tr_f.add_prefix(PREF + '_').to_feather(outpath_tr) te_f.add_prefix(PREF + '_').to_feather(outpath_te) return
def transform(self, df): with Timer('features.FeatureGenerator.transform', verbose=self.verbose): # Hand Written Features simple_feature_generator = SimpleFeatureGenerator( numeric=self.numeric, verbose=self.verbose) df_features = pd.concat( [df, simple_feature_generator.fit_transform(df)], axis=1) df_features = reduce_mem_usage(df_features) # 1-st level features = self.numeric + simple_feature_generator.get_feature_names( ) df_features = pd.concat([ df_features, GroupAggregatedFeatureGenerator( features, verbose=self.verbose).fit_transform(df_features), ], axis=1) df_features = reduce_mem_usage(df_features) if self.created_features is None: self.created_features = [ col for col in df_features.columns if col in df.columns ] else: # TODO: test pass return df_features
def multi(args): c, outpath_tr, outpath_te = args tr_f = pd.DataFrame(index=tr.index) te_f = pd.DataFrame(index=te.index) di = frequency_encoding(c, False, False) tr_f[c + '_trte_00'] = tr[c].map(lambda x: di.get(x, np.nan)) te_f[c + '_trte_00'] = te[c].map(lambda x: di.get(x, np.nan)) di = frequency_encoding(c, False, True) tr_f[c + '_trte_01'] = tr[c].map(lambda x: di.get(x, np.nan)) te_f[c + '_trte_01'] = te[c].map(lambda x: di.get(x, np.nan)) di = frequency_encoding(c, True, False) tr_f[c + '_trte_10'] = tr[c].map(lambda x: di.get(x, np.nan)) te_f[c + '_trte_10'] = te[c].map(lambda x: di.get(x, np.nan)) di = frequency_encoding(c, True, True) tr_f[c + '_trte_11'] = tr[c].map(lambda x: di.get(x, np.nan)) te_f[c + '_trte_11'] = te[c].map(lambda x: di.get(x, np.nan)) utils.reduce_mem_usage(tr_f) utils.reduce_mem_usage(te_f) # output tr_f.add_prefix(PREF + '_').to_feather(outpath_tr) te_f.add_prefix(PREF + '_').to_feather(outpath_te) return
def gen_level_aggs(col, updata=False): feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col)) if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) dfal = get_nominal_dfal()[[col, 'da'] + level_cols] dmax = dfal.da.max() dmin = dfal.da.min() level_agg = None for da in sorted(dfal.da.unique())[1:]: da_agg = None for win_das in [1, 2, 3]: if da - win_das < dmin: continue agg = gen_level_agg_features(dfal, da, win_das, col) print('Generated {} {} {}'.format(col, da, win_das)) if da_agg is None: da_agg = agg else: da_agg = da_agg.merge(agg, how='outer') if level_agg is None: level_agg = da_agg else: level_agg = pd.concat([level_agg, da_agg], axis=0) level_agg.fillna(0, inplace=True) level_agg, _ = reduce_mem_usage(level_agg) print(level_agg.shape) level_agg, _ = reduce_mem_usage(level_agg) dump_pickle(level_agg, feat_path)
def data_loader(): train = pd.read_csv("data/") test = pd.read_csv("data/") train2 = utils.reduce_mem_usage(train) train2.to_csv("data/train2.csv") test2 = utils.reduce_mem_usage(test) test2.to_csv("data/test2.csv") exit() return train, test
def fe(df): df['AvSigVersion'] = (df['AvSigVersion'] - date_min).dt.days df['AvSigVersion'] = df['AvSigVersion'] // 7 df.rename(columns={'AvSigVersion': 'key'}, inplace=True) df = pd.merge(df[['key']], report, on='key', how='left') del df['key'] df = df.rank(pct=True) utils.reduce_mem_usage(df) return df
def multi_te(args): cat, outpath = args tbl = te.groupby(cat).agg(num_agg) tbl.columns = [f'{"-".join(cat)}_{i}_{j}' for i, j in tbl.columns] tbl.reset_index(inplace=True) te_f = pd.merge(te[[cat]], tbl, on=cat, how='left') del te_f[cat] utils.reduce_mem_usage(te_f) te_f.add_prefix(PREF + '_').to_feather(outpath) return
def load_data(file_name, directory='../input/', sample_size=None, normilize_names=True): """ Load data from .csv file. Transform columns names from CamelCase to _underscore notation. :param file_name: file name :param directory: path to the directory with the file :param nrows: sample size :param normilize_names: camelcase to underscore :return: DataFrame """ if file_name.startswith('train'): full_file_name = 'train_V2.csv' elif file_name.startswith('test'): full_file_name = 'test_V2.csv' elif 'sub' in file_name: full_file_name = 'sample_submission_V2.csv' else: full_file_name = file_name with Timer('Data Loading:'): df = pd.read_csv(os.path.join(directory, full_file_name), nrows=sample_size) df = reduce_mem_usage(df) gc.collect() if normilize_names: df.columns = [ camelcase_to_underscore(col) for col in df.columns ] return df
def transform(self, df): """ Used to test/submit :param df: DataFrame :return: DataFrame """ with Timer('preprocessing.Preprocessor.transform', verbose=self.verbose): # Drop ID and Categorical columns to_drop = [ col for col in df.columns if col in self.id + [self.target] + self.categorical ] x = df.drop(to_drop, axis=1).copy() # # Feature Selection # non_selected = [col for col in x.columns if col not in self.SELECTED_FEATURES] # x.drop(non_selected, axis=1, inplace=True) # Fill missings x.fillna(0, inplace=True) # # Normilize # x = x.astype(np.float64) # # x = pd.DataFrame(self.scaler.transform(x), columns=[ # # col for col in self.features if col in self.SELECTED_FEATURES]) # x = pd.DataFrame(self.scaler.transform(x), columns=self.features) x = reduce_mem_usage(x) return x
def gen_dfal(): dump_nominal_file = os.path.join(utils.cache_root, 'dfda_nominal.pkl') dump_textual_file = os.path.join(utils.cache_root, 'dfda_textual.pkl') if not os.path.exists(dump_nominal_file): tr = pd.read_csv('./input/round1_ijcai_18_train_20180301.txt', sep=' ', dtype={'is_trade': np.uint8}) tr.is_trade = tr.is_trade.astype(np.int8) te = pd.read_csv('./input/round1_ijcai_18_test_b_20180418.txt', sep=' ') da = pd.concat([tr, te], axis=0) da = utils.add_time_fields(da) for col in utils.nominal_cate_cols + utils.identity_cols: da[col] = LabelEncoder().fit_transform(da[col]) for col in utils.ordinal_cate_cols: levels = sorted(da[col].unique()) da[col] = da[col].apply(lambda x: levels.index(x)).astype(np.uint8) del da['context_id'] del da['context_timestamp'] del da['ts'] da, _ = utils.reduce_mem_usage(da) utils.dump_pickle(da[utils.textual_cols], dump_textual_file) utils.dump_pickle(da.drop(utils.textual_cols, axis=1), dump_nominal_file) print('gen dfal ok.')
def make_features(self, df_train_input, df_test_input): df_train_features = pd.DataFrame() df_test_features = pd.DataFrame() df_train_input = reduce_mem_usage(df_train_input) df_train_input["timestamp"] = pd.to_datetime( df_train_input["timestamp"], unit="s") # 2020-02-06 00:00 ~ 2020-02-12 23:59 df_train_input["timestamp_bin"] = -9999 df_train_input.loc[ df_train_input["timestamp"] <= "2020-02-08 08:00:00", "timestamp_bin"] = 0 df_train_input.loc[ (df_train_input["timestamp"] > "2020-02-08 08:00:00") & (df_train_input["timestamp"] <= "2020-02-10 16:00:00"), "timestamp_bin"] = 1 df_train_input.loc[df_train_input["timestamp"] > "2020-02-10 16:00:00", "timestamp_bin"] = 2 print(df_train_input["timestamp_bin"].value_counts().sort_index()) val_position = np.zeros(len(df_train_input)).astype(np.int8) for i_fold, bin_number in enumerate([0, 1, 2]): is_trn = df_train_input["timestamp_bin"] != bin_number is_val = df_train_input["timestamp_bin"] == bin_number trn_idx = df_train_input[is_trn].index val_idx = df_train_input[is_val].index val_position[val_idx] = i_fold print(f"{i_fold}fold: n_trn={len(trn_idx)}, n_val={len(val_idx)}") df_train_features["val_position"] = val_position print(df_train_features["val_position"].value_counts().sort_index()) return df_train_features, df_test_features
def read_data(): print('Reading files...') calendar = pd.read_csv('./m5-forecasting-accuracy/calendar.csv') calendar = reduce_mem_usage(calendar) print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1])) sell_prices = pd.read_csv('./m5-forecasting-accuracy/sell_prices.csv') sell_prices = reduce_mem_usage(sell_prices) print('Sell prices has {} rows and {} columns'.format( sell_prices.shape[0], sell_prices.shape[1])) train_data = pd.read_csv( './m5-forecasting-accuracy/sales_train_validation.csv') print('Sales train validation has {} rows and {} columns'.format( train_data.shape[0], train_data.shape[1])) return calendar, sell_prices, train_data
def melt_and_merge(calendar, sell_prices, sales_train_evaluation, submission, nrows = 55000000, merge = False): print('\n\n Running melt and merge\n') # melt sales data, get it ready for training sales_train_evaluation = pd.melt(sales_train_evaluation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand') print('Melted sales train validation has {} rows and {} columns'.format(sales_train_evaluation.shape[0], sales_train_evaluation.shape[1])) sales_train_evaluation = reduce_mem_usage(sales_train_evaluation) sales_train_evaluation = sales_train_evaluation.iloc[-nrows:,:] # seperate test dataframes test1_rows = [row for row in submission['id'] if 'validation' in row] test2_rows = [row for row in submission['id'] if 'evaluation' in row] test1 = submission[submission['id'].isin(test1_rows)] test2 = submission[submission['id'].isin(test2_rows)] # change column names test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941'] test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969'] # get product table product = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates() # merge with product table test1 = test1.merge(product, how = 'left', on = 'id') test2['id'] = test2['id'].str.replace('_evaluation','_validation') test2 = test2.merge(product, how = 'left', on = 'id') test2['id'] = test2['id'].str.replace('_validation','_evaluation') test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand') test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand') sales_train_evaluation['part'] = 'train' test1['part'] = 'test1' test2['part'] = 'test2' data = pd.concat([sales_train_evaluation, test1, test2], axis = 0) del sales_train_evaluation, test1, test2 print(data.shape) # drop some calendar features print(calendar.columns) calendar.drop(['weekday'], inplace = True, axis = 1) # delete test2 for now, test2 to be predicted when full data is available. For now, predict on test1. data = data[data['part'] != 'test1'] if merge: # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD) data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d']) data.drop(['d', 'day'], inplace = True, axis = 1) # get the sell price data (this feature should be very important) data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left') print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1])) else: print('Merge failed!') gc.collect() return data
def get_features(features, cfg): dfs = [ pd.read_feather(f'../features/{f}_{cfg.data_type}.feather') for f in features if f is not None ] df = pd.concat(dfs, axis=1) if cfg.reduce: df = reduce_mem_usage(df) return df
def main(): # load pkls df = read_pickles('../feats/sales_diff') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d',how='left') df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk']>=df['release']] # make lag features df = make_lags(df,28) # label encoding cols_string = ['item_id','dept_id','cat_id','store_id','state_id'] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1,np.nan,inplace=True) # add price features df_grouped = df[['id','sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f105') # save feature name list features_json = {'features':df.columns.tolist()} to_json(features_json,'../configs/105_all_features_diff.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(args): """ input_path, output_path = '../data/train.f', f'../data/train_{PREF}.f' """ input_path, output_path = args df = pd.read_feather(input_path)[col] df['Census_PrimaryDiskTotalCapacity-m-Census_SystemVolumeTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity'] df['Census_PrimaryDiskTotalCapacity-d-Census_SystemVolumeTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'] / df['Census_SystemVolumeTotalCapacity'] df['Census_InternalPrimaryDisplayResolution'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'] * df['Census_InternalPrimaryDisplayResolutionVertical'] utils.reduce_mem_usage(df) df.add_prefix(PREF+'_').to_feather(output_path) return
def main(args): input_path, output_path = args df = pd.read_csv(input_path, dtype=utils.DTYPES) di = {'on':0, 'audit':1} df['PuaMode'] = df['PuaMode'].map(lambda x: di.get(x, np.nan)) # parse OsBuildLab # wired record in test -> 17134.1*amd64fre.rs4_release.180410-1804 tmp = df['OsBuildLab'].map(lambda x: x.replace('*', '.').split('.') if isinstance(x, str) else [np.nan]*5) print(tmp.map(len).describe()) df['OsBuildLab_major'] = tmp.map(lambda x: x[0]).astype(np.float64) df['OsBuildLab_minor'] = tmp.map(lambda x: x[1]).astype(np.float64) df['OsBuildLab_build'] = tmp.map(lambda x: x[2]) df['OsBuildLab_architecture'] = tmp.map(lambda x: x[3]) df['OsBuildLab_date'] = tmp.map(lambda x: x[4].split('-')[0] if isinstance(x[4], str) else np.nan).astype(np.float64) df['OsBuildLab_time'] = tmp.map(lambda x: x[4].split('-')[1] if isinstance(x[4], str) else np.nan).astype(np.float64) # SmartScreen di = { '00000000': '0', # tekiko~ 'enabled': 'on', # tekiko~ 'requiredadmin': 'requireadmin', 'deny': 'off', # tekiko~ 'of': 'off', 'promprt': 'prompt', } df['SmartScreen'] = df['SmartScreen'].str.lower() df['SmartScreen'].replace(di, inplace=True) utils.reduce_mem_usage(df) df.to_feather(output_path) if 'train' in input_path: df[['HasDetections']].to_feather('../data/target.f') return
def add_subject_feature(df, subject): sid2lcnum = { i: j for i, j in subject[['SubjectId', 'Level__SubjectId_cnum']].values } sid2lev = {i: j for i, j in subject[['SubjectId', 'Level']].values} sid2cnum = { i: j for i, j in subject[['SubjectId', 'SubjectId_cnum']].values } level_cnum_oht = [] subject_meta = [] for slist in tqdm(df['SubjectId'].values, total=len(df)): lsoht = np.zeros(len(level_cnum_list), dtype=int) for sid in slist: i = level_cnum_list.index(sid2lcnum[sid]) lsoht[i] += 1 snum = len(slist) levlist = [sid2lev[sid] for sid in slist] cnumlist = [sid2cnum[sid] for sid in slist] level_cnum_oht.append(lsoht) subject_meta.append( [snum, max(levlist), sum(levlist), max(cnumlist), sum(cnumlist)]) level_subject_oht = pd.DataFrame( level_cnum_oht, columns=level_cnum_list).add_prefix('subj_') level_subject_oht = reduce_mem_usage(level_subject_oht) subject_meta = pd.DataFrame(subject_meta, columns=subject_meta_cols).add_prefix('subj_') subject_meta = reduce_mem_usage(subject_meta) df = pd.concat([df, level_subject_oht, subject_meta], axis=1) return df
def create_feature(self, random_state=None, devmode=False): trn_dir, tst_dir = self.get_feature_dir(random_state) if os.path.exists(trn_dir) and os.path.exists( tst_dir) and devmode is False: print( "There are cache dir for feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])" .format(self.__class__.__name__, trn_dir, tst_dir)) trn_feature_files = list(Path(trn_dir).glob('*.f')) tst_feature_files = list(Path(tst_dir).glob('*.f')) return trn_feature_files, tst_feature_files print( "Start computing feature [{}] (train_cache_dir=[{}], test_cache_dir=[{}])" .format(self.__class__.__name__, trn_dir, tst_dir)) if isinstance(self.fin, list): # 入力ファイルがlistだった場合DataFrameのlistを渡す df_list = [pd.read_feather(f) for f in self.fin] feat = self.create_feature_impl(df_list, random_state) del df_list gc.collect() else: df = pd.read_feather(self.fin) feat = self.create_feature_impl(df, random_state) del df gc.collect() feat = utils.reduce_mem_usage(feat) trn = self.trn_base.merge(feat, on=CONST.KEY, how='left').drop(columns=CONST.KEY) tst = self.tst_base.merge(feat, on=CONST.KEY, how='left').drop(columns=CONST.KEY) trn = trn.add_prefix(self.pref) tst = tst.add_prefix(self.pref) # Save ... if not devmode: os.makedirs(trn_dir) os.makedirs(tst_dir) utils.to_feature(trn, trn_dir) utils.to_feature(tst, tst_dir) trn_feature_files = list(Path(trn_dir).glob('*.f')) tst_feature_files = list(Path(tst_dir).glob('*.f')) return trn_feature_files, tst_feature_files else: return trn, tst
def transform(self, df): with Timer('features.GroupAggregatedFeatureGenerator.transform', verbose=self.verbose): df_features = [] # Aggregate by Group # for agg_type in ('mean', 'max', 'min', 'count', 'std'): for agg_type in ( 'mean', 'max', 'min', 'count', ): df_aggregated = df.groupby( ['match_id', 'group_id'], as_index=False)[self.features].agg(agg_type) df_aggregated = self.restore_row_order( df, df_aggregated, on=['match_id', 'group_id']) agg_column_names = { col: f'{agg_type}_group_{col}' for col in self.features } df_aggregated.rename(columns=agg_column_names, inplace=True) # # TODO: Computational problems # # Rank Groups by Match # columns_to_select = list(agg_column_names.values()) # # Anyway deletes match_id # df_ranked = df_aggregated.groupby('match_id', as_index=False)[columns_to_select].rank(pct=True) # ranked_column_names = {col: f'rank_{col}' for col in columns_to_select} # df_ranked.rename(columns=ranked_column_names, inplace=True) # # Unsafe merge because of rank, which deletes match_id # df_aggregated_ranked = pd.concat([df_aggregated, df_ranked], axis=1) # df_features.append(df_aggregated_ranked) # del df_aggregated, df_ranked, df_aggregated_ranked # gc.collect() df_aggregated = reduce_mem_usage(df_aggregated) df_features.append(df_aggregated) df_features = pd.concat(df_features, axis=1) if self.created_features is None: self.created_features = list(df_features.columns) else: if self.created_features == list(df_features.columns): if self.verbose == 2: print('Lost features') for col in df_features.columns: if col not in self.created_features: if self.verbose == 2: print(col) return df_features
def melt_train_data(input_data): data = pd.melt( input_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='day', value_name='demand') data = reduce_mem_usage(data) print('melt_train_data has {} rows and {} columns'.format( data.shape[0], data.shape[1])) del input_data gc.collect() return data
def run(self): # Now we have 3 sets of features data = pd.concat([ self.load('data4'), self.load('data2').iloc[:, 2:], self.load('data3').iloc[:, 2:] ], axis=1) data = reduce_mem_usage(data) # Let's check again memory usage print("{:>20}: {:>8}".format( 'Full Grid', sizeof_fmt(data.memory_usage(index=True).sum()))) print('Size:', data.shape) self.save(data)
def main(is_eval=False): # load csv df = pd.read_csv('../input/sell_prices.csv') # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe release_df = df.groupby(['store_id', 'item_id' ])['wm_yr_wk'].agg(['min']).reset_index() release_df.columns = ['store_id', 'item_id', 'release'] # merge release week df = df.merge(release_df, on=['store_id', 'item_id'], how='left') # days from release df['days_from_release'] = df['wm_yr_wk'] - df['release'] # basic aggregations df['price_max'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('max') df['price_min'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('min') df['price_std'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('std') df['price_mean'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean') # normalized price df['price_norm'] = df['sell_price'] / df['price_max'] # label encoding df['price_nunique'] = df.groupby(['store_id', 'item_id' ])['sell_price'].transform('nunique') df['item_nunique'] = df.groupby(['store_id', 'sell_price' ])['item_id'].transform('nunique') # momentum df['price_momentum'] = df['sell_price'] / df.groupby( ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1)) # reduce memory usage df = reduce_mem_usage(df) # save pkl save2pkl('../feats/sell_prices.pkl', df) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def get_train_test(conf): df = Base.get_df(conf) # pd.DataFrame feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features] for feature in feature_classes: with timer(f"process {feature.__name__}"): f = feature.get_df(conf) if "drop_duplicate_column_on_merge" in conf.options and conf.options.drop_duplicate_column_on_merge: cols_to_drop = [ c for c in f.columns if (c in df.columns) and (c != 'SK_ID_CURR') ] if cols_to_drop: print(f"drop columns: {cols_to_drop}") f = f.drop(cols_to_drop, axis=1) if "reduce_mem_usage" in conf.options and conf.options.reduce_mem_usage: with timer("reduce_mem_usaga"): f = reduce_mem_usage(f) df = df.merge(f, how='left', on='SK_ID_CURR') del f gc.collect() if "stacking_features" in conf: StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features) f = StackingFeaturesWithPasses.get_df(conf) df = df.merge(f, how='left', on='SK_ID_CURR') if "drop_features_list_file" in conf.options: with open(conf.options.drop_features_list_file, "r") as fp: line = fp.read() feature_to_drop = eval(line) print(f"drop columns in {conf.options.drop_features_list_file}") df = df.drop(feature_to_drop, axis=1) if "clean_data" in conf.options and conf.options.clean_data: with timer("clean_data"): df = clean_data(df) train_df = df[df['TARGET'].notnull()].copy() test_df = df[df['TARGET'].isnull()].copy() del df gc.collect() return train_df, test_df
def gen_target_aggs(col, updata=False): feat_path = os.path.join(feats_root, 'target_aggs_{}.pkl'.format(col)) if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) dfal = get_nominal_dfal()[[col, 'da', 'is_trade']] dmax = dfal.da.max() dmin = dfal.da.min() for da in sorted(dfal.da.unique())[1:]: for win_das in [1, 2, 3]: if da - win_das < dmin: continue dfal = gen_target_agg_features(dfal, da, win_das, col) dfal = dfal.loc[dfal.da > 17, :] dfal.drop(['is_trade'], inplace=True, axis=1) dfal.drop_duplicates([col, 'da'], inplace=True) dfal.fillna(0, inplace=True) dfal, _ = reduce_mem_usage(dfal) dump_pickle(dfal, feat_path)
def input_to_feather(): files = [f for f in os.listdir(CONST.INDIR) if '.csv' in f] for f in files: # if os.path.exists(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather')): # print("File '{}' is already exist".format(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather'))) # else: print("to feather '{}'...".format(f)) df = pd.read_csv(os.path.join(CONST.INDIR, f)) # datetimeに変換したいカラムがある if 'purchase_date' in df.columns: df['purchase_date'] = pd.to_datetime(df['purchase_date']) if 'first_active_month' in df.columns: df['first_active_month'] = pd.to_datetime(df['first_active_month']) # Y, Nをbinarizeしたいカラムがある if 'authorized_flag' in df.columns or 'category_1' in df.columns or 'category_4' in df.columns: df = binarize(df) df = reduce_mem_usage(df) df.to_feather(os.path.join(CONST.INDIR, f.split('.')[0] + '.feather'))