def main(conf): dump_dir = abspath(conf['factors_pd']['dump']['dir']) makedirs(dump_dir) data_dir = abspath(conf['factors_pd']['source']) dfc = DataFrameCols(data_dir) computer = Factors() for group in conf['factors_pd']['factors']: logging.info('Compute factors group: %s', group) for factor in conf['factors_pd']['factors'][group]: logging.info('Compute factor: %s', factor) spec = conf['factors_pd']['factors'][group][factor] df = dfc.load_df(['id'] + spec['columns']) df = getattr(computer, group)(df, factor, **spec) df.sort_values(by=['id'], inplace=True) if conf['factors_pd']['factors'][group][factor].get( 'factors', None) is None: dfc.write_column(factor, df[factor].values) else: for fout in conf['factors_pd']['factors'][group][factor].get( 'factors'): fname = factor + '_' + fout dfc.write_column(fname, df[fname].values) del df gc.collect()
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) submission_idx = dfc.load_index('submission') day_col = dfc.load_column('day', index=submission_idx) hour_col = dfc.load_column('hour', index=submission_idx) print('Submission days:') days = np.unique(day_col, return_counts=True) for (d, c) in zip(days[0], days[1]): print(d, c) print('Submission hours') hours = np.unique(hour_col, return_counts=True) for (h, c) in zip(hours[0], hours[1]): print(h, c) # Submission days: # 10 18790469 # Submission hours # 4 3344125 # 5 2858427 # 6 381 # 9 2984808 # 10 3127993 # 11 413 # 13 3212566
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) day_col = dfc.load_column('day') is_attributed_col = dfc.load_column('is_attributed') index = np.where((is_attributed_col >= 0) & (day_col > 7))[0] dfc.write_index('days_8_9_attributed', index)
from scipy.special import logit from sklearn.metrics import roc_auc_score, log_loss from lib.columns import DataFrameCols if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('dump') args = parser.parse_args() dumpdir = abspath(args.dump) datadir = abspath('../data/columns') dfc = DataFrameCols(datadir) df = dfc.load_df(columns=['id', 'is_attributed']) df['p'] = 0 df_train = df[df['is_attributed'] >= 0] df_test = df[df['is_attributed'] == -1] print(df_test.shape[0]) with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f: folds = pickle.load(f) p_test_avg = np.zeros(df_test.shape[0]) for j_fold, (fold_idx, valid_idx) in enumerate(folds): valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % j_fold) with open(valid_pred_file, 'r') as f: p_valid = np.array([float(s) for s in f.readlines()])
df.sort_values(by=['id'], inplace=True) dtypes = { 'id': 'uint32', 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'click_id': 'int32', 'click_id_submission': 'int32', 'is_attributed': 'int8' } test_dir = '../data/columns' makedirs(test_dir) dfc = DataFrameCols(test_dir) for col, dtype in dtypes.items(): print(col, dtype) dfc.write_column(col, df[col].astype(dtype).values) dfc.write_column( 'day', pd.to_datetime(df['click_time']).dt.day.astype('uint8').values) dfc.write_column( 'hour', pd.to_datetime(df['click_time']).dt.hour.astype('uint8').values) dfc.write_column('epoch', (df['click_time'].astype(np.int64) // 10**9).values)
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) day_col = dfc.load_column('day') hour_col = dfc.load_column('hour') is_attributed_col = dfc.load_column('is_attributed') hours = np.unique(hour_col, return_counts=True)[0] for h in hours: all = np.where((day_col == 9) & (hour_col == h))[0].shape[0] attributed = np.where((day_col == 9) & (hour_col == h) & (is_attributed_col >= 0))[0].shape[0] print(h, all, attributed) # hour all attributed # 0 3318301 3318301 # 1 3082862 3082862 # 2 3068887 3068887 # 3 3351149 3351149 # 4 4032691 4032691 # 5 3671741 3671741 # 6 3570940 3570940 # 7 3186240 3186240 # 8 2804701 2804701 # 9 2986204 2986204 # 10 3304199 3304199
write_config(conf, join_path(dump_dir, 'application.json'), 'json') logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log'))) logging.info('Kaggle Talking Data') logging.info('Train Catboost') logging.info('Dump: %s', dump_dir) target = conf['catboost']['target'] features = conf['catboost']['features'] categorical_features = conf['catboost']['categorical_features'] logging.info('Target: %s', target) logging.info('Features: %s', config2json(features)) logging.info('Categorical features: %s', categorical_features) data_dir = abspath(conf['catboost']['data']['dir']) dfc = DataFrameCols(data_dir) train_index_name = conf['catboost']['data']['train']['index'] train_index = dfc.load_index(train_index_name) train_df = dfc.load_df(columns=[target] + features, index=train_index) train_df, valid_df = train_test_split(train_df, test_size=0.1) catboost_options = conf['catboost']['options'] logging.info('Using catboost options: %s', catboost_options) work_dir = getcwd() chdir(dump_dir) hyperopt_options = conf['catboost']['hyperopt'] if hyperopt_options['enabled']: train_quality, valid_quality, model = train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options)
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) day_col = dfc.load_column('day') hour_col = dfc.load_column('hour') is_attributed_col = dfc.load_column('is_attributed') hidx = (hour_col == 4) | (hour_col == 5) | (hour_col == 9) | ( hour_col == 10) | (hour_col == 13) | (hour_col == 14) index = np.where((is_attributed_col >= 0) & (day_col > 7) & hidx)[0] dfc.write_index('days_8_9_hours_4_5_9_10_13_14_attributed', index)
import numpy as np from sklearn.metrics import roc_auc_score, log_loss from lib.columns import DataFrameCols if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('dump', type=str) parser.add_argument('fold', type=int) args = parser.parse_args() dumpdir = abspath(args.dump) datadir = abspath('../data/columns') dfc = DataFrameCols(datadir) df = dfc.load_df(columns=['id', 'is_attributed']) df = df[df['is_attributed'] >= 0] with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f: folds = pickle.load(f) train_pred_file = join_path(dumpdir, 'train_pred_%d.txt' % args.fold) with open(train_pred_file, 'r') as f: p_train = np.array([float(s) for s in f.readlines()]) valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % args.fold) with open(valid_pred_file, 'r') as f: p_valid = np.array([float(s) for s in f.readlines()]) fold_idx = folds[args.fold][0]
def main(conf): dump_dir = abspath(conf['libffm']['dump']['dir']) makedirs(dump_dir) data_dir = abspath(conf['libffm']['data']['dir']) dfc = DataFrameCols(data_dir) target = 'is_attributed' fields = {'ip': 0, 'app': 1, 'device': 2, 'os': 3, 'channel': 4} shifts = { 'ip': 0, 'app': 364779, 'device': 365548, 'os': 369776, 'channel': 370733 } # 1) write test data # logging.info('Writing test data in libffm format') # df = dfc.load_df(columns=['id', target] + list(fields.keys())) # df = df[df[target] == -1] # df[target] = 0 # do we need this? # df = write_libffm_data(df, target, fields, shifts) test_fname = join_path(dump_dir, 'test.txt') # df[['data']].to_csv(test_fname, header=False, index=False, quoting=csv.QUOTE_NONE) # del df # gc.collect() # exit() # 2) write training folds # logging.info('Writing k-fold training data') # df = dfc.load_df(columns=['id', target] + list(fields.keys())) # df = df[df[target] >= 0] # df = write_libffm_data(df, target, fields, shifts) # # folds = [] # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337) # for fold_idx, valid_idx in skf.split(df['id'].values, df[target].values): # folds.append((fold_idx, valid_idx)) # # with open(join_path(dump_dir, 'folds.pkl'), 'wb') as f: # pickle.dump(folds, f) # # for j_fold, (fold_idx, valid_idx) in enumerate(folds): # logging.info('Writing fold %d in libffm format', j_fold) # train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold) # df.loc[fold_idx, ['data']].to_csv(train_fname, header=False, index=False, quoting=csv.QUOTE_NONE) # valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold) # df.loc[valid_idx, ['data']].to_csv(valid_fname, header=False, index=False, quoting=csv.QUOTE_NONE) # # del df # gc.collect() # exit() df = dfc.load_df(columns=['id', target]) df = df[df[target] >= 0] with open(join_path(dump_dir, 'folds.pkl'), 'rb') as f: folds = pickle.load(f) chdir(dump_dir) for j_fold, (fold_idx, valid_idx) in enumerate(folds): logging.info('Training on fold %d', j_fold) train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold) valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold) model_fname = join_path(dump_dir, 'model_%d.bin' % j_fold) proc = subprocess.run([ 'ffm-train', '-p', valid_fname, '-l', str(conf['libffm']['options']['lambda']), '-k', str(conf['libffm']['options']['factor']), '-r', str(conf['libffm']['options']['learning_rate']), '-t', str(conf['libffm']['options']['num_iter']), train_fname, model_fname ], stdout=subprocess.PIPE, check=True) logging.info('Running command %s', ' '.join(proc.args)) logging.info('Process return code %d', proc.returncode) logging.info(proc.stdout.decode('utf-8')) train_pred_file = join_path(dump_dir, 'train_pred_%d.txt' % j_fold) proc = subprocess.run( ['ffm-predict', train_fname, model_fname, train_pred_file], stdout=subprocess.PIPE, check=True) logging.info('Running command %s', ' '.join(proc.args)) logging.info('Process return code %d', proc.returncode) with open(train_pred_file, 'r') as f: p_train = np.array([float(s) for s in f.readlines()], dtype=np.float32) auc_train = roc_auc_score(df.loc[fold_idx, target].values, p_train) valid_pred_file = join_path(dump_dir, 'valid_pred_%d.txt' % j_fold) proc = subprocess.run( ['ffm-predict', valid_fname, model_fname, valid_pred_file], stdout=subprocess.PIPE, check=True) logging.info('Running command %s', ' '.join(proc.args)) logging.info('Process return code %d', proc.returncode) with open(valid_pred_file, 'r') as f: p_valid = np.array([float(s) for s in f.readlines()], dtype=np.float32) auc_valid = roc_auc_score(df.loc[valid_idx, target].values, p_valid) logging.info('Fold quality: auc_train=%f auc_valid=%f', auc_train, auc_valid) test_pred_file = join_path(dump_dir, 'test_pred_%d.txt' % j_fold) proc = subprocess.run( ['ffm-predict', test_fname, model_fname, test_pred_file], stdout=subprocess.PIPE, check=True) logging.info('Running command %s', ' '.join(proc.args)) logging.info('Process return code %d', proc.returncode)
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) is_attributed_col = dfc.load_column('is_attributed') index = np.where((is_attributed_col >= 0))[0] print(index.shape[0]) dfc.write_index('train', index)
import numpy as np from lib.columns import DataFrameCols if __name__ == '__main__': workdir = '../data/columns' dfc = DataFrameCols(workdir) is_attributed_col = dfc.load_column('is_attributed') subsample = np.random.choice([0, 1], size=is_attributed_col.shape[0], p=[0.5, 0.5]) subsample_idx = np.where((is_attributed_col == 1) | ((is_attributed_col == 0) & (subsample == 1)))[0] print(subsample_idx.shape[0]) dfc.write_index('subsample_not_attributed_50pct_2', subsample_idx)
def main(conf): dump_dir = conf['lightgbm']['dump']['dir'] makedirs(dump_dir) write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon') write_config(conf, join_path(dump_dir, 'application.json'), 'json') logging.getLogger().addHandler( logging.FileHandler(join_path(dump_dir, 'application.log'))) logging.info('Kaggle Talking Data') label = conf['lightgbm']['label'] features = conf['lightgbm']['features'] categorical_features = conf['lightgbm']['categorical_features'] logging.info('Label: %s', label) logging.info('Features: %s', features) logging.info('Categorical features: %s', categorical_features) data_dir = abspath(conf['lightgbm']['data']['dir']) dfc = DataFrameCols(data_dir) train_index_name = conf['lightgbm']['data']['train']['index'] train_index = dfc.load_index(train_index_name) df = dfc.load_df(columns=[label] + features, index=train_index) if conf['lightgbm']['valid_size'] > 0: train_df, valid_df = train_test_split( df, test_size=conf['lightgbm']['valid_size']) train_dataset = lgb.Dataset(data=train_df[features].values, label=train_df[label].values, feature_name=features, categorical_feature=categorical_features) valid_dataset = lgb.Dataset(data=valid_df[features].values, label=valid_df[label].values, feature_name=features, categorical_feature=categorical_features) del train_df del valid_df gc.collect() else: train_dataset = lgb.Dataset(data=df[features].values, label=df[label].values, feature_name=features, categorical_feature=categorical_features) valid_dataset = None params = conf['lightgbm']['params'] options = conf['lightgbm']['options'] model = train_lightgbm(params, train_dataset, valid_dataset, **options) model.save_model(join_path(dump_dir, 'model.bin')) del train_dataset del valid_dataset gc.collect() # load model # model = lgb.Booster(model_file=join_path(dump_dir, 'model.bin')) # train_label = train_df[label].values # train_pred = model.predict(train_df[features]) # train_quality = quality(train_label, train_pred) # logging.info('Train quality: %s', train_quality) # # valid_label = valid_df[label].values # valid_pred = model.predict(valid_df[features]) # valid_quality = quality(valid_label, valid_pred) # logging.info('Valid quality: %s', valid_quality) test_index_name = conf['lightgbm']['data']['test']['index'] test_index = dfc.load_index(test_index_name) test_df = dfc.load_df(columns=features + ['click_id_submission'], index=test_index) test_df['is_attributed'] = model.predict(test_df[features]) test_df = test_df[['click_id_submission', 'is_attributed' ]].rename(columns={'click_id_submission': 'click_id'}) test_df.sort_values(by='click_id', inplace=True) test_df.to_csv(join_path(dump_dir, 'submission.csv'), header=True, index=False) gain = model.feature_importance('gain') ft = pd.DataFrame({ 'feature': model.feature_name(), 'split': model.feature_importance('split'), 'gain': 100 * gain / gain.sum() }).sort_values('gain', ascending=False) ft.to_csv(join_path(dump_dir, 'feature_strength.csv'), header=True, index=False, sep='\t')