def main(conf):
    dump_dir = abspath(conf['factors_pd']['dump']['dir'])
    makedirs(dump_dir)

    data_dir = abspath(conf['factors_pd']['source'])
    dfc = DataFrameCols(data_dir)

    computer = Factors()
    for group in conf['factors_pd']['factors']:
        logging.info('Compute factors group: %s', group)
        for factor in conf['factors_pd']['factors'][group]:
            logging.info('Compute factor: %s', factor)
            spec = conf['factors_pd']['factors'][group][factor]
            df = dfc.load_df(['id'] + spec['columns'])
            df = getattr(computer, group)(df, factor, **spec)
            df.sort_values(by=['id'], inplace=True)
            if conf['factors_pd']['factors'][group][factor].get(
                    'factors', None) is None:
                dfc.write_column(factor, df[factor].values)
            else:
                for fout in conf['factors_pd']['factors'][group][factor].get(
                        'factors'):
                    fname = factor + '_' + fout
                    dfc.write_column(fname, df[fname].values)
            del df
            gc.collect()
Esempio n. 2
0
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)

    submission_idx = dfc.load_index('submission')
    day_col = dfc.load_column('day', index=submission_idx)
    hour_col = dfc.load_column('hour', index=submission_idx)

    print('Submission days:')
    days = np.unique(day_col, return_counts=True)
    for (d, c) in zip(days[0], days[1]):
        print(d, c)

    print('Submission hours')
    hours = np.unique(hour_col, return_counts=True)
    for (h, c) in zip(hours[0], hours[1]):
        print(h, c)

# Submission days:
# 10 18790469
# Submission hours
# 4 3344125
# 5 2858427
# 6 381
# 9 2984808
# 10 3127993
# 11 413
# 13 3212566
Esempio n. 3
0
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)

    day_col = dfc.load_column('day')
    is_attributed_col = dfc.load_column('is_attributed')
    index = np.where((is_attributed_col >= 0) & (day_col > 7))[0]

    dfc.write_index('days_8_9_attributed', index)
Esempio n. 4
0
from scipy.special import logit
from sklearn.metrics import roc_auc_score, log_loss

from lib.columns import DataFrameCols


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('dump')
    args = parser.parse_args()

    dumpdir = abspath(args.dump)
    datadir = abspath('../data/columns')

    dfc = DataFrameCols(datadir)
    df = dfc.load_df(columns=['id', 'is_attributed'])
    df['p'] = 0

    df_train = df[df['is_attributed'] >= 0]
    df_test = df[df['is_attributed'] == -1]
    print(df_test.shape[0])

    with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f:
        folds = pickle.load(f)

    p_test_avg = np.zeros(df_test.shape[0])
    for j_fold, (fold_idx, valid_idx) in enumerate(folds):
        valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % j_fold)
        with open(valid_pred_file, 'r') as f:
            p_valid = np.array([float(s) for s in f.readlines()])
Esempio n. 5
0
    df.sort_values(by=['id'], inplace=True)

    dtypes = {
        'id': 'uint32',
        'ip': 'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'click_id': 'int32',
        'click_id_submission': 'int32',
        'is_attributed': 'int8'
    }

    test_dir = '../data/columns'
    makedirs(test_dir)

    dfc = DataFrameCols(test_dir)
    for col, dtype in dtypes.items():
        print(col, dtype)
        dfc.write_column(col, df[col].astype(dtype).values)

    dfc.write_column(
        'day',
        pd.to_datetime(df['click_time']).dt.day.astype('uint8').values)
    dfc.write_column(
        'hour',
        pd.to_datetime(df['click_time']).dt.hour.astype('uint8').values)
    dfc.write_column('epoch',
                     (df['click_time'].astype(np.int64) // 10**9).values)
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)
    day_col = dfc.load_column('day')
    hour_col = dfc.load_column('hour')
    is_attributed_col = dfc.load_column('is_attributed')

    hours = np.unique(hour_col, return_counts=True)[0]

    for h in hours:
        all = np.where((day_col == 9) & (hour_col == h))[0].shape[0]
        attributed = np.where((day_col == 9) & (hour_col == h)
                              & (is_attributed_col >= 0))[0].shape[0]

        print(h, all, attributed)

# hour all attributed
# 0 3318301 3318301
# 1 3082862 3082862
# 2 3068887 3068887
# 3 3351149 3351149
# 4 4032691 4032691
# 5 3671741 3671741
# 6 3570940 3570940
# 7 3186240 3186240
# 8 2804701 2804701
# 9 2986204 2986204
# 10 3304199 3304199
    write_config(conf, join_path(dump_dir, 'application.json'), 'json')
    logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log')))

    logging.info('Kaggle Talking Data')
    logging.info('Train Catboost')
    logging.info('Dump: %s', dump_dir)

    target = conf['catboost']['target']
    features = conf['catboost']['features']
    categorical_features = conf['catboost']['categorical_features']
    logging.info('Target: %s', target)
    logging.info('Features: %s', config2json(features))
    logging.info('Categorical features: %s', categorical_features)

    data_dir = abspath(conf['catboost']['data']['dir'])
    dfc = DataFrameCols(data_dir)

    train_index_name = conf['catboost']['data']['train']['index']
    train_index = dfc.load_index(train_index_name)
    train_df = dfc.load_df(columns=[target] + features, index=train_index)
    train_df, valid_df = train_test_split(train_df, test_size=0.1)

    catboost_options = conf['catboost']['options']
    logging.info('Using catboost options: %s', catboost_options)

    work_dir = getcwd()
    chdir(dump_dir)

    hyperopt_options = conf['catboost']['hyperopt']
    if hyperopt_options['enabled']:
        train_quality, valid_quality, model = train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options)
Esempio n. 8
0
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)

    day_col = dfc.load_column('day')
    hour_col = dfc.load_column('hour')
    is_attributed_col = dfc.load_column('is_attributed')

    hidx = (hour_col == 4) | (hour_col == 5) | (hour_col == 9) | (
        hour_col == 10) | (hour_col == 13) | (hour_col == 14)
    index = np.where((is_attributed_col >= 0) & (day_col > 7) & hidx)[0]

    dfc.write_index('days_8_9_hours_4_5_9_10_13_14_attributed', index)
Esempio n. 9
0
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss

from lib.columns import DataFrameCols

if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('dump', type=str)
    parser.add_argument('fold', type=int)
    args = parser.parse_args()

    dumpdir = abspath(args.dump)
    datadir = abspath('../data/columns')

    dfc = DataFrameCols(datadir)
    df = dfc.load_df(columns=['id', 'is_attributed'])
    df = df[df['is_attributed'] >= 0]

    with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f:
        folds = pickle.load(f)

    train_pred_file = join_path(dumpdir, 'train_pred_%d.txt' % args.fold)
    with open(train_pred_file, 'r') as f:
        p_train = np.array([float(s) for s in f.readlines()])

    valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % args.fold)
    with open(valid_pred_file, 'r') as f:
        p_valid = np.array([float(s) for s in f.readlines()])

    fold_idx = folds[args.fold][0]
def main(conf):
    dump_dir = abspath(conf['libffm']['dump']['dir'])
    makedirs(dump_dir)

    data_dir = abspath(conf['libffm']['data']['dir'])
    dfc = DataFrameCols(data_dir)

    target = 'is_attributed'
    fields = {'ip': 0, 'app': 1, 'device': 2, 'os': 3, 'channel': 4}
    shifts = {
        'ip': 0,
        'app': 364779,
        'device': 365548,
        'os': 369776,
        'channel': 370733
    }

    # 1) write test data
    # logging.info('Writing test data in libffm format')
    # df = dfc.load_df(columns=['id', target] + list(fields.keys()))
    # df = df[df[target] == -1]
    # df[target] = 0  # do we need this?
    # df = write_libffm_data(df, target, fields, shifts)
    test_fname = join_path(dump_dir, 'test.txt')
    # df[['data']].to_csv(test_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
    # del df
    # gc.collect()
    # exit()

    # 2) write training folds
    # logging.info('Writing k-fold training data')
    # df = dfc.load_df(columns=['id', target] + list(fields.keys()))
    # df = df[df[target] >= 0]
    # df = write_libffm_data(df, target, fields, shifts)
    #
    # folds = []
    # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)
    # for fold_idx, valid_idx in skf.split(df['id'].values, df[target].values):
    #     folds.append((fold_idx, valid_idx))
    #
    # with open(join_path(dump_dir, 'folds.pkl'), 'wb') as f:
    #     pickle.dump(folds, f)
    #
    # for j_fold, (fold_idx, valid_idx) in enumerate(folds):
    #     logging.info('Writing fold %d in libffm format', j_fold)
    #     train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold)
    #     df.loc[fold_idx, ['data']].to_csv(train_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
    #     valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold)
    #     df.loc[valid_idx, ['data']].to_csv(valid_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
    #
    # del df
    # gc.collect()
    # exit()

    df = dfc.load_df(columns=['id', target])
    df = df[df[target] >= 0]

    with open(join_path(dump_dir, 'folds.pkl'), 'rb') as f:
        folds = pickle.load(f)

    chdir(dump_dir)
    for j_fold, (fold_idx, valid_idx) in enumerate(folds):
        logging.info('Training on fold %d', j_fold)
        train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold)
        valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold)
        model_fname = join_path(dump_dir, 'model_%d.bin' % j_fold)
        proc = subprocess.run([
            'ffm-train', '-p', valid_fname, '-l',
            str(conf['libffm']['options']['lambda']), '-k',
            str(conf['libffm']['options']['factor']), '-r',
            str(conf['libffm']['options']['learning_rate']), '-t',
            str(conf['libffm']['options']['num_iter']), train_fname,
            model_fname
        ],
                              stdout=subprocess.PIPE,
                              check=True)

        logging.info('Running command %s', ' '.join(proc.args))
        logging.info('Process return code %d', proc.returncode)
        logging.info(proc.stdout.decode('utf-8'))

        train_pred_file = join_path(dump_dir, 'train_pred_%d.txt' % j_fold)
        proc = subprocess.run(
            ['ffm-predict', train_fname, model_fname, train_pred_file],
            stdout=subprocess.PIPE,
            check=True)

        logging.info('Running command %s', ' '.join(proc.args))
        logging.info('Process return code %d', proc.returncode)

        with open(train_pred_file, 'r') as f:
            p_train = np.array([float(s) for s in f.readlines()],
                               dtype=np.float32)
            auc_train = roc_auc_score(df.loc[fold_idx, target].values, p_train)

        valid_pred_file = join_path(dump_dir, 'valid_pred_%d.txt' % j_fold)
        proc = subprocess.run(
            ['ffm-predict', valid_fname, model_fname, valid_pred_file],
            stdout=subprocess.PIPE,
            check=True)

        logging.info('Running command %s', ' '.join(proc.args))
        logging.info('Process return code %d', proc.returncode)

        with open(valid_pred_file, 'r') as f:
            p_valid = np.array([float(s) for s in f.readlines()],
                               dtype=np.float32)
            auc_valid = roc_auc_score(df.loc[valid_idx, target].values,
                                      p_valid)

        logging.info('Fold quality: auc_train=%f auc_valid=%f', auc_train,
                     auc_valid)

        test_pred_file = join_path(dump_dir, 'test_pred_%d.txt' % j_fold)
        proc = subprocess.run(
            ['ffm-predict', test_fname, model_fname, test_pred_file],
            stdout=subprocess.PIPE,
            check=True)

        logging.info('Running command %s', ' '.join(proc.args))
        logging.info('Process return code %d', proc.returncode)
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)

    is_attributed_col = dfc.load_column('is_attributed')
    index = np.where((is_attributed_col >= 0))[0]

    print(index.shape[0])
    dfc.write_index('train', index)
Esempio n. 12
0
import numpy as np
from lib.columns import DataFrameCols

if __name__ == '__main__':
    workdir = '../data/columns'
    dfc = DataFrameCols(workdir)

    is_attributed_col = dfc.load_column('is_attributed')
    subsample = np.random.choice([0, 1], size=is_attributed_col.shape[0], p=[0.5, 0.5])
    subsample_idx = np.where((is_attributed_col == 1) | ((is_attributed_col == 0) & (subsample == 1)))[0]

    print(subsample_idx.shape[0])
    dfc.write_index('subsample_not_attributed_50pct_2', subsample_idx)
Esempio n. 13
0
def main(conf):
    dump_dir = conf['lightgbm']['dump']['dir']
    makedirs(dump_dir)

    write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon')
    write_config(conf, join_path(dump_dir, 'application.json'), 'json')
    logging.getLogger().addHandler(
        logging.FileHandler(join_path(dump_dir, 'application.log')))

    logging.info('Kaggle Talking Data')

    label = conf['lightgbm']['label']
    features = conf['lightgbm']['features']
    categorical_features = conf['lightgbm']['categorical_features']
    logging.info('Label: %s', label)
    logging.info('Features: %s', features)
    logging.info('Categorical features: %s', categorical_features)

    data_dir = abspath(conf['lightgbm']['data']['dir'])
    dfc = DataFrameCols(data_dir)
    train_index_name = conf['lightgbm']['data']['train']['index']
    train_index = dfc.load_index(train_index_name)

    df = dfc.load_df(columns=[label] + features, index=train_index)

    if conf['lightgbm']['valid_size'] > 0:
        train_df, valid_df = train_test_split(
            df, test_size=conf['lightgbm']['valid_size'])

        train_dataset = lgb.Dataset(data=train_df[features].values,
                                    label=train_df[label].values,
                                    feature_name=features,
                                    categorical_feature=categorical_features)
        valid_dataset = lgb.Dataset(data=valid_df[features].values,
                                    label=valid_df[label].values,
                                    feature_name=features,
                                    categorical_feature=categorical_features)

        del train_df
        del valid_df
        gc.collect()
    else:
        train_dataset = lgb.Dataset(data=df[features].values,
                                    label=df[label].values,
                                    feature_name=features,
                                    categorical_feature=categorical_features)
        valid_dataset = None

    params = conf['lightgbm']['params']
    options = conf['lightgbm']['options']
    model = train_lightgbm(params, train_dataset, valid_dataset, **options)
    model.save_model(join_path(dump_dir, 'model.bin'))
    del train_dataset
    del valid_dataset
    gc.collect()

    # load model
    # model = lgb.Booster(model_file=join_path(dump_dir, 'model.bin'))

    # train_label = train_df[label].values
    # train_pred = model.predict(train_df[features])
    # train_quality = quality(train_label, train_pred)
    # logging.info('Train quality: %s', train_quality)
    #
    # valid_label = valid_df[label].values
    # valid_pred = model.predict(valid_df[features])
    # valid_quality = quality(valid_label, valid_pred)
    # logging.info('Valid quality: %s', valid_quality)

    test_index_name = conf['lightgbm']['data']['test']['index']
    test_index = dfc.load_index(test_index_name)
    test_df = dfc.load_df(columns=features + ['click_id_submission'],
                          index=test_index)
    test_df['is_attributed'] = model.predict(test_df[features])
    test_df = test_df[['click_id_submission', 'is_attributed'
                       ]].rename(columns={'click_id_submission': 'click_id'})
    test_df.sort_values(by='click_id', inplace=True)
    test_df.to_csv(join_path(dump_dir, 'submission.csv'),
                   header=True,
                   index=False)

    gain = model.feature_importance('gain')
    ft = pd.DataFrame({
        'feature': model.feature_name(),
        'split': model.feature_importance('split'),
        'gain': 100 * gain / gain.sum()
    }).sort_values('gain', ascending=False)
    ft.to_csv(join_path(dump_dir, 'feature_strength.csv'),
              header=True,
              index=False,
              sep='\t')