def main(): options = common_model_parser().parse_args() config = json.load(open(options.config_file)) data = pd.read_csv(options.data_file) feature_columns = data.columns[data.columns != 'y'] if options.train: train, valid = train_test_split(data, test_size=0.2, random_state=334) train = scale_dataset(data=train, target_positive_ratio=0.191) valid = scale_dataset(data=valid, target_positive_ratio=0.191) X_train, y_train = train[feature_columns], train['y'] X_valid, y_valid = valid[feature_columns], valid['y'] d_train = xgb.DMatrix(X_train, label=y_train) d_valid = xgb.DMatrix(X_valid, label=y_valid) watchlist = [(d_train, 'train'), (d_valid, 'valid')] params = config['model']['params'] bst = xgb.train(params=params['booster'], dtrain=d_train, evals=watchlist, **params['train']) joblib.dump(bst, options.model_file) p_valid = bst.predict(d_valid) log_result(y_valid, p_valid, config, options.log_file) add_feature_importance(bst, options.log_file) else: bst = joblib.load(options.model_file) data['is_duplicate'] = bst.predict(xgb.DMatrix(data[feature_columns])) data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')
def main(): options = common_model_parser().parse_args() config = json.load(open(options.config_file)) data = pd.read_csv(options.data_file) feature_columns = data.columns[data.columns != 'y'] categorical_feature_columns = [feature for feature in feature_columns if feature.endswith('.cat')] print("Categorical features: {}".format(categorical_feature_columns), file=sys.stderr) if options.train: skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=114514) negative_weight = (data.y.sum() / config['model']["target_positive_ratio"] - data.y.sum()) / (data.y == 0).sum() models = [] stats = {"results": [], "config": config} data['prediction'] = [0] * data.shape[0] for train, valid in skf.split(data[feature_columns], data['y']): train_data = data.ix[train] valid_data = data.ix[valid] X_train, y_train = train_data[feature_columns], train_data['y'] X_valid, y_valid = valid_data[feature_columns], valid_data['y'] w_train = np.ones(X_train.shape[0]) w_train[y_train == 0] *= negative_weight w_valid = np.ones(X_valid.shape[0]) w_valid[y_valid == 0] *= negative_weight d_train = lgb.Dataset(data=X_train, label=y_train, weight=w_train, categorical_feature=categorical_feature_columns) d_valid = lgb.Dataset(data=X_valid, label=y_valid, weight=w_valid, categorical_feature=categorical_feature_columns) params = config['model']['params'] gbm = lgb.train(params['booster'], d_train, valid_sets=d_valid, **params['train']) models.append(gbm) p_train = gbm.predict(X_train, num_iteration=gbm.best_iteration) p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration) data.ix[valid, 'prediction'] = p_valid stat = calculate_statistics(pred=p_valid, true=y_valid, weight=w_valid) stat['results']['train_log_loss'] = log_loss(y_train, p_train, sample_weight=w_train) stats["results"].append(stat["results"]) stats['sum_log_loss'] = sum(stat['log_loss'] for stat in stats['results']) joblib.dump(models, options.model_file) data[['prediction']].to_csv(options.model_file + '.train.pred', index=False) json.dump(stats, open(options.log_file, 'w'), sort_keys=True, indent=4) else: models = joblib.load(options.model_file) data['is_duplicate'] = np.zeros(data.shape[0]) preds = np.zeros((data.shape[0], len(models))) BATCH_SIZE = 300000 for begin in range(0, data.shape[0], BATCH_SIZE): end = min(begin + BATCH_SIZE, data.shape[0]) for i, gbm in enumerate(models): preds[begin:end, i] = gbm.predict(data[begin:end][feature_columns]) data['is_duplicate'] = preds.mean(axis=1) data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')
def main(): options = common_model_parser().parse_args() config = json.load(open(options.config_file)) data = pd.read_csv(options.data_file) feature_columns = data.columns[data.columns != 'y'] if options.train: train, valid = train_test_split(data, test_size=0.2, random_state=334, stratify=data.y) X_train, y_train = train[feature_columns], train['y'] X_valid, y_valid = valid[feature_columns], valid['y'] negative_weight = ( data.y.sum() / config['model']["target_positive_ratio"] - data.y.sum()) / (data.y == 0).sum() w_train = np.ones(X_train.shape[0]) w_train[y_train == 0] *= negative_weight w_valid = np.ones(X_valid.shape[0]) w_valid[y_valid == 0] *= negative_weight d_train = xgb.DMatrix(X_train, label=y_train, weight=w_train) d_valid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid) watchlist = [(d_train, 'train'), (d_valid, 'valid')] params = config['model']['params'] bst = xgb.train(params=params['booster'], dtrain=d_train, evals=watchlist, **params['train']) joblib.dump(bst, options.model_file) p_train = bst.predict(d_train) p_valid = bst.predict(d_valid) log_result(y_valid, p_valid, config, options.log_file, weight=w_valid) add_feature_importance(bst, options.log_file) add_train_score(y_train=y_train, p_train=p_train, log_file=options.log_file, weight=w_train) else: bst = joblib.load(options.model_file) data['is_duplicate'] = bst.predict(xgb.DMatrix(data[feature_columns])) data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')
def main(): options = common_model_parser().parse_args() config = json.load(open(options.config_file)) data = pd.read_csv(options.data_file) feature_columns = data.columns[data.columns != 'y'] categorical_feature_columns = [ feature for feature in feature_columns if feature.endswith('.cat') ] print("Categorical features: {}".format(categorical_feature_columns), file=sys.stderr) if options.train: train, valid = train_test_split(data, test_size=0.2, random_state=334, stratify=data.y) X_train, y_train = train[feature_columns], train['y'] X_valid, y_valid = valid[feature_columns], valid['y'] negative_weight = ( data.y.sum() / config['model']["target_positive_ratio"] - data.y.sum()) / (data.y == 0).sum() w_train = np.ones(X_train.shape[0]) w_train[y_train == 0] *= negative_weight w_valid = np.ones(X_valid.shape[0]) w_valid[y_valid == 0] *= negative_weight d_train = lgb.Dataset(data=X_train, label=y_train, weight=w_train, categorical_feature=categorical_feature_columns) d_valid = lgb.Dataset(data=X_valid, label=y_valid, weight=w_valid, categorical_feature=categorical_feature_columns) params = config['model']['params'] gbm = lgb.train(params['booster'], d_train, valid_sets=d_valid, **params['train']) joblib.dump(gbm, options.model_file) p_train = gbm.predict(X_train, num_iteration=gbm.best_iteration) p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration) log_result(y_valid, p_valid, config, options.log_file, weight=w_valid) add_feature_importance(gbm, options.log_file) add_train_score(y_train=y_train, p_train=p_train, log_file=options.log_file, weight=w_train) temp_df = pd.DataFrame() temp_df['is_duplicate'] = gbm.predict(data[feature_columns]) temp_df[['is_duplicate' ]].to_csv(options.submission_file + ".train.pred.csv", index_label='id') else: gbm = joblib.load(options.model_file) data['is_duplicate'] = gbm.predict(data[feature_columns]) data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')
def main(): options = common_model_parser().parse_args() config = json.load(open(options.config_file)) data = pd.read_csv(options.data_file) feature_columns = data.columns[data.columns != 'y'] class_weight = {0: 1.309028344, 1: 0.472001959} if options.train: skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=114514) negative_weight = ( data.y.sum() / config['model']["target_positive_ratio"] - data.y.sum()) / (data.y == 0).sum() scalers = [] stats = {"results": [], "config": config} data['prediction'] = [0] * data.shape[0] for i, (train, valid) in enumerate(skf.split(data[feature_columns], data['y'])): train_data = data.ix[train] valid_data = data.ix[valid] X_train, y_train = train_data[feature_columns], train_data['y'] X_valid, y_valid = valid_data[feature_columns], valid_data['y'] w_train = np.ones(X_train.shape[0]) w_train *= 0.472001959 w_train[y_train == 0] = 1.309028344 w_valid = np.ones(X_valid.shape[0]) w_valid *= 0.472001959 w_valid[y_valid == 0] = 1.309028344 scaler = StandardScaler() X_train = scaler.fit_transform(X_train.values) X_valid = scaler.transform(X_valid.values) model = nn_model(X_train) bst_model_path = options.model_file + '.h5' hist = model.fit(X_train, y_train, validation_data=(X_valid, y_valid, w_valid), epochs=200, batch_size=2048, shuffle=True, class_weight=class_weight, callbacks=[ EarlyStopping(monitor='val_loss', patience=10), ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) ]) p_valid = calibration( model.predict(X_valid, batch_size=8192, verbose=1).ravel()) p_train = calibration( model.predict(X_train, batch_size=8192, verbose=1).ravel()) print(p_valid, p_valid.max()) joblib.dump(p_valid, 'tmp.p_valid.pkl') model.save(options.model_file + '.{}.h5'.format(i)) scalers.append(scaler) data.ix[valid, 'prediction'] = p_valid stat = calculate_statistics(pred=p_valid, true=y_valid, weight=w_valid) stat['results']['train_log_loss'] = log_loss(y_train, p_train, sample_weight=w_train) print(stat) stats["results"].append(stat["results"]) joblib.dump(scalers, options.model_file + '.scaler') data[['prediction']].to_csv(options.model_file + '.train.pred', index=False) print(stats) json.dump(stats, open(options.log_file, 'w'), sort_keys=True, indent=4) else: scalers = joblib.load(options.model_file + '.scaler') data['is_duplicate'] = np.zeros(data.shape[0]) preds = np.zeros((data.shape[0], len(scalers))) for i, scaler in enumerate(scalers): model = load_model(options.model_file + '.{}.h5'.format(i)) X = scaler.transform(data[feature_columns]) preds[:, i] = calibration( model.predict(X, batch_size=8192, verbose=1).ravel()) data['is_duplicate'] = preds.mean(axis=1) data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')