def main(): baskets.time_me.set_default_mode('print') logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('folds', nargs='+') parser.add_argument('-b', '--boost', help='Tag of model to boost from') parser.add_argument('--testmode', action='store_true') parser.add_argument('--uniprods', action='store_true') parser.add_argument('--tag', default='pairs') args = parser.parse_args() assert args.uniprods with time_me('Loaded pair lookup'): lookup = count_pairs.load_pair_lookup() for fold in args.folds: users = iterate_wrapped_users(fold, ktest=args.testmode) if args.boost: logits = user_wrapper.logits_for_tag(args.boost, fold) else: logits = None with time_me('Vectorized'): X, y = vectorize(users, lookup, logits, args.uniprods) logging.info('Loaded X with shape {} and y with shape {}'.format( X.shape, y.shape)) logging.info('Mean # of non-zero features per instance = {:.1f}'.format( X.sum(axis=1).mean() )) save_fold(fold, X, y, args.tag)
def vectorize_fold(fold, tags, meta_df, use_metafeats=True): with time_me('Loaded pdicts'): scoreses = [common.pdict_for_tag(tag, fold) for tag in tags] df = meta_df[meta_df['fold'] == fold] assert len(df) y = df['label'] n_predictors = len(scoreses) with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'): # TODO: could use the logit loading fn added to user_wrapper module scores = munge_scoreses(scoreses, df) if not use_metafeats: X = scores else: meta_cols = metavectorize.metafeature_columns meta = df[meta_cols].values # Special f_0 dummy meta feature for learning vanilla weight term per predictor metafeats = np.hstack([np.ones((len(df), 1)), meta]) # Oh f**k this, I've spent too long trying to understand np.einsum... # (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility # that might have been useful here. But this is fine.) n_metafeats = metafeats.shape[1] logging.info('{} predictors x {} metafeatures -> {} coefs'.format( n_predictors, n_metafeats, n_predictors * n_metafeats)) # X is 'metafeat major'. i.e. the first n_p values for each vector are the # raw scores for each predictor, they're followed by each predictor's score # multiplied by the first metafeature and so on. X = np.tile(scores, n_metafeats) * np.repeat( metafeats, n_predictors, axis=1) return X, y
def main(): parser = argparse.ArgumentParser() parser.add_argument('tags', nargs='+') parser.add_argument('--dest-tag', default='stacked', help='Tag for generated pdict (default: "stacked")') parser.add_argument('--fold', default='test') args = parser.parse_args() metavec = load_metavectors(args.fold) #clf = train.load_model() clf = joblib.load('model.pkl') with time_me('Vectorized fold {}'.format(args.fold)): # TODO: this fn is not a thing? X, y = train.vectorize_fold(args.fold, args.tags, metavec) if hasattr(clf, 'predict_proba'): probs = clf.predict_proba(X) # returns an array of shape (n, 2), where each len-2 subarray # has the probability of the negative and positive classes. which is silly. probs = probs[:,1] else: scores = clf.decision_function(X) probs = expit(scores) pdict = pdictify(probs, metavec) common.save_pdict_for_tag(args.dest_tag, pdict, args.fold)
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tag') parser.add_argument('--train-recordfile', default='train', help='identifier for file with the users to train on (default: train). deprecated: specify in hps...') parser.add_argument('-n', '--n-rounds', type=int, default=50, help='Number of rounds of boosting. Deprecated: specify this in hp config file') parser.add_argument('--weight', action='store_true', help='Whether to do per-instance weighting. Deprecated: specify in hps') args = parser.parse_args() try: hps = hypers.hps_for_tag(args.tag) except hypers.NoHpsDefinedException: logging.warn('No hps found for tag {}. Creating and saving some.'.format(args.tag)) hps = hypers.get_default_hparams() hps.train_file = args.train_recordfile hps.rounds = args.n_rounds hps.weight = args.weight hypers.save_hps(args.tag, hps) validate_hps(hps) dataset = Dataset(hps.train_file, hps) with time_me(mode='stderr'): train(dataset, args.tag, hps)
def quick_fscore(preds, _notused_dtrain): global counter counter += 1 if 0 and counter % 5 != 0: return 'fscore', 0.0 with time_me('calculated validation fscore', mode='print'): user_counts = defaultdict(lambda : dict(tpos=0, fpos=0, fneg=0)) uids = valdat.uids labels = dval.get_label() for i, prob in enumerate(preds): uid = uids[i] pred = prob >= THRESH label = labels[i] if pred and label: user_counts[uid]['tpos'] += 1 elif pred and not label: user_counts[uid]['fpos'] += 1 elif label and not pred: user_counts[uid]['fneg'] += 1 fscore_sum = 0 for uid, res in user_counts.iteritems(): numerator = 2 * res['tpos'] denom = numerator + res['fpos'] + res['fneg'] if denom == 0: fscore = 1 else: fscore = numerator / denom fscore_sum += fscore return 'fscore', fscore_sum / len(user_counts)
def main(): parser = argparse.ArgumentParser() parser.add_argument('tags', metavar='tag', nargs='+') parser.add_argument('-t', '--thresh', default=.2, help='Probability threshold '+ 'for taking a product when using threshold predictor (default=.2)', type=float) parser.add_argument('--fold', default='test.tfrecords', help='tfrecords file with the users to test on (default: test.tfrecords)') parser.add_argument('--mc-trials', type=int, default=50, help='Number of rounds of monte carlo sim to perform per product/threshold (default:50)') parser.add_argument('-n', '--n-users', type=int, help='Limit number of users tested on (default: no limit)') parser.add_argument('--baseline', action='store_true', help='Run a dumb baseline predict-previous-order predictor for comparison') parser.add_argument('--tp', action='store_true', help='Run a basic thresholded predictor for each tag using --thresh threshold') parser.add_argument('--mc', action='store_true', dest='montecarlo', default=True, help='Run a monte-carlo thresh predictor per tag') parser.add_argument('--no-mc', action='store_false', dest='montecarlo', help='Don\'t run a monte-carlo thresh predictor per tag') parser.add_argument('--save', action='store_true', help='Serialize predictions to a file') parser.add_argument('--quick', action='store_true', help='Cut some corners') args = parser.parse_args() predictors = {} if args.baseline: predictors['baseline'] = pred.PreviousOrderPredictor() for tag in args.tags: try: pmap = common.pdict_for_tag(tag, args.fold) except common.NoPdictException as err: raise logging.warning(err.message + "\nPrecomputing and saving probabilities") # Not clear whether this 'recovery' mode should be on by default. Might cause more problems than it solves. # Not every tag belongs to an rnn model. with time_me('Precomputed probabilities', mode='stderr'): pmap = precompute_probs.precompute_probs_for_tag(tag, args.fold) if args.tp: predictors['{}-tp'.format(tag)] = pred.ThresholdPredictor(pmap, args.thresh) if args.montecarlo: predictors['{}-mc'.format(tag)] = \ pred.HybridThresholdPredictor(pmap, ntrials=args.mc_trials, save=args.save, optimization_level=(0 if args.quick else 10) ) assert predictors user_iterator = iterate_wrapped_users(args.fold) judge = evaluator.Evaluator(user_iterator) # TODO: would be real nice to use tensorboard to look at dist. of # probabilities/logits/fscores/precisions stuff like that results = judge.evaluate(predictors, limit=args.n_users, save=args.save) for pname, res in results.iteritems(): print '{}:'.format(pname) df = res.to_df() print df.mean()
def train(traindat, tag, hps): valdat = Dataset('validation', hps, mode=Mode.eval) # TODO: try set_base_margin (https://github.com/dmlc/xgboost/blob/master/demo/guide-python/boost_from_prediction.py) with time_me('Made training dmatrix', mode='stderr'): dtrain = traindat.as_dmatrix() def quick_fscore(preds, _notused_dtrain): global counter counter += 1 if 0 and counter % 5 != 0: return 'fscore', 0.0 with time_me('calculated validation fscore', mode='print'): user_counts = defaultdict(lambda : dict(tpos=0, fpos=0, fneg=0)) uids = valdat.uids labels = dval.get_label() for i, prob in enumerate(preds): uid = uids[i] pred = prob >= THRESH label = labels[i] if pred and label: user_counts[uid]['tpos'] += 1 elif pred and not label: user_counts[uid]['fpos'] += 1 elif label and not pred: user_counts[uid]['fneg'] += 1 fscore_sum = 0 for uid, res in user_counts.iteritems(): numerator = 2 * res['tpos'] denom = numerator + res['fpos'] + res['fneg'] if denom == 0: fscore = 1 else: fscore = numerator / denom fscore_sum += fscore return 'fscore', fscore_sum / len(user_counts) dval = valdat.as_dmatrix() # If you pass in more than one value to evals, early stopping uses the # last one. Because why not. watchlist = [(dtrain, 'train'), (dval, 'validation'),] #watchlist = [(dval, 'validation'),] xgb_params = hypers.xgb_params_from_hps(hps) evals_result = {} t0 = time.time() model = xgb.train(xgb_params, dtrain, hps.rounds, evals=watchlist, early_stopping_rounds=hps.early_stopping_rounds, evals_result=evals_result) #, feval=quick_fscore, maximize=True) t1 = time.time() model_path = common.resolve_xgboostmodel_path(tag) model.save_model(model_path) preds = model.predict(dval) _, fscore = quick_fscore(preds, None) logging.info('Final validation (quick) fscore = {}'.format(fscore)) resultsdict = dict(fscore=fscore, evals=evals_result, duration=t1-t0) res_path = os.path.join(common.XGBOOST_DIR, 'results', tag+'.pickle') with open(res_path, 'w') as f: pickle.dump(resultsdict, f)
def main(): baskets.time_me.set_default_mode('print') logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tags', nargs='+') parser.add_argument('-f', '--train-fold', default='train') parser.add_argument('--validation-fold', help='Fold for validation (default: None)') parser.add_argument('--no-metafeats', action='store_true') parser.add_argument('--svm', action='store_true') args = parser.parse_args() with time_me("Loaded metavectors"): meta_df = pd.read_pickle(METAVECTORS_PICKLEPATH) with time_me("Made training vectors"): X, y = vectorize_fold(args.train_fold, args.tags, meta_df, use_metafeats=not args.no_metafeats) # This sucks. if args.svm: # slooow :( (sklearn docs say hard to scale to dataset w more than like 20k examples) #model = sklearn.svm.SVC(verbose=True, probability=True, C=1.0) model = sklearn.svm.LinearSVC( penalty='l2', loss='hinge', C=.001, verbose=1, ) else: # TODO: C model = LogisticRegression(verbose=1) with time_me('Trained model', mode='print'): model.fit(X, y) model_fname = 'model.pkl' joblib.dump(model, model_fname) return model
def main(): tf.logging.set_verbosity(tf.logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tags', nargs='+') parser.add_argument( '--fold', default='test.tfrecords', help= 'fold of users to compute probs for (should correspond to name of a vector file)' ) args = parser.parse_args() for tag in args.tags: tf.logging.info('Computing probs for tag {}'.format(tag)) with time_me('Computed probs for {}'.format(tag)): precompute_probs_for_tag(tag, args.fold)
def main(): tf.logging.set_verbosity(tf.logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tag') parser.add_argument('--recordfile', default='train.tfrecords', help='tfrecords file with the users to train on (default: train.tfrecords)') parser.add_argument('-r', '--resume', metavar='TAG', help='Load existing checkpoint with the given tag name and resume training') parser.add_argument('--finetune', action='store_true') parser.add_argument('--logdir', default='logs') parser.add_argument('--toy', action='store_true') args = parser.parse_args() hps = hypers.hps_for_tag(args.tag, save_full=True) logdir = args.logdir if args.toy: hps.num_steps = 100 hps.log_every = 50 logdir = 'toylogs' tf.logging.info('Building model') train_dat = BasketDataset(hps, args.recordfile) model = RNNModel(hps, train_dat) eval_hps = hypers.as_eval(hps) eval_dat = BasketDataset(eval_hps) eval_model = RNNModel(eval_hps, eval_dat, reuse=True) # TODO: I think there's something fancy I could do to use a single Dataset # for both train and validation (reinitializable iterator?). But probably fine this way. sess = tf.InteractiveSession() if args.resume: tf.logging.info('Loading saved weights') utils.load_checkpoint_for_tag(args.resume, sess) else: sess.run(tf.global_variables_initializer()) with time_me("Completed training"): train(sess, model, args.tag, eval_model, args.logdir)
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tags', metavar='tag', nargs='+') parser.add_argument( '--fold', default='test', help='identifier for file with the users to test on (default: test)') args = parser.parse_args() for model_tag in args.tags: hps = hypers.hps_for_tag(model_tag) dataset = Dataset(args.fold, hps, mode=Mode.inference) path = common.resolve_xgboostmodel_path(model_tag) logging.info('Loading model with tag {}'.format(model_tag)) model = xgb.Booster(model_file=path) logging.info('Computing probs for tag {}'.format(model_tag)) with time_me('Computed probs for {}'.format(model_tag), mode='stderr'): pdict = get_pdict(model, dataset) logging.info('Got probs for {} users'.format(len(pdict))) # TODO: might want to enforce some namespace separation between # rnn-generated pdicts and ones coming from xgboost models? common.save_pdict_for_tag(model_tag, pdict, args.fold)
parser.add_argument('tags', nargs='+') parser.add_argument('--dest-tag', default='stacked', help='Tag for generated pdict (default: "stacked")') parser.add_argument('--fold', default='test') args = parser.parse_args() metavec = load_metavectors(args.fold) #clf = train.load_model() clf = joblib.load('model.pkl') with time_me('Vectorized fold {}'.format(args.fold)): # TODO: this fn is not a thing? X, y = train.vectorize_fold(args.fold, args.tags, metavec) if hasattr(clf, 'predict_proba'): probs = clf.predict_proba(X) # returns an array of shape (n, 2), where each len-2 subarray # has the probability of the negative and positive classes. which is silly. probs = probs[:,1] else: scores = clf.decision_function(X) probs = expit(scores) pdict = pdictify(probs, metavec) common.save_pdict_for_tag(args.dest_tag, pdict, args.fold) if __name__ == '__main__': with time_me(mode='print'): main()
# Hack because of silly reasons. if userfold == 'validation_full': userfold = 'validation' common.save_pdict_for_tag(tag, probmap, userfold) sess.close() tf.reset_default_graph() return probmap def main(): tf.logging.set_verbosity(tf.logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('tags', nargs='+') parser.add_argument( '--fold', default='test.tfrecords', help= 'fold of users to compute probs for (should correspond to name of a vector file)' ) args = parser.parse_args() for tag in args.tags: tf.logging.info('Computing probs for tag {}'.format(tag)) with time_me('Computed probs for {}'.format(tag)): precompute_probs_for_tag(tag, args.fold) if __name__ == '__main__': with time_me(): main()
# Not clear whether this 'recovery' mode should be on by default. Might cause more problems than it solves. # Not every tag belongs to an rnn model. with time_me('Precomputed probabilities', mode='stderr'): pmap = precompute_probs.precompute_probs_for_tag(tag, args.fold) if args.tp: predictors['{}-tp'.format(tag)] = pred.ThresholdPredictor(pmap, args.thresh) if args.montecarlo: predictors['{}-mc'.format(tag)] = \ pred.HybridThresholdPredictor(pmap, ntrials=args.mc_trials, save=args.save, optimization_level=(0 if args.quick else 10) ) assert predictors user_iterator = iterate_wrapped_users(args.fold) judge = evaluator.Evaluator(user_iterator) # TODO: would be real nice to use tensorboard to look at dist. of # probabilities/logits/fscores/precisions stuff like that results = judge.evaluate(predictors, limit=args.n_users, save=args.save) for pname, res in results.iteritems(): print '{}:'.format(pname) df = res.to_df() print df.mean() if __name__ == '__main__': with time_me('Finished evaluation', mode='stderr'): main()
nprods = constants.N_PRODUCTS if BOOST: boost_coef = coef[0] print "Coef on boosted score: {:.3f}".format(boost_coef) offset = 1 else: offset = 0 uniprod_coefs = coef[offset:nprods + offset] biprod_coefs = coef[nprods + offset:] print "{:,} uni prod feats, {:,} bi prod feats".format(nprods, len(biprod_coefs)) for coefs, name in [(uniprod_coefs, 'uniprod'), (biprod_coefs, 'biprod')]: nz = coefs[coefs != 0] print "{} {} features are non-zero".format(len(nz), name) print "Dist of nonzero {} features...".format(name) s = pd.Series(nz) print s.describe() with time_me("Loaded feature lookup", mode='print'): featmap = count_pairs.load_pair_lookup() pair_lookup = {v: k for (k, v) in featmap.iteritems()} prod_df = utils.load_product_df() poke_uniprods(uniprod_coefs, prod_df) poke_biprods(biprod_coefs, pair_lookup, prod_df, 10)
t0 = time.time() user_iterator = user_wrapper.iterate_wrapped_users(args.recordfile, ktest=True) outname = 'submission_{}.csv'.format(args.save_tag) f = open(outname, 'w') writer = csv.DictWriter(f, fieldnames=['order_id', 'products']) writer.writeheader() def predcol(pids): stringify = lambda pid: 'None' if pid == -1 else str(pid) return ' '.join(map(stringify, pids)) pmap = common.pdict_for_tag(args.tag, args.recordfile) predictor = pred.HybridThresholdPredictor(pmap, ntrials=args.mc_trials) for i, user in enumerate(user_iterator): predicted = predictor.predict_last_order(user) oid = user.user.testorder.orderid row = {'order_id': oid, 'products': predcol(predicted)} writer.writerow(row) if args.n_users and i >= args.n_users: break t1 = time.time() print "Finished predictions in {:.1f}s".format(t1 - t0) if __name__ == '__main__': with time_me(mode="print"): main()