def main():
  baskets.time_me.set_default_mode('print')
  logging.basicConfig(level=logging.INFO)
  parser = argparse.ArgumentParser()
  parser.add_argument('folds', nargs='+')
  parser.add_argument('-b', '--boost', help='Tag of model to boost from')
  parser.add_argument('--testmode', action='store_true')
  parser.add_argument('--uniprods', action='store_true')
  parser.add_argument('--tag', default='pairs')
  args = parser.parse_args()
  assert args.uniprods

  with time_me('Loaded pair lookup'):
    lookup = count_pairs.load_pair_lookup()
  for fold in args.folds:
    users = iterate_wrapped_users(fold, ktest=args.testmode)
    if args.boost:
      logits = user_wrapper.logits_for_tag(args.boost, fold)
    else:
      logits = None
    with time_me('Vectorized'):
      X, y = vectorize(users, lookup, logits, args.uniprods)
    logging.info('Loaded X with shape {} and y with shape {}'.format(
      X.shape, y.shape))
    logging.info('Mean # of non-zero features per instance = {:.1f}'.format(
      X.sum(axis=1).mean()
      ))
    save_fold(fold, X, y, args.tag)
Ejemplo n.º 2
0
def vectorize_fold(fold, tags, meta_df, use_metafeats=True):
    with time_me('Loaded pdicts'):
        scoreses = [common.pdict_for_tag(tag, fold) for tag in tags]
    df = meta_df[meta_df['fold'] == fold]
    assert len(df)
    y = df['label']
    n_predictors = len(scoreses)
    with time_me('Munged scores for {} predictors'.format(n_predictors),
                 mode='print'):
        # TODO: could use the logit loading fn added to user_wrapper module
        scores = munge_scoreses(scoreses, df)
    if not use_metafeats:
        X = scores
    else:
        meta_cols = metavectorize.metafeature_columns
        meta = df[meta_cols].values
        # Special f_0 dummy meta feature for learning vanilla weight term per predictor
        metafeats = np.hstack([np.ones((len(df), 1)), meta])
        # Oh f**k this, I've spent too long trying to understand np.einsum...
        # (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility
        # that might have been useful here. But this is fine.)
        n_metafeats = metafeats.shape[1]
        logging.info('{} predictors x {} metafeatures -> {} coefs'.format(
            n_predictors, n_metafeats, n_predictors * n_metafeats))
        # X is 'metafeat major'. i.e. the first n_p values for each vector are the
        # raw scores for each predictor, they're followed by each predictor's score
        # multiplied by the first metafeature and so on.
        X = np.tile(scores, n_metafeats) * np.repeat(
            metafeats, n_predictors, axis=1)
    return X, y
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('tags', nargs='+')
  parser.add_argument('--dest-tag', default='stacked', 
      help='Tag for generated pdict (default: "stacked")')
  parser.add_argument('--fold', default='test')
  args = parser.parse_args()

  metavec = load_metavectors(args.fold)

  #clf = train.load_model()
  clf = joblib.load('model.pkl')

  with time_me('Vectorized fold {}'.format(args.fold)):
    # TODO: this fn is not a thing?
    X, y = train.vectorize_fold(args.fold, args.tags, metavec)

  if hasattr(clf, 'predict_proba'):
    probs = clf.predict_proba(X)
    # returns an array of shape (n, 2), where each len-2 subarray
    # has the probability of the negative and positive classes. which is silly.
    probs = probs[:,1]
  else:
    scores = clf.decision_function(X)
    probs = expit(scores)

  pdict = pdictify(probs, metavec)
  common.save_pdict_for_tag(args.dest_tag, pdict, args.fold)
Ejemplo n.º 4
0
def main():
  logging.basicConfig(level=logging.INFO)
  parser = argparse.ArgumentParser()
  parser.add_argument('tag')
  parser.add_argument('--train-recordfile', default='train', 
      help='identifier for file with the users to train on (default: train). deprecated: specify in hps...')
  parser.add_argument('-n', '--n-rounds', type=int, default=50,
      help='Number of rounds of boosting. Deprecated: specify this in hp config file')
  parser.add_argument('--weight', action='store_true',
      help='Whether to do per-instance weighting. Deprecated: specify in hps')
  args = parser.parse_args()

  try:
    hps = hypers.hps_for_tag(args.tag)
  except hypers.NoHpsDefinedException:
    logging.warn('No hps found for tag {}. Creating and saving some.'.format(args.tag))
    hps = hypers.get_default_hparams()
    hps.train_file = args.train_recordfile
    hps.rounds = args.n_rounds
    hps.weight = args.weight
    hypers.save_hps(args.tag, hps)
  validate_hps(hps)
  dataset = Dataset(hps.train_file, hps)
  with time_me(mode='stderr'):
    train(dataset, args.tag, hps)
Ejemplo n.º 5
0
 def quick_fscore(preds, _notused_dtrain):
   global counter
   counter += 1
   if 0 and counter % 5 != 0:
     return 'fscore', 0.0
   with time_me('calculated validation fscore', mode='print'):
     user_counts = defaultdict(lambda : dict(tpos=0, fpos=0, fneg=0))
     uids = valdat.uids
     labels = dval.get_label()
     for i, prob in enumerate(preds):
       uid = uids[i]
       pred = prob >= THRESH
       label = labels[i]
       if pred and label:
         user_counts[uid]['tpos'] += 1
       elif pred and not label:
         user_counts[uid]['fpos'] += 1
       elif label and not pred:
         user_counts[uid]['fneg'] += 1
     fscore_sum = 0
     for uid, res in user_counts.iteritems():
       numerator = 2 * res['tpos']
       denom = numerator + res['fpos'] + res['fneg']
       if denom == 0:
         fscore = 1
       else:
         fscore = numerator / denom
       fscore_sum += fscore
     return 'fscore', fscore_sum / len(user_counts)
Ejemplo n.º 6
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('tags', metavar='tag', nargs='+')
  parser.add_argument('-t', '--thresh', default=.2, help='Probability threshold '+
      'for taking a product when using threshold predictor (default=.2)', type=float)
  parser.add_argument('--fold', default='test.tfrecords', 
      help='tfrecords file with the users to test on (default: test.tfrecords)')
  parser.add_argument('--mc-trials', type=int, default=50,
      help='Number of rounds of monte carlo sim to perform per product/threshold (default:50)')
  parser.add_argument('-n', '--n-users', type=int, 
      help='Limit number of users tested on (default: no limit)')
  parser.add_argument('--baseline', action='store_true', 
      help='Run a dumb baseline predict-previous-order predictor for comparison')
  parser.add_argument('--tp', action='store_true', 
      help='Run a basic thresholded predictor for each tag using --thresh threshold')
  parser.add_argument('--mc', action='store_true', dest='montecarlo', default=True,
      help='Run a monte-carlo thresh predictor per tag')
  parser.add_argument('--no-mc', action='store_false', dest='montecarlo',
      help='Don\'t run a monte-carlo thresh predictor per tag')
  parser.add_argument('--save', action='store_true', help='Serialize predictions to a file')
  parser.add_argument('--quick', action='store_true', help='Cut some corners')
  args = parser.parse_args()
  
  predictors = {}
  if args.baseline:
    predictors['baseline'] = pred.PreviousOrderPredictor()

  for tag in args.tags:
    try:
      pmap = common.pdict_for_tag(tag, args.fold)
    except common.NoPdictException as err:
      raise
      logging.warning(err.message + "\nPrecomputing and saving probabilities")
      # Not clear whether this 'recovery' mode should be on by default. Might cause more problems than it solves.
      # Not every tag belongs to an rnn model.
      with time_me('Precomputed probabilities', mode='stderr'):
        pmap = precompute_probs.precompute_probs_for_tag(tag, args.fold)
    if args.tp:
      predictors['{}-tp'.format(tag)] = pred.ThresholdPredictor(pmap, args.thresh)
    if args.montecarlo:
      predictors['{}-mc'.format(tag)] = \
          pred.HybridThresholdPredictor(pmap, 
              ntrials=args.mc_trials, 
              save=args.save,
              optimization_level=(0 if args.quick else 10)
              )

  assert predictors

  user_iterator = iterate_wrapped_users(args.fold)
  judge = evaluator.Evaluator(user_iterator)
  # TODO: would be real nice to use tensorboard to look at dist. of
  # probabilities/logits/fscores/precisions stuff like that
  results = judge.evaluate(predictors, limit=args.n_users, save=args.save)

  for pname, res in results.iteritems():
    print '{}:'.format(pname)
    df = res.to_df()
    print df.mean()
Ejemplo n.º 7
0
def train(traindat, tag, hps):
  valdat = Dataset('validation', hps, mode=Mode.eval)
  # TODO: try set_base_margin (https://github.com/dmlc/xgboost/blob/master/demo/guide-python/boost_from_prediction.py)
  with time_me('Made training dmatrix', mode='stderr'):
    dtrain = traindat.as_dmatrix()
  def quick_fscore(preds, _notused_dtrain):
    global counter
    counter += 1
    if 0 and counter % 5 != 0:
      return 'fscore', 0.0
    with time_me('calculated validation fscore', mode='print'):
      user_counts = defaultdict(lambda : dict(tpos=0, fpos=0, fneg=0))
      uids = valdat.uids
      labels = dval.get_label()
      for i, prob in enumerate(preds):
        uid = uids[i]
        pred = prob >= THRESH
        label = labels[i]
        if pred and label:
          user_counts[uid]['tpos'] += 1
        elif pred and not label:
          user_counts[uid]['fpos'] += 1
        elif label and not pred:
          user_counts[uid]['fneg'] += 1
      fscore_sum = 0
      for uid, res in user_counts.iteritems():
        numerator = 2 * res['tpos']
        denom = numerator + res['fpos'] + res['fneg']
        if denom == 0:
          fscore = 1
        else:
          fscore = numerator / denom
        fscore_sum += fscore
      return 'fscore', fscore_sum / len(user_counts)
    
  dval = valdat.as_dmatrix()
  # If you pass in more than one value to evals, early stopping uses the
  # last one. Because why not.
  watchlist = [(dtrain, 'train'), (dval, 'validation'),]
  #watchlist = [(dval, 'validation'),]

  xgb_params = hypers.xgb_params_from_hps(hps)
  evals_result = {}
  t0 = time.time()
  model = xgb.train(xgb_params, dtrain, hps.rounds, evals=watchlist, 
      early_stopping_rounds=hps.early_stopping_rounds, evals_result=evals_result) #, feval=quick_fscore, maximize=True)

  t1 = time.time()
  model_path = common.resolve_xgboostmodel_path(tag)
  model.save_model(model_path)
  preds = model.predict(dval)
  _, fscore = quick_fscore(preds, None)
  logging.info('Final validation (quick) fscore = {}'.format(fscore))
  resultsdict = dict(fscore=fscore, evals=evals_result, duration=t1-t0)
  res_path = os.path.join(common.XGBOOST_DIR, 'results', tag+'.pickle')
  with open(res_path, 'w') as f:
    pickle.dump(resultsdict, f)
Ejemplo n.º 8
0
def main():
    baskets.time_me.set_default_mode('print')
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('tags', nargs='+')
    parser.add_argument('-f', '--train-fold', default='train')
    parser.add_argument('--validation-fold',
                        help='Fold for validation (default: None)')
    parser.add_argument('--no-metafeats', action='store_true')
    parser.add_argument('--svm', action='store_true')
    args = parser.parse_args()

    with time_me("Loaded metavectors"):
        meta_df = pd.read_pickle(METAVECTORS_PICKLEPATH)

    with time_me("Made training vectors"):
        X, y = vectorize_fold(args.train_fold,
                              args.tags,
                              meta_df,
                              use_metafeats=not args.no_metafeats)

    # This sucks.
    if args.svm:
        # slooow :( (sklearn docs say hard to scale to dataset w more than like 20k examples)
        #model = sklearn.svm.SVC(verbose=True, probability=True, C=1.0)
        model = sklearn.svm.LinearSVC(
            penalty='l2',
            loss='hinge',
            C=.001,
            verbose=1,
        )
    else:
        # TODO: C
        model = LogisticRegression(verbose=1)
    with time_me('Trained model', mode='print'):
        model.fit(X, y)

    model_fname = 'model.pkl'
    joblib.dump(model, model_fname)
    return model
Ejemplo n.º 9
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('tags', nargs='+')
    parser.add_argument(
        '--fold',
        default='test.tfrecords',
        help=
        'fold of users to compute probs for (should correspond to name of a vector file)'
    )
    args = parser.parse_args()

    for tag in args.tags:
        tf.logging.info('Computing probs for tag {}'.format(tag))
        with time_me('Computed probs for {}'.format(tag)):
            precompute_probs_for_tag(tag, args.fold)
Ejemplo n.º 10
0
def main():
  tf.logging.set_verbosity(tf.logging.INFO)
  parser = argparse.ArgumentParser()
  parser.add_argument('tag')
  parser.add_argument('--recordfile', default='train.tfrecords', 
      help='tfrecords file with the users to train on (default: train.tfrecords)')
  parser.add_argument('-r', '--resume', metavar='TAG',
      help='Load existing checkpoint with the given tag name and resume training')
  parser.add_argument('--finetune', action='store_true')
  parser.add_argument('--logdir', default='logs')
  parser.add_argument('--toy', action='store_true')
  args = parser.parse_args()

  hps = hypers.hps_for_tag(args.tag, save_full=True)
  logdir = args.logdir
  if args.toy:
    hps.num_steps = 100
    hps.log_every = 50
    logdir = 'toylogs'
  tf.logging.info('Building model')
  train_dat = BasketDataset(hps, args.recordfile)
  model = RNNModel(hps, train_dat)

  eval_hps = hypers.as_eval(hps)
  eval_dat = BasketDataset(eval_hps)
  eval_model = RNNModel(eval_hps, eval_dat, reuse=True)
  # TODO: I think there's something fancy I could do to use a single Dataset
  # for both train and validation (reinitializable iterator?). But probably fine this way.

  sess = tf.InteractiveSession()

  if args.resume:
    tf.logging.info('Loading saved weights')
    utils.load_checkpoint_for_tag(args.resume, sess)
  else:
    sess.run(tf.global_variables_initializer())

  with time_me("Completed training"):
    train(sess, model, args.tag, eval_model, args.logdir)
Ejemplo n.º 11
0
def main():
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('tags', metavar='tag', nargs='+')
    parser.add_argument(
        '--fold',
        default='test',
        help='identifier for file with the users to test on (default: test)')
    args = parser.parse_args()

    for model_tag in args.tags:
        hps = hypers.hps_for_tag(model_tag)
        dataset = Dataset(args.fold, hps, mode=Mode.inference)
        path = common.resolve_xgboostmodel_path(model_tag)
        logging.info('Loading model with tag {}'.format(model_tag))
        model = xgb.Booster(model_file=path)
        logging.info('Computing probs for tag {}'.format(model_tag))
        with time_me('Computed probs for {}'.format(model_tag), mode='stderr'):
            pdict = get_pdict(model, dataset)
            logging.info('Got probs for {} users'.format(len(pdict)))
            # TODO: might want to enforce some namespace separation between
            # rnn-generated pdicts and ones coming from xgboost models?
            common.save_pdict_for_tag(model_tag, pdict, args.fold)
  parser.add_argument('tags', nargs='+')
  parser.add_argument('--dest-tag', default='stacked', 
      help='Tag for generated pdict (default: "stacked")')
  parser.add_argument('--fold', default='test')
  args = parser.parse_args()

  metavec = load_metavectors(args.fold)

  #clf = train.load_model()
  clf = joblib.load('model.pkl')

  with time_me('Vectorized fold {}'.format(args.fold)):
    # TODO: this fn is not a thing?
    X, y = train.vectorize_fold(args.fold, args.tags, metavec)

  if hasattr(clf, 'predict_proba'):
    probs = clf.predict_proba(X)
    # returns an array of shape (n, 2), where each len-2 subarray
    # has the probability of the negative and positive classes. which is silly.
    probs = probs[:,1]
  else:
    scores = clf.decision_function(X)
    probs = expit(scores)

  pdict = pdictify(probs, metavec)
  common.save_pdict_for_tag(args.dest_tag, pdict, args.fold)

if __name__ == '__main__':
  with time_me(mode='print'):
    main()
Ejemplo n.º 13
0
    # Hack because of silly reasons.
    if userfold == 'validation_full':
        userfold = 'validation'
    common.save_pdict_for_tag(tag, probmap, userfold)
    sess.close()
    tf.reset_default_graph()
    return probmap


def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('tags', nargs='+')
    parser.add_argument(
        '--fold',
        default='test.tfrecords',
        help=
        'fold of users to compute probs for (should correspond to name of a vector file)'
    )
    args = parser.parse_args()

    for tag in args.tags:
        tf.logging.info('Computing probs for tag {}'.format(tag))
        with time_me('Computed probs for {}'.format(tag)):
            precompute_probs_for_tag(tag, args.fold)


if __name__ == '__main__':
    with time_me():
        main()
Ejemplo n.º 14
0
      # Not clear whether this 'recovery' mode should be on by default. Might cause more problems than it solves.
      # Not every tag belongs to an rnn model.
      with time_me('Precomputed probabilities', mode='stderr'):
        pmap = precompute_probs.precompute_probs_for_tag(tag, args.fold)
    if args.tp:
      predictors['{}-tp'.format(tag)] = pred.ThresholdPredictor(pmap, args.thresh)
    if args.montecarlo:
      predictors['{}-mc'.format(tag)] = \
          pred.HybridThresholdPredictor(pmap, 
              ntrials=args.mc_trials, 
              save=args.save,
              optimization_level=(0 if args.quick else 10)
              )

  assert predictors

  user_iterator = iterate_wrapped_users(args.fold)
  judge = evaluator.Evaluator(user_iterator)
  # TODO: would be real nice to use tensorboard to look at dist. of
  # probabilities/logits/fscores/precisions stuff like that
  results = judge.evaluate(predictors, limit=args.n_users, save=args.save)

  for pname, res in results.iteritems():
    print '{}:'.format(pname)
    df = res.to_df()
    print df.mean()
  
if __name__ == '__main__':
  with time_me('Finished evaluation', mode='stderr'):
    main()
nprods = constants.N_PRODUCTS
if BOOST:
    boost_coef = coef[0]
    print "Coef on boosted score: {:.3f}".format(boost_coef)
    offset = 1
else:
    offset = 0
uniprod_coefs = coef[offset:nprods + offset]
biprod_coefs = coef[nprods + offset:]

print "{:,} uni prod feats, {:,} bi prod feats".format(nprods,
                                                       len(biprod_coefs))

for coefs, name in [(uniprod_coefs, 'uniprod'), (biprod_coefs, 'biprod')]:
    nz = coefs[coefs != 0]
    print "{} {} features are non-zero".format(len(nz), name)
    print "Dist of nonzero {} features...".format(name)
    s = pd.Series(nz)
    print s.describe()

with time_me("Loaded feature lookup", mode='print'):
    featmap = count_pairs.load_pair_lookup()

pair_lookup = {v: k for (k, v) in featmap.iteritems()}

prod_df = utils.load_product_df()

poke_uniprods(uniprod_coefs, prod_df)

poke_biprods(biprod_coefs, pair_lookup, prod_df, 10)
Ejemplo n.º 16
0
    t0 = time.time()

    user_iterator = user_wrapper.iterate_wrapped_users(args.recordfile,
                                                       ktest=True)
    outname = 'submission_{}.csv'.format(args.save_tag)
    f = open(outname, 'w')
    writer = csv.DictWriter(f, fieldnames=['order_id', 'products'])
    writer.writeheader()

    def predcol(pids):
        stringify = lambda pid: 'None' if pid == -1 else str(pid)
        return ' '.join(map(stringify, pids))

    pmap = common.pdict_for_tag(args.tag, args.recordfile)
    predictor = pred.HybridThresholdPredictor(pmap, ntrials=args.mc_trials)
    for i, user in enumerate(user_iterator):
        predicted = predictor.predict_last_order(user)
        oid = user.user.testorder.orderid
        row = {'order_id': oid, 'products': predcol(predicted)}
        writer.writerow(row)
        if args.n_users and i >= args.n_users:
            break
    t1 = time.time()
    print "Finished predictions in {:.1f}s".format(t1 - t0)


if __name__ == '__main__':
    with time_me(mode="print"):
        main()