Esempio n. 1
0
def main():
    # TODO: would be kind of nice to have an optional tag to be able to juggle
    # between sets of vectors corresponding to different user folds or sets
    # of features. (Though don't want too many lying around. Vectorizing just
    # 1% of users can already produce files as big as a GB.)
    parser = argparse.ArgumentParser()
    parser.add_argument('user_fold')
    parser.add_argument(
        '--tag',
        action='store_true',
        help='Whether to tag the generated vectors by the user fold used')
    parser.add_argument('--lim',
                        type=int,
                        help='Limit number of users vectorized')
    args = parser.parse_args()

    #featspec = FeatureSpec.all_features_spec()
    featspec = FeatureSpec.basic_spec()
    #featspec.add_feature(features.PrevOrderPids)
    users = iterate_wrapped_users(args.user_fold)
    if args.tag:
        affix = '_' + args.user_fold
    else:
        affix = ''
    victor = Vectorizer(featspec, affix)
    n = victor.vectorize_users(users, limit=args.lim)
    print 'Vectorized {} users from fold {}'.format(n, args.user_fold)
def main():
  baskets.time_me.set_default_mode('print')
  logging.basicConfig(level=logging.INFO)
  parser = argparse.ArgumentParser()
  parser.add_argument('folds', nargs='+')
  parser.add_argument('-b', '--boost', help='Tag of model to boost from')
  parser.add_argument('--testmode', action='store_true')
  parser.add_argument('--uniprods', action='store_true')
  parser.add_argument('--tag', default='pairs')
  args = parser.parse_args()
  assert args.uniprods

  with time_me('Loaded pair lookup'):
    lookup = count_pairs.load_pair_lookup()
  for fold in args.folds:
    users = iterate_wrapped_users(fold, ktest=args.testmode)
    if args.boost:
      logits = user_wrapper.logits_for_tag(args.boost, fold)
    else:
      logits = None
    with time_me('Vectorized'):
      X, y = vectorize(users, lookup, logits, args.uniprods)
    logging.info('Loaded X with shape {} and y with shape {}'.format(
      X.shape, y.shape))
    logging.info('Mean # of non-zero features per instance = {:.1f}'.format(
      X.sum(axis=1).mean()
      ))
    save_fold(fold, X, y, args.tag)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('user_records_file')
    parser.add_argument(
        '-n',
        '--n-users',
        type=int,
        help='limit on number of users vectorized (default: none)')
    parser.add_argument(
        '--max-prods',
        type=int,
        default=None,
        help='Max number of products to take per user (default: no limit)')
    parser.add_argument('--testmode', action='store_true')
    parser.add_argument('--outname',
                        help='Identifier used to name the generated\
      npy file. Default name is based on user_records_file')
    args = parser.parse_args()
    random.seed(1337)

    if args.testmode:
        assert args.max_prods is None

    prod_lookup = load_product_lookup()
    user_iter = iterate_wrapped_users(args.user_records_file,
                                      ktest=args.testmode)
    vecs = accumulate_user_vectors(user_iter, args.max_prods, prod_lookup,
                                   args.n_users, args.testmode)
    output_tag = args.outname or args.user_records_file
    outpath = common.resolve_scalarvector_path(output_tag)
    np.save(outpath, vecs)
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('tags', metavar='tag', nargs='+')
  parser.add_argument('-t', '--thresh', default=.2, help='Probability threshold '+
      'for taking a product when using threshold predictor (default=.2)', type=float)
  parser.add_argument('--fold', default='test.tfrecords', 
      help='tfrecords file with the users to test on (default: test.tfrecords)')
  parser.add_argument('--mc-trials', type=int, default=50,
      help='Number of rounds of monte carlo sim to perform per product/threshold (default:50)')
  parser.add_argument('-n', '--n-users', type=int, 
      help='Limit number of users tested on (default: no limit)')
  parser.add_argument('--baseline', action='store_true', 
      help='Run a dumb baseline predict-previous-order predictor for comparison')
  parser.add_argument('--tp', action='store_true', 
      help='Run a basic thresholded predictor for each tag using --thresh threshold')
  parser.add_argument('--mc', action='store_true', dest='montecarlo', default=True,
      help='Run a monte-carlo thresh predictor per tag')
  parser.add_argument('--no-mc', action='store_false', dest='montecarlo',
      help='Don\'t run a monte-carlo thresh predictor per tag')
  parser.add_argument('--save', action='store_true', help='Serialize predictions to a file')
  parser.add_argument('--quick', action='store_true', help='Cut some corners')
  args = parser.parse_args()
  
  predictors = {}
  if args.baseline:
    predictors['baseline'] = pred.PreviousOrderPredictor()

  for tag in args.tags:
    try:
      pmap = common.pdict_for_tag(tag, args.fold)
    except common.NoPdictException as err:
      raise
      logging.warning(err.message + "\nPrecomputing and saving probabilities")
      # Not clear whether this 'recovery' mode should be on by default. Might cause more problems than it solves.
      # Not every tag belongs to an rnn model.
      with time_me('Precomputed probabilities', mode='stderr'):
        pmap = precompute_probs.precompute_probs_for_tag(tag, args.fold)
    if args.tp:
      predictors['{}-tp'.format(tag)] = pred.ThresholdPredictor(pmap, args.thresh)
    if args.montecarlo:
      predictors['{}-mc'.format(tag)] = \
          pred.HybridThresholdPredictor(pmap, 
              ntrials=args.mc_trials, 
              save=args.save,
              optimization_level=(0 if args.quick else 10)
              )

  assert predictors

  user_iterator = iterate_wrapped_users(args.fold)
  judge = evaluator.Evaluator(user_iterator)
  # TODO: would be real nice to use tensorboard to look at dist. of
  # probabilities/logits/fscores/precisions stuff like that
  results = judge.evaluate(predictors, limit=args.n_users, save=args.save)

  for pname, res in results.iteritems():
    print '{}:'.format(pname)
    df = res.to_df()
    print df.mean()
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('folds', nargs='+')
    args = parser.parse_args()

    users = {fold: iterate_wrapped_users(fold) for fold in args.folds}
    df = vectorize(users)
    df.to_pickle(METAVECTORS_PICKLEPATH)
    print 'Wrote df with {} metavectors'.format(len(df))
def get_pair_lookup(fold, threshold=1, force=False):
    """
  Return a dictionary of form
    (focal, other) -> idx
  idx is the index of the feature representing 'product other came after the last
  occurrence of product focal'. (idx is 'relative' to the set of product pair features,
  and might not correspond to the same 'global' feature index in the ultimate train matrix

  The only pairs of products that make it into the returned dicts are ones that occur
  at least threshold times.
  """
    try:
        assert not force  # this is some hacky control flow right here
        M = load_cooc_matrix(fold)
        logging.info('Loaded cooc matrix')
    except (AssertionError, IOError):
        logging.info('Building cooc matrix')
        users = iterate_wrapped_users(fold)
        with time_me('Built cooc matrix', mode='print'):
            M = build_cooc_matrix(users)
            M = M.tocoo()
        # converting to coo at this point might provide some memory relief
        save_cooc_matrix(fold, M)

    logging.info('Loaded cooc matrix with {:,} nonzero entries'.format(M.nnz))
    next_ix = 0
    lookup = {}
    # Turns out using scipy.sparse.find is slow af
    #with time_me('finded'):
    #  focals, others, counts = scipy.sparse.find(M)
    assert isinstance(M, scipy.sparse.coo_matrix)
    focals, others, counts = M.row, M.col, M.data
    #for i in xrange(len(counts)):
    #  focal_pidx = focals[i]
    #  other_pidx = others[i]
    #  count = counts[i]
    for (focal_pidx, other_pidx,
         count) in itertools.izip(focals, others, counts):

        if count >= threshold:
            k = (focal_pidx + 1, other_pidx + 1)
            lookup[k] = next_ix
            next_ix += 1
            if next_ix % 10**6 == 0:
                print '{:,}... '.format(next_ix)

    logging.info(
        'Went from {:,} non-zero entries to {:,} pairs after thresholding by {}'
        .format(M.nnz, next_ix, threshold))
    if next_ix + constants.N_PRODUCTS > 2**31:
        logging.warn(
            "Too many features even after thresholding. You're gonna have a bad time."
        )
    return lookup
def main():
    parser = argparse.ArgumentParser(
        description="Generate a Kaggle submission file")
    parser.add_argument('tag')
    parser.add_argument(
        '--recordfile',
        default='ktest.tfrecords',
        help=
        'tfrecords file with the users to make predictions on (default: ktest.tfrecords)'
    )
    parser.add_argument(
        '--mc-trials',
        type=int,
        default=50,
        help=
        'Number of rounds of monte carlo sim to perform per product/threshold (default:50)'
    )
    parser.add_argument(
        '-n',
        '--n-users',
        type=int,
        help='Limit number of users tested on (default: no limit)')
    parser.add_argument('--quick', action='store_true')  # TODO: hook me up
    parser.add_argument('--save-tag', default='')
    args = parser.parse_args()

    t0 = time.time()

    user_iterator = user_wrapper.iterate_wrapped_users(args.recordfile,
                                                       ktest=True)
    outname = 'submission_{}.csv'.format(args.save_tag)
    f = open(outname, 'w')
    writer = csv.DictWriter(f, fieldnames=['order_id', 'products'])
    writer.writeheader()

    def predcol(pids):
        stringify = lambda pid: 'None' if pid == -1 else str(pid)
        return ' '.join(map(stringify, pids))

    pmap = common.pdict_for_tag(args.tag, args.recordfile)
    predictor = pred.HybridThresholdPredictor(pmap, ntrials=args.mc_trials)
    for i, user in enumerate(user_iterator):
        predicted = predictor.predict_last_order(user)
        oid = user.user.testorder.orderid
        row = {'order_id': oid, 'products': predcol(predicted)}
        writer.writerow(row)
        if args.n_users and i >= args.n_users:
            break
    t1 = time.time()
    print "Finished predictions in {:.1f}s".format(t1 - t0)
Esempio n. 8
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('user_records_file')
    parser.add_argument('--out',
                        help='Name to use for saved tfrecords file.\
      Defaults to a name based on input tfrecords file.')
    parser.add_argument(
        '--test-mode',
        action='store_true',
        help=
        'Include final "testorder" in sequences, and only vectorize test users.'
    )
    parser.add_argument(
        '-n',
        '--n-users',
        type=int,
        help='limit on number of users vectorized (default: none)')
    parser.add_argument(
        '--max-prods',
        type=int,
        default=None,
        help='Max number of products to take per user (default: no limit)')
    args = parser.parse_args()
    # (For reproducibility when sampling pids)
    random.seed(1337)

    if args.test_mode:
        assert not args.max_prods

    outpath = common.resolve_vector_recordpath(args.out
                                               or args.user_records_file)
    tf.logging.info("Writing vectors to {}".format(outpath))
    writer_options = tf.python_io.TFRecordOptions(
        compression_type=common.VECTOR_COMPRESSION_TYPE)
    writer = tf.python_io.TFRecordWriter(outpath, options=writer_options)
    prod_lookup = load_product_lookup()
    i = 0
    nseqs = 0
    for user in iterate_wrapped_users(args.user_records_file,
                                      ktest=args.test_mode):
        nseqs += write_user_vectors(user, writer, prod_lookup, args.test_mode,
                                    args.max_prods)
        i += 1
        if args.n_users and i >= args.n_users:
            break
        if (i % 10000) == 0:
            print "i={}... ".format(i)
    print "Wrote {} sequences for {} users".format(nseqs, i)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('tag')
    parser.add_argument('--recordfile',
                        default='test',
                        help='identifier for user stratum')
    args = parser.parse_args()

    product_lookup = utils.load_product_df()

    pmap = common.pdict_for_tag(args.tag, args.recordfile)
    user_iterator = iterate_wrapped_users(args.recordfile)
    predictor = pred.HybridThresholdPredictor(pmap, optimization_level=0)

    skip = 10
    for i, user in enumerate(user_iterator):
        if i < skip:
            continue
        foo(user, predictor, product_lookup)