argparser.add_argument("--aol_data", action='store_true', default=False)
    argparser.add_argument("--count_sketch", action='store_true', default=False)
    args = argparser.parse_args()

    command = ' '.join(sys.argv) + '\n'
    log_str = command
    log_str += git_log() + '\n'
    print(log_str)
    np.random.seed(args.seed)

    if args.aol_data:
        assert len(args.data) == 1
        x, y = get_data_aol_query(args.data[0])
    else:
        x, y = get_data_str_with_ports_list(args.data)
    get_stat(args.data, x, y)

    if args.count_sketch:
        name = 'count_sketch'
    else:
        name = 'count_min'
    folder = os.path.join('param_results', name, '')
    if not os.path.exists(folder):
        os.makedirs(folder)

    nb_all = []
    nh_all = []
    for n_hash in args.n_hashes_list:
        for space in args.space_list:
            n_buckets = int(space * 1e6 / (n_hash * 4))
            nh_all.append(n_hash)
Exemple #2
0
    train_x, train_y = get_data(args.train, feat_idx, args.n_examples)
    print('train x shape:', train_x.shape, 'y max', np.max(train_y), 'y min',
          np.min(train_y))

    valid_x, valid_y = get_data(args.valid, feat_idx, args.n_examples)
    print('valid x shape:', valid_x.shape, 'y max', np.max(valid_y), 'y min',
          np.min(valid_y))

    test_x, test_y = get_data_list(args.test, feat_idx, args.n_examples)
    print('Load data time %.1f seconds' % (time.time() - start_t))
    if not args.evaluate:
        assert len(
            test_x) == 1, 'test on more than 1 minute (forgot --evaluate?)'

    data_stat = get_stat('train before log', train_x, train_y)
    train_y = np.log(train_y)
    valid_y = np.log(valid_y)
    for i in range(len(test_y)):
        test_y[i] = np.log(test_y[i])
    rmin = np.log(args.regress_min)

    data_stat += get_stat('train before rmin', train_x, train_y)
    s = 'rmin %.2f, # train_y < min %.2f\n\n' % (rmin, np.sum(train_y < rmin))
    data_stat += s
    print(s)

    train_y[train_y < rmin] = rmin
    valid_y[valid_y < rmin] = rmin
    for i in range(len(test_y)):
        test_y[i][test_y[i] < rmin] = rmin
        name = 'lookup_table_%s' % sketch_type
    else:
        name = 'cutoff_%s_param' % sketch_type

    folder = os.path.join('param_results', name, '')
    if not os.path.exists(folder):
        os.makedirs(folder)

    start_t = time.time()
    if args.aol_data:
        x_valid, y_valid = get_data_aol_query_list(args.valid_data)
        x_test, y_test = get_data_aol_query_list(args.test_data)
    else:
        x_valid, y_valid = get_data_str_with_ports_list(args.valid_data)
        x_test, y_test = get_data_str_with_ports_list(args.test_data)
    log_str += get_stat('valid data:\n'+'\n'.join(args.valid_data), x_valid, y_valid)
    log_str += get_stat('test data:\n'+'\n'.join(args.test_data), x_test, y_test)

    if args.lookup_data:
        if args.aol_data:
            x_train, y_train = get_data_aol_query_list(args.lookup_data)
        else:
            x_train, y_train = get_data_str_with_ports_list(args.lookup_data)
        log_str += get_stat('lookup data:\n'+'\n'.join(args.lookup_data), x_train, y_train)
    print('data loading time: %.1f sec' % (time.time() - start_t))

    if args.valid_results:
        key = 'valid_output'
        y_valid_ordered, valid_scores = order_y_wkey_list(y_valid, args.valid_results, key)

    if args.test_results: