argparser.add_argument("--aol_data", action='store_true', default=False) argparser.add_argument("--count_sketch", action='store_true', default=False) args = argparser.parse_args() command = ' '.join(sys.argv) + '\n' log_str = command log_str += git_log() + '\n' print(log_str) np.random.seed(args.seed) if args.aol_data: assert len(args.data) == 1 x, y = get_data_aol_query(args.data[0]) else: x, y = get_data_str_with_ports_list(args.data) get_stat(args.data, x, y) if args.count_sketch: name = 'count_sketch' else: name = 'count_min' folder = os.path.join('param_results', name, '') if not os.path.exists(folder): os.makedirs(folder) nb_all = [] nh_all = [] for n_hash in args.n_hashes_list: for space in args.space_list: n_buckets = int(space * 1e6 / (n_hash * 4)) nh_all.append(n_hash)
train_x, train_y = get_data(args.train, feat_idx, args.n_examples) print('train x shape:', train_x.shape, 'y max', np.max(train_y), 'y min', np.min(train_y)) valid_x, valid_y = get_data(args.valid, feat_idx, args.n_examples) print('valid x shape:', valid_x.shape, 'y max', np.max(valid_y), 'y min', np.min(valid_y)) test_x, test_y = get_data_list(args.test, feat_idx, args.n_examples) print('Load data time %.1f seconds' % (time.time() - start_t)) if not args.evaluate: assert len( test_x) == 1, 'test on more than 1 minute (forgot --evaluate?)' data_stat = get_stat('train before log', train_x, train_y) train_y = np.log(train_y) valid_y = np.log(valid_y) for i in range(len(test_y)): test_y[i] = np.log(test_y[i]) rmin = np.log(args.regress_min) data_stat += get_stat('train before rmin', train_x, train_y) s = 'rmin %.2f, # train_y < min %.2f\n\n' % (rmin, np.sum(train_y < rmin)) data_stat += s print(s) train_y[train_y < rmin] = rmin valid_y[valid_y < rmin] = rmin for i in range(len(test_y)): test_y[i][test_y[i] < rmin] = rmin
name = 'lookup_table_%s' % sketch_type else: name = 'cutoff_%s_param' % sketch_type folder = os.path.join('param_results', name, '') if not os.path.exists(folder): os.makedirs(folder) start_t = time.time() if args.aol_data: x_valid, y_valid = get_data_aol_query_list(args.valid_data) x_test, y_test = get_data_aol_query_list(args.test_data) else: x_valid, y_valid = get_data_str_with_ports_list(args.valid_data) x_test, y_test = get_data_str_with_ports_list(args.test_data) log_str += get_stat('valid data:\n'+'\n'.join(args.valid_data), x_valid, y_valid) log_str += get_stat('test data:\n'+'\n'.join(args.test_data), x_test, y_test) if args.lookup_data: if args.aol_data: x_train, y_train = get_data_aol_query_list(args.lookup_data) else: x_train, y_train = get_data_str_with_ports_list(args.lookup_data) log_str += get_stat('lookup data:\n'+'\n'.join(args.lookup_data), x_train, y_train) print('data loading time: %.1f sec' % (time.time() - start_t)) if args.valid_results: key = 'valid_output' y_valid_ordered, valid_scores = order_y_wkey_list(y_valid, args.valid_results, key) if args.test_results: