def solve_opga(directory, name, depot, loc, prize, max_length, disable_cache=False): problem_filename = os.path.join(directory, "{}.opga.pkl".format(name)) if os.path.isfile(problem_filename) and not disable_cache: (prize, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() prize, tour, duration = run_opga_alg( [(*pos, p) for p, pos in zip([0, 0] + prize, [depot, depot] + loc)], max_length, return_sol=True, verbose=False) duration = time.time() - start # Measure clock time save_dataset((prize, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0][3] == 0 assert tour[-1][3] == 1 return -prize, [i - 1 for x, y, p, i, t in tour[1:-1]], duration
def main(): train, test = data_utils.load_dataset(op_scope='5') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'add fastest routes features...' train_fr_1 = pd.read_csv('../input/fastest_routes_train_part_1.csv') train_fr_2 = pd.read_csv('../input/fastest_routes_train_part_2.csv') test_fr = pd.read_csv('../input/fastest_routes_test.csv') train_fr = pd.concat((train_fr_1, train_fr_2)) train = train.merge(train_fr, how='left', on='id') test = test.merge(test_fr, how='left', on='id') generate_street_heavy(train, test) train.drop([ 'starting_street', 'end_street', 'street_for_each_step', 'distance_per_step', 'travel_time_per_step', 'step_maneuvers', 'step_direction', 'step_location_list' ], axis=1, inplace=True) test.drop([ 'starting_street', 'end_street', 'street_for_each_step', 'distance_per_step', 'travel_time_per_step', 'step_maneuvers', 'step_direction', 'step_location_list' ], axis=1, inplace=True) print 'add weather features...' train, test = add_weather_features(train, test) print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='6')
def main(): if os.path.exists(Configure.processed_train_path.format('1')): return train, test = data_utils.load_dataset(op_scope='0') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) print 'generate geography pca features...' generate_pca_features(conbined_data) print 'generate datetime features...' generate_date_features(conbined_data) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] print 'generate distance features...' generate_distance_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_') print 'generate pca distance features...' generate_distance_features(train, test, loc1='pca0', loc2='pca1', fea_name='pca_') print 'generate location bin features...' generate_location_bin_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_', round_num=2) train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='1')
def main(): if os.path.exists(Configure.processed_train_path.format('2')): return train, test = data_utils.load_dataset(op_scope='1') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'data clean according to lat_long_distance_haversine & trip_duration...' # train = train[train['lat_long_distance_haversine'] < 300] # train = train[train['trip_duration'] <= 1800000].reset_index(drop=True) # 导致过拟合 print 'train: {}, test: {}'.format(train.shape, test.shape) # optimize dtypes print('Memory usage, Mb: {:.2f}'.format(train.memory_usage().sum() / 2**20)) print 'optimize dtypes...' train['is_store_and_fwd_flag'] = train['is_store_and_fwd_flag'].astype( np.uint8) train['passenger_count'] = train['passenger_count'].astype(np.uint8) train['vendor_id'] = train['vendor_id'].astype(np.uint8) train['pickup_month'] = train['pickup_month'].astype(np.uint8) train['pickup_day'] = train['pickup_day'].astype(np.uint8) train['pickup_hour'] = train['pickup_hour'].astype(np.uint8) train['pickup_weekofyear'] = train['pickup_weekofyear'].astype(np.uint8) train['pickup_weekday'] = train['pickup_weekday'].astype(np.uint8) train['is_weekend'] = train['is_weekend'].astype(np.uint8) train['trip_duration'] = train['trip_duration'].astype(np.uint32) print('After optimized memory usage, Mb: {:.2f}'.format( train.memory_usage().sum() / 2**20)) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='2')
def main(): if os.path.exists(Configure.processed_train_path.format('3')): return train, test = data_utils.load_dataset(op_scope='2') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) n_clusters = 10**2 print 'location clustering n_clusters = {}...'.format(n_clusters) location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64**3, random_state=1000) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'generate lat_long groupby speed features...' train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude', fea_name='lat_long_') # print 'generate pca groupby speed features...' # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='pca0', loc2='pca1', fea_name='pca_') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='3')
def solve_ortools(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, sec_local_search=0, disable_cache=False): # Lazy import so we do not require ortools by default from .pctsp_ortools import solve_pctsp_ortools try: problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search)) if os.path.isfile(problem_filename) and not disable_cache: objval, tour, duration = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() objval, tour = solve_pctsp_ortools(depot, loc, deterministic_prize, penalty, min(sum(deterministic_prize), 1.), sec_local_search=sec_local_search) duration = time.time() - start save_dataset((objval, tour, duration), problem_filename) assert tour[0] == 0, "Tour must start with depot" tour = tour[1:] total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert abs(total_cost - objval) <= 1e-5, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None
def main(): if os.path.exists(Configure.processed_train_path.format('8')): return train, test = data_utils.load_dataset(op_scope='7') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) def driving_distance(raw): startpoint = (raw['pickup_latitude'], raw['pickup_longitude']) endpoint = (raw['dropoff_latitude'], raw['dropoff_longitude']) distance = great_circle(startpoint, endpoint).miles return distance print 'calc geopy distance features...' conbined_data['osmnx_distance'] = conbined_data[[ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ]].apply(driving_distance, axis=1) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='8')
def main(): train, test = data_utils.load_dataset(op_scope='4') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) generate_binary_features(conbined_data) # for n_clusters in [6**2]: # print 'location clustering n_clusters = {}...'.format(n_clusters) # location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64 ** 3, random_state=1000) # # train = conbined_data.iloc[:train.shape[0], :] # test = conbined_data.iloc[train.shape[0]:, :] # train['trip_duration'] = trip_durations # # print 'generate lat_long groupby speed features...' # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude', # fea_name='lat_long_') # del train['trip_duration'] # print 'train: {}, test: {}'.format(train.shape, test.shape) # conbined_data = pd.concat([train, test]) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='5')
def eval_dataset(dataset_path, decode_strategy, width, softmax_temp, opts): model, model_args = load_model(opts.model) use_cuda = torch.cuda.is_available() and not opts.no_cuda device = torch.device("cuda:0" if use_cuda else "cpu") dataset = model.problem.make_dataset(filename=dataset_path, batch_size=opts.batch_size, num_samples=opts.val_size, neighbors=model_args['neighbors'], knn_strat=model_args['knn_strat'], supervised=True) results = _eval_dataset(model, dataset, decode_strategy, width, softmax_temp, opts, device) costs, tours, durations = zip(*results) costs, tours, durations = np.array(costs), np.array(tours), np.array( durations) gt_tours = dataset.tour_nodes gt_costs = rollout_groundtruth(model.problem, dataset, opts).cpu().numpy() opt_gap = ((costs / gt_costs - 1) * 100) results = zip(costs, gt_costs, tours, gt_tours, opt_gap, durations) print('Validation groundtruth cost: {:.3f} +- {:.3f}'.format( gt_costs.mean(), np.std(gt_costs))) print('Validation average cost: {:.3f} +- {:.3f}'.format( costs.mean(), np.std(costs))) print('Validation optimality gap: {:.3f}% +- {:.3f}'.format( opt_gap.mean(), np.std(opt_gap))) print('Average duration: {:.3f}s +- {:.3f}'.format(durations.mean(), np.std(durations))) print('Total duration: {}s'.format(np.sum(durations) / opts.batch_size)) dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1]) model_name = "_".join( os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:]) results_dir = os.path.join(opts.results_dir, dataset_basename) os.makedirs(results_dir, exist_ok=True) out_file = os.path.join( results_dir, "{}-{}-{}{}-t{}-{}-{}{}".format( dataset_basename, model_name, decode_strategy, width if decode_strategy != 'greedy' else '', softmax_temp, opts.offset, opts.offset + len(costs), ext)) assert opts.f or not os.path.isfile( out_file ), "File already exists! Try running with -f option to overwrite." save_dataset(results, out_file) latex_str = ' & ${:.3f}\pm{:.3f}$ & ${:.3f}\%\pm{:.3f}$ & ${:.3f}$s'.format( costs.mean(), np.std(costs), opt_gap.mean(), np.std(opt_gap), np.sum(durations) / opts.batch_size) return latex_str
def main(): if os.path.exists(Configure.processed_train_path.format('7')): return train, test = data_utils.load_dataset(op_scope='6') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) conbined_data['is_holyday'] = conbined_data.apply( lambda row: 1 if (row['pickup_month'] == 1 and row['pickup_day'] == 1) or (row['pickup_month'] == 7 and row['pickup_day'] == 4) or (row['pickup_month'] == 11 and row['pickup_day'] == 11) or (row['pickup_month'] == 12 and row['pickup_day'] == 25) or (row['pickup_month'] == 1 and row['pickup_day'] >= 15 and row[ 'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 2 and row['pickup_day'] >= 15 and row[ 'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 5 and row['pickup_day'] >= 25 and row[ 'pickup_day'] <= 31 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[ 'pickup_day'] <= 7 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 10 and row['pickup_day'] >= 8 and row[ 'pickup_day'] <= 14 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 11 and row['pickup_day'] >= 22 and row[ 'pickup_day'] <= 28 and row['pickup_weekday'] == 3) else 0, axis=1) conbined_data['is_day_before_holyday'] = conbined_data.apply( lambda row: 1 if (row['pickup_month'] == 12 and row['pickup_day'] == 31) or (row['pickup_month'] == 7 and row['pickup_day'] == 3) or (row['pickup_month'] == 11 and row['pickup_day'] == 10) or (row['pickup_month'] == 12 and row['pickup_day'] == 24) or (row['pickup_month'] == 1 and row['pickup_day'] >= 14 and row[ 'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 2 and row['pickup_day'] >= 14 and row[ 'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 5 and row['pickup_day'] >= 24 and row[ 'pickup_day'] <= 30 and row['pickup_weekday'] == 6) or ( (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[ 'pickup_day'] <= 6) or (row['pickup_month'] == 8 and row[ 'pickup_day'] == 31) and row['pickup_weekday'] == 6) or (row['pickup_month'] == 10 and row['pickup_day'] >= 7 and row[ 'pickup_day'] <= 13 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 11 and row['pickup_day'] >= 21 and row[ 'pickup_day'] <= 27 and row['pickup_weekday'] == 2) else 0, axis=1) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='7')
def solve_lkh_log(executable, directory, name, depot, loc, demand, capacity, grid_size=1, runs=1, disable_cache=False): problem_filename = os.path.join(directory, "{}.lkh{}.vrp".format(name, runs)) tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs)) output_filename = os.path.join(directory, "{}.lkh{}.pkl".format(name, runs)) param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs)) log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs)) try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_vrplib(problem_filename, depot, loc, demand, capacity, grid_size, name=name) params = { "PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": tour_filename, "RUNS": runs, "SEED": 1234 } write_lkh_par(param_filename, params) with open(log_filename, 'w') as f: start = time.time() check_call([executable, param_filename], stdout=f, stderr=f) duration = time.time() - start tour = read_vrplib(tour_filename, n=len(demand)) save_dataset((tour, duration), output_filename) return calc_vrp_cost(depot, loc, tour), tour, duration except Exception as e: raise print("Exception occured") print(e) return None
def solve_salesman(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10): problem_filename = os.path.join(directory, "{}.salesman{}.pctsp".format(name, runs)) output_filename = os.path.join(directory, "{}.salesman{}.pkl".format(name, runs)) try: # May have already been run if not os.path.isfile(output_filename): write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name) start = time.time() random.seed(1234) pctsp = Pctsp() pctsp.load(problem_filename, float_to_scaled_int(1.)) s = solution.random(pctsp, start_size=int(len(pctsp.prize) * 0.7)) s = ilocal_search(s, n_runs=runs) output = (s.route[:s.size], s.quality) duration = time.time() - start save_dataset((output, duration), output_filename) else: output, duration = load_dataset(output_filename) # Now parse output tour = output[0][:] assert tour[0] == 0, "Tour should start with depot" assert tour[-1] != 0, "Tour should not end with depot" tour = tour[1:] # Strip off depot total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert (float_to_scaled_int(total_cost) - output[1]) / float( output[1]) < 1e-5 return total_cost, tour, duration except Exception as e: print("Exception occured") print(e) return None
def solve_gurobi(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, disable_cache=False, timeout=None, gap=None): # Lazy import so we do not need to have gurobi installed to run this script from .pctsp_gurobi import \ solve_euclidian_pctsp as solve_euclidian_pctsp_gurobi try: problem_filename = os.path.join( directory, "{}.gurobi{}{}.pkl".format( name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap))) if os.path.isfile(problem_filename) and not disable_cache: (cost, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() # Must collect 1 or the sum of the prices if it is less then 1. cost, tour = solve_euclidian_pctsp_gurobi( depot, loc, penalty, deterministic_prize, min(sum(deterministic_prize), 1.), threads=1, timeout=timeout, gap=gap) duration = time.time() - start # Measure clock time save_dataset((cost, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0] == 0 tour = tour[1:] total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we can retry by the caching mechanism print("Exception occured") print(e) return None
def eval_dataset(dataset_path, width, softmax_temp, opts): # Even with multiprocessing, we load the model here since it contains the name where to write results model, _ = load_model(opts.model) use_cuda = torch.cuda.is_available() and not opts.no_cuda model.use_cuda = use_cuda if opts.multiprocessing: assert use_cuda, "Can only do multiprocessing with cuda" num_processes = torch.cuda.device_count() assert opts.val_size % num_processes == 0 with mp.Pool(num_processes) as pool: results = list(itertools.chain.from_iterable(pool.map( eval_dataset_mp, [(dataset_path, width, softmax_temp, opts, i, num_processes) for i in range(num_processes)] ))) else: device = torch.device("cuda:0" if use_cuda else "cpu") dataset = model.problem.make_dataset(filename=dataset_path, num_samples=opts.val_size, offset=opts.offset) results = _eval_dataset(model, dataset, width, softmax_temp, opts, device) # This is parallelism, even if we use multiprocessing (we report as if we did not use multiprocessing, e.g. 1 GPU) parallelism = opts.eval_batch_size costs, tours, durations = zip(*results) # Not really costs since they should be negative print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs)))) print("Average serial duration: {} +- {}".format( np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations)))) print("Average parallel duration: {}".format(np.mean(durations) / parallelism)) print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism)))) dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1]) model_name = "_".join(os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:]) if opts.o is None: results_dir = os.path.join(opts.results_dir, model.problem.NAME, dataset_basename) os.makedirs(results_dir, exist_ok=True) out_file = os.path.join(results_dir, "{}-{}-{}{}-t{}-{}-{}{}".format( dataset_basename, model_name, opts.decode_strategy, width if opts.decode_strategy != 'greedy' else '', softmax_temp, opts.offset, opts.offset + len(costs), ext )) else: out_file = opts.o assert opts.f or not os.path.isfile( out_file), "File already exists! Try running with -f option to overwrite." save_dataset((results, parallelism), out_file) return costs, tours, durations
def solve_compass_log(executable, directory, name, depot, loc, prize, max_length, disable_cache=False): problem_filename = os.path.join(directory, "{}.oplib".format(name)) tour_filename = os.path.join(directory, "{}.tour".format(name)) output_filename = os.path.join(directory, "{}.compass.pkl".format(name)) log_filename = os.path.join(directory, "{}.log".format(name)) try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_oplib(problem_filename, depot, loc, prize, max_length, name=name) with open(log_filename, 'w') as f: start = time.time() check_call([ executable, '--op', '--op-ea4op', problem_filename, '-o', tour_filename ], stdout=f, stderr=f) duration = time.time() - start tour = read_oplib(tour_filename, n=len(prize)) if not calc_op_length(depot, loc, tour) <= max_length: print("Warning: length exceeds max length:", calc_op_length(depot, loc, tour), max_length) assert calc_op_length( depot, loc, tour ) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" save_dataset((tour, duration), output_filename) return -calc_op_total(prize, tour), tour, duration except Exception as e: print("Exception occured") print(e) return None
def solve_gurobi(directory, name, depot, loc, prize, max_length, disable_cache=False, timeout=None, gap=None): # Lazy import so we do not need to have gurobi installed to run this script from problems.op.op_gurobi import \ solve_euclidian_op as solve_euclidian_op_gurobi try: problem_filename = os.path.join( directory, "{}.gurobi{}{}.pkl".format( name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap))) if os.path.isfile(problem_filename) and not disable_cache: (cost, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() cost, tour = solve_euclidian_op_gurobi(depot, loc, prize, max_length, threads=1, timeout=timeout, gap=gap) duration = time.time() - start # Measure clock time save_dataset((cost, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0] == 0 tour = tour[1:] assert calc_op_length( depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" total_cost = -calc_op_total(prize, tour) assert abs(total_cost - cost) <= 1e-4, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None
def main(base_data_dir): op_scope = 0 if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope+1)): return print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> generate basic statistic features") train['num_of_chars_q1'] = train['question1'].apply(lambda x: len(str(x))) train['num_of_chars_q2'] = train['question2'].apply(lambda x: len(str(x))) test['num_of_chars_q1'] = test['question1'].apply(lambda x: len(str(x))) test['num_of_chars_q2'] = test['question2'].apply(lambda x: len(str(x))) train['num_of_words_q1'] = train['question1'].apply(lambda x: len(str(x).split())) train['num_of_words_q2'] = train['question2'].apply(lambda x: len(str(x).split())) test['num_of_words_q1'] = test['question1'].apply(lambda x: len(str(x).split())) test['num_of_words_q2'] = test['question2'].apply(lambda x: len(str(x).split())) print('---> generate unigram_words features before cleaned') train = jobs.parallelize_dataframe(train, generate_unigram_words_features) test = jobs.parallelize_dataframe(test, generate_unigram_words_features) print('---> clean text') start = time.clock() if 'no_stem_words' in base_data_dir: print('clean train question') train = jobs.parallelize_dataframe(train, clean_text_func_no_stem_words) print('clean test question') test = jobs.parallelize_dataframe(test, clean_text_func_no_stem_words) else: print('clean train question') train = jobs.parallelize_dataframe(train, clean_text_func_stem_words) print('clean test question') test = jobs.parallelize_dataframe(test, clean_text_func_stem_words) stop = time.clock() print("text cleaned, cost {}s".format(stop, str(stop - start))) print('---> generate unigram_words features after cleaned') train = jobs.parallelize_dataframe(train, generate_cleaned_unigram_words_features) test = jobs.parallelize_dataframe(test, generate_cleaned_unigram_words_features) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)
def main(): if os.path.exists(Configure.processed_train_path.format('0')): return train, test = data_utils.load_dataset(op_scope='0') print 'train: {}, test: {}'.format(train.shape, test.shape) # store_and_fwd_flag train['is_store_and_fwd_flag'] = train['store_and_fwd_flag'].map( lambda s: 1 if s == 'Y' else 0) test['is_store_and_fwd_flag'] = test['store_and_fwd_flag'].map( lambda s: 1 if s == 'Y' else 0) del train['store_and_fwd_flag'] del test['store_and_fwd_flag'] print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='0')
def main(base_data_dir): op_scope = 4 if os.path.exists( Configure.processed_train_path.format(base_data_dir, op_scope + 1)): return print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print('---> generate common word count') train = jobs.parallelize_dataframe(train, generate_common_word_count) test = jobs.parallelize_dataframe(test, generate_common_word_count) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)
def eval_dataset(dataset_path, opts): model = load_model(opts.model) use_cuda = torch.cuda.is_available() and opts.no_cuda device = torch.device('cuda:0' if use_cuda else 'cpu') dataset = model.problem.make_dataset(filename=dataset_path, num_samples=opts.val_size, offset=opts.offset) results = _eval_dataset(model, dataset, opts, device) parallelism = opts.eval_batch_size costs, tours, durations = zip( *results) # Not really costs since they should be negative print("Average cost: {} +- {}".format( np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs)))) print("Average serial duration: {} +- {}".format( np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations)))) print("Average parallel duration: {}".format( np.mean(durations) / parallelism)) print("Calculated total duration: {}".format( timedelta(seconds=int(np.sum(durations) / parallelism)))) dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1]) model_name = "_".join( os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:]) if opts.o is None: results_dir = os.path.join(opts.results_dir, model.problem.NAME, dataset_basename) os.makedirs(results_dir, exist_ok=True) out_file = os.path.join( results_dir, "{}-{}-{}-{}-{}{}".format(dataset_basename, model_name, opts.decode_strategy, opts.offset, opts.offset + len(costs), ext)) else: out_file = opts.o save_dataset((results, parallelism), out_file) return costs, tours, durations
def solve_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10): problem_filename = os.path.join(directory, "{}.pctsp{}.pctsp".format(name, runs)) output_filename = os.path.join(directory, "{}.pctsp{}.pkl".format(name, runs)) log_filename = os.path.join(directory, "{}.pctsp{}.log".format(name, runs)) try: # May have already been run if not os.path.isfile(output_filename): write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name) with open(log_filename, 'w') as f: start = time.time() output = check_output( # exe, filename, min_total_prize (=1), num_runs [executable, problem_filename, float_to_scaled_int_str(1.), str(runs)], stderr=f ).decode('utf-8') duration = time.time() - start f.write(output) save_dataset((output, duration), output_filename) else: output, duration = load_dataset(output_filename) # Now parse output tour = None for line in output.splitlines(): heading = "Best Result Route: " if line[:len(heading)] == heading: tour = np.array(line[len(heading):].split(" ")).astype(int) break assert tour is not None, "Could not find tour in output!" assert tour[0] == 0, "Tour should start with depot" assert tour[-1] == 0, "Tour should end with depot" tour = tour[1:-1] # Strip off depot return calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour), tour.tolist(), duration except Exception as e: print("Exception occured") print(e) return None
def main(): train, test = data_utils.load_dataset(op_scope='4') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) conbined_data.columns = test.columns.values conbined_data.index = range(conbined_data.shape[0]) # timewindow size in minutes timewindow_days = [10, 15] conbined_data = perform_time_window(conbined_data, timewindow_days) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='5')
def solve_concorde_log(executable, directory, name, loc, disable_cache=False): problem_filename = os.path.join(directory, "{}.tsp".format(name)) tour_filename = os.path.join(directory, "{}.tour".format(name)) output_filename = os.path.join(directory, "{}.concorde.pkl".format(name)) log_filename = os.path.join(directory, "{}.log".format(name)) # if True: try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_tsplib(problem_filename, loc, name=name) with open(log_filename, 'w') as f: start = time.time() try: # Concorde is weird, will leave traces of solution in current directory so call from target dir check_call([ executable, '-s', '1234', '-x', '-o', os.path.abspath(tour_filename), os.path.abspath(problem_filename) ], stdout=f, stderr=f, cwd=directory) except CalledProcessError as e: # Somehow Concorde returns 255 assert e.returncode == 255 duration = time.time() - start tour = read_concorde_tour(tour_filename) save_dataset((tour, duration), output_filename) return calc_tsp_length(loc, tour), tour, duration except Exception as e: print("Exception occured") print(e) return None
def main(): if os.path.exists(Configure.processed_train_path.format('4')): return train, test = data_utils.load_dataset(op_scope='3') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) drop_missing_rate = 1 print 'drop some features, missing_rate > {}'.format(drop_missing_rate) conbined_data = drop_some_features(conbined_data, drop_missing_rate=drop_missing_rate) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='4')
def solve_ortools(directory, name, depot, loc, prize, max_length, sec_local_search=0, disable_cache=False): # Lazy import so we do not require ortools by default from problems.op.op_ortools import solve_op_ortools try: problem_filename = os.path.join( directory, "{}.ortools{}.pkl".format(name, sec_local_search)) if os.path.isfile(problem_filename) and not disable_cache: objval, tour, duration = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() objval, tour = solve_op_ortools(depot, loc, prize, max_length, sec_local_search=sec_local_search) duration = time.time() - start save_dataset((objval, tour, duration), problem_filename) assert tour[0] == 0, "Tour must start with depot" tour = tour[1:] assert calc_op_length( depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" assert abs(-calc_op_total(prize, tour) - objval) <= 1e-5, "Cost is incorrect" return -calc_op_total(prize, tour), tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None
def main(): train, test = data_utils.load_dataset(op_scope='4') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) n_clusters = 10**2 result_df = calc_heavy_traffic_cluster_distances( conbined_data, n_clusters=n_clusters, batch_size=64**3, most_traffic_quantile=0.80, random_state=1000) conbined_data = pd.merge(conbined_data, result_df, how='left', on='id') train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='5')
def main(base_data_dir): op_scope = 2 if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope + 1)): return print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print('---> generate question introducer word features') train = jobs.parallelize_dataframe(train, generate_question_introducer_word_features) test = jobs.parallelize_dataframe(test, generate_question_introducer_word_features) print('---> generate symbol features') train = jobs.parallelize_dataframe(train, generate_symbol_count) test = jobs.parallelize_dataframe(test, generate_symbol_count) print('---> generate char count features') train = jobs.parallelize_dataframe(train, generate_char_count) test = jobs.parallelize_dataframe(test, generate_char_count) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)
def solve_stochastic_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10, append='all'): try: problem_filename = os.path.join( directory, "{}.stochpctsp{}{}.pctsp".format(name, append, runs)) output_filename = os.path.join( directory, "{}.stochpctsp{}{}.pkl".format(name, append, runs)) log_filename = os.path.join( directory, "{}.stochpctsp{}{}.log".format(name, append, runs)) # May have already been run if not os.path.isfile(output_filename): total_start = time.time() outputs = [] durations = [] final_tour = [] coord = [depot] + loc mask = np.zeros(len(coord), dtype=bool) dist = distance_matrix(coord, coord) penalty = np.array(penalty) deterministic_prize = np.array(deterministic_prize) it = 0 total_collected_prize = 0. # As long as we have not visited all nodes we repeat # even though we have already satisfied the total prize collected constraint # since the algorithm may decide to include more nodes to avoid further penalties while len(final_tour) < len(stochastic_prize): # Mask all nodes already visited (not the depot) mask[final_tour] = True # The distance from the 'start' or 'depot' is the distance from the 'current node' # this way we mimic as if we have a separate start and end by the assymetric distance matrix # Note: this violates the triangle inequality and the distance from 'depot to depot' becomes nonzero # but the program seems to deal with this well if len( final_tour ) > 0: # in the first iteration we are at depot and distance matrix is ok dist[0, :] = dist[final_tour[-1], :] remaining_deterministic_prize = deterministic_prize[~mask[1:]] write_pctsp_dist(problem_filename, dist[np.ix_(~mask, ~mask)], penalty[~mask[1:]], remaining_deterministic_prize) # If the remaining deterministic prize is less than the prize we should still collect # set this lower value as constraint since otherwise problem is infeasible # compute total remaining deterministic prize after converting to ints # otherwise we may still have problems with rounding # Note we need to clip 1 - total_collected_prize between 0 (constraint can already be satisfied) # and the maximum achievable with the remaining_deterministic_prize min_prize_int = max( 0, min( float_to_scaled_int(1. - total_collected_prize), sum([ float_to_scaled_int(v) for v in remaining_deterministic_prize ]))) with open(log_filename, 'a') as f: start = time.time() output = check_output( # exe, filename, min_total_prize (=1), num_runs [ executable, problem_filename, str(min_prize_int), str(runs) ], stderr=f).decode('utf-8') durations.append(time.time() - start) outputs.append(output) # Now parse output tour = None for line in output.splitlines(): heading = "Best Result Route: " if line[:len(heading)] == heading: tour = np.array( line[len(heading):].split(" ")).astype(int) break assert tour is not None, "Could not find tour in output!" assert tour[0] == 0, "Tour should start with depot" assert tour[-1] == 0, "Tour should end with depot" tour = tour[1:-1] # Strip off depot # Now find to which nodes these correspond tour_node_ids = np.arange(len(coord), dtype=int)[~mask][tour] if len(tour_node_ids) == 0: # The inner algorithm can decide to stop, but does not have to assert total_collected_prize > 1 - 1e-5, "Collected prize should be one" break if append == 'first': final_tour.append(tour_node_ids[0]) elif append == 'half': final_tour.extend( tour_node_ids[:max(len(tour_node_ids) // 2, 1)]) else: assert append == 'all' final_tour.extend(tour_node_ids) total_collected_prize = calc_pctsp_total( stochastic_prize, final_tour) it = it + 1 os.remove(problem_filename) final_cost = calc_pctsp_cost(depot, loc, penalty, stochastic_prize, final_tour) total_duration = time.time() - total_start save_dataset( (final_cost, final_tour, total_duration, outputs, durations), output_filename) else: final_cost, final_tour, total_duration, outputs, durations = load_dataset( output_filename) return final_cost, final_tour, total_duration except Exception as e: print("Exception occured") print(e) return None
def run_func(args): return solve_stochastic_pctsp_log(executable, *args, runs=runs, append=append) results, parallelism = run_all_in_pool( run_func, target_dir, dataset, opts, use_multiprocessing=use_multiprocessing) else: assert False, "Unknown method: {}".format(opts.method) costs, tours, durations = zip( *results) # Not really costs since they should be negative print("Average cost: {} +- {}".format( np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs)))) print("Average serial duration: {} +- {}".format( np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations)))) print("Average parallel duration: {}".format( np.mean(durations) / parallelism)) print("Calculated total duration: {}".format( timedelta(seconds=int(np.sum(durations) / parallelism)))) save_dataset((results, parallelism), out_file)
full_data_perform_stem_words, full_data_perform_no_stem_words""") options, _ = parser.parse_args() print("========== generate word vector features ==========") base_data_dir = options.base_data_dir op_scope = 5 if os.path.exists( Configure.processed_train_path.format(base_data_dir, op_scope + 1)): exit() print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print('---> generate word vector mapping') embeddings_index = generate_word_vector_map() print('---> generate wordvectors features') train = jobs.parallelize_dataframe(train, generate_wordvectors_features) test = jobs.parallelize_dataframe(test, generate_wordvectors_features) print('---> generate wordvector distance features') train = jobs.parallelize_dataframe(train, generate_wordvector_distance) test = jobs.parallelize_dataframe(test, generate_wordvector_distance) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)