def __init__(self, config): super(ProstateDistDvhDataLoader, self).__init__(config) # (self.X_train, self.y_train), (self.X_test, self.y_test) = mnist.load_data() #print "datapack_" + config.data_loader.organ + "_" + config.data_loader.x_name + "_2018.h5" filename = get_datapack_filename(str(config.data_loader.organ), str(config.data_loader.x_name)) self.X = load_dataset(config.data_loader.h5py_dir, filename, config.data_loader.x_groupname) self.y = load_dataset(config.data_loader.h5py_dir, filename, config.data_loader.y_groupname) config.data_loader.input_shape = self.X.shape[1:]
def _init_dataloaders(self): self.train_ds = load_dataset(self.config['data_dir'], self.config.get('dataset_name', 'MNIST'), train=True) self.test_ds = load_dataset(self.config['data_dir'], self.config.get('dataset_name', 'MNIST'), train=False) num_good_points = self.config.get('num_good_points', len(self.train_ds)) num_bad_points = self.config.get('num_bad_points', 0) self.train_ds = Subset(self.train_ds, range(num_good_points + num_bad_points)) self.train_ds = spoil_dataset(self.train_ds, num_good_points, num_bad_points) self.train_dataloader = build_dataloader(self.train_ds, self.config['batch_size'], sequential=True) self.test_dataloader = build_dataloader(self.test_ds, self.config['batch_size'], sequential=True)
def load_data_h5(train_file_paths=None, test_file_paths=None, remap_config='Neo', orientation=preprocessor.ORIENTATION['coronal']): # Data splitting print("START") #if train_volumes and test_volumes: #train_file_paths = du.load_file_paths(data_dir, label_dir, train_volumes) #test_file_paths = du.load_file_paths(data_dir, label_dir, test_volumes) #else: #raise ValueError('You must provide a train, train dataset list') if train_file_paths: print("Train dataset size: %d" % (len(train_file_paths))) # loading,pre-processing and writing train data print("===Train data===") data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset(train_file_paths, orientation, remap_config=remap_config, return_weights=True, reduce_slices=True, remove_black=True) no_slices, H, W = data_train[0].shape data_train=np.concatenate(data_train).reshape((-1, H, W)) label_train=np.concatenate(label_train).reshape((-1, H, W)) class_weights_train=np.concatenate(class_weights_train).reshape((-1, H, W)) print("END") return (ImdbData(data_train, label_train, class_weights_train, transforms=transform_train)) if test_file_paths: #_write_h5(data_train, label_train, class_weights_train, weights_train, f, mode='train') print("Test dataset size: %d" % (len(test_file_paths))) # loading,pre-processing and writing test data print("===Test data===") data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset(test_file_paths, orientation, remap_config=remap_config, return_weights=True, reduce_slices=True, remove_black=True) no_slices, H, W = data_test[0].shape data_test=np.concatenate(data_test).reshape((-1, H, W)) label_test=np.concatenate(label_test).reshape((-1, H, W)) class_weights_test=np.concatenate(class_weights_test).reshape((-1, H, W)) print("END") return (ImdbData(data_test, label_test, class_weights_test)) else: raise ValueError('You must provide a train or test dataset list')
def main(): if os.path.exists(Configure.processed_train_path.format('1')): return train, test = data_utils.load_dataset(op_scope='0') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) print 'generate geography pca features...' generate_pca_features(conbined_data) print 'generate datetime features...' generate_date_features(conbined_data) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] print 'generate distance features...' generate_distance_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_') print 'generate pca distance features...' generate_distance_features(train, test, loc1='pca0', loc2='pca1', fea_name='pca_') print 'generate location bin features...' generate_location_bin_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_', round_num=2) train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='1')
def main(): train, test = data_utils.load_dataset(op_scope='4') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) generate_binary_features(conbined_data) # for n_clusters in [6**2]: # print 'location clustering n_clusters = {}...'.format(n_clusters) # location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64 ** 3, random_state=1000) # # train = conbined_data.iloc[:train.shape[0], :] # test = conbined_data.iloc[train.shape[0]:, :] # train['trip_duration'] = trip_durations # # print 'generate lat_long groupby speed features...' # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude', # fea_name='lat_long_') # del train['trip_duration'] # print 'train: {}, test: {}'.format(train.shape, test.shape) # conbined_data = pd.concat([train, test]) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='5')
def main(): if os.path.exists(Configure.processed_train_path.format('2')): return train, test = data_utils.load_dataset(op_scope='1') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'data clean according to lat_long_distance_haversine & trip_duration...' # train = train[train['lat_long_distance_haversine'] < 300] # train = train[train['trip_duration'] <= 1800000].reset_index(drop=True) # 导致过拟合 print 'train: {}, test: {}'.format(train.shape, test.shape) # optimize dtypes print('Memory usage, Mb: {:.2f}'.format(train.memory_usage().sum() / 2**20)) print 'optimize dtypes...' train['is_store_and_fwd_flag'] = train['is_store_and_fwd_flag'].astype( np.uint8) train['passenger_count'] = train['passenger_count'].astype(np.uint8) train['vendor_id'] = train['vendor_id'].astype(np.uint8) train['pickup_month'] = train['pickup_month'].astype(np.uint8) train['pickup_day'] = train['pickup_day'].astype(np.uint8) train['pickup_hour'] = train['pickup_hour'].astype(np.uint8) train['pickup_weekofyear'] = train['pickup_weekofyear'].astype(np.uint8) train['pickup_weekday'] = train['pickup_weekday'].astype(np.uint8) train['is_weekend'] = train['is_weekend'].astype(np.uint8) train['trip_duration'] = train['trip_duration'].astype(np.uint32) print('After optimized memory usage, Mb: {:.2f}'.format( train.memory_usage().sum() / 2**20)) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='2')
def main(): if os.path.exists(Configure.processed_train_path.format('3')): return train, test = data_utils.load_dataset(op_scope='2') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) n_clusters = 10**2 print 'location clustering n_clusters = {}...'.format(n_clusters) location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64**3, random_state=1000) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'generate lat_long groupby speed features...' train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude', fea_name='lat_long_') # print 'generate pca groupby speed features...' # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='pca0', loc2='pca1', fea_name='pca_') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='3')
def solve_opga(directory, name, depot, loc, prize, max_length, disable_cache=False): problem_filename = os.path.join(directory, "{}.opga.pkl".format(name)) if os.path.isfile(problem_filename) and not disable_cache: (prize, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() prize, tour, duration = run_opga_alg( [(*pos, p) for p, pos in zip([0, 0] + prize, [depot, depot] + loc)], max_length, return_sol=True, verbose=False) duration = time.time() - start # Measure clock time save_dataset((prize, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0][3] == 0 assert tour[-1][3] == 1 return -prize, [i - 1 for x, y, p, i, t in tour[1:-1]], duration
def solve_ortools(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, sec_local_search=0, disable_cache=False): # Lazy import so we do not require ortools by default from .pctsp_ortools import solve_pctsp_ortools try: problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search)) if os.path.isfile(problem_filename) and not disable_cache: objval, tour, duration = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() objval, tour = solve_pctsp_ortools(depot, loc, deterministic_prize, penalty, min(sum(deterministic_prize), 1.), sec_local_search=sec_local_search) duration = time.time() - start save_dataset((objval, tour, duration), problem_filename) assert tour[0] == 0, "Tour must start with depot" tour = tour[1:] total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert abs(total_cost - objval) <= 1e-5, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None
def main(): if os.path.exists(Configure.processed_train_path.format('8')): return train, test = data_utils.load_dataset(op_scope='7') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) def driving_distance(raw): startpoint = (raw['pickup_latitude'], raw['pickup_longitude']) endpoint = (raw['dropoff_latitude'], raw['dropoff_longitude']) distance = great_circle(startpoint, endpoint).miles return distance print 'calc geopy distance features...' conbined_data['osmnx_distance'] = conbined_data[[ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ]].apply(driving_distance, axis=1) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='8')
def main(): train, test = data_utils.load_dataset(op_scope='5') print 'train: {}, test: {}'.format(train.shape, test.shape) print 'add fastest routes features...' train_fr_1 = pd.read_csv('../input/fastest_routes_train_part_1.csv') train_fr_2 = pd.read_csv('../input/fastest_routes_train_part_2.csv') test_fr = pd.read_csv('../input/fastest_routes_test.csv') train_fr = pd.concat((train_fr_1, train_fr_2)) train = train.merge(train_fr, how='left', on='id') test = test.merge(test_fr, how='left', on='id') generate_street_heavy(train, test) train.drop([ 'starting_street', 'end_street', 'street_for_each_step', 'distance_per_step', 'travel_time_per_step', 'step_maneuvers', 'step_direction', 'step_location_list' ], axis=1, inplace=True) test.drop([ 'starting_street', 'end_street', 'street_for_each_step', 'distance_per_step', 'travel_time_per_step', 'step_maneuvers', 'step_direction', 'step_location_list' ], axis=1, inplace=True) print 'add weather features...' train, test = add_weather_features(train, test) print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='6')
def main(): if os.path.exists(Configure.processed_train_path.format('7')): return train, test = data_utils.load_dataset(op_scope='6') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) conbined_data['is_holyday'] = conbined_data.apply( lambda row: 1 if (row['pickup_month'] == 1 and row['pickup_day'] == 1) or (row['pickup_month'] == 7 and row['pickup_day'] == 4) or (row['pickup_month'] == 11 and row['pickup_day'] == 11) or (row['pickup_month'] == 12 and row['pickup_day'] == 25) or (row['pickup_month'] == 1 and row['pickup_day'] >= 15 and row[ 'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 2 and row['pickup_day'] >= 15 and row[ 'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 5 and row['pickup_day'] >= 25 and row[ 'pickup_day'] <= 31 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[ 'pickup_day'] <= 7 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 10 and row['pickup_day'] >= 8 and row[ 'pickup_day'] <= 14 and row['pickup_weekday'] == 0) or (row['pickup_month'] == 11 and row['pickup_day'] >= 22 and row[ 'pickup_day'] <= 28 and row['pickup_weekday'] == 3) else 0, axis=1) conbined_data['is_day_before_holyday'] = conbined_data.apply( lambda row: 1 if (row['pickup_month'] == 12 and row['pickup_day'] == 31) or (row['pickup_month'] == 7 and row['pickup_day'] == 3) or (row['pickup_month'] == 11 and row['pickup_day'] == 10) or (row['pickup_month'] == 12 and row['pickup_day'] == 24) or (row['pickup_month'] == 1 and row['pickup_day'] >= 14 and row[ 'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 2 and row['pickup_day'] >= 14 and row[ 'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 5 and row['pickup_day'] >= 24 and row[ 'pickup_day'] <= 30 and row['pickup_weekday'] == 6) or ( (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[ 'pickup_day'] <= 6) or (row['pickup_month'] == 8 and row[ 'pickup_day'] == 31) and row['pickup_weekday'] == 6) or (row['pickup_month'] == 10 and row['pickup_day'] >= 7 and row[ 'pickup_day'] <= 13 and row['pickup_weekday'] == 6) or (row['pickup_month'] == 11 and row['pickup_day'] >= 21 and row[ 'pickup_day'] <= 27 and row['pickup_weekday'] == 2) else 0, axis=1) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='7')
def solve_lkh_log(executable, directory, name, depot, loc, demand, capacity, grid_size=1, runs=1, disable_cache=False): problem_filename = os.path.join(directory, "{}.lkh{}.vrp".format(name, runs)) tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs)) output_filename = os.path.join(directory, "{}.lkh{}.pkl".format(name, runs)) param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs)) log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs)) try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_vrplib(problem_filename, depot, loc, demand, capacity, grid_size, name=name) params = { "PROBLEM_FILE": problem_filename, "OUTPUT_TOUR_FILE": tour_filename, "RUNS": runs, "SEED": 1234 } write_lkh_par(param_filename, params) with open(log_filename, 'w') as f: start = time.time() check_call([executable, param_filename], stdout=f, stderr=f) duration = time.time() - start tour = read_vrplib(tour_filename, n=len(demand)) save_dataset((tour, duration), output_filename) return calc_vrp_cost(depot, loc, tour), tour, duration except Exception as e: raise print("Exception occured") print(e) return None
def solve_salesman(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10): problem_filename = os.path.join(directory, "{}.salesman{}.pctsp".format(name, runs)) output_filename = os.path.join(directory, "{}.salesman{}.pkl".format(name, runs)) try: # May have already been run if not os.path.isfile(output_filename): write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name) start = time.time() random.seed(1234) pctsp = Pctsp() pctsp.load(problem_filename, float_to_scaled_int(1.)) s = solution.random(pctsp, start_size=int(len(pctsp.prize) * 0.7)) s = ilocal_search(s, n_runs=runs) output = (s.route[:s.size], s.quality) duration = time.time() - start save_dataset((output, duration), output_filename) else: output, duration = load_dataset(output_filename) # Now parse output tour = output[0][:] assert tour[0] == 0, "Tour should start with depot" assert tour[-1] != 0, "Tour should not end with depot" tour = tour[1:] # Strip off depot total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert (float_to_scaled_int(total_cost) - output[1]) / float( output[1]) < 1e-5 return total_cost, tour, duration except Exception as e: print("Exception occured") print(e) return None
def solve_gurobi(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, disable_cache=False, timeout=None, gap=None): # Lazy import so we do not need to have gurobi installed to run this script from .pctsp_gurobi import \ solve_euclidian_pctsp as solve_euclidian_pctsp_gurobi try: problem_filename = os.path.join( directory, "{}.gurobi{}{}.pkl".format( name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap))) if os.path.isfile(problem_filename) and not disable_cache: (cost, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() # Must collect 1 or the sum of the prices if it is less then 1. cost, tour = solve_euclidian_pctsp_gurobi( depot, loc, penalty, deterministic_prize, min(sum(deterministic_prize), 1.), threads=1, timeout=timeout, gap=gap) duration = time.time() - start # Measure clock time save_dataset((cost, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0] == 0 tour = tour[1:] total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour) assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we can retry by the caching mechanism print("Exception occured") print(e) return None
def __init__(self, num_nodes, num_neighbors, batch_size, filepath, target_filepath=None, do_shuffle=False, do_prep=True): """ Args: num_nodes: Number of nodes in TSP tours num_neighbors: Number of neighbors to consider for each node in graph batch_size: Batch size filepath: Path to dataset file (.txt file) """ self.num_nodes = num_nodes self.num_neighbors = num_neighbors self.batch_size = batch_size self.filepath = filepath filedata = load_dataset(filepath) # open(filepath, "r").readlines() self.target_filepath = target_filepath if target_filepath is not None: self.has_target = True target_filedata, parallelism = load_dataset(target_filepath) self.filedata = list([ (inst, sol) for inst, sol in zip(filedata, target_filedata) if sol is not None ]) else: self.has_target = False self.filedata = list([(inst, None) for inst in filedata]) self.do_prep = do_prep if do_shuffle: self.shuffle() self.max_iter = (len(self.filedata) // batch_size) assert self.max_iter > 0, "Not enough instances ({}) for batch size ({})".format( len(self.filedata), batch_size)
def pre_train(): train_all, test = load_dataset(0) train_all.fillna(-1, inplace=True) test.fillna(-1, inplace=True) y_train_all = train_all['orderType'] id_train = train_all['userid'] train_all.drop(['orderType'], axis=1, inplace=True) id_test = test['userid'] print("train_all: ({}), test: ({})".format(train_all.shape, test.shape)) return train_all, y_train_all, id_train, test, id_test
def __init__(self, num_nodes, num_neighbors, batch_size, filepath, target_filepath=None, do_shuffle=False): """ Args: num_nodes: Number of nodes in VRP tours (excl depot) num_neighbors: Number of neighbors to consider for each node in graph batch_size: Batch size filepath: Path to dataset file (.txt file) variant: 'routes' to predict all edges for routes, 'clusters' to predict which nodes go together in clusters """ self.num_nodes = num_nodes self.num_neighbors = num_neighbors self.batch_size = batch_size self.filepath = filepath filedata = load_dataset(filepath) self.target_filepath = target_filepath if target_filepath is not None: self.has_target = True target_filedata, parallelism = load_dataset(target_filepath) self.filedata = list([ (inst, sol) for inst, sol in zip(filedata, target_filedata) if sol is not None ]) else: self.has_target = False self.filedata = list([(inst, None) for inst in filedata]) if do_shuffle: self.shuffle() self.max_iter = (len(self.filedata) // batch_size) assert self.max_iter > 0, "Not enough instances ({}) for batch size ({})".format( len(self.filedata), batch_size)
def load_data(args): df = load_dataset() unique_img_ids = get_unique_img_ids(df) balanced_dataset = get_balanced_dataset(unique_img_ids) if args['drop_empty_images']: balanced_dataset = drop_empty_images(balanced_dataset) train_df, valid_df = get_train_val_datasets(df, balanced_dataset) valid_x, valid_y = split_validation_dataset(valid_df, classify=args['classify_mode']) return train_df, valid_x, valid_y
def solve_compass_log(executable, directory, name, depot, loc, prize, max_length, disable_cache=False): problem_filename = os.path.join(directory, "{}.oplib".format(name)) tour_filename = os.path.join(directory, "{}.tour".format(name)) output_filename = os.path.join(directory, "{}.compass.pkl".format(name)) log_filename = os.path.join(directory, "{}.log".format(name)) try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_oplib(problem_filename, depot, loc, prize, max_length, name=name) with open(log_filename, 'w') as f: start = time.time() check_call([ executable, '--op', '--op-ea4op', problem_filename, '-o', tour_filename ], stdout=f, stderr=f) duration = time.time() - start tour = read_oplib(tour_filename, n=len(prize)) if not calc_op_length(depot, loc, tour) <= max_length: print("Warning: length exceeds max length:", calc_op_length(depot, loc, tour), max_length) assert calc_op_length( depot, loc, tour ) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" save_dataset((tour, duration), output_filename) return -calc_op_total(prize, tour), tour, duration except Exception as e: print("Exception occured") print(e) return None
def solve_gurobi(directory, name, depot, loc, prize, max_length, disable_cache=False, timeout=None, gap=None): # Lazy import so we do not need to have gurobi installed to run this script from problems.op.op_gurobi import \ solve_euclidian_op as solve_euclidian_op_gurobi try: problem_filename = os.path.join( directory, "{}.gurobi{}{}.pkl".format( name, "" if timeout is None else "t{}".format(timeout), "" if gap is None else "gap{}".format(gap))) if os.path.isfile(problem_filename) and not disable_cache: (cost, tour, duration) = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() cost, tour = solve_euclidian_op_gurobi(depot, loc, prize, max_length, threads=1, timeout=timeout, gap=gap) duration = time.time() - start # Measure clock time save_dataset((cost, tour, duration), problem_filename) # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1 assert tour[0] == 0 tour = tour[1:] assert calc_op_length( depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" total_cost = -calc_op_total(prize, tour) assert abs(total_cost - cost) <= 1e-4, "Cost is incorrect" return total_cost, tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None
def main(base_data_dir): op_scope = 0 if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope+1)): return print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> generate basic statistic features") train['num_of_chars_q1'] = train['question1'].apply(lambda x: len(str(x))) train['num_of_chars_q2'] = train['question2'].apply(lambda x: len(str(x))) test['num_of_chars_q1'] = test['question1'].apply(lambda x: len(str(x))) test['num_of_chars_q2'] = test['question2'].apply(lambda x: len(str(x))) train['num_of_words_q1'] = train['question1'].apply(lambda x: len(str(x).split())) train['num_of_words_q2'] = train['question2'].apply(lambda x: len(str(x).split())) test['num_of_words_q1'] = test['question1'].apply(lambda x: len(str(x).split())) test['num_of_words_q2'] = test['question2'].apply(lambda x: len(str(x).split())) print('---> generate unigram_words features before cleaned') train = jobs.parallelize_dataframe(train, generate_unigram_words_features) test = jobs.parallelize_dataframe(test, generate_unigram_words_features) print('---> clean text') start = time.clock() if 'no_stem_words' in base_data_dir: print('clean train question') train = jobs.parallelize_dataframe(train, clean_text_func_no_stem_words) print('clean test question') test = jobs.parallelize_dataframe(test, clean_text_func_no_stem_words) else: print('clean train question') train = jobs.parallelize_dataframe(train, clean_text_func_stem_words) print('clean test question') test = jobs.parallelize_dataframe(test, clean_text_func_stem_words) stop = time.clock() print("text cleaned, cost {}s".format(stop, str(stop - start))) print('---> generate unigram_words features after cleaned') train = jobs.parallelize_dataframe(train, generate_cleaned_unigram_words_features) test = jobs.parallelize_dataframe(test, generate_cleaned_unigram_words_features) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)
def main(base_data_dir): op_scope = 4 if os.path.exists( Configure.processed_train_path.format(base_data_dir, op_scope + 1)): return print("---> load datasets from scope {}".format(op_scope)) train, test = data_utils.load_dataset(base_data_dir, op_scope) print("train: {}, test: {}".format(train.shape, test.shape)) print('---> generate common word count') train = jobs.parallelize_dataframe(train, generate_common_word_count) test = jobs.parallelize_dataframe(test, generate_common_word_count) print("train: {}, test: {}".format(train.shape, test.shape)) print("---> save datasets") data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)
def main(): if os.path.exists(Configure.processed_train_path.format('0')): return train, test = data_utils.load_dataset(op_scope='0') print 'train: {}, test: {}'.format(train.shape, test.shape) # store_and_fwd_flag train['is_store_and_fwd_flag'] = train['store_and_fwd_flag'].map( lambda s: 1 if s == 'Y' else 0) test['is_store_and_fwd_flag'] = test['store_and_fwd_flag'].map( lambda s: 1 if s == 'Y' else 0) del train['store_and_fwd_flag'] del test['store_and_fwd_flag'] print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='0')
def run(traces, outfname): X, Y, W, _, _ = load_dataset(traces) sizes = encode_sizes(X) log('Computing pairwise distances') D = pairwise_levenshtein_distances(sizes) log('Computing subtractions') log('Storing distances into {}'.format(outfname)) data = { 'webpage-id': W, 'label': np.array(Y), 'pairdist': D, } with open(outfname, 'wb') as f: dill.dump(data, f)
def solve_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10): problem_filename = os.path.join(directory, "{}.pctsp{}.pctsp".format(name, runs)) output_filename = os.path.join(directory, "{}.pctsp{}.pkl".format(name, runs)) log_filename = os.path.join(directory, "{}.pctsp{}.log".format(name, runs)) try: # May have already been run if not os.path.isfile(output_filename): write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name) with open(log_filename, 'w') as f: start = time.time() output = check_output( # exe, filename, min_total_prize (=1), num_runs [executable, problem_filename, float_to_scaled_int_str(1.), str(runs)], stderr=f ).decode('utf-8') duration = time.time() - start f.write(output) save_dataset((output, duration), output_filename) else: output, duration = load_dataset(output_filename) # Now parse output tour = None for line in output.splitlines(): heading = "Best Result Route: " if line[:len(heading)] == heading: tour = np.array(line[len(heading):].split(" ")).astype(int) break assert tour is not None, "Could not find tour in output!" assert tour[0] == 0, "Tour should start with depot" assert tour[-1] == 0, "Tour should end with depot" tour = tour[1:-1] # Strip off depot return calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour), tour.tolist(), duration except Exception as e: print("Exception occured") print(e) return None
def main(): train, test = data_utils.load_dataset(op_scope='4') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) conbined_data.columns = test.columns.values conbined_data.index = range(conbined_data.shape[0]) # timewindow size in minutes timewindow_days = [10, 15] conbined_data = perform_time_window(conbined_data, timewindow_days) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='5')
def solve_concorde_log(executable, directory, name, loc, disable_cache=False): problem_filename = os.path.join(directory, "{}.tsp".format(name)) tour_filename = os.path.join(directory, "{}.tour".format(name)) output_filename = os.path.join(directory, "{}.concorde.pkl".format(name)) log_filename = os.path.join(directory, "{}.log".format(name)) # if True: try: # May have already been run if os.path.isfile(output_filename) and not disable_cache: tour, duration = load_dataset(output_filename) else: write_tsplib(problem_filename, loc, name=name) with open(log_filename, 'w') as f: start = time.time() try: # Concorde is weird, will leave traces of solution in current directory so call from target dir check_call([ executable, '-s', '1234', '-x', '-o', os.path.abspath(tour_filename), os.path.abspath(problem_filename) ], stdout=f, stderr=f, cwd=directory) except CalledProcessError as e: # Somehow Concorde returns 255 assert e.returncode == 255 duration = time.time() - start tour = read_concorde_tour(tour_filename) save_dataset((tour, duration), output_filename) return calc_tsp_length(loc, tour), tour, duration except Exception as e: print("Exception occured") print(e) return None
def main(): if os.path.exists(Configure.processed_train_path.format('4')): return train, test = data_utils.load_dataset(op_scope='3') print 'train: {}, test: {}'.format(train.shape, test.shape) trip_durations = train['trip_duration'] del train['trip_duration'] conbined_data = pd.concat([train, test]) drop_missing_rate = 1 print 'drop some features, missing_rate > {}'.format(drop_missing_rate) conbined_data = drop_some_features(conbined_data, drop_missing_rate=drop_missing_rate) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['trip_duration'] = trip_durations print 'train: {}, test: {}'.format(train.shape, test.shape) print 'save dataset...' data_utils.save_dataset(train, test, op_scope='4')
def solve_ortools(directory, name, depot, loc, prize, max_length, sec_local_search=0, disable_cache=False): # Lazy import so we do not require ortools by default from problems.op.op_ortools import solve_op_ortools try: problem_filename = os.path.join( directory, "{}.ortools{}.pkl".format(name, sec_local_search)) if os.path.isfile(problem_filename) and not disable_cache: objval, tour, duration = load_dataset(problem_filename) else: # 0 = start, 1 = end so add depot twice start = time.time() objval, tour = solve_op_ortools(depot, loc, prize, max_length, sec_local_search=sec_local_search) duration = time.time() - start save_dataset((objval, tour, duration), problem_filename) assert tour[0] == 0, "Tour must start with depot" tour = tour[1:] assert calc_op_length( depot, loc, tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!" assert abs(-calc_op_total(prize, tour) - objval) <= 1e-5, "Cost is incorrect" return -calc_op_total(prize, tour), tour, duration except Exception as e: # For some stupid reason, sometimes OR tools cannot find a feasible solution? # By letting it fail we do not get total results, but we dcan retry by the caching mechanism print("Exception occured") print(e) return None