Python save_dataset Examples, utils.data_utils.save_dataset Python Examples

Example #1

0

Show file

def solve_opga(directory,
               name,
               depot,
               loc,
               prize,
               max_length,
               disable_cache=False):
    problem_filename = os.path.join(directory, "{}.opga.pkl".format(name))
    if os.path.isfile(problem_filename) and not disable_cache:
        (prize, tour, duration) = load_dataset(problem_filename)
    else:
        # 0 = start, 1 = end so add depot twice
        start = time.time()
        prize, tour, duration = run_opga_alg(
            [(*pos, p)
             for p, pos in zip([0, 0] + prize, [depot, depot] + loc)],
            max_length,
            return_sol=True,
            verbose=False)
        duration = time.time() - start  # Measure clock time
        save_dataset((prize, tour, duration), problem_filename)

    # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
    assert tour[0][3] == 0
    assert tour[-1][3] == 1
    return -prize, [i - 1 for x, y, p, i, t in tour[1:-1]], duration

Example #2

0

Show file

def main():
    train, test = data_utils.load_dataset(op_scope='5')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'add fastest routes features...'
    train_fr_1 = pd.read_csv('../input/fastest_routes_train_part_1.csv')
    train_fr_2 = pd.read_csv('../input/fastest_routes_train_part_2.csv')
    test_fr = pd.read_csv('../input/fastest_routes_test.csv')

    train_fr = pd.concat((train_fr_1, train_fr_2))

    train = train.merge(train_fr, how='left', on='id')
    test = test.merge(test_fr, how='left', on='id')

    generate_street_heavy(train, test)

    train.drop([
        'starting_street', 'end_street', 'street_for_each_step',
        'distance_per_step', 'travel_time_per_step', 'step_maneuvers',
        'step_direction', 'step_location_list'
    ],
               axis=1,
               inplace=True)
    test.drop([
        'starting_street', 'end_street', 'street_for_each_step',
        'distance_per_step', 'travel_time_per_step', 'step_maneuvers',
        'step_direction', 'step_location_list'
    ],
              axis=1,
              inplace=True)

    print 'add weather features...'
    train, test = add_weather_features(train, test)
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='6')

Example #3

0

Show file

File: basic_feature_engineering.py Project: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('1')):
        return

    train, test = data_utils.load_dataset(op_scope='0')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    print 'generate geography pca features...'
    generate_pca_features(conbined_data)

    print 'generate datetime features...'
    generate_date_features(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    print 'generate distance features...'
    generate_distance_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_')

    print 'generate pca distance features...'
    generate_distance_features(train, test, loc1='pca0', loc2='pca1', fea_name='pca_')

    print 'generate location bin features...'
    generate_location_bin_features(train, test, loc1='latitude', loc2='longitude',
                                   fea_name='lat_long_', round_num=2)

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='1')

Example #4

0

Show file

File: data_cleaning.py Project: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('2')):
        return

    train, test = data_utils.load_dataset(op_scope='1')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'data clean according to lat_long_distance_haversine & trip_duration...'
    # train = train[train['lat_long_distance_haversine'] < 300]
    # train = train[train['trip_duration'] <= 1800000].reset_index(drop=True) # 导致过拟合

    print 'train: {}, test: {}'.format(train.shape, test.shape)

    # optimize dtypes
    print('Memory usage, Mb: {:.2f}'.format(train.memory_usage().sum() /
                                            2**20))
    print 'optimize dtypes...'
    train['is_store_and_fwd_flag'] = train['is_store_and_fwd_flag'].astype(
        np.uint8)
    train['passenger_count'] = train['passenger_count'].astype(np.uint8)
    train['vendor_id'] = train['vendor_id'].astype(np.uint8)
    train['pickup_month'] = train['pickup_month'].astype(np.uint8)
    train['pickup_day'] = train['pickup_day'].astype(np.uint8)
    train['pickup_hour'] = train['pickup_hour'].astype(np.uint8)
    train['pickup_weekofyear'] = train['pickup_weekofyear'].astype(np.uint8)
    train['pickup_weekday'] = train['pickup_weekday'].astype(np.uint8)
    train['is_weekend'] = train['is_weekend'].astype(np.uint8)
    train['trip_duration'] = train['trip_duration'].astype(np.uint32)
    print('After optimized memory usage, Mb: {:.2f}'.format(
        train.memory_usage().sum() / 2**20))

    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='2')

Example #5

0

Show file

File: perform_geography_clustering.py Project: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('3')):
        return

    train, test = data_utils.load_dataset(op_scope='2')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    n_clusters = 10**2
    print 'location clustering n_clusters = {}...'.format(n_clusters)
    location_clustering(conbined_data,
                        n_clusters=n_clusters,
                        batch_size=64**3,
                        random_state=1000)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'generate lat_long groupby speed features...'
    train, test = generate_groupby_speed_features(train,
                                                  test,
                                                  n_clusters,
                                                  loc1='latitude',
                                                  loc2='longitude',
                                                  fea_name='lat_long_')
    # print 'generate pca groupby speed features...'
    # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='pca0', loc2='pca1', fea_name='pca_')

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='3')

Example #6

0

Show file

def solve_ortools(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize,
                  sec_local_search=0, disable_cache=False):
    # Lazy import so we do not require ortools by default
    from .pctsp_ortools import solve_pctsp_ortools

    try:
        problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search))
        if os.path.isfile(problem_filename) and not disable_cache:
            objval, tour, duration = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()
            objval, tour = solve_pctsp_ortools(depot, loc, deterministic_prize, penalty,
                                               min(sum(deterministic_prize), 1.), sec_local_search=sec_local_search)
            duration = time.time() - start
            save_dataset((objval, tour, duration), problem_filename)
        assert tour[0] == 0, "Tour must start with depot"
        tour = tour[1:]
        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour)
        assert abs(total_cost - objval) <= 1e-5, "Cost is incorrect"
        return total_cost, tour, duration
    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Example #7

0

Show file

def main():
    if os.path.exists(Configure.processed_train_path.format('8')):
        return

    train, test = data_utils.load_dataset(op_scope='7')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    def driving_distance(raw):
        startpoint = (raw['pickup_latitude'], raw['pickup_longitude'])
        endpoint = (raw['dropoff_latitude'], raw['dropoff_longitude'])
        distance = great_circle(startpoint, endpoint).miles
        return distance

    print 'calc geopy distance features...'
    conbined_data['osmnx_distance'] = conbined_data[[
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
        'dropoff_longitude'
    ]].apply(driving_distance, axis=1)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='8')

Example #8

0

Show file

def main():
    train, test = data_utils.load_dataset(op_scope='4')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    generate_binary_features(conbined_data)

    # for n_clusters in [6**2]:
    #     print 'location clustering n_clusters = {}...'.format(n_clusters)
    #     location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64 ** 3, random_state=1000)
    #
    #     train = conbined_data.iloc[:train.shape[0], :]
    #     test = conbined_data.iloc[train.shape[0]:, :]
    #     train['trip_duration'] = trip_durations
    #
    #     print 'generate lat_long groupby speed features...'
    #     train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude',
    #                                                   fea_name='lat_long_')
    #     del train['trip_duration']
    #     print 'train: {}, test: {}'.format(train.shape, test.shape)
    #     conbined_data = pd.concat([train, test])

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='5')

Example #9

0

Show file

def eval_dataset(dataset_path, decode_strategy, width, softmax_temp, opts):
    model, model_args = load_model(opts.model)
    use_cuda = torch.cuda.is_available() and not opts.no_cuda

    device = torch.device("cuda:0" if use_cuda else "cpu")
    dataset = model.problem.make_dataset(filename=dataset_path,
                                         batch_size=opts.batch_size,
                                         num_samples=opts.val_size,
                                         neighbors=model_args['neighbors'],
                                         knn_strat=model_args['knn_strat'],
                                         supervised=True)

    results = _eval_dataset(model, dataset, decode_strategy, width,
                            softmax_temp, opts, device)

    costs, tours, durations = zip(*results)
    costs, tours, durations = np.array(costs), np.array(tours), np.array(
        durations)
    gt_tours = dataset.tour_nodes
    gt_costs = rollout_groundtruth(model.problem, dataset, opts).cpu().numpy()
    opt_gap = ((costs / gt_costs - 1) * 100)

    results = zip(costs, gt_costs, tours, gt_tours, opt_gap, durations)

    print('Validation groundtruth cost: {:.3f} +- {:.3f}'.format(
        gt_costs.mean(), np.std(gt_costs)))
    print('Validation average cost: {:.3f} +- {:.3f}'.format(
        costs.mean(), np.std(costs)))
    print('Validation optimality gap: {:.3f}% +- {:.3f}'.format(
        opt_gap.mean(), np.std(opt_gap)))
    print('Average duration: {:.3f}s +- {:.3f}'.format(durations.mean(),
                                                       np.std(durations)))
    print('Total duration: {}s'.format(np.sum(durations) / opts.batch_size))

    dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])

    model_name = "_".join(
        os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:])

    results_dir = os.path.join(opts.results_dir, dataset_basename)
    os.makedirs(results_dir, exist_ok=True)

    out_file = os.path.join(
        results_dir, "{}-{}-{}{}-t{}-{}-{}{}".format(
            dataset_basename, model_name, decode_strategy,
            width if decode_strategy != 'greedy' else '', softmax_temp,
            opts.offset, opts.offset + len(costs), ext))

    assert opts.f or not os.path.isfile(
        out_file
    ), "File already exists! Try running with -f option to overwrite."

    save_dataset(results, out_file)

    latex_str = ' & ${:.3f}\pm{:.3f}$ & ${:.3f}\%\pm{:.3f}$ & ${:.3f}$s'.format(
        costs.mean(), np.std(costs), opt_gap.mean(), np.std(opt_gap),
        np.sum(durations) / opts.batch_size)

    return latex_str

Example #10

0

Show file

File: final_feature_engineering.py Project: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('7')):
        return

    train, test = data_utils.load_dataset(op_scope='6')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    conbined_data['is_holyday'] = conbined_data.apply(
        lambda row: 1
        if (row['pickup_month'] == 1 and row['pickup_day'] == 1) or
        (row['pickup_month'] == 7 and row['pickup_day'] == 4) or
        (row['pickup_month'] == 11 and row['pickup_day'] == 11) or
        (row['pickup_month'] == 12 and row['pickup_day'] == 25) or
        (row['pickup_month'] == 1 and row['pickup_day'] >= 15 and row[
            'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 2 and row['pickup_day'] >= 15 and row[
            'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 5 and row['pickup_day'] >= 25 and row[
            'pickup_day'] <= 31 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[
            'pickup_day'] <= 7 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 10 and row['pickup_day'] >= 8 and row[
            'pickup_day'] <= 14 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 11 and row['pickup_day'] >= 22 and row[
            'pickup_day'] <= 28 and row['pickup_weekday'] == 3) else 0,
        axis=1)
    conbined_data['is_day_before_holyday'] = conbined_data.apply(
        lambda row: 1
        if (row['pickup_month'] == 12 and row['pickup_day'] == 31) or
        (row['pickup_month'] == 7 and row['pickup_day'] == 3) or
        (row['pickup_month'] == 11 and row['pickup_day'] == 10) or
        (row['pickup_month'] == 12 and row['pickup_day'] == 24) or
        (row['pickup_month'] == 1 and row['pickup_day'] >= 14 and row[
            'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 2 and row['pickup_day'] >= 14 and row[
            'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 5 and row['pickup_day'] >= 24 and row[
            'pickup_day'] <= 30 and row['pickup_weekday'] == 6) or (
                (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[
                    'pickup_day'] <= 6) or (row['pickup_month'] == 8 and row[
                        'pickup_day'] == 31) and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 10 and row['pickup_day'] >= 7 and row[
            'pickup_day'] <= 13 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 11 and row['pickup_day'] >= 21 and row[
            'pickup_day'] <= 27 and row['pickup_weekday'] == 2) else 0,
        axis=1)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='7')

Example #11

0

Show file

File: vrp_baseline.py Project: fmxFranky/attention-learn-to-route

def solve_lkh_log(executable,
                  directory,
                  name,
                  depot,
                  loc,
                  demand,
                  capacity,
                  grid_size=1,
                  runs=1,
                  disable_cache=False):

    problem_filename = os.path.join(directory,
                                    "{}.lkh{}.vrp".format(name, runs))
    tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs))
    output_filename = os.path.join(directory,
                                   "{}.lkh{}.pkl".format(name, runs))
    param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs))
    log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs))

    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_vrplib(problem_filename,
                         depot,
                         loc,
                         demand,
                         capacity,
                         grid_size,
                         name=name)

            params = {
                "PROBLEM_FILE": problem_filename,
                "OUTPUT_TOUR_FILE": tour_filename,
                "RUNS": runs,
                "SEED": 1234
            }
            write_lkh_par(param_filename, params)

            with open(log_filename, 'w') as f:
                start = time.time()
                check_call([executable, param_filename], stdout=f, stderr=f)
                duration = time.time() - start

            tour = read_vrplib(tour_filename, n=len(demand))

            save_dataset((tour, duration), output_filename)

        return calc_vrp_cost(depot, loc, tour), tour, duration

    except Exception as e:
        raise
        print("Exception occured")
        print(e)
        return None

Example #12

0

Show file

def solve_salesman(directory,
                   name,
                   depot,
                   loc,
                   penalty,
                   deterministic_prize,
                   stochastic_prize,
                   runs=10):

    problem_filename = os.path.join(directory,
                                    "{}.salesman{}.pctsp".format(name, runs))
    output_filename = os.path.join(directory,
                                   "{}.salesman{}.pkl".format(name, runs))

    try:
        # May have already been run
        if not os.path.isfile(output_filename):
            write_pctsp(problem_filename,
                        depot,
                        loc,
                        penalty,
                        deterministic_prize,
                        name=name)

            start = time.time()

            random.seed(1234)
            pctsp = Pctsp()
            pctsp.load(problem_filename, float_to_scaled_int(1.))
            s = solution.random(pctsp, start_size=int(len(pctsp.prize) * 0.7))
            s = ilocal_search(s, n_runs=runs)

            output = (s.route[:s.size], s.quality)

            duration = time.time() - start

            save_dataset((output, duration), output_filename)
        else:
            output, duration = load_dataset(output_filename)

        # Now parse output
        tour = output[0][:]
        assert tour[0] == 0, "Tour should start with depot"
        assert tour[-1] != 0, "Tour should not end with depot"
        tour = tour[1:]  # Strip off depot

        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize,
                                     tour)
        assert (float_to_scaled_int(total_cost) - output[1]) / float(
            output[1]) < 1e-5
        return total_cost, tour, duration
    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Example #13

0

Show file

def solve_gurobi(directory,
                 name,
                 depot,
                 loc,
                 penalty,
                 deterministic_prize,
                 stochastic_prize,
                 disable_cache=False,
                 timeout=None,
                 gap=None):
    # Lazy import so we do not need to have gurobi installed to run this script
    from .pctsp_gurobi import \
        solve_euclidian_pctsp as solve_euclidian_pctsp_gurobi

    try:
        problem_filename = os.path.join(
            directory, "{}.gurobi{}{}.pkl".format(
                name, "" if timeout is None else "t{}".format(timeout),
                "" if gap is None else "gap{}".format(gap)))

        if os.path.isfile(problem_filename) and not disable_cache:
            (cost, tour, duration) = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()

            # Must collect 1 or the sum of the prices if it is less then 1.
            cost, tour = solve_euclidian_pctsp_gurobi(
                depot,
                loc,
                penalty,
                deterministic_prize,
                min(sum(deterministic_prize), 1.),
                threads=1,
                timeout=timeout,
                gap=gap)
            duration = time.time() - start  # Measure clock time
            save_dataset((cost, tour, duration), problem_filename)

        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
        assert tour[0] == 0
        tour = tour[1:]

        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize,
                                     tour)
        assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect"
        return total_cost, tour, duration

    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we can retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Example #14

0

Show file

def eval_dataset(dataset_path, width, softmax_temp, opts):
    # Even with multiprocessing, we load the model here since it contains the name where to write results
    model, _ = load_model(opts.model)
    use_cuda = torch.cuda.is_available() and not opts.no_cuda
    model.use_cuda = use_cuda
    if opts.multiprocessing:
        assert use_cuda, "Can only do multiprocessing with cuda"
        num_processes = torch.cuda.device_count()
        assert opts.val_size % num_processes == 0

        with mp.Pool(num_processes) as pool:
            results = list(itertools.chain.from_iterable(pool.map(
                eval_dataset_mp,
                [(dataset_path, width, softmax_temp, opts, i, num_processes) for i in range(num_processes)]
            )))

    else:
        device = torch.device("cuda:0" if use_cuda else "cpu")
        dataset = model.problem.make_dataset(filename=dataset_path, num_samples=opts.val_size, offset=opts.offset)
        results = _eval_dataset(model, dataset, width, softmax_temp, opts, device)

    # This is parallelism, even if we use multiprocessing (we report as if we did not use multiprocessing, e.g. 1 GPU)
    parallelism = opts.eval_batch_size

    costs, tours, durations = zip(*results)  # Not really costs since they should be negative

    print("Average cost: {} +- {}".format(np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
    print("Average serial duration: {} +- {}".format(
        np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
    print("Average parallel duration: {}".format(np.mean(durations) / parallelism))
    print("Calculated total duration: {}".format(timedelta(seconds=int(np.sum(durations) / parallelism))))

    dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
    model_name = "_".join(os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:])
    if opts.o is None:
        results_dir = os.path.join(opts.results_dir, model.problem.NAME, dataset_basename)
        os.makedirs(results_dir, exist_ok=True)

        out_file = os.path.join(results_dir, "{}-{}-{}{}-t{}-{}-{}{}".format(
            dataset_basename, model_name,
            opts.decode_strategy,
            width if opts.decode_strategy != 'greedy' else '',
            softmax_temp, opts.offset, opts.offset + len(costs), ext
        ))
    else:
        out_file = opts.o

    assert opts.f or not os.path.isfile(
        out_file), "File already exists! Try running with -f option to overwrite."

    save_dataset((results, parallelism), out_file)

    return costs, tours, durations

Example #15

0

Show file

def solve_compass_log(executable,
                      directory,
                      name,
                      depot,
                      loc,
                      prize,
                      max_length,
                      disable_cache=False):

    problem_filename = os.path.join(directory, "{}.oplib".format(name))
    tour_filename = os.path.join(directory, "{}.tour".format(name))
    output_filename = os.path.join(directory, "{}.compass.pkl".format(name))
    log_filename = os.path.join(directory, "{}.log".format(name))

    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_oplib(problem_filename,
                        depot,
                        loc,
                        prize,
                        max_length,
                        name=name)

            with open(log_filename, 'w') as f:
                start = time.time()
                check_call([
                    executable, '--op', '--op-ea4op', problem_filename, '-o',
                    tour_filename
                ],
                           stdout=f,
                           stderr=f)
                duration = time.time() - start

            tour = read_oplib(tour_filename, n=len(prize))
            if not calc_op_length(depot, loc, tour) <= max_length:
                print("Warning: length exceeds max length:",
                      calc_op_length(depot, loc, tour), max_length)
            assert calc_op_length(
                depot, loc, tour
            ) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
            save_dataset((tour, duration), output_filename)

        return -calc_op_total(prize, tour), tour, duration

    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Example #16

0

Show file

def solve_gurobi(directory,
                 name,
                 depot,
                 loc,
                 prize,
                 max_length,
                 disable_cache=False,
                 timeout=None,
                 gap=None):
    # Lazy import so we do not need to have gurobi installed to run this script
    from problems.op.op_gurobi import \
        solve_euclidian_op as solve_euclidian_op_gurobi

    try:
        problem_filename = os.path.join(
            directory, "{}.gurobi{}{}.pkl".format(
                name, "" if timeout is None else "t{}".format(timeout),
                "" if gap is None else "gap{}".format(gap)))

        if os.path.isfile(problem_filename) and not disable_cache:
            (cost, tour, duration) = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()

            cost, tour = solve_euclidian_op_gurobi(depot,
                                                   loc,
                                                   prize,
                                                   max_length,
                                                   threads=1,
                                                   timeout=timeout,
                                                   gap=gap)
            duration = time.time() - start  # Measure clock time
            save_dataset((cost, tour, duration), problem_filename)

        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
        assert tour[0] == 0
        tour = tour[1:]
        assert calc_op_length(
            depot, loc,
            tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
        total_cost = -calc_op_total(prize, tour)
        assert abs(total_cost - cost) <= 1e-4, "Cost is incorrect"
        return total_cost, tour, duration

    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Example #17

0

Show file

File: preprocess_cleaning.py Project: SunnyMarkLiu/Kaggle_Quora_Question_Pairs_Intent

def main(base_data_dir):
    op_scope = 0
    if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope+1)):
        return

    print("---> load datasets from scope {}".format(op_scope))
    train, test = data_utils.load_dataset(base_data_dir, op_scope)
    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> generate basic statistic features")
    train['num_of_chars_q1'] = train['question1'].apply(lambda x: len(str(x)))
    train['num_of_chars_q2'] = train['question2'].apply(lambda x: len(str(x)))
    test['num_of_chars_q1'] = test['question1'].apply(lambda x: len(str(x)))
    test['num_of_chars_q2'] = test['question2'].apply(lambda x: len(str(x)))

    train['num_of_words_q1'] = train['question1'].apply(lambda x: len(str(x).split()))
    train['num_of_words_q2'] = train['question2'].apply(lambda x: len(str(x).split()))
    test['num_of_words_q1'] = test['question1'].apply(lambda x: len(str(x).split()))
    test['num_of_words_q2'] = test['question2'].apply(lambda x: len(str(x).split()))

    print('---> generate unigram_words features before cleaned')
    train = jobs.parallelize_dataframe(train, generate_unigram_words_features)
    test = jobs.parallelize_dataframe(test, generate_unigram_words_features)

    print('---> clean text')
    start = time.clock()
    if 'no_stem_words' in base_data_dir:
        print('clean train question')
        train = jobs.parallelize_dataframe(train, clean_text_func_no_stem_words)
        print('clean test question')
        test = jobs.parallelize_dataframe(test, clean_text_func_no_stem_words)
    else:
        print('clean train question')
        train = jobs.parallelize_dataframe(train, clean_text_func_stem_words)
        print('clean test question')
        test = jobs.parallelize_dataframe(test, clean_text_func_stem_words)

    stop = time.clock()
    print("text cleaned, cost {}s".format(stop, str(stop - start)))

    print('---> generate unigram_words features after cleaned')
    train = jobs.parallelize_dataframe(train, generate_cleaned_unigram_words_features)
    test = jobs.parallelize_dataframe(test, generate_cleaned_unigram_words_features)

    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> save datasets")
    data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)

Example #18

0

Show file

File: train_test_preprocess.py Project: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('0')):
        return

    train, test = data_utils.load_dataset(op_scope='0')
    print 'train: {}, test: {}'.format(train.shape, test.shape)

    # store_and_fwd_flag
    train['is_store_and_fwd_flag'] = train['store_and_fwd_flag'].map(
        lambda s: 1 if s == 'Y' else 0)
    test['is_store_and_fwd_flag'] = test['store_and_fwd_flag'].map(
        lambda s: 1 if s == 'Y' else 0)
    del train['store_and_fwd_flag']
    del test['store_and_fwd_flag']

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='0')

Example #19

0

Show file

File: generate_magic_features.py Project: SunnyMarkLiu/Kaggle_Quora_Question_Pairs_Intent

def main(base_data_dir):
    op_scope = 4
    if os.path.exists(
            Configure.processed_train_path.format(base_data_dir,
                                                  op_scope + 1)):
        return

    print("---> load datasets from scope {}".format(op_scope))
    train, test = data_utils.load_dataset(base_data_dir, op_scope)
    print("train: {}, test: {}".format(train.shape, test.shape))

    print('---> generate common word count')
    train = jobs.parallelize_dataframe(train, generate_common_word_count)
    test = jobs.parallelize_dataframe(test, generate_common_word_count)

    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> save datasets")
    data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)

Example #20

0

Show file

def eval_dataset(dataset_path, opts):
    model = load_model(opts.model)
    use_cuda = torch.cuda.is_available() and opts.no_cuda

    device = torch.device('cuda:0' if use_cuda else 'cpu')
    dataset = model.problem.make_dataset(filename=dataset_path,
                                         num_samples=opts.val_size,
                                         offset=opts.offset)
    results = _eval_dataset(model, dataset, opts, device)

    parallelism = opts.eval_batch_size

    costs, tours, durations = zip(
        *results)  # Not really costs since they should be negative

    print("Average cost: {} +- {}".format(
        np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
    print("Average serial duration: {} +- {}".format(
        np.mean(durations), 2 * np.std(durations) / np.sqrt(len(durations))))
    print("Average parallel duration: {}".format(
        np.mean(durations) / parallelism))
    print("Calculated total duration: {}".format(
        timedelta(seconds=int(np.sum(durations) / parallelism))))

    dataset_basename, ext = os.path.splitext(os.path.split(dataset_path)[-1])
    model_name = "_".join(
        os.path.normpath(os.path.splitext(opts.model)[0]).split(os.sep)[-2:])
    if opts.o is None:
        results_dir = os.path.join(opts.results_dir, model.problem.NAME,
                                   dataset_basename)
        os.makedirs(results_dir, exist_ok=True)

        out_file = os.path.join(
            results_dir,
            "{}-{}-{}-{}-{}{}".format(dataset_basename, model_name,
                                      opts.decode_strategy, opts.offset,
                                      opts.offset + len(costs), ext))
    else:
        out_file = opts.o

    save_dataset((results, parallelism), out_file)

    return costs, tours, durations

Example #21

0

Show file

def solve_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10):

    problem_filename = os.path.join(directory, "{}.pctsp{}.pctsp".format(name, runs))
    output_filename = os.path.join(directory, "{}.pctsp{}.pkl".format(name, runs))
    log_filename = os.path.join(directory, "{}.pctsp{}.log".format(name, runs))

    try:
        # May have already been run
        if not os.path.isfile(output_filename):
            write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name)
            with open(log_filename, 'w') as f:
                start = time.time()
                output = check_output(
                    # exe, filename, min_total_prize (=1), num_runs
                    [executable, problem_filename, float_to_scaled_int_str(1.), str(runs)],
                    stderr=f
                ).decode('utf-8')
                duration = time.time() - start
                f.write(output)

            save_dataset((output, duration), output_filename)
        else:
            output, duration = load_dataset(output_filename)

        # Now parse output
        tour = None
        for line in output.splitlines():
            heading = "Best Result Route: "
            if line[:len(heading)] == heading:
                tour = np.array(line[len(heading):].split(" ")).astype(int)
                break
        assert tour is not None, "Could not find tour in output!"

        assert tour[0] == 0, "Tour should start with depot"
        assert tour[-1] == 0, "Tour should end with depot"
        tour = tour[1:-1]  # Strip off depot

        return calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour), tour.tolist(), duration
    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Example #22

0

Show file

def main():
    train, test = data_utils.load_dataset(op_scope='4')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])
    conbined_data.columns = test.columns.values
    conbined_data.index = range(conbined_data.shape[0])

    # timewindow size in minutes
    timewindow_days = [10, 15]
    conbined_data = perform_time_window(conbined_data, timewindow_days)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='5')

Example #23

0

Show file

def solve_concorde_log(executable, directory, name, loc, disable_cache=False):

    problem_filename = os.path.join(directory, "{}.tsp".format(name))
    tour_filename = os.path.join(directory, "{}.tour".format(name))
    output_filename = os.path.join(directory, "{}.concorde.pkl".format(name))
    log_filename = os.path.join(directory, "{}.log".format(name))

    # if True:
    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_tsplib(problem_filename, loc, name=name)

            with open(log_filename, 'w') as f:
                start = time.time()
                try:
                    # Concorde is weird, will leave traces of solution in current directory so call from target dir
                    check_call([
                        executable, '-s', '1234', '-x', '-o',
                        os.path.abspath(tour_filename),
                        os.path.abspath(problem_filename)
                    ],
                               stdout=f,
                               stderr=f,
                               cwd=directory)
                except CalledProcessError as e:
                    # Somehow Concorde returns 255
                    assert e.returncode == 255
                duration = time.time() - start

            tour = read_concorde_tour(tour_filename)
            save_dataset((tour, duration), output_filename)

        return calc_tsp_length(loc, tour), tour, duration

    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Example #24

0

Show file

def main():
    if os.path.exists(Configure.processed_train_path.format('4')):
        return

    train, test = data_utils.load_dataset(op_scope='3')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    drop_missing_rate = 1
    print 'drop some features, missing_rate > {}'.format(drop_missing_rate)
    conbined_data = drop_some_features(conbined_data, drop_missing_rate=drop_missing_rate)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='4')

Example #25

0

Show file

def solve_ortools(directory,
                  name,
                  depot,
                  loc,
                  prize,
                  max_length,
                  sec_local_search=0,
                  disable_cache=False):
    # Lazy import so we do not require ortools by default
    from problems.op.op_ortools import solve_op_ortools

    try:
        problem_filename = os.path.join(
            directory, "{}.ortools{}.pkl".format(name, sec_local_search))
        if os.path.isfile(problem_filename) and not disable_cache:
            objval, tour, duration = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()
            objval, tour = solve_op_ortools(depot,
                                            loc,
                                            prize,
                                            max_length,
                                            sec_local_search=sec_local_search)
            duration = time.time() - start
            save_dataset((objval, tour, duration), problem_filename)
        assert tour[0] == 0, "Tour must start with depot"
        tour = tour[1:]
        assert calc_op_length(
            depot, loc,
            tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
        assert abs(-calc_op_total(prize, tour) -
                   objval) <= 1e-5, "Cost is incorrect"
        return -calc_op_total(prize, tour), tour, duration
    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Example #26

0

Show file

def main():
    train, test = data_utils.load_dataset(op_scope='4')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    n_clusters = 10**2
    result_df = calc_heavy_traffic_cluster_distances(
        conbined_data,
        n_clusters=n_clusters,
        batch_size=64**3,
        most_traffic_quantile=0.80,
        random_state=1000)
    conbined_data = pd.merge(conbined_data, result_df, how='left', on='id')

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='5')

Example #27

0

Show file

def main(base_data_dir):
    op_scope = 2
    if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope + 1)):
        return

    print("---> load datasets from scope {}".format(op_scope))
    train, test = data_utils.load_dataset(base_data_dir, op_scope)
    print("train: {}, test: {}".format(train.shape, test.shape))

    print('---> generate question introducer word features')
    train = jobs.parallelize_dataframe(train, generate_question_introducer_word_features)
    test = jobs.parallelize_dataframe(test, generate_question_introducer_word_features)

    print('---> generate symbol features')
    train = jobs.parallelize_dataframe(train, generate_symbol_count)
    test = jobs.parallelize_dataframe(test, generate_symbol_count)

    print('---> generate char count features')
    train = jobs.parallelize_dataframe(train, generate_char_count)
    test = jobs.parallelize_dataframe(test, generate_char_count)

    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> save datasets")
    data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)

Example #28

0

Show file

def solve_stochastic_pctsp_log(executable,
                               directory,
                               name,
                               depot,
                               loc,
                               penalty,
                               deterministic_prize,
                               stochastic_prize,
                               runs=10,
                               append='all'):

    try:

        problem_filename = os.path.join(
            directory, "{}.stochpctsp{}{}.pctsp".format(name, append, runs))
        output_filename = os.path.join(
            directory, "{}.stochpctsp{}{}.pkl".format(name, append, runs))
        log_filename = os.path.join(
            directory, "{}.stochpctsp{}{}.log".format(name, append, runs))

        # May have already been run
        if not os.path.isfile(output_filename):

            total_start = time.time()

            outputs = []
            durations = []
            final_tour = []

            coord = [depot] + loc

            mask = np.zeros(len(coord), dtype=bool)
            dist = distance_matrix(coord, coord)
            penalty = np.array(penalty)
            deterministic_prize = np.array(deterministic_prize)

            it = 0
            total_collected_prize = 0.
            # As long as we have not visited all nodes we repeat
            # even though we have already satisfied the total prize collected constraint
            # since the algorithm may decide to include more nodes to avoid further penalties
            while len(final_tour) < len(stochastic_prize):

                # Mask all nodes already visited (not the depot)
                mask[final_tour] = True

                # The distance from the 'start' or 'depot' is the distance from the 'current node'
                # this way we mimic as if we have a separate start and end by the assymetric distance matrix
                # Note: this violates the triangle inequality and the distance from 'depot to depot' becomes nonzero
                # but the program seems to deal with this well
                if len(
                        final_tour
                ) > 0:  # in the first iteration we are at depot and distance matrix is ok
                    dist[0, :] = dist[final_tour[-1], :]

                remaining_deterministic_prize = deterministic_prize[~mask[1:]]
                write_pctsp_dist(problem_filename, dist[np.ix_(~mask, ~mask)],
                                 penalty[~mask[1:]],
                                 remaining_deterministic_prize)
                # If the remaining deterministic prize is less than the prize we should still collect
                # set this lower value as constraint since otherwise problem is infeasible
                # compute total remaining deterministic prize after converting to ints
                # otherwise we may still have problems with rounding
                # Note we need to clip 1 - total_collected_prize between 0 (constraint can already be satisfied)
                # and the maximum achievable with the remaining_deterministic_prize
                min_prize_int = max(
                    0,
                    min(
                        float_to_scaled_int(1. - total_collected_prize),
                        sum([
                            float_to_scaled_int(v)
                            for v in remaining_deterministic_prize
                        ])))
                with open(log_filename, 'a') as f:
                    start = time.time()
                    output = check_output(
                        # exe, filename, min_total_prize (=1), num_runs
                        [
                            executable, problem_filename,
                            str(min_prize_int),
                            str(runs)
                        ],
                        stderr=f).decode('utf-8')
                    durations.append(time.time() - start)
                    outputs.append(output)

                # Now parse output
                tour = None
                for line in output.splitlines():
                    heading = "Best Result Route: "
                    if line[:len(heading)] == heading:
                        tour = np.array(
                            line[len(heading):].split(" ")).astype(int)
                        break
                assert tour is not None, "Could not find tour in output!"

                assert tour[0] == 0, "Tour should start with depot"
                assert tour[-1] == 0, "Tour should end with depot"
                tour = tour[1:-1]  # Strip off depot

                # Now find to which nodes these correspond
                tour_node_ids = np.arange(len(coord), dtype=int)[~mask][tour]

                if len(tour_node_ids) == 0:
                    # The inner algorithm can decide to stop, but does not have to
                    assert total_collected_prize > 1 - 1e-5, "Collected prize should be one"
                    break

                if append == 'first':
                    final_tour.append(tour_node_ids[0])
                elif append == 'half':
                    final_tour.extend(
                        tour_node_ids[:max(len(tour_node_ids) // 2, 1)])
                else:
                    assert append == 'all'
                    final_tour.extend(tour_node_ids)

                total_collected_prize = calc_pctsp_total(
                    stochastic_prize, final_tour)
                it = it + 1

            os.remove(problem_filename)
            final_cost = calc_pctsp_cost(depot, loc, penalty, stochastic_prize,
                                         final_tour)
            total_duration = time.time() - total_start
            save_dataset(
                (final_cost, final_tour, total_duration, outputs, durations),
                output_filename)

        else:
            final_cost, final_tour, total_duration, outputs, durations = load_dataset(
                output_filename)

        return final_cost, final_tour, total_duration
    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Example #29

0

Show file

                def run_func(args):
                    return solve_stochastic_pctsp_log(executable,
                                                      *args,
                                                      runs=runs,
                                                      append=append)

            results, parallelism = run_all_in_pool(
                run_func,
                target_dir,
                dataset,
                opts,
                use_multiprocessing=use_multiprocessing)

        else:
            assert False, "Unknown method: {}".format(opts.method)

        costs, tours, durations = zip(
            *results)  # Not really costs since they should be negative
        print("Average cost: {} +- {}".format(
            np.mean(costs), 2 * np.std(costs) / np.sqrt(len(costs))))
        print("Average serial duration: {} +- {}".format(
            np.mean(durations),
            2 * np.std(durations) / np.sqrt(len(durations))))
        print("Average parallel duration: {}".format(
            np.mean(durations) / parallelism))
        print("Calculated total duration: {}".format(
            timedelta(seconds=int(np.sum(durations) / parallelism))))

        save_dataset((results, parallelism), out_file)

Example #30

0

Show file

                full_data_perform_stem_words,
                full_data_perform_no_stem_words""")

options, _ = parser.parse_args()
print("========== generate word vector features ==========")
base_data_dir = options.base_data_dir

op_scope = 5
if os.path.exists(
        Configure.processed_train_path.format(base_data_dir, op_scope + 1)):
    exit()

print("---> load datasets from scope {}".format(op_scope))
train, test = data_utils.load_dataset(base_data_dir, op_scope)
print("train: {}, test: {}".format(train.shape, test.shape))

print('---> generate word vector mapping')
embeddings_index = generate_word_vector_map()

print('---> generate wordvectors features')
train = jobs.parallelize_dataframe(train, generate_wordvectors_features)
test = jobs.parallelize_dataframe(test, generate_wordvectors_features)

print('---> generate wordvector distance features')
train = jobs.parallelize_dataframe(train, generate_wordvector_distance)
test = jobs.parallelize_dataframe(test, generate_wordvector_distance)

print("train: {}, test: {}".format(train.shape, test.shape))
print("---> save datasets")
data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)