Ejemplo n.º 1
0
def get_data_heading(model_id,
                     driver_id,
                     repeat,
                     test=False,
                     moving_average_window=3,
                     stops=False,
                     version=1):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()
    if test:
        raise Exception

    driver_train, driver_test = da.get_rides_split(driver_id,
                                                   settings.BIG_CHUNK)
    other_train = list(
        da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
    other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

    set1 = driver_train + other_train  # used for training
    set2 = driver_test + other_test  # used for testing

    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
    set2 = [util.get_list_string(heading.get_ride_heading(ride, \
        moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

    set1 = list(itertools.chain(*set1))

    set1 = [util.get_list_string(r) for r in set1]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    set1 = vectorizer.fit_transform(set1)
    set2 = vectorizer.transform(set2)

    return set1, set2
def get_data_heading(model_id, driver_id, repeat, test=False, moving_average_window=3, stops=False, version=1):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()
  if test:
    raise Exception

  driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK)
  other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed))
  other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id))

  set1 = driver_train + other_train # used for training
  set2 = driver_test + other_test # used for testing

  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window, stops=stops, version=version) for ride in set1]
  set2 = [util.get_list_string(heading.get_ride_heading(ride, \
      moving_average_window=moving_average_window, stops=stops, version=version)) for ride in set2]

  set1 = list(itertools.chain(*set1))

  set1 = [util.get_list_string(r) for r in set1]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  set1 = vectorizer.fit_transform(set1)
  set2 = vectorizer.transform(set2)

  return set1, set2
Ejemplo n.º 3
0
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()

    set1 = list(da.get_rides(driver_id))  # first half of the train set
    set2 = list(
        da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                            driver_id,
                            seed=seed))  # second half of the train set

    moving_average_window = 6 if get_data == get_data_heading_v2 else 3
    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set1]
    set2 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set2]

    set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
    set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    vectorizer.fit([r[0] for r in set1])
    rides = [[vectorizer.transform([r])[0] for r in four_pack]
             for four_pack in set1]
    other_rides = [[vectorizer.transform([r])[0] for r in four_pack]
                   for four_pack in set2]
    other_rides = list(itertools.chain(*other_rides))

    rides = np.array(rides)

    trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [
        0
    ] * settings.BIG_CHUNK_TEST * 4 * repeat
    kf = KFold(200,
               n_folds=settings.FOLDS,
               shuffle=True,
               random_state=driver_id)
    predictions = ['bug'] * 200
    for train_fold, test_fold in kf:
        trainX = rides[train_fold]
        trainX = scipy.sparse.vstack(
            list(itertools.chain(*trainX)) * repeat + \
            other_rides
        )
        testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

        assert (trainX.shape[0] == len(trainY))
        assert (testX.shape[0] == settings.SMALL_CHUNK_TEST)

        model = Model(trainX, trainY, driver_id)
        fold_predictions = model.predict(testX)
        for i, v in enumerate(test_fold):
            predictions[v] = fold_predictions[i]

    predictions = np.array(predictions)
    if settings.ENABLE_CACHE:
        util.cache_results(Model, get_data, driver_id, True, predictions,
                           repeat)
    return driver_id, predictions
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()

  set1 = list(da.get_rides(driver_id)) # first half of the train set
  set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set

  moving_average_window = 6 if get_data == get_data_heading_v2 else 3
  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set1]
  set2 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set2]

  set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
  set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  vectorizer.fit([r[0] for r in set1])
  rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1]
  other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2]
  other_rides = list(itertools.chain(*other_rides))

  rides = np.array(rides)

  trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat
  kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id)
  predictions = ['bug'] * 200
  for train_fold, test_fold in kf:
    trainX = rides[train_fold]
    trainX = scipy.sparse.vstack(
        list(itertools.chain(*trainX)) * repeat + \
        other_rides
    )
    testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

    assert(trainX.shape[0] == len(trainY))
    assert(testX.shape[0] == settings.SMALL_CHUNK_TEST)

    model = Model(trainX, trainY, driver_id)
    fold_predictions = model.predict(testX)
    for i, v in enumerate(test_fold):
      predictions[v] = fold_predictions[i]

  predictions = np.array(predictions)
  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, True, predictions, repeat)
  return driver_id, predictions