def run_model((model_id, driver_id, Model, get_data, repeat)):
  testY = [1] * settings.SMALL_CHUNK + [0] * settings.SMALL_CHUNK

  if settings.ENABLE_CACHE:
    predictions = util.get_results(Model, get_data, driver_id, False, repeat)
    if predictions is not False:
      return predictions, testY

  multiplier = 4 if get_data in HEADING_DATA_FUNCTIONS else 1

  trainY = [1] * settings.BIG_CHUNK * multiplier * repeat + \
      [0] * settings.BIG_CHUNK * multiplier * repeat
  trainX, testX = get_data(model_id, driver_id, repeat)

  if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]:
    trainX = scipy.sparse.vstack(
        [trainX[:settings.BIG_CHUNK * multiplier]] * repeat +
        [trainX[settings.BIG_CHUNK * multiplier:]]
    )
  else:
    trainX = np.vstack((
        np.tile(np.array(trainX[:settings.BIG_CHUNK * multiplier]).T, repeat).T,
        trainX[settings.BIG_CHUNK * multiplier:]
    ))

  assert(trainX.shape[0] == len(trainY))
  assert(testX.shape[0] == len(testY))

  model = Model(trainX, trainY, driver_id)
  predictions = model.predict(testX)

  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, False, predictions, repeat)

  return predictions, testY
Esempio n. 2
0
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
    seed = random.Random(x=driver_id + model_id)
    da = DataAccess()

    set1 = list(da.get_rides(driver_id))  # first half of the train set
    set2 = list(
        da.get_random_rides(settings.BIG_CHUNK_TEST * repeat,
                            driver_id,
                            seed=seed))  # second half of the train set

    moving_average_window = 6 if get_data == get_data_heading_v2 else 3
    set1 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set1]
    set2 = [heading.get_ride_heading(ride, variations=True, \
        moving_average_window=moving_average_window) for ride in set2]

    set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
    set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

    vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000)
    vectorizer.fit([r[0] for r in set1])
    rides = [[vectorizer.transform([r])[0] for r in four_pack]
             for four_pack in set1]
    other_rides = [[vectorizer.transform([r])[0] for r in four_pack]
                   for four_pack in set2]
    other_rides = list(itertools.chain(*other_rides))

    rides = np.array(rides)

    trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [
        0
    ] * settings.BIG_CHUNK_TEST * 4 * repeat
    kf = KFold(200,
               n_folds=settings.FOLDS,
               shuffle=True,
               random_state=driver_id)
    predictions = ['bug'] * 200
    for train_fold, test_fold in kf:
        trainX = rides[train_fold]
        trainX = scipy.sparse.vstack(
            list(itertools.chain(*trainX)) * repeat + \
            other_rides
        )
        testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

        assert (trainX.shape[0] == len(trainY))
        assert (testX.shape[0] == settings.SMALL_CHUNK_TEST)

        model = Model(trainX, trainY, driver_id)
        fold_predictions = model.predict(testX)
        for i, v in enumerate(test_fold):
            predictions[v] = fold_predictions[i]

    predictions = np.array(predictions)
    if settings.ENABLE_CACHE:
        util.cache_results(Model, get_data, driver_id, True, predictions,
                           repeat)
    return driver_id, predictions
def test_model_heading(model_id, driver_id, Model, get_data, repeat):
  seed = random.Random(x=driver_id+model_id)
  da = DataAccess()

  set1 = list(da.get_rides(driver_id)) # first half of the train set
  set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set

  moving_average_window = 6 if get_data == get_data_heading_v2 else 3
  set1 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set1]
  set2 = [heading.get_ride_heading(ride, variations=True, \
      moving_average_window=moving_average_window) for ride in set2]

  set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1]
  set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2]

  vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000)
  vectorizer.fit([r[0] for r in set1])
  rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1]
  other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2]
  other_rides = list(itertools.chain(*other_rides))

  rides = np.array(rides)

  trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat
  kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id)
  predictions = ['bug'] * 200
  for train_fold, test_fold in kf:
    trainX = rides[train_fold]
    trainX = scipy.sparse.vstack(
        list(itertools.chain(*trainX)) * repeat + \
        other_rides
    )
    testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]])

    assert(trainX.shape[0] == len(trainY))
    assert(testX.shape[0] == settings.SMALL_CHUNK_TEST)

    model = Model(trainX, trainY, driver_id)
    fold_predictions = model.predict(testX)
    for i, v in enumerate(test_fold):
      predictions[v] = fold_predictions[i]

  predictions = np.array(predictions)
  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, True, predictions, repeat)
  return driver_id, predictions
Esempio n. 4
0
def test_model(xxx_todo_changeme1):
    (model_id, driver_id, Model, get_data, repeat) = xxx_todo_changeme1
    if settings.ENABLE_CACHE:
        predictions = util.get_results(Model, get_data, driver_id, True,
                                       repeat)
        if predictions is not False:
            return driver_id, predictions

    if get_data in HEADING_DATA_FUNCTIONS:
        return test_model_heading(model_id, driver_id, Model, get_data, repeat)

    rides, other_rides = get_data(model_id, driver_id, repeat, test=True)
    trainY = [1] * settings.BIG_CHUNK_TEST * repeat + [
        0
    ] * settings.BIG_CHUNK_TEST * repeat
    kf = KFold(200,
               n_folds=settings.FOLDS,
               shuffle=True,
               random_state=driver_id)
    predictions = ['bug'] * 200
    for train_fold, test_fold in kf:
        trainX = rides[train_fold]
        testX = rides[test_fold]

        if type(trainX) in [
                scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix
        ]:
            trainX = scipy.sparse.vstack([trainX] * repeat + [other_rides])
        else:
            trainX = np.vstack((np.tile(np.array(trainX).T,
                                        repeat).T, other_rides))

        assert (trainX.shape[0] == len(trainY))
        assert (testX.shape[0] == settings.SMALL_CHUNK_TEST)

        model = Model(trainX, trainY, driver_id)
        fold_predictions = model.predict(testX)
        for i, v in enumerate(test_fold):
            predictions[v] = fold_predictions[i]

    predictions = np.array(predictions)
    if settings.ENABLE_CACHE:
        util.cache_results(Model, get_data, driver_id, True, predictions,
                           repeat)
    return driver_id, predictions
Esempio n. 5
0
def run_model(xxx_todo_changeme):
    (model_id, driver_id, Model, get_data, repeat) = xxx_todo_changeme
    testY = [1] * settings.SMALL_CHUNK + [0] * settings.SMALL_CHUNK

    if settings.ENABLE_CACHE:
        predictions = util.get_results(Model, get_data, driver_id, False,
                                       repeat)
        if predictions is not False:
            return predictions, testY

    multiplier = 4 if get_data in HEADING_DATA_FUNCTIONS else 1

    trainY = [1] * settings.BIG_CHUNK * multiplier * repeat + \
        [0] * settings.BIG_CHUNK * multiplier * repeat
    trainX, testX = get_data(model_id, driver_id, repeat)

    if type(trainX) in [
            scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix
    ]:
        trainX = scipy.sparse.vstack(
            [trainX[:settings.BIG_CHUNK * multiplier]] * repeat +
            [trainX[settings.BIG_CHUNK * multiplier:]])
    else:
        trainX = np.vstack((np.tile(
            np.array(trainX[:settings.BIG_CHUNK * multiplier]).T,
            repeat).T, trainX[settings.BIG_CHUNK * multiplier:]))

    assert (trainX.shape[0] == len(trainY))
    assert (testX.shape[0] == len(testY))

    model = Model(trainX, trainY, driver_id)
    predictions = model.predict(testX)

    if settings.ENABLE_CACHE:
        util.cache_results(Model, get_data, driver_id, False, predictions,
                           repeat)

    return predictions, testY
def test_model((model_id, driver_id, Model, get_data, repeat)):
  if settings.ENABLE_CACHE:
    predictions = util.get_results(Model, get_data, driver_id, True, repeat)
    if predictions is not False:
      return driver_id, predictions

  if get_data in HEADING_DATA_FUNCTIONS:
    return test_model_heading(model_id, driver_id, Model, get_data, repeat)

  rides, other_rides = get_data(model_id, driver_id, repeat, test=True)
  trainY = [1] * settings.BIG_CHUNK_TEST * repeat + [0] * settings.BIG_CHUNK_TEST * repeat
  kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id)
  predictions = ['bug'] * 200
  for train_fold, test_fold in kf:
    trainX = rides[train_fold]
    testX = rides[test_fold]

    if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]:
      trainX = scipy.sparse.vstack([trainX] * repeat + [other_rides])
    else:
      trainX = np.vstack((
          np.tile(np.array(trainX).T, repeat).T,
          other_rides
      ))

    assert(trainX.shape[0] == len(trainY))
    assert(testX.shape[0] == settings.SMALL_CHUNK_TEST)

    model = Model(trainX, trainY, driver_id)
    fold_predictions = model.predict(testX)
    for i, v in enumerate(test_fold):
      predictions[v] = fold_predictions[i]

  predictions = np.array(predictions)
  if settings.ENABLE_CACHE:
    util.cache_results(Model, get_data, driver_id, True, predictions, repeat)
  return driver_id, predictions