def run_model((model_id, driver_id, Model, get_data, repeat)): testY = [1] * settings.SMALL_CHUNK + [0] * settings.SMALL_CHUNK if settings.ENABLE_CACHE: predictions = util.get_results(Model, get_data, driver_id, False, repeat) if predictions is not False: return predictions, testY multiplier = 4 if get_data in HEADING_DATA_FUNCTIONS else 1 trainY = [1] * settings.BIG_CHUNK * multiplier * repeat + \ [0] * settings.BIG_CHUNK * multiplier * repeat trainX, testX = get_data(model_id, driver_id, repeat) if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]: trainX = scipy.sparse.vstack( [trainX[:settings.BIG_CHUNK * multiplier]] * repeat + [trainX[settings.BIG_CHUNK * multiplier:]] ) else: trainX = np.vstack(( np.tile(np.array(trainX[:settings.BIG_CHUNK * multiplier]).T, repeat).T, trainX[settings.BIG_CHUNK * multiplier:] )) assert(trainX.shape[0] == len(trainY)) assert(testX.shape[0] == len(testY)) model = Model(trainX, trainY, driver_id) predictions = model.predict(testX) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, False, predictions, repeat) return predictions, testY
def test_model_heading(model_id, driver_id, Model, get_data, repeat): seed = random.Random(x=driver_id + model_id) da = DataAccess() set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set moving_average_window = 6 if get_data == get_data_heading_v2 else 3 set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set1] set2 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set2] set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1] set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2] vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 15), max_df=1000000) vectorizer.fit([r[0] for r in set1]) rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1] other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2] other_rides = list(itertools.chain(*other_rides)) rides = np.array(rides) trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [ 0 ] * settings.BIG_CHUNK_TEST * 4 * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] trainX = scipy.sparse.vstack( list(itertools.chain(*trainX)) * repeat + \ other_rides ) testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]]) assert (trainX.shape[0] == len(trainY)) assert (testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def test_model_heading(model_id, driver_id, Model, get_data, repeat): seed = random.Random(x=driver_id+model_id) da = DataAccess() set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set moving_average_window = 6 if get_data == get_data_heading_v2 else 3 set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set1] set2 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set2] set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1] set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2] vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000) vectorizer.fit([r[0] for r in set1]) rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1] other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2] other_rides = list(itertools.chain(*other_rides)) rides = np.array(rides) trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] trainX = scipy.sparse.vstack( list(itertools.chain(*trainX)) * repeat + \ other_rides ) testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]]) assert(trainX.shape[0] == len(trainY)) assert(testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def test_model(xxx_todo_changeme1): (model_id, driver_id, Model, get_data, repeat) = xxx_todo_changeme1 if settings.ENABLE_CACHE: predictions = util.get_results(Model, get_data, driver_id, True, repeat) if predictions is not False: return driver_id, predictions if get_data in HEADING_DATA_FUNCTIONS: return test_model_heading(model_id, driver_id, Model, get_data, repeat) rides, other_rides = get_data(model_id, driver_id, repeat, test=True) trainY = [1] * settings.BIG_CHUNK_TEST * repeat + [ 0 ] * settings.BIG_CHUNK_TEST * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] testX = rides[test_fold] if type(trainX) in [ scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix ]: trainX = scipy.sparse.vstack([trainX] * repeat + [other_rides]) else: trainX = np.vstack((np.tile(np.array(trainX).T, repeat).T, other_rides)) assert (trainX.shape[0] == len(trainY)) assert (testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def run_model(xxx_todo_changeme): (model_id, driver_id, Model, get_data, repeat) = xxx_todo_changeme testY = [1] * settings.SMALL_CHUNK + [0] * settings.SMALL_CHUNK if settings.ENABLE_CACHE: predictions = util.get_results(Model, get_data, driver_id, False, repeat) if predictions is not False: return predictions, testY multiplier = 4 if get_data in HEADING_DATA_FUNCTIONS else 1 trainY = [1] * settings.BIG_CHUNK * multiplier * repeat + \ [0] * settings.BIG_CHUNK * multiplier * repeat trainX, testX = get_data(model_id, driver_id, repeat) if type(trainX) in [ scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix ]: trainX = scipy.sparse.vstack( [trainX[:settings.BIG_CHUNK * multiplier]] * repeat + [trainX[settings.BIG_CHUNK * multiplier:]]) else: trainX = np.vstack((np.tile( np.array(trainX[:settings.BIG_CHUNK * multiplier]).T, repeat).T, trainX[settings.BIG_CHUNK * multiplier:])) assert (trainX.shape[0] == len(trainY)) assert (testX.shape[0] == len(testY)) model = Model(trainX, trainY, driver_id) predictions = model.predict(testX) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, False, predictions, repeat) return predictions, testY
def test_model((model_id, driver_id, Model, get_data, repeat)): if settings.ENABLE_CACHE: predictions = util.get_results(Model, get_data, driver_id, True, repeat) if predictions is not False: return driver_id, predictions if get_data in HEADING_DATA_FUNCTIONS: return test_model_heading(model_id, driver_id, Model, get_data, repeat) rides, other_rides = get_data(model_id, driver_id, repeat, test=True) trainY = [1] * settings.BIG_CHUNK_TEST * repeat + [0] * settings.BIG_CHUNK_TEST * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] testX = rides[test_fold] if type(trainX) in [scipy.sparse.csr.csr_matrix, scipy.sparse.coo.coo_matrix]: trainX = scipy.sparse.vstack([trainX] * repeat + [other_rides]) else: trainX = np.vstack(( np.tile(np.array(trainX).T, repeat).T, other_rides )) assert(trainX.shape[0] == len(trainY)) assert(testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions