def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1,10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def segment_driver(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[1] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(smoothen(ride))] # enrich with timestamp ride = rdp(ride, epsilon=10) lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))] times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))] angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))] # bucket the values lengths = util.bucket(np.log(lengths), 25, [2.2,8]) # [int(l) for l in lengths] times = util.bucket(np.log(times), 20, [1,5.5]) # [int(t) for t in times] angles = util.bucket(angles, 30, [0,180]) # [int(a) for a in angles] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles) logging.info('finished segmenting driver %s' % driver_id)
def get_data_movements_accel(model_id, driver_id, repeat, test=False, step=3, tf=False, extra=((1,15),2), version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features4(r, step=step, version=version) for r in set1] set2 = [util.build_features4(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_movements_v1(model_id, driver_id, repeat, test=False, step=5, tf=False, version=1, extra=((1, 5), 2)): seed = random.Random(x=driver_id + model_id) da = DataAccess() ngram_range, min_df = extra if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, segments=False, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK, segments=False) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, segments=False, seed=seed)) other_test = list( da.get_random_rides(settings.SMALL_CHUNK, driver_id, segments=False)) set1 = driver_train + other_train set2 = driver_test + other_test # keep only lengths and convert to text set1 = [util.build_features3(r, step=step, version=version) for r in set1] set2 = [util.build_features3(r, step=step, version=version) for r in set2] if tf: vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range) else: vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def test_model_heading(model_id, driver_id, Model, get_data, repeat): seed = random.Random(x=driver_id+model_id) da = DataAccess() set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set moving_average_window = 6 if get_data == get_data_heading_v2 else 3 set1 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set1] set2 = [heading.get_ride_heading(ride, variations=True, \ moving_average_window=moving_average_window) for ride in set2] set1 = [[util.get_list_string(r) for r in four_pack] for four_pack in set1] set2 = [[util.get_list_string(r) for r in four_pack] for four_pack in set2] vectorizer = CountVectorizer(min_df=2, ngram_range=(1,15), max_df=1000000) vectorizer.fit([r[0] for r in set1]) rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set1] other_rides = [[vectorizer.transform([r])[0] for r in four_pack] for four_pack in set2] other_rides = list(itertools.chain(*other_rides)) rides = np.array(rides) trainY = [1] * settings.BIG_CHUNK_TEST * 4 * repeat + [0] * settings.BIG_CHUNK_TEST * 4 * repeat kf = KFold(200, n_folds=settings.FOLDS, shuffle=True, random_state=driver_id) predictions = ['bug'] * 200 for train_fold, test_fold in kf: trainX = rides[train_fold] trainX = scipy.sparse.vstack( list(itertools.chain(*trainX)) * repeat + \ other_rides ) testX = scipy.sparse.vstack([r[0] for r in rides[test_fold]]) assert(trainX.shape[0] == len(trainY)) assert(testX.shape[0] == settings.SMALL_CHUNK_TEST) model = Model(trainX, trainY, driver_id) fold_predictions = model.predict(testX) for i, v in enumerate(test_fold): predictions[v] = fold_predictions[i] predictions = np.array(predictions) if settings.ENABLE_CACHE: util.cache_results(Model, get_data, driver_id, True, predictions, repeat) return driver_id, predictions
def get_data_basic_accel(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.build_features_acc(ride, version=version) for ride in set1] set2 = [util.build_features_acc(ride, version=version) for ride in set2] return np.array(set1), np.array(set2)
def get_data_g_forces_v1(model_id, driver_id, repeat, test=False, min_df=1, ngram_range=(1, 10), digitize=0): def process(ride, digitize): g_forces = util.get_g_forces(ride) if digitize: g_forces = np.digitize(g_forces, range(0, 800, digitize)) return util.get_list_string(g_forces) seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [process(ride, digitize) for ride in set1] set2 = [process(ride, digitize) for ride in set2] vectorizer = CountVectorizer(min_df=min_df, ngram_range=ngram_range) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_dist_acc(model_id, driver_id, repeat, test=False): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [util.get_distance_acc_words(ride, step=3) for ride in set1] set2 = [util.get_distance_acc_words(ride, step=3) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,15)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_g_forces_v6(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) # first half of the train set set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) # second half of the train set else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train # used for training set2 = driver_test + other_test # used for testing set1 = [util.get_g_forces_v4(ride, version=version) for ride in set1] set2 = [util.get_g_forces_v4(ride, version=version) for ride in set2] vectorizer = CountVectorizer(min_df=1, ngram_range=(1,20)) set1 = vectorizer.fit_transform(set1) set2 = vectorizer.transform(set2) return set1, set2
def get_data_fft(model_id, driver_id, repeat, test=False, version=1): seed = random.Random(x=driver_id+model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list(da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list(da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test if version == 1: set1 = [util.fft(ride) for ride in set1] set2 = [util.fft(ride) for ride in set2] else: set1 = [util.fft_strip(ride) for ride in set1] set2 = [util.fft_strip(ride) for ride in set2] return np.array(set1), np.array(set2)
def segment_driver_v2(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[2] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id, version=2): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(ride)] # enrich with timestamp ride = rdp(ride, epsilon=4) lengths = [ util.euclidian_distance(ride[i - 1], ride[i]) for i in range(1, len(ride)) ] times = [ride[i][2] - ride[i - 1][2] for i in range(1, len(ride))] angles = [ util.get_angle(ride[i - 2], ride[i - 1], ride[i]) for i in range(2, len(ride)) ] lengths = np.histogram(lengths, bins=list(range(0, 700, 20)) + [1000000000])[0] times = np.histogram(times, bins=list(range(0, 60, 4)) + [1000000000])[0] angles = np.histogram(angles, bins=list(range(0, 181, 20)))[0] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2) logging.info('finished segmenting driver %s' % driver_id)
def get_data_basic(model_id, driver_id, repeat, test=False, normalized=False, version=1): seed = random.Random(x=driver_id + model_id) da = DataAccess() if test: set1 = list(da.get_rides(driver_id)) set2 = list( da.get_random_rides(settings.BIG_CHUNK_TEST * repeat, driver_id, seed=seed)) else: driver_train, driver_test = da.get_rides_split(driver_id, settings.BIG_CHUNK) other_train = list( da.get_random_rides(settings.BIG_CHUNK * repeat, driver_id, seed=seed)) other_test = list(da.get_random_rides(settings.SMALL_CHUNK, driver_id)) set1 = driver_train + other_train set2 = driver_test + other_test set1 = [ util.build_features(ride, normalized=normalized, version=version) for ride in set1 ] set2 = [ util.build_features(ride, normalized=normalized, version=version) for ride in set2 ] return np.array(set1), np.array(set2)
def segment_driver_v2(driver_id): ''' this generated the segments in settings.SEGMENTS_FOLDER[2] ''' da = DataAccess() for ride_id_minus_1, ride in enumerate(da.get_rides(driver_id)): ride_id = ride_id_minus_1 + 1 if da.skip_segment(driver_id, ride_id, version=2): continue # apply the Ramer-Douglas-Peucker algorithm ride = [p + [i] for i, p in enumerate(ride)] # enrich with timestamp ride = rdp(ride, epsilon=4) lengths = [util.euclidian_distance(ride[i-1], ride[i]) for i in xrange(1, len(ride))] times = [ride[i][2] - ride[i-1][2] for i in xrange(1, len(ride))] angles = [util.get_angle(ride[i-2], ride[i-1], ride[i]) for i in xrange(2, len(ride))] lengths = np.histogram(lengths, bins=range(0, 700, 20) + [1000000000])[0] times = np.histogram(times, bins=range(0, 60, 4) + [1000000000])[0] angles = np.histogram(angles, bins=range(0, 181, 20))[0] # write results da.write_ride_segments(driver_id, ride_id, lengths, times, angles, version=2) logging.info('finished segmenting driver %s' % driver_id)