def predict(self, predictor, columns, data_dict, time_steps_back, lines_reservations=None): """ Prepares feature vector for prediction algorithms an generates prediction :param predictor: sklearn predictor or other with simmilar interface :param columns: List with names of columns that must be in feature vector :param data_dict: Dictionary with time data containing columns names and default vaules :param time_steps_back: Number of time steps for one input to prediction algorithm :param lines_reservations: List with line reservations for predicted day generated by get_lines_usage_for_day method :return: Vector with prediction for the day (288 items) """ lines_reserved_id = -1 org_ids = dict() for org_id, column in enumerate(columns): if column.startswith('reserved_'): org_ids[column] = org_id if column == 'lines_reserved': lines_reserved_id = org_id if lines_reservations is None: lines_reservations = ['']*64 data = list() for i in range(self.prediction_steps): data.append([0]*len(columns)) for j, column in enumerate(columns): if column in data_dict: data[i][j] = data_dict[column] slot_id = (data_dict['minute_of_day']-360)//15 if slot_id >= 0 and slot_id < 64: org_list = lines_reservations[slot_id].split(',')[:-1] for name in org_list: feature_name = 'reserved_' + name if feature_name in columns: data[i][org_ids[feature_name]] += 1 elif 'reserved_other' in columns: data[i][org_ids['reserved_other']] += 1 if lines_reserved_id >= 0: data[i][lines_reserved_id] += 1 data_dict['minute'] += 5 data_dict['minute_of_day'] += 5 if data_dict['minute'] == 60: data_dict['minute'] = 0 data_dict['hour'] += 1 df = pd.DataFrame(data, columns=columns) day = Day('ts') day.data = df x, y = self.dh.get_feature_vectors_from_days([day], [], time_steps_back, 1, True) return self.dh.predict_day_from_features(x, predictor, time_steps_back)
def prepare_days_data(self): """ Loads pickle with all Days """ if os.path.isfile(self.days_data_path): with open(self.days_data_path, 'rb') as input_file: self.days_train, self.days_test, self.days_valid = pickle.load( input_file) self.columns = self.days_train[0].data.columns else: if os.path.isfile(self.csv_path): days_stats = [0, 0, 0, 0, 0, 0, 0] print('Preparing days.pickle') days_list = [] last_date = 'start' day_start_id = 0 day_stop_id = 0 n_bad_days = 0 data_frame = pd.read_csv(self.csv_path) for index, row in data_frame.iterrows(): if row['minute_of_day'] > 1320: data_frame['pool'].iloc[index] = 0 new_date = data_frame['time'].iloc[index][:10] if not last_date == new_date: day_stop_id = index if index > 0: new_day = Day(last_date) new_day.data = data_frame.iloc[ day_start_id:day_stop_id] if day_stop_id - day_start_id == 288: days_list.append(new_day) days_stats[ data_frame['day_of_week'].iloc[index]] += 1 else: if abs(day_stop_id - day_start_id - 288) < 15: expected = 0 n_bad_days += 1 print( 'Error in day %s, length of day is %d' % (last_date, day_stop_id - day_start_id)) for value in list( data_frame['minute_of_day']. iloc[day_start_id:day_stop_id]): if not value == expected: print('Should be %d is %d' % (expected, value)) expected = value expected += 5 print('\n\n') # TODO: Most of them have less than 10 missing values. # If the missinga values are out of openning hours - fill with zeros and use # Many other missing values can be filled in # Also change of time from summer to winter makes 1 hour gap or duplicate hour # Move this function to data preprocessing last_date = data_frame['time'].iloc[index][:10] day_start_id = index Random(RANDOM_SEED).shuffle(days_list) train_portion = 0.4 validation_portion = 0.2 n_days = len(days_list) print('Generated %d days. (%d days removed)' % (n_days, n_bad_days)) print('Number of days from Monday to Sunday', days_stats) n_train_days = int(n_days * train_portion) n_validation_days = int(n_days * validation_portion) train_days = days_list[:n_train_days] validation_days = days_list[n_train_days:n_train_days + n_validation_days] test_days = days_list[n_train_days + n_validation_days:] with open(self.days_data_path, 'wb') as input_file: pickle.dump([train_days, test_days, validation_days], input_file) else: raise Exception( 'Missing days.pickle and dataset.csv.\nGenerate dataset.csv in preprocess_data.py first.' )