コード例 #1
0
    def predict(self, predictor, columns, data_dict, time_steps_back, lines_reservations=None):
        """
        Prepares feature vector for prediction algorithms an generates prediction
        :param predictor: sklearn predictor or other with simmilar interface
        :param columns: List with names of columns that must be in feature vector
        :param data_dict: Dictionary with time data containing columns names and default vaules
        :param time_steps_back: Number of time steps for one input to prediction algorithm
        :param lines_reservations: List with line reservations for predicted day generated by get_lines_usage_for_day method
        :return: Vector with prediction for the day (288 items)
        """
        lines_reserved_id = -1
        org_ids = dict()
        for org_id, column in enumerate(columns):
            if column.startswith('reserved_'):
                org_ids[column] = org_id
            if column == 'lines_reserved':
                lines_reserved_id = org_id

        if lines_reservations is None:
            lines_reservations = ['']*64
            
        data = list()
        for i in range(self.prediction_steps):
            data.append([0]*len(columns))
            for j, column in enumerate(columns):
                if column in data_dict:
                    data[i][j] = data_dict[column]

            slot_id = (data_dict['minute_of_day']-360)//15
            if slot_id >= 0 and slot_id < 64:
                org_list = lines_reservations[slot_id].split(',')[:-1]
                for name in org_list:
                    feature_name = 'reserved_' + name
                    if feature_name in columns:
                        data[i][org_ids[feature_name]] += 1
                    elif 'reserved_other' in columns:
                        data[i][org_ids['reserved_other']] += 1
                        
                    if lines_reserved_id >= 0:
                        data[i][lines_reserved_id] += 1
                        
            data_dict['minute'] += 5
            data_dict['minute_of_day'] += 5
            if data_dict['minute'] == 60:
                data_dict['minute'] = 0
                data_dict['hour'] += 1

        df = pd.DataFrame(data, columns=columns) 
        day = Day('ts')
        day.data = df
        x, y = self.dh.get_feature_vectors_from_days([day], [], time_steps_back, 1, True)
        return self.dh.predict_day_from_features(x, predictor, time_steps_back)
コード例 #2
0
    def prepare_days_data(self):
        """
        Loads pickle with all Days
        """
        if os.path.isfile(self.days_data_path):
            with open(self.days_data_path, 'rb') as input_file:
                self.days_train, self.days_test, self.days_valid = pickle.load(
                    input_file)
            self.columns = self.days_train[0].data.columns
        else:
            if os.path.isfile(self.csv_path):
                days_stats = [0, 0, 0, 0, 0, 0, 0]
                print('Preparing days.pickle')
                days_list = []
                last_date = 'start'
                day_start_id = 0
                day_stop_id = 0
                n_bad_days = 0

                data_frame = pd.read_csv(self.csv_path)
                for index, row in data_frame.iterrows():
                    if row['minute_of_day'] > 1320:
                        data_frame['pool'].iloc[index] = 0

                    new_date = data_frame['time'].iloc[index][:10]
                    if not last_date == new_date:
                        day_stop_id = index
                        if index > 0:
                            new_day = Day(last_date)
                            new_day.data = data_frame.iloc[
                                day_start_id:day_stop_id]
                            if day_stop_id - day_start_id == 288:
                                days_list.append(new_day)
                                days_stats[
                                    data_frame['day_of_week'].iloc[index]] += 1
                            else:
                                if abs(day_stop_id - day_start_id - 288) < 15:
                                    expected = 0
                                    n_bad_days += 1
                                    print(
                                        'Error in day %s, length of day is %d'
                                        % (last_date,
                                           day_stop_id - day_start_id))
                                    for value in list(
                                            data_frame['minute_of_day'].
                                            iloc[day_start_id:day_stop_id]):
                                        if not value == expected:
                                            print('Should be %d is %d' %
                                                  (expected, value))
                                            expected = value
                                        expected += 5
                                    print('\n\n')

                                # TODO: Most of them have less than 10 missing values.
                                # If the missinga values are out of openning hours - fill with zeros and use
                                # Many other missing values can be filled in
                                # Also change of time from summer to winter makes 1 hour gap or duplicate hour
                                # Move this function to data preprocessing

                        last_date = data_frame['time'].iloc[index][:10]
                        day_start_id = index

                Random(RANDOM_SEED).shuffle(days_list)
                train_portion = 0.4
                validation_portion = 0.2
                n_days = len(days_list)
                print('Generated %d days. (%d days removed)' %
                      (n_days, n_bad_days))
                print('Number of days from Monday to Sunday', days_stats)
                n_train_days = int(n_days * train_portion)
                n_validation_days = int(n_days * validation_portion)
                train_days = days_list[:n_train_days]
                validation_days = days_list[n_train_days:n_train_days +
                                            n_validation_days]
                test_days = days_list[n_train_days + n_validation_days:]

                with open(self.days_data_path, 'wb') as input_file:
                    pickle.dump([train_days, test_days, validation_days],
                                input_file)

            else:
                raise Exception(
                    'Missing days.pickle and dataset.csv.\nGenerate dataset.csv in preprocess_data.py first.'
                )