Example #1
0
class Observation:
    def __init__(self, path, reactor_site, suffix_list, format="%Y-%m-%dT%H:%M:%S.000Z",
                 hours_backfill=1, verbose=0, ignore_keys=[], remove_on = [deb1[0]]):
        self.verboseprint = print if verbose else lambda *a, **k: None
        self.verboseprint("Loading in memory %i observations..." % (int(len(suffix_list)),))
        self.hours_backfill = hours_backfill
        files_name = [reactor_site + "-" + suffix + ".txt" for suffix in suffix_list]
        list_df = [pd.read_csv(path + file_name, sep=";") for file_name in files_name]
        self.ignore_keys = ignore_keys # TODO : remove deprecated
        self.remove_on = remove_on
        for df, tag in zip(list_df, suffix_list):
            df.columns = ["date", tag]
            df.drop_duplicates(subset="date", inplace=True)
            df['date'] = pd.to_datetime(df['date'], format=format)
            df.set_index('date', inplace=True)
        self.verboseprint("Concatenation...")
        self.df = pd.concat(list_df, axis=1)
        self.bad_labels_dict = {}
        self.change_isolated_wrong_values()
        self.verboseprint("Forward Filling...")
        self.df.fillna(method='ffill', inplace=True)
        self.verboseprint("Backward Filling...")
        self.df.fillna(method='bfill', inplace=True)

        self.compute_intervals_to_remove()
        self.compute_full_concatenated_df()
        self.compute_low_regime_intervals()

    def change_isolated_wrong_values(self):
        self.verboseprint("Changing isolated wrong values...")
        for column in self.df:
            bad_labels = self.df.index[((self.df[column] == MAX_VALUE) | (self.df[column] == 0))] ## >= THRESHOLD
            bad_labels = sequence_to_interval(bad_labels, timedelta(minutes=10))  # Stricly consecutive wrong values
            to_change_index = (bad_labels[:, 1] - bad_labels[:, 0]) <= timedelta(hours=self.hours_backfill)
            for begin, end in bad_labels[to_change_index]:
                self.df[column][begin:end] = np.nan
            if column in self.remove_on:
                self.bad_labels_dict[column] = bad_labels[~to_change_index]
            else:
                for begin, end in bad_labels[~to_change_index]:
                    self.df[column][begin:end] = np.nan

    def compute_intervals_to_remove(self):
        self.intervals_to_remove = Interval([])
        for key, intervals_bad_level in self.bad_labels_dict.items():
            if (key not in self.ignore_keys):
                self.intervals_to_remove.update(intervals_bad_level)

    def compute_low_regime_intervals(self):
        #time_precision = '10m'#'6H'
        low_regime_merge_time = timedelta(days=15)  # In days: The merging time for low regime
        margin_intervals_to_remove = timedelta(minutes=10)  # In days: Be careful, a high time_precision can make this wrong !
        filter_spike = timedelta(hours=1)  # In days: below that, the interval is considered as a spike !

        #subsample = self.full_concatenated_df[deb1[0]].resample(time_precision, label='right').min()
        subsample = self.full_concatenated_df[deb1[0]]
        self.low_regime_intervals = sequence_to_interval(subsample.index[(subsample < 200)],
                                                         low_regime_merge_time)
        self.low_regime_intervals = Interval(self.low_regime_intervals)
        self.low_regime_intervals.update_conditionally(
            self.intervals_to_remove.enlarge(margin_intervals_to_remove))
        self.low_regime_intervals.filter(filter_spike)

    def compute_full_concatenated_df(self):
        self.full_concatenated_df = pd.concat(self.intervals_to_remove.split_between(self.df),axis=0)