def finalize(self):
        """ Merges together any nonzero sections which span multiple segments.
        Whether there are gaps in between does not matter.

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """

        # Merge the results of all chunks
        starts = []
        ends = []
        for index, row in self._data.iterrows():
            starts.append(row['sections']['start'])
            ends.append(row['sections']['end'])

        if len(starts) == 0 == len(ends):
            self._data = TimeFrameGroup()
            return

        starts = pd.concat(starts)
        ends = pd.concat(ends)

        rate = pd.Timedelta(seconds=self.max_sample_rate)
        self._data = TimeFrameGroup(starts_and_ends={
            'starts': starts,
            'ends': ends
        })  #.merge_shorter_gaps_than(rate) TODO: Merge needed?
    def import_from_cache(self, cached_stat, sections):
        ''' Stores the statistic into the cache of the nilmtk.DataStore

        Note
        ----
        I do not know whether this is still an issue: 
        HIER IST DAS PROBLEM BEIM STATISTIKEN LESEN! 
        DIE WERDEN CHUNK Weise GESPEICHERT, aber hier wird auf 
        das Vorhandensein der gesamten Section als ganzes vertraut.
        '''
        self._data = TimeFrameGroup(cached_stat)
Ejemplo n.º 3
0
    def finalize(self):
        """Merges together any good sections which span multiple segments,
        as long as those segments are adjacent 
        (previous.end - max_sample_period <= next.start <= previous.end).
        This may happen if we merge cached sections and noncached sections.

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """
        sections = TimeFrameGroup()
        end_date_of_prev_row = None
        for index, row in self._data.iterrows():
            row_sections = row['sections']

            # Check if first TimeFrame of row_sections needs to be merged with
            # last TimeFrame of previous section
            if (end_date_of_prev_row is not None):

                rows_are_adjacent = (
                    (end_date_of_prev_row - self.max_sample_period_td) <= index
                    <= end_date_of_prev_row)

                if rows_are_adjacent and row_sections[0].start is None:
                    assert sections[-1].end is None
                    sections._df.iloc[-1, 1] = row_sections[
                        0].end  # HIER MUSSTE ICH AUFPASSEN, DASS ICH UEBERSCHREIBE!
                    row_sections.pop(0)
                else:
                    # row_sections[0] and sections[-1] were not in adjacent chunks
                    # so check if they are both open-ended and close them...
                    if sections and sections[-1].end is None:
                        try:
                            sections[-1].end = end_date_of_prev_row
                        except ValueError:  # end_date_of_prev_row before sections[-1].start
                            pass
                    if row_sections and row_sections[0].start is None:
                        try:
                            row_sections[0].start = index
                        except ValueError:
                            pass

            end_date_of_prev_row = row['end']
            sections.extend(row_sections)

        if sections and sections.count() > 0:
            sections[-1].include_end = True
            if sections[-1].end is None:
                sections[
                    -1,
                    1] = end_date_of_prev_row  # HIER MUSSTE ICH AUFPASSEN, DASS ICH UEBERSCHREIBE!

        sections._df.reset_index(drop=True, inplace=True)
        self._data = sections
Ejemplo n.º 4
0
    def combined(self):
        """Merges together any good sections which span multiple segments,
        as long as those segments are adjacent 
        (previous.end - max_sample_period <= next.start <= previous.end).

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """
        sections = TimeFrameGroup()
        end_date_of_prev_row = None
        for index, row in self._data.iterrows():
            row_sections = row['sections']

            # Check if first TimeFrame of row_sections needs to be merged with
            # last TimeFrame of previous section
            if (end_date_of_prev_row is not None):

                rows_are_adjacent = (
                    (end_date_of_prev_row - self.max_sample_period_td)
                    <= index <=
                    end_date_of_prev_row)

                if rows_are_adjacent and row_sections[0].start is None:
                    assert sections[-1].end is None
                    sections[-1].end = row_sections[0].end
                    row_sections.pop(0)
                else:
                    # row_sections[0] and sections[-1] were not in adjacent chunks
                    # so check if they are both open-ended and close them...
                    if sections and sections[-1].end is None:
                        try:
                            sections[-1].end = end_date_of_prev_row
                        except ValueError: # end_date_of_prev_row before sections[-1].start
                            pass
                    if row_sections and row_sections[0].start is None:
                        try:
                            row_sections[0].start = index
                        except ValueError:
                            pass
                
            end_date_of_prev_row = row['end']
            sections.extend(row_sections)

        if sections:
            sections[-1].include_end = True
            if sections[-1].end is None:
                sections[-1].end = end_date_of_prev_row

        return sections
Ejemplo n.º 5
0
    def combined(self):
        """Merges together any good sections which span multiple segments,
        as long as those segments are adjacent 
        (previous.end - max_sample_period <= next.start <= previous.end).

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """
        sections = TimeFrameGroup()
        end_date_of_prev_row = None
        for index, row in self._data.iterrows():
            row_sections = row['sections']

            # Check if first TimeFrame of row_sections needs to be merged with
            # last TimeFrame of previous section
            if (end_date_of_prev_row is not None):

                rows_are_adjacent = (
                    (end_date_of_prev_row - self.max_sample_period_td) <= index
                    <= end_date_of_prev_row)

                if rows_are_adjacent and row_sections[0].start is None:
                    assert sections[-1].end is None
                    sections[-1].end = row_sections[0].end
                    row_sections.pop(0)
                else:
                    # row_sections[0] and sections[-1] were not in adjacent chunks
                    # so check if they are both open-ended and close them...
                    if sections and sections[-1].end is None:
                        try:
                            sections[-1].end = end_date_of_prev_row
                        except ValueError:  # end_date_of_prev_row before sections[-1].start
                            pass
                    if row_sections and row_sections[0].start is None:
                        try:
                            row_sections[0].start = index
                        except ValueError:
                            pass

            end_date_of_prev_row = row['end']
            sections.extend(row_sections)

        if sections:
            sections[-1].include_end = True
            if sections[-1].end is None:
                sections[-1].end = end_date_of_prev_row

        return sections
Ejemplo n.º 6
0
    def load(self, key, columns=None, sections=None, n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES):
             
        file_path = self._key_to_abs_path(key)
        
        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        self.all_sections_smaller_than_chunksize = True
        
        # iterate through parameter sections
        # requires 1 pass through file for each section
        for section in sections:
            window_intersect = self.window.intersection(section)
            header_rows = [0,1]
            text_file_reader = pd.read_csv(file_path, 
                                            index_col=0, 
                                            header=header_rows, 
                                            parse_dates=True,
                                            chunksize=chunksize)
                                            
            # iterate through all chunks in file
            for chunk_idx, chunk in enumerate(text_file_reader):
                
                # filter dataframe by specified columns
                if columns:
                    chunk = chunk[columns]
                
                # mask chunk by window and section intersect
                subchunk_idx = [True]*len(chunk)
                if window_intersect.start:
                    subchunk_idx = np.logical_and(subchunk_idx, (chunk.index>=window_intersect.start))
                if window_intersect.end:
                    subchunk_idx = np.logical_and(subchunk_idx, (chunk.index<window_intersect.end))
                if window_intersect.empty:
                    subchunk_idx = [False]*len(chunk)
                subchunk = chunk[subchunk_idx]
                
                if len(subchunk)>0:
                    subchunk_end = np.max(np.nonzero(subchunk_idx))
                    subchunk.timeframe = TimeFrame(subchunk.index[0], subchunk.index[-1])
                    # Load look ahead if necessary
                    if n_look_ahead_rows > 0:
                        if len(subchunk.index) > 0:
                            rows_to_skip = (len(header_rows)+1)+(chunk_idx*chunksize)+subchunk_end+1
                            try:
                                subchunk.look_ahead = pd.read_csv(file_path, 
                                                index_col=0, 
                                                header=None, 
                                                parse_dates=True,
                                                skiprows=rows_to_skip,
                                                nrows=n_look_ahead_rows)
                            except ValueError:
                                subchunk.look_ahead = pd.DataFrame()
                        else:
                            subchunk.look_ahead = pd.DataFrame()
                    
                    yield subchunk
Ejemplo n.º 7
0
 def _find_sections_with_no_target(self):
     """Finds the intersections of the mains good sections with the gaps
     between target appliance activations.
     """
     self.sections_with_no_target = {}
     seq_length_secs = self.seq_length * self.sample_period
     for fold, sects_per_building in self.mains_good_sections.items():
         for building, good_sections in sects_per_building.items():
             activations = (
                 self.activations[fold][self.target_appliance][building])
             mains = self.mains[fold][building]
             mains_good_sections = self.mains_good_sections[fold][building]
             gaps_between_activations = TimeFrameGroup()
             prev_end = mains.index[0]
             for activation in activations:
                 gap = TimeFrame(prev_end, activation.index[0])
                 gaps_between_activations.append(gap)
                 prev_end = activation.index[-1]
             gap = TimeFrame(prev_end, mains.index[-1])
             gaps_between_activations.append(gap)
             intersection = (
                 gaps_between_activations.intersection(mains_good_sections))
             intersection = intersection.remove_shorter_than(
                 seq_length_secs)
             self.sections_with_no_target.setdefault(
                 fold, {})[building] = (intersection)
             logger.info(
                 "Found {} sections without target for {} {}.".format(
                     len(intersection), fold, building))
Ejemplo n.º 8
0
 def _find_sections_with_no_target(self):
     """Finds the intersections of the mains good sections with the gaps
     between target appliance activations.
     """
     self.sections_with_no_target = {}
     seq_length_secs = self.seq_length * self.sample_period
     for fold, sects_per_building in self.data_good_sections.items():
         for building, good_sections in sects_per_building.items():
             if building not in self.all_activations[fold][self.target_appliance]:
                 continue
             activations = (
                 self.all_activations[fold][self.target_appliance][building])
             data = self.data[fold][building]
             data_good_sections = good_sections
             gaps_between_activations = TimeFrameGroup()
             prev_end = data.index[0]
             for activation in activations:
                 activation_start = activation.start
                 if prev_end < activation_start:
                     gap = TimeFrame(prev_end, activation_start)
                     gaps_between_activations.append(gap)
                 prev_end = activation.end
             data_end = data.index[-1]
             if prev_end < data_end:
                 gap = TimeFrame(prev_end, data_end)
             gaps_between_activations.append(gap)
             intersection = (
                 gaps_between_activations.intersection(data_good_sections))
             intersection = intersection.remove_shorter_than(
                 seq_length_secs)
             self.sections_with_no_target.setdefault(fold, {})[building] = (
                 intersection)
             logger.info("Found {} sections without target for {} {}."
                         .format(len(intersection), fold, building))
Ejemplo n.º 9
0
    def append(self, timeframe, new_results):
        """Append a single result.

        Parameters
        ----------
        timeframe : nilmtk.TimeFrame
        new_results : {'sections': list of TimeFrame objects}
        """
        new_results['sections'] = [TimeFrameGroup(new_results['sections'][0])]
        super(AboveFreqSectionsResults, self).append(timeframe, new_results)
Ejemplo n.º 10
0
    def _delete_phony_sections(self):
        filtered_data = {}
        for fold, data_per_building in self.data.items():
            for building, data in data_per_building.items():
                if building not in self.phony_active_timeframes[fold][self.target_appliance]:
                    continue
                activations = (
                    self.phony_active_timeframes[fold][self.target_appliance][building])
                data_between_phony_activations = TimeFrameGroup()
                prev_end = data.index[0]
                for activation in activations:
                    activation_start = activation.start
                    if prev_end < activation_start:
                        gap = TimeFrame(prev_end, activation_start)
                        data_between_phony_activations.append(gap)
                    prev_end = activation.end
                data_end = data.index[-1] + pd.Timedelta(seconds=self.sample_period)
                if prev_end < data_end:
                    gap = TimeFrame(prev_end, data_end)
                    data_between_phony_activations.append(gap)
                dfs = []
                for section in data_between_phony_activations:
                    dfs.append(section.slice(data))
                data = pd.concat(dfs)
                filtered_data.setdefault(fold, {})[building] = (
                    data)
                logger.info("Found {} good sections for {} {}."
                            .format(len(data_between_phony_activations), fold, building))

        self.data = filtered_data
Ejemplo n.º 11
0
 def _find_sections_with_no_target(self):
     """Finds the intersections of the mains good sections with the gaps
     between target appliance activations.
     """
     self.sections_with_no_target = {}
     seq_length_secs = self.seq_length * self.sample_period
     for fold, sects_per_building in self.mains_good_sections.iteritems():
         for building, good_sections in sects_per_building.iteritems():
             activations = (
                 self.activations[fold][self.target_appliance][building])
             mains = self.mains[fold][building]
             mains_good_sections = self.mains_good_sections[fold][building]
             gaps_between_activations = TimeFrameGroup()
             prev_end = mains.index[0]
             for activation in activations:
                 gap = TimeFrame(prev_end, activation.index[0])
                 gaps_between_activations.append(gap)
                 prev_end = activation.index[-1]
             gap = TimeFrame(prev_end, mains.index[-1])
             gaps_between_activations.append(gap)
             intersection = (
                 gaps_between_activations.intersection(mains_good_sections))
             intersection = intersection.remove_shorter_than(
                 seq_length_secs)
             self.sections_with_no_target.setdefault(fold, {})[building] = (
                 intersection)
             logger.info("Found {} sections without target for {} {}."
                         .format(len(intersection), fold, building))
Ejemplo n.º 12
0
    def _load_mains_into_memory(self):
        logger.info("Loading NILMTK mains...")

        # Load dataset
        #dataset = nilmtk.DataSet(self.filename)

        self.mains = {}
        self.mains_good_sections = {}
        for fold, buildings_and_windows in self.windows.items():
            for building_i, window in buildings_and_windows.items():
                dataset=load_mains_dataset(int(building_i))
                dataset=set_window(dataset, window[0], window[1])
                #elec = dataset.buildings[building_i].elec
                building_name = (
                    'REDD' +
                    '_building_{}'.format(building_i))

                logger.info(
                    "Loading mains for {}...".format(building_name))

                #mains_meter = elec.mains()
                good_sections_interval = find_good_sections(dataset,30)
                good_sections_interval = TimeFrameGroup(good_sections_interval)
                #meter = dataset
                resample_kwargs={}
                resample_kwargs['rule'] = '{:d}S'.format(self.sample_period)
                dataset=dataset['power']['apparent']
                dataset=safe_resample(dataset,resample_kwargs)
                dataset=dataset.agg(np.mean)
                dataset=dataset.interpolate()
                mains_data = power_series_all_data(dataset, good_sections_interval).dropna()
                print(mains_data.index[0])
                print(fold)
                print(building_i)

                def set_mains_data(dictionary, data):
                    dictionary.setdefault(fold, {})[building_name] = data

                if not mains_data.empty:
                    set_mains_data(self.mains, mains_data)
                    set_mains_data(self.mains_good_sections, good_sections_interval)

                logger.info(
                    "Loaded mains data from building {} for fold {}"
                    " from {} to {}."
                    .format(building_name, fold,
                            mains_data.index[0], mains_data.index[-1]))

        #dataset.store.close()
        logger.info("Done loading NILMTK mains data.")
Ejemplo n.º 13
0
    def __init__(self, **config):
        if 'filename' not in config.keys():
            self.dataSet = nilmtk.DataSet("ukdale.h5")
        else:
            self.dataSet = nilmtk.DataSet(config['fileName'])

        if 'startTime' not in config.keys() or 'endTime' not in config.keys():
            self.dataSet.set_window("2012-11-01", "2015-01-31")
        else:
            self.dataSet.set_window(config['startTime'], config['endTime'])

        if 'trainBuildings' not in config.keys():
            self.trainBuildings = [1, 3, 4, 5]
        else:
            self.trainBuildings = config['trainBuildings']
        if 'testBuildings' not in config.keys():
            self.testBuildings = [2]
        else:
            self.testBuildings = config['testBuildings']

        if 'applications' not in config.keys():
            raise KeyError("please input applications")
        self.applications = config['applications']

        if 'targetapplication' not in config.keys():
            raise KeyError("please input targetapplication")
        self.targetApplication = config['targetapplication']

        if 'randSeed' not in config.keys():
            randSeed = 0
        else:
            randSeed = config['randSeed']

        self.otherApplications = [
            i for i in self.applications if i not in [self.targetApplication]
        ]
        self.allBuildings = set(self.trainBuildings + self.testBuildings)
        self.window = 599
        self.inputSeqs = []
        self.targetSeqs = []
        self.rng = np.random.RandomState(randSeed)
        activationConfig = {
            'fridge': {
                'min_off_duration': 18,  # 12 in paper here
                'min_on_duration': 60,
                'on_power_threshold': 50,
                'sample_period': 6,
            },
            'kettle': {
                'min_off_duration': 18,  # 0 in paper here
                'min_on_duration': 12,
                'on_power_threshold': 2000,
                'sample_period': 6,
            },
            'washing machine': {
                'min_off_duration': 160,
                'min_on_duration': 1800,
                'on_power_threshold': 20,
                'sample_period': 6,
            },
            'microwave': {
                'min_off_duration': 30,
                'min_on_duration': 12,
                'on_power_threshold': 200,
                'sample_period': 6,
            },
            'dish washer': {
                'min_off_duration': 1800,
                'min_on_duration': 1800,
                'on_power_threshold': 10,
                'sample_period': 6,
            }
        }

        self.elecMains = {}
        self.goodSections = {}
        for building in self.allBuildings:
            self.goodSections[building] = self.dataSet.buildings[
                building].elec.mains().good_sections()
            self.elecMains[building] = self.dataSet.buildings[
                building].elec.mains().power_series_all_data(
                    sample_period=6,
                    sections=self.goodSections[building]).dropna()

        self.numApp = {}
        self.elecApp = {}
        self.activationsApp = {}
        self.activationAppSections = {}
        for app in self.applications:
            self.elecApp[app] = {}
            self.activationsApp[app] = {}
            self.numApp[app] = 0
            self.activationAppSections[app] = {}
            for building in self.allBuildings:
                try:
                    self.elecApp[app][building] = self.dataSet.buildings[
                        building].elec[app].power_series_all_data(
                            sample_period=6).dropna()

                    self.activationsApp[app][
                        building] = self.dataSet.buildings[building].elec[
                            app].get_activations(**activationConfig[app])
                    self.activationsApp[app][building] = [
                        activation.astype(np.float32)
                        for activation in self.activationsApp[app][building]
                    ]
                    self.numApp[app] += len(self.activationsApp[app][building])
                    self.activationAppSections[app][building] = TimeFrameGroup(
                    )
                    for activation in self.activationsApp[app][building]:
                        self.activationAppSections[app][building].append(
                            TimeFrame(activation.index[0],
                                      activation.index[-1]))
                except KeyError as exception:
                    logger.info(
                        str(building) + " has no " + app +
                        ". Full exception: {}".format(exception))
                    continue
        logger.info("Done loading NILMTK data.")

        for building in self.allBuildings:
            activationsToRemove = []
            try:
                activations = self.activationsApp[
                    self.targetApplication][building]
                mains = self.elecMains[building]
                for i, activation in enumerate(activations):
                    activationDuration = (activation.index[-1] -
                                          activation.index[0])
                    start = (activation.index[0] - activationDuration)
                    end = (activation.index[-1] + activationDuration)
                    if start < mains.index[0] or end > mains.index[-1]:
                        activationsToRemove.append(i)
                    else:
                        mainsForAct = mains[start:end]
                        if not self._hasSufficientSamples(
                                start, end, mainsForAct):
                            activationsToRemove.append(i)
                activationsToRemove.reverse()
                for i in activationsToRemove:
                    activations.pop(i)
                self.activationsApp[
                    self.targetApplication][building] = activations
            except KeyError as exception:
                continue

        self.sectionsWithNoTarget = {}
        for building in self.allBuildings:
            try:
                activationsTarget = self.activationsApp[
                    self.targetApplication][building]
                mainGoodSections = self.goodSections[building]
                mains = self.elecMains[building]
                gapsBetweenActivations = TimeFrameGroup()
                prev = mains.index[0]
                for activation in activationsTarget:
                    try:
                        p2 = prev
                        gapsBetweenActivations.append(
                            TimeFrame(prev, activation.index[0]))
                        prev = activation.index[-1]
                        p1 = activation.index[0]
                    except ValueError:
                        logger.debug("----------------------")
                        logger.debug(p1)
                        logger.debug(p2)
                        logger.debug(activation.index[0])
                        logger.debug(activation.index[-1])

                gapsBetweenActivations.append(TimeFrame(prev, mains.index[-1]))

                intersection = gapsBetweenActivations.intersection(
                    mainGoodSections)
                intersection = intersection.remove_shorter_than(6 *
                                                                self.window)
                self.sectionsWithNoTarget[building] = intersection
            except KeyError:
                continue
Ejemplo n.º 14
0
    def _get_stat_from_cache_or_compute(self, nodes, results_obj,
                                        loader_kwargs):
        """General function for computing statistics and/or loading them from
        cache.

        Cached statistics lives in the DataStore at
        'building<I>/elec/cache/meter<K>/<statistic_name>' e.g.
        'building1/elec/cache/meter1/total_energy'.  We store the
        'full' statistic... i.e we store a representation of the `Results._data`
        DataFrame. Some times we need to do some conversion to store
        `Results._data` on disk.  The logic for doing this conversion lives
        in the `Results` class or subclass.  The cache can be cleared by calling
        `ElecMeter.clear_cache()`.
        When 'preprocessing' is set, then the cache is not used because the cache
        is only valid for the version without preprocessing.

        Parameters
        ----------
        nodes : list of nilmtk.Node classes
        results_obj : instance of nilmtk.Results subclass. This is THE result 
                      instance which is afterwards filled by all the results
                      coming from the different chunks.
        loader_kwargs : dict

        Returns
        -------
        if `full_results` is True then return nilmtk.Results subclass
        instance otherwise return nilmtk.Results.simple().

        See Also
        --------
        clear_cache
        _compute_stat
        key_for_cached_stat
        get_cached_stat
        """
        full_results = loader_kwargs.pop('full_results', False)
        verbose = loader_kwargs.get('verbose')
        if 'ac_type' in loader_kwargs or 'physical_quantity' in loader_kwargs:
            loader_kwargs = self._convert_physical_quantity_and_ac_type_to_cols(
                **loader_kwargs)
        columns = loader_kwargs.get('columns', [])
        ac_types = set([m[1] for m in columns if m[1]])
        results_obj_copy = deepcopy(results_obj)

        # Prepare `sections` list
        sections = loader_kwargs.get('sections')
        if sections is None:
            tf = self.get_timeframe()
            tf.include_end = True
            sections = [tf]
        sections = TimeFrameGroup(sections)  # Takes care that NILMTK timeframe
        sections = [s for s in sections if not s.empty]

        # Retrieve usable stats from cache
        key_for_cached_stat = self.key_for_cached_stat(results_obj.name)
        cached_stat = None
        if loader_kwargs.get('preprocessing') is None:
            cached_stat = self.get_cached_stat(key_for_cached_stat)
            #results_obj.import_from_cache(cached_stat, sections) # Fill results_obj with cache

            #def find_sections_to_compute():
            #    # Get sections_to_compute
            #    results_obj_timeframes = results_obj.timeframes()
            #    sections_to_compute = set(sections) - set(results_obj_timeframes)
            #    t1 = TimeFrameGroup(sections)
            #    t2 = TimeFrameGroup(results_obj_timeframes)
            #    sections_to_compute = t1.diff(t2)   # HIER IST DAS DIFF, DAS ICH NEU GEBAUT HABE!!! NUR WARUM GEHT ES GERADE NICHT MEHR???
            #    sections_to_compute = sorted(sections_to_compute)
            #    return sections_to_compute
            #try:
            #    ac_type_keys = results_obj.keys() #.simple().keys()
            #except:
            #    sections_to_compute = find_sections_to_compute()
            #else:
            #    if ac_types.issubset(ac_type_keys):
            #        # IF ac_type in cache, only calculate remaining sections
            #        sections_to_compute = find_sections_to_compute()
            #    else:
            #        # If false ac_type cached, still have to compute all
            #        sections_to_compute = sections
            #        results_obj = results_obj_copy
        #else:
        #    sections_to_compute = sections
        if verbose and not cached_stat is None:  #._data.empty:
            print("Using cached result.")

        # If necessary compute stats for missing sections
        if cached_stat is None:  #sections_to_compute:
            # If we need everything either way, we don't need expensive index lookup during load
            #if not self.get_timeframe() in sections_to_compute:
            #    loader_kwargs['sections'] = sections_to_compute

            #computed_result = self._compute_stat(nodes, loader_kwargs)

            # Merge newly computed stats into the main stat result
            # DAS HIER BAUT MAN BESSER DIREKT IN DEN NODE EIN!!! DASS SEIN RESULT ERWEITERT WIRD
            # DANN KANN MAN IMMER NOCH DAS RESULT VOM CACHING NEHMEN UND VERBINDEN!
            # MAN SETZT DANN VON ALLEN NODE ELEMENTEN DAS RESULT MIT IN DIE PIPELINE ELEMENTE!
            # DANN KANN MAN SIE HINTEN RAUSHOLEN. SO KANN MAN DIE BERECHNUNG IN EINEM ZUG MACHEN.
            # => Ist ja so gemacht. Nur eben fuer jede Section!
            # => Die einzige rweiterung waere das durchreichen von Results.
            #results_obj.update(computed_result.results)

            results_obj = self._compute_stat(nodes, loader_kwargs).results

            # For Nonzero section exclude where there are not good sections
            if results_obj.name == 'nonzero_sections' or results_obj.name == 'overbasepower_sections':
                good_sections = self.good_sections(**loader_kwargs)  #_data
                results_obj._data = results_obj._data.intersection(
                    good_sections)

            # Save to disk newly computed stats
            stat_for_store = results_obj.export_to_cache()
            try:
                #self.store.remove(key_for_cached_stat)
                self.store.put(key_for_cached_stat, stat_for_store, fixed=True)
                # Temporary workarround to store the good sections also for the other meters TODO
                if results_obj.name == 'good_sections':
                    for i in range(2, 4):
                        self.store.put(key_for_cached_stat.replace(
                            'meter1', 'meter' + str(i)),
                                       stat_for_store,
                                       fixed=True)

            except ValueError:
                # the old table probably had different columns
                self.store.remove(key_for_cached_stat)
                self.store.put(key_for_cached_stat,
                               results_obj.export_to_cache())
        else:
            results_obj.import_from_cache(
                cached_stat, sections)  # Fill results_obj with cache

        # Return the correct value depending on options
        if full_results:
            return results_obj
        res = results_obj  #.simple()
        if ac_types:
            try:
                ac_type_keys = res.keys()
            except:
                return res
            else:
                if res.empty:
                    return res
                else:
                    return pd.Series(res[ac_types], index=ac_types)
        return res._data
Ejemplo n.º 15
0
class OverBasepowerSectionsResults(Results):
    """ The result of the Non zero section statistic.
    Attributes
    ----------
    _data : pd.DataFrame
        index is start date for the whole chunk
        `end` is end date for the whole chunk
        `sections` is a TimeFrameGroups object (a list of nilmtk.TimeFrame objects)
    """

    name = "overbasepower_sections"

    def __init__(self, max_sample_rate):
        # Used to know when to combine
        self.max_sample_rate = max_sample_rate
        super(OverBasepowerSectionsResults, self).__init__()

    def append(self, timeframe, new_results):
        """Append a single result.

        Parameters
        ----------
        timeframe : nilmtk.TimeFrame
        new_results : {'sections': list of TimeFrame objects}
        """
        super(OverBasepowerSectionsResults,
              self).append(timeframe, new_results)

    def finalize(self):
        """ Merges together any nonzero sections which span multiple segments.
        Whether there are gaps in between does not matter.

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """

        # Merge the results of all chunks
        starts = []
        ends = []
        for index, row in self._data.iterrows():
            starts.append(row['sections']['start'])
            ends.append(row['sections']['end'])

        if len(starts) == 0 == len(ends):
            self._data = TimeFrameGroup()
            return

        starts = pd.concat(starts)
        ends = pd.concat(ends)

        rate = pd.Timedelta(seconds=self.max_sample_rate)
        self._data = TimeFrameGroup(starts_and_ends={
            'starts': starts,
            'ends': ends
        })  #.merge_shorter_gaps_than(rate) TODO: Merge needed?

    def unify(self, other):
        raise Exception("Did not try this yet for the new nonzeroresults")
        super(OverBasepowerSectionsResults, self).unify(other)
        for start, row in self._data.iterrows():
            other_sections = other._data['sections'].loc[start]
            intersection = row['sections'].intersection(other_sections)
            self._data['sections'].loc[start] = intersection

    def to_dict(self):
        overbasepower_sections = self._data
        overbasepower_sections_list_of_dicts = [
            timeframe.to_dict() for timeframe in overbasepower_sections
        ]
        return {
            'statistics': {
                'overbasepower_sections': overbasepower_sections_list_of_dicts
            }
        }

    def plot(self, **plot_kwargs):
        timeframes = self
        return timeframes.plot(**plot_kwargs)

    def import_from_cache(self, cached_stat, sections):
        ''' Stores the statistic into the cache of the nilmtk.DataStore

        Note
        ----
        I do not know whether this is still an issue: 
        HIER IST DAS PROBLEM BEIM STATISTIKEN LESEN! 
        DIE WERDEN CHUNK Weise GESPEICHERT, aber hier wird auf 
        das Vorhandensein der gesamten Section als ganzes vertraut.
        '''
        self._data = TimeFrameGroup(cached_stat)

    def export_to_cache(self):
        """
        Returns the DataFrame to be written into cache.

        Returns
        -------
        df: pd.DataFrame
            With three columns: 'end', 'section_end', 'section_start.      
        """
        return self._data._df
Ejemplo n.º 16
0
 def import_from_cache(self, cached_stat, sections):
     self._data = TimeFrameGroup(cached_stat)
Ejemplo n.º 17
0
    def _save_metadata_for_disaggregation(self,
                                          output_datastore,
                                          sample_period,
                                          measurement,
                                          timeframes,
                                          building,
                                          meters=None,
                                          num_meters=None,
                                          supervised=True,
                                          original_building_meta=None,
                                          rest_powerflow_included=False):
        """Add metadata for disaggregated appliance estimates to datastore.

        REMINDER: Also urpruenglich wollte ich das anders machen und eben auch die Metadatan mit abspeichern.
                  Habe ich aus zeitgruenden dann gelassen und mache es doch so wie es vorher war.
        
        This function first checks whether there are already metainformation in the file.
        If zes, it extends them and otherwise it removes them.

        Note that `self.MODEL_NAME` needs to be set to a string before
        calling this method.  For example, we use `self.MODEL_NAME = 'CO'`
        for Combinatorial Optimisation.

        TODO:`preprocessing_applied` for all meters
        TODO: submeter measurement should probably be the mains
              measurement we used to train on, not the mains measurement.

        Parameters
        ----------
        output_datastore : nilmtk.DataStore subclass object
            The datastore to write metadata into.
        sample_period : int
            The sample period, in seconds, used for both the
            mains and the disaggregated appliance estimates.
        measurement : 2-tuple of strings
            In the form (<physical_quantity>, <type>) e.g.
            ("power", "active")
        timeframes : list of nilmtk.TimeFrames or nilmtk.TimeFrameGroup
            The TimeFrames over which this data is valid for.
        building : int
            The building instance number (starting from 1)
        supervised : bool, defaults to True
            Is this a supervised NILM algorithm?
        meters : list of nilmtk.ElecMeters, optional
            Required if `supervised=True`
        num_meters : [int]
            Required if `supervised=False`, Gives for each phase amount of meters
        """

        # DataSet and MeterDevice metadata only when not already available
        try:
            metadata = output_datastore.load_metadata()
            timeframes.append(
                TimeFrame(start=metadata["timeframe"]["start"],
                          end=metadata["timeframe"]["end"]))
            total_timeframe = TimeFrameGroup(timeframes).get_timeframe()
            dataset_metadata = {
                'name': metadata["name"],
                'date': metadata["date"],
                'meter_devices': metadata["meter_devices"],
                'timeframe': total_timeframe.to_dict()
            }
            output_datastore.save_metadata('/', dataset_metadata)
        except:
            pq = 3
            meter_devices = {
                'disaggregate': {
                    'model':
                    type(self),  #self.model.MODEL_NAME,
                    'sample_period':
                    sample_period if rest_powerflow_included else
                    0,  # Makes it possible to use special load functionality
                    'max_sample_period':
                    sample_period,
                    'measurements': [{
                        'physical_quantity':
                        'power',  #measurement.levels[0][0],
                        'type': 'active'  #measurement.levels, #[1][0]
                    }]
                }
            }

            if rest_powerflow_included:
                meter_devices['rest'] = {
                    'model':
                    'rest',
                    'sample_period':
                    sample_period,
                    'max_sample_period':
                    sample_period,
                    'measurements': [{
                        'physical_quantity':
                        'power',  #measurement.levels, #[0][0],
                        'type': 'active'  #measurement.levels, #[1][0]
                    }]
                }
            total_timeframe = TimeFrameGroup(timeframes).get_timeframe()

            date_now = datetime.now().isoformat().split('.')[0]
            dataset_metadata = {
                'name': type(self),
                'date': date_now,
                'meter_devices': meter_devices,
                'timeframe': total_timeframe.to_dict()
            }
            output_datastore.save_metadata('/', dataset_metadata)

        # Building metadata always stored for the new buildings
        for i in range(len(num_meters)):
            phase_building = building * 10 + i
            building_path = '/building{}'.format(phase_building)
            mains_data_location = building_path + '/elec/meter1'

            # Rest meter:
            elec_meters = {}
            if rest_powerflow_included:
                elec_meters[1] = {
                    'device_model': 'rest',
                    #'site_meter': True,
                    'data_location': mains_data_location,
                    'preprocessing_applied': {},  # TODO
                    'statistics': {
                        'timeframe': total_timeframe.to_dict()
                    }
                }

            def update_elec_meters(meter_instance):
                elec_meters.update({
                    meter_instance: {
                        'device_model':
                        'disaggregate',  # self.MODEL_NAME,
                        'submeter_of':
                        1,
                        'data_location':
                        ('{}/elec/meter{}'.format(building_path,
                                                  meter_instance)),
                        'preprocessing_applied': {},  # TODO
                        'statistics': {
                            'timeframe': total_timeframe.to_dict()
                        }
                    }
                })

            # Appliances and submeters:
            appliances = []
            if supervised:
                for meter in meters:
                    meter_instance = meter.instance()
                    update_elec_meters(meter_instance)

                    for app in meter.appliances:
                        appliance = {
                            'meters': [meter_instance],
                            'type': app.identifier.type,
                            'instance': app.identifier.instance
                            # TODO this `instance` will only be correct when the
                            # model is trained on the same house as it is tested on
                            # https://github.com/nilmtk/nilmtk/issues/194
                        }
                        appliances.append(appliance)

                    # Setting the name if it exists
                    if meter.name:
                        if len(meter.name) > 0:
                            elec_meters[meter_instance]['name'] = meter.name
            else:  # Unsupervised
                # Submeters:
                # Starts at 2 because meter 1 is mains.
                for chan in range(2, num_meters[i] +
                                  1):  # Additional + 1 because index 0 skipped
                    update_elec_meters(meter_instance=chan)
                    appliance = {
                        'meters': [chan],
                        'type': 'unknown',
                        'instance': chan - 1
                        # TODO this `instance` will only be correct when the
                        # model is trained on the same house as it is tested on
                        # https://github.com/nilmtk/nilmtk/issues/194
                    }
                    appliances.append(appliance)

            if len(appliances) == 0:
                continue

            building_metadata = {
                'instance': (phase_building),
                'elec_meters':
                elec_meters,
                'appliances':
                appliances,
                'original_name':
                original_building_meta['original_name']
                if 'original_name' in original_building_meta else None,
                'geo_location':
                original_building_meta['geo_location']
                if 'geo_location' in original_building_meta else None,
                'zip':
                original_building_meta['zip']
                if 'zip' in original_building_meta else None,
            }
            print(building_path)
            output_datastore.save_metadata(building_path, building_metadata)
Ejemplo n.º 18
0
    def load(self,
             key,
             columns=None,
             sections=None,
             n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES,
             verbose=False,
             **additionalLoaderKwargs):
        # TODO: calculate chunksize default based on physical
        # memory installed and number of columns

        # Make sure key has a slash at the front but not at the end.
        if key[0] != '/':
            key = '/' + key
        if len(key) > 1 and key[-1] == '/':
            key = key[:-1]

        # Make sure chunksize is an int otherwise `range` complains later.
        chunksize = np.int64(chunksize)

        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        # Replace any Nones with '' in cols:
        if columns is not None:
            columns = [('' if pq is None else pq, '' if ac is None else ac)
                       for pq, ac in columns]
            cols_idx = pd.MultiIndex.from_tuples(
                columns, names=['physical_quantity', 'type'])

        if verbose:
            print("HDFDataStore.load(key='{}', columns='{}', sections='{}',"
                  " n_look_ahead_rows='{}', chunksize='{}')".format(
                      key, columns, sections, n_look_ahead_rows, chunksize))

        self.all_sections_smaller_than_chunksize = True

        for section in sections:
            if verbose:
                print("   ", section)
            window_intersect = self.window.intersection(section)

            if window_intersect.empty:  # Wenn der abgefragte Zeitabschnitt nicht in der Datenreihe enthalten ist
                data = pd.DataFrame(columns=cols_idx)
                data.timeframe = section
                yield data
                continue

            terms = window_intersect.query_terms('window_intersect')
            if terms is None:
                section_start_i = 0
                section_end_i = self.store.get_storer(key).nrows
                if section_end_i <= 1:
                    data = pd.DataFrame(columns=cols_idx)
                    data.timeframe = section
                    yield data
                    continue
            else:
                try:
                    coords = self.store.select_as_coordinates(key=key,
                                                              where=terms)
                except AttributeError as e:
                    if str(e) == ("'NoneType' object has no attribute "
                                  "'read_coordinates'"):
                        raise KeyError("key '{}' not found".format(key))
                    else:
                        raise
                n_coords = len(coords)
                if n_coords == 0:
                    data = pd.DataFrame(columns=cols_idx)
                    data.timeframe = window_intersect
                    yield data
                    continue

                section_start_i = coords[0]
                section_end_i = coords[-1]
                if section_start_i == section_end_i:  # For corner cases where there is really only a single entry.
                    section_end_i += 1

                del coords
            slice_starts = range(section_start_i, section_end_i, chunksize)
            n_chunks = int(
                np.ceil((section_end_i - section_start_i) / chunksize))

            if n_chunks > 1:
                self.all_sections_smaller_than_chunksize = False

            for chunk_i, chunk_start_i in enumerate(slice_starts):
                chunk_end_i = chunk_start_i + chunksize
                there_are_more_subchunks = (chunk_i < n_chunks - 1)

                if chunk_end_i > section_end_i:
                    chunk_end_i = section_end_i
                chunk_end_i += 1

                data = self.store.select(key=key,
                                         columns=cols_idx,
                                         start=chunk_start_i,
                                         stop=chunk_end_i)

                if len(data) <= 2:
                    data = pd.DataFrame(columns=cols_idx)
                    data.timeframe = section
                    yield data

                # Load look ahead if necessary
                if n_look_ahead_rows > 0:
                    if len(data.index) > 0:
                        look_ahead_start_i = chunk_end_i
                        look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows
                        try:
                            data.look_ahead = self.store.select(
                                key=key,
                                columns=columns,
                                start=look_ahead_start_i,
                                stop=look_ahead_end_i)
                        except ValueError:
                            data.look_ahead = pd.DataFrame()
                    else:
                        data.look_ahead = pd.DataFrame()

                data.timeframe = _timeframe_for_chunk(there_are_more_subchunks,
                                                      chunk_i,
                                                      window_intersect,
                                                      data.index)
                yield data
                del data
Ejemplo n.º 19
0
    def _get_stat_from_cache_or_compute(self, nodes, results_obj,
                                        loader_kwargs):
        """General function for computing statistics and/or loading them from
        cache.

        Cached statistics lives in the DataStore at
        'building<I>/elec/cache/meter<K>/<statistic_name>' e.g.
        'building1/elec/cache/meter1/total_energy'.  We store the
        'full' statistic... i.e we store a representation of the `Results._data`
        DataFrame. Some times we need to do some conversion to store
        `Results._data` on disk.  The logic for doing this conversion lives
        in the `Results` class or subclass.  The cache can be cleared by calling
        `ElecMeter.clear_cache()`.

        Parameters
        ----------
        nodes : list of nilmtk.Node classes
        results_obj : instance of nilmtk.Results subclass
        loader_kwargs : dict

        Returns
        -------
        if `full_results` is True then return nilmtk.Results subclass
        instance otherwise return nilmtk.Results.simple().

        See Also
        --------
        clear_cache
        _compute_stat
        key_for_cached_stat
        get_cached_stat
        """
        full_results = loader_kwargs.pop('full_results', False)
        verbose = loader_kwargs.get('verbose')
        if 'ac_type' in loader_kwargs or 'physical_quantity' in loader_kwargs:
            loader_kwargs = self._convert_physical_quantity_and_ac_type_to_cols(
                **loader_kwargs)
        columns = loader_kwargs.get('columns', [])
        ac_types = set([m[1] for m in columns if m[1]])
        results_obj_copy = deepcopy(results_obj)

        # Prepare `sections` list
        sections = loader_kwargs.get('sections')
        if sections is None:
            tf = self.get_timeframe()
            tf.include_end = True
            sections = [tf]
        sections = TimeFrameGroup(sections)
        sections = [s for s in sections if not s.empty]

        # Retrieve usable stats from cache
        key_for_cached_stat = self.key_for_cached_stat(results_obj.name)
        if loader_kwargs.get('preprocessing') is None:
            cached_stat = self.get_cached_stat(key_for_cached_stat)
            results_obj.import_from_cache(cached_stat, sections)

            def find_sections_to_compute():
                # Get sections_to_compute
                results_obj_timeframes = results_obj.timeframes()
                sections_to_compute = set(sections) - set(
                    results_obj_timeframes)
                sections_to_compute = sorted(sections_to_compute)
                return sections_to_compute

            try:
                ac_type_keys = results_obj.simple().keys()
            except:
                sections_to_compute = find_sections_to_compute()
            else:
                if ac_types.issubset(ac_type_keys):
                    sections_to_compute = find_sections_to_compute()
                else:
                    sections_to_compute = sections
                    results_obj = results_obj_copy
        else:
            sections_to_compute = sections

        if verbose and not results_obj._data.empty:
            print("Using cached result.")

        # If we get to here then we have to compute some stats
        if sections_to_compute:
            loader_kwargs['sections'] = sections_to_compute
            computed_result = self._compute_stat(nodes, loader_kwargs)

            # Merge cached results with newly computed
            results_obj.update(computed_result.results)

            # Save to disk newly computed stats
            stat_for_store = computed_result.results.export_to_cache()
            try:
                self.store.append(key_for_cached_stat, stat_for_store)
            except ValueError:
                # the old table probably had different columns
                self.store.remove(key_for_cached_stat)
                self.store.put(key_for_cached_stat,
                               results_obj.export_to_cache())

        if full_results:
            return results_obj
        else:
            res = results_obj.simple()
            if ac_types:
                try:
                    ac_type_keys = res.keys()
                except:
                    return res
                else:
                    if res.empty:
                        return res
                    else:
                        return pd.Series(res[ac_types], index=ac_types)
            else:
                return res
Ejemplo n.º 20
0
    def load(self,
             key,
             columns=None,
             sections=None,
             n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES,
             verbose=False,
             **additionalLoaderKwargs):
        '''
        Load measurments over a certain period of time.
        The resampling is taking place on the serverside to save bandwidth. This is different from the 
        HDF datastore where always all data is loaded and then resampled.
        '''
        # TODO: calculate chunksize default based on physical
        # memory installed and number of columns

        # Make sure key has a slash at the front but not at the end.
        if key[0] != '/':
            key = '/' + key
        if len(key) > 1 and key[-1] == '/':
            key = key[:-1]

        sample_period = additionalLoaderKwargs["sample_period"]

        # Make sure chunksize is an int otherwise `range` complains later.
        chunksize = np.int64(chunksize)

        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        # Replace any Nones with '' in cols:
        if columns is not None:
            columns = [('' if pq is None else pq, '' if ac is None else ac)
                       for pq, ac in columns]
            cols_idx = pd.MultiIndex.from_tuples(
                columns, names=['physical_quantity', 'type'])

        columnsStr = []
        for i, val in enumerate(columns):
            columnsStr.append(str(columns[i]))
        columnsStr = str(columnsStr)

        if verbose:
            print("HDFDataStore.load(key='{}', columns='{}', sections='{}',"
                  " n_look_ahead_rows='{}', chunksize='{}')".format(
                      key, columns, sections, n_look_ahead_rows, chunksize))

        self.all_sections_smaller_than_chunksize = True

        for section in sections:
            if verbose:
                print("   ", section)
            window_intersect = self.window.intersection(section)

            if window_intersect.empty:  # Wenn der abgefragte Zeitabschnitt nicht in der Datenreihe enthalten ist
                data = pd.DataFrame(columns=cols_idx)
                data.timeframe = section
                yield data
                continue

            # The estimation of fitting slices is avoided
            delta = section.end - section.start
            n_chunks = int(
                np.ceil((delta.total_seconds() / sample_period / chunksize)))
            delta = delta / n_chunks
            slice_starts = []
            for i in range(n_chunks):
                slice_starts.append(section.start + delta * i)
            if n_chunks > 1:
                self.all_sections_smaller_than_chunksize = False

            # Load the sections
            for chunk_i, chunk_start_i in enumerate(slice_starts):
                chunk_end_i = chunk_start_i + datetime.timedelta(
                    seconds=int(chunksize))
                there_are_more_subchunks = (chunk_i < n_chunks - 1)

                if chunk_end_i > section.end:
                    chunk_end_i = section.end

                # The required parameter form is: base={lat}/{lng}/{deviceKey}/{deviceType} + {start}/{end}/{columns}/{sample_rate}
                iso_chunk_start = chunk_start_i.isoformat()
                iso_chunk_end = chunk_end_i.isoformat()
                data = self._execute_request("load",
                                             type="GET",
                                             parameters={
                                                 "url":
                                                 key,
                                                 "start":
                                                 iso_chunk_start,
                                                 "end":
                                                 iso_chunk_end,
                                                 "columns":
                                                 columnsStr,
                                                 "sample_period":
                                                 str(sample_period)
                                             })
                data = self._jsonDataToPandasDF(columns, data)

                if len(data) <= 2:
                    data = pd.DataFrame(columns=cols_idx)
                    data.timeframe = section
                    yield data

                # Load look ahead if necessary
                if n_look_ahead_rows > 0:
                    if len(data.index) > 0:
                        look_ahead_start_i = chunk_end_i
                        look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows
                        try:
                            #data.look_ahead = self.store.select(
                            #    key=key, columns=columns,
                            #    start=look_ahead_start_i,
                            #    stop=look_ahead_end_i)
                            data = self._execute_request(
                                "load",
                                type="GET",
                                parameters={
                                    "url": key,
                                    "start": look_ahead_start_i,
                                    "end": look_ahead_end_i,
                                    "columns": columnsStr,
                                    "sample_period": sample_period
                                })
                            data = self._jsonDataToPandasDF(columns, data)
                        except ValueError:
                            data.look_ahead = pd.DataFrame()
                    else:
                        data.look_ahead = pd.DataFrame()

                data.timeframe = _timeframe_for_chunk(there_are_more_subchunks,
                                                      chunk_i,
                                                      window_intersect,
                                                      data.index)
                yield data
                del data
Ejemplo n.º 21
0
    def load(self,
             key,
             columns=None,
             sections=None,
             n_look_ahead_rows=0,
             chunksize=MAX_MEM_ALLOWANCE_IN_BYTES,
             verbose=False):
        # TODO: calculate chunksize default based on physical
        # memory installed and number of columns

        # Make sure key has a slash at the front but not at the end.
        if key[0] != '/':
            key = '/' + key
        if len(key) > 1 and key[-1] == '/':
            key = key[:-1]

        # Make sure chunksize is an int otherwise `range` complains later.
        chunksize = np.int64(chunksize)

        # Set `sections` variable
        sections = [TimeFrame()] if sections is None else sections
        sections = TimeFrameGroup(sections)

        # Replace any Nones with '' in columns:
        if columns is not None:
            columns = [('' if pq is None else pq, '' if ac is None else ac)
                       for pq, ac in columns]

        if verbose:
            print("HDFDataStore.load(key='{}', columns='{}', sections='{}',"
                  " n_look_ahead_rows='{}', chunksize='{}')".format(
                      key, columns, sections, n_look_ahead_rows, chunksize))

        self.all_sections_smaller_than_chunksize = True

        for section in sections:
            if verbose:
                print("   ", section)
            window_intersect = self.window.intersection(section)

            if window_intersect.empty:
                data = pd.DataFrame()
                data.timeframe = section
                yield data
                continue

            terms = window_intersect.query_terms('window_intersect')
            if terms is None:
                section_start_i = 0
                section_end_i = self.store.get_storer(key).nrows
                if section_end_i <= 1:
                    data = pd.DataFrame()
                    data.timeframe = section
                    yield data
                    continue
            else:
                try:
                    coords = self.store.select_as_coordinates(key=key,
                                                              where=terms)
                except AttributeError as e:
                    if str(e) == ("'NoneType' object has no attribute "
                                  "'read_coordinates'"):
                        raise KeyError("key '{}' not found".format(key))
                    else:
                        raise
                n_coords = len(coords)
                if n_coords == 0:
                    data = pd.DataFrame()
                    data.timeframe = window_intersect
                    yield data
                    continue

                section_start_i = coords[0]
                section_end_i = coords[-1]
                del coords

            slice_starts = range(section_start_i, section_end_i, chunksize)
            n_chunks = int(
                np.ceil((section_end_i - section_start_i) / chunksize))

            if n_chunks > 1:
                self.all_sections_smaller_than_chunksize = False

            for chunk_i, chunk_start_i in enumerate(slice_starts):
                chunk_end_i = chunk_start_i + chunksize
                there_are_more_subchunks = (chunk_i < n_chunks - 1)

                if chunk_end_i > section_end_i:
                    chunk_end_i = section_end_i
                chunk_end_i += 1

                data = self.store.select(key=key,
                                         columns=columns,
                                         start=chunk_start_i,
                                         stop=chunk_end_i)

                # if len(data) <= 2:
                #     yield pd.DataFrame()

                # Load look ahead if necessary
                if n_look_ahead_rows > 0:
                    if len(data.index) > 0:
                        look_ahead_start_i = chunk_end_i
                        look_ahead_end_i = look_ahead_start_i + n_look_ahead_rows
                        try:
                            look_ahead = self.store.select(
                                key=key,
                                columns=columns,
                                start=look_ahead_start_i,
                                stop=look_ahead_end_i)
                        except ValueError:
                            look_ahead = pd.DataFrame()
                    else:
                        look_ahead = pd.DataFrame()

                    with warnings.catch_warnings():
                        # Silence "Pandas doesn't allow columns to be created via a new attribute name"
                        # since we're not adding a column
                        warnings.filterwarnings(
                            'ignore',
                            category=UserWarning,
                            message=".*Pandas doesn't allow columns.*")
                        setattr(data, 'look_ahead', look_ahead)

                data.timeframe = _timeframe_for_chunk(there_are_more_subchunks,
                                                      chunk_i,
                                                      window_intersect,
                                                      data.index)
                yield data
                del data
Ejemplo n.º 22
0
    def _classify_activation_quality(self, nilmtk_activations):
        def get_stale_seconds(act):
            actdiff = act.resample("{:d}S".format(self.sample_period)).mean().ffill().diff()
            return (actdiff == 0.0).sum() * self.sample_period

        def activation_filter(tf, building_data):
            start_time = tf.start
            end_time = tf.end
            df = building_data[start_time:end_time]
            if df.empty:
                return False
            else:
                act_stale_seconds = get_stale_seconds(df['target'])
                act_duration = (end_time - start_time).total_seconds()
                act_stale_pct = act_stale_seconds / act_duration
                mains_stale_seconds = get_stale_seconds(df['mains'])
                mains_stale_pct = get_stale_seconds(df['mains']) / act_duration
                if (act_stale_pct < self.activation_max_stale_pct) & (mains_stale_pct < self.mains_max_stale_pct):
                    return True
                else:
                    return False

        good_timeframes = {}
        bad_timeframes = {}
        all_timeframes = {}
        for fold, buildings_per_appliances in nilmtk_activations.items():
            good_timeframes[fold] = {}
            bad_timeframes[fold] = {}
            all_timeframes[fold] = {}
            for appliance, activations_per_building in buildings_per_appliances.items():
                good_timeframes[fold][appliance] = {}
                bad_timeframes[fold][appliance] = {}
                all_timeframes[fold][appliance] = {}
                for building, activations in activations_per_building.items():
                    building_data = self.data[fold][building]
                    good_timeframes_per_building = TimeFrameGroup()
                    bad_timeframes_per_building = TimeFrameGroup()
                    all_timeframes_per_building = TimeFrameGroup()
                    for i, activation in enumerate(activations):
                        tf = TimeFrame(
                            start=activation.index[0],
                            end=activation.index[-1] + pd.Timedelta(seconds=self.sample_period))
                        all_timeframes_per_building.append(tf)
                        if activation_filter(tf, building_data):
                            good_timeframes_per_building.append(tf)
                        else:
                            bad_timeframes_per_building.append(tf)
                    good_timeframes[fold][appliance][building] = good_timeframes_per_building
                    bad_timeframes[fold][appliance][building] = bad_timeframes_per_building
                    all_timeframes[fold][appliance][building] = all_timeframes_per_building
        #
        self.clean_active_timeframes = good_timeframes
        self.all_active_timeframes = all_timeframes
        self.phony_active_timeframes = bad_timeframes