Esempio n. 1
0
 def __init__(self,
              stream_batcher,
              batchidx2paths,
              batchidx2start_end,
              randomize=False,
              paths=None,
              shard2batchidx=None,
              seed=None,
              shard_fractions=None,
              cache_size_GB=4):
     super(DataLoaderSlave, self).__init__()
     if randomize:
         assert seed is not None, 'For randomized data loading a seed needs to be set!'
     self.cache_size_GB = cache_size_GB
     self.stream_batcher = stream_batcher
     self.batchidx2paths = batchidx2paths
     self.batchidx2start_end = batchidx2start_end
     self.current_data = {}
     self.randomize = randomize
     self.num_batches = len(list(batchidx2paths.keys()))
     self.rdm = np.random.RandomState(234 + seed)
     self.shard_fractions = shard_fractions
     self.shard2batchidx = shard2batchidx
     self.paths = paths
     self._stop = threading.Event()
     self.daemon = True
     self.t = Timer()
     self.batches_processes = 0
     self.cache_order = []
Esempio n. 2
0
class AbstractProcessor(object):
    def __init__(self):
        self.state = None
        self.execution_state = set(['fit', 'transform'])
        self.sample_counter = 0
        self.timer = Timer(silent=True)

    def link_with_pipeline(self, state):
        self.state = state

    def abstract_process(self, inputs, inp_type, benchmark):
        benchmark = True
        if benchmark:
            self.sample_counter += 1
            self.timer.tick()
        result = self.process(inputs, inp_type)
        if benchmark:
            self.timer.tick()
            if self.sample_counter == 10000:
                log.info_once(
                    'Time taken for 10000 samples for input type {0} for processor {1}: '
                    .format(inp_type,
                            type(self).__name__) + '{0} seconds',
                    round(self.timer.tock(), 2))
        return result

    def process(self, inputs, inp_type):
        raise NotImplementedError(
            'Classes that inherit from AbstractProcessor need to implement the process method'
        )
Esempio n. 3
0
class ETAHook(AbstractHook, IAtEpochStartObservable):
    def __init__(self, name='', print_every_x_batches=1000):
        super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches)
        self.t = Timer(silent=True)
        self.cumulative_t = 0.0
        self.skipped_first = False

    def get_time_string(self, seconds):
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        if h < 0: h = 0
        if m < 0: m = 0
        if s < 0: s = 0
        return "%d:%02d:%02d" % (h, m, s)

    def calculate_metric(self, state):
        n = state.num_batches
        i = state.current_idx
        cumulative_t = self.t.tick('ETA')
        total_time_estimate = (cumulative_t / i) * n
        self.t.tick('ETA')
        self.cumulative_t = cumulative_t

        return total_time_estimate

    def print_statistic(self):
        if not self.skipped_first:
            # the first estimation is very unreliable for time measures
            self.skipped_first = True
            return 0, 0, 0, 0
        n, lower, m, upper = self.get_confidence_intervals()
        lower -= self.cumulative_t
        m -= self.cumulative_t
        upper -= self.cumulative_t
        lower, m, upper = self.get_time_string(lower), self.get_time_string(
            m), self.get_time_string(upper)
        log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format(
            lower, upper, m, self.name, self.metric_name, n))
        return lower, upper, m, n

    def at_start_of_epoch_event(self, batcher_state):
        self.t.tick('ETA')
        t = self.t.tick('Epoch')

    def at_end_of_epoch_event(self, state):
        self.t.tock('ETA')
        epoch_time = self.t.tock('Epoch')
        self.epoch_errors.append([epoch_time])
        log.info('Total epoch time: {0}'.format(
            self.get_time_string(epoch_time)))
        del self.current_scores[:]
        self.n = 0
        self.mean = 0
        self.M2 = 0
        self.skipped_first = False
        self.epoch += 1
        return epoch_time
Esempio n. 4
0
 def __init__(self, name='', print_every_x_batches=1000):
     super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches)
     self.t = Timer(silent=True)
     self.cumulative_t = 0.0
     self.skipped_first = False
Esempio n. 5
0
import os
import shutil
import json
import zipfile
import numpy as np

from spodernet.preprocessing.vocab import Vocab
from spodernet.utils.util import Timer
from spodernet.preprocessing.processors import SaveLengthsToState
from sklearn.feature_extraction.text import TfidfVectorizer

from spodernet.utils.logger import Logger
log = Logger('pipeline.py.txt')

t = Timer()
class StreamMethods:
    files = 'FILES'
    data = 'DATA'

class DatasetStreamer(object):
    def __init__(self, input_keys=None, output_keys=None, stream_method=StreamMethods.files):
        self.stream_processors = []
        self.input_keys = input_keys or ['input', 'support', 'target']
        self.output_keys = output_keys
        self.paths = []
        self.output_keys = output_keys or self.input_keys
        self.stream_method = stream_method
        self.data = []

    def add_stream_processor(self, stream):
Esempio n. 6
0
class DataLoaderSlave(threading.Thread):
    def __init__(self,
                 stream_batcher,
                 batchidx2paths,
                 batchidx2start_end,
                 randomize=False,
                 paths=None,
                 shard2batchidx=None,
                 seed=None,
                 shard_fractions=None,
                 cache_size_GB=4):
        super(DataLoaderSlave, self).__init__()
        if randomize:
            assert seed is not None, 'For randomized data loading a seed needs to be set!'
        self.cache_size_GB = cache_size_GB
        self.stream_batcher = stream_batcher
        self.batchidx2paths = batchidx2paths
        self.batchidx2start_end = batchidx2start_end
        self.current_data = {}
        self.randomize = randomize
        self.num_batches = len(list(batchidx2paths.keys()))
        self.rdm = np.random.RandomState(234 + seed)
        self.shard_fractions = shard_fractions
        self.shard2batchidx = shard2batchidx
        self.paths = paths
        self._stop = threading.Event()
        self.daemon = True
        self.t = Timer()
        self.batches_processes = 0
        self.cache_order = []

    def stop(self):
        self._stop.set()

    def stopped(self):
        return self._stop.isSet()

    def load_files_if_needed(self, current_paths):
        if isinstance(current_paths[0], list):
            for paths in current_paths:
                shuffle_idx = None
                for path in paths:
                    if path not in self.current_data:
                        ordered_data = load_data(path)
                        self.cache_order.append(path)
                        if shuffle_idx == None and self.randomize:
                            shuffle_idx = np.arange(ordered_data.shape[0])
                            self.rdm.shuffle(shuffle_idx)

                        if self.randomize:
                            # be careful with pointers here, or we have trouble
                            # with garbage collection
                            data = np.copy(ordered_data[shuffle_idx])
                            del ordered_data
                            order_data = None
                            self.current_data[path] = data
                        else:
                            self.current_data[path] = ordered_data

                shuffle_idx = None
        else:
            shuffle_idx = None
            for path in current_paths:
                if path not in self.current_data:
                    ordered_data = load_data(path)
                    self.cache_order.append(path)
                    if shuffle_idx is None and self.randomize:
                        shuffle_idx = np.arange(ordered_data.shape[0])
                        self.rdm.shuffle(shuffle_idx)

                    if self.randomize:
                        # be careful with pointers here, or we have trouble
                        # with garbage collection
                        data = np.copy(ordered_data[shuffle_idx])
                        del ordered_data
                        order_data = None
                        self.current_data[path] = data
                    else:
                        self.current_data[path] = ordered_data

    def create_batch_parts(self, current_paths, start, end):
        # index loaded data for minibatch
        batch_parts = []
        if isinstance(current_paths[0], list):
            start = start[0]
            end = end[1]
            for i in range(len(current_paths[0])):
                x1 = self.current_data[current_paths[0][i]][start:]
                x2 = self.current_data[current_paths[1][i]][:end]
                if len(x1.shape) == 1:
                    x = np.hstack([x1, x2])
                else:
                    x = np.vstack([x1, x2])
                batch_parts.append(x)
        else:
            for path in current_paths:
                batch_parts.append(self.current_data[path][start:end])

        return batch_parts

    def determine_cache_size(self):
        total_bytes = 0
        for path, shard in self.current_data.items():
            total_bytes += shard.nbytes
        return total_bytes / (1024.0**3.0)

    def clean_cache(self, current_paths):
        # delete unused cached data
        i = 0
        n = len(self.cache_order)
        while i < n:
            if self.cache_order[i] in current_paths:
                i += 1
                continue
            path = self.cache_order.pop(i)
            self.current_data.pop(path, None)
            GB_usage = self.determine_cache_size()
            n -= 1
            if GB_usage < self.cache_size_GB: break

    def publish_at_prepared_batch_event(self, batch_parts):
        for i, obs in enumerate(
                self.stream_batcher.at_batch_prepared_observers):
            self.t.tick(str(i))
            batch_parts = obs.at_batch_prepared(batch_parts)
            self.t.tick(str(i))
        return batch_parts

    def run(self):
        while not self.stopped():

            # we have this to terminate threads gracefully
            # if we use daemons then the terminational signal might not be heard while loading files
            # thus causing ugly exceptions
            try:
                batch_idx = self.stream_batcher.work.get(block=False,
                                                         timeout=1.0)
            except:
                continue

            if self.randomize:
                n = 0
                while (n - self.stream_batcher.batch_size + 1) <= 0:
                    shard_idx = self.rdm.choice(len(
                        list(self.shard2batchidx.keys())),
                                                1,
                                                p=self.shard_fractions)[0]
                    current_paths = self.paths[shard_idx]

                    self.load_files_if_needed(current_paths)

                    n = self.current_data[current_paths[0]].shape[0]
                start = self.rdm.randint(
                    0, n - self.stream_batcher.batch_size + 1)
                end = start + self.stream_batcher.batch_size

                batch_parts = self.create_batch_parts(current_paths, start,
                                                      end)
            else:
                if batch_idx not in self.batchidx2paths:
                    log.error('{0}, {1}', batch_idx,
                              list(self.batchidx2paths.keys()))
                current_paths = self.batchidx2paths[batch_idx]
                start, end = self.batchidx2start_end[batch_idx]

                self.load_files_if_needed(current_paths)
                batch_parts = self.create_batch_parts(current_paths, start,
                                                      end)

            batch_parts = self.publish_at_prepared_batch_event(batch_parts)
            # pass data to streambatcher
            self.stream_batcher.prepared_batches[batch_idx] = batch_parts
            try:
                self.stream_batcher.prepared_batchidx.put(batch_idx,
                                                          block=False,
                                                          timeout=1.0)
            except:
                continue

            GB_usage = self.determine_cache_size()
            if GB_usage > self.cache_size_GB:
                self.clean_cache(current_paths)
            self.batches_processes += 1
            if self.batches_processes % 100 == 0:
                if benchmark:
                    for i, obs in enumerate(
                            self.stream_batcher.at_batch_prepared_observers):
                        t = self.t.tock(str(i))
Esempio n. 7
0
    def __init__(self,
                 pipeline_name,
                 name,
                 batch_size,
                 mnt_name='',
                 loader_threads=4,
                 randomize=False,
                 seed=None,
                 keys=['input', 'support', 'target'],
                 is_volatile=False,
                 cache_size_GB=4):
        config_path = join(get_data_path(mnt_name=mnt_name), pipeline_name,
                           name, 'hdf5_config.pkl')
        if not exists(config_path):
            log.error(
                'Path {0} does not exists! Have you forgotten to preprocess your dataset?',
                config_path)
        config = pickle.load(open(config_path, 'rb'))
        self.paths = config['paths']
        self.fractions = config['fractions']
        self.num_batches = int(np.sum(config['counts']) / batch_size)
        self.max_lengths = config['max_lengths']
        self.batch_size = batch_size
        self.batch_idx = 0
        self.prefetch_batch_idx = 0
        self.loaders = []
        self.prepared_batches = {}
        self.prepared_batchidx = queue.Queue()
        self.work = queue.Queue()
        self.cached_batches = {}
        self.end_iter_observers = []
        self.end_epoch_observers = []
        self.start_epoch_observers = []
        self.at_batch_prepared_observers = []
        self.state = BatcherState()
        self.current_iter = 0
        self.current_epoch = 0
        self.timer = Timer()
        self.loader_threads = loader_threads
        if Config.backend == Backends.TORCH:
            from spodernet.backends.torchbackend import TorchConverter, TorchCUDAConverter
            self.subscribe_to_batch_prepared_event(DictConverter(keys))
            self.subscribe_to_batch_prepared_event(TorchConverter(is_volatile))
            if Config.cuda:
                import torch
                self.subscribe_to_batch_prepared_event(
                    TorchCUDAConverter(torch.cuda.current_device()))
        elif Config.backend == Backends.TENSORFLOW:
            from spodernet.backends.tfbackend import TensorFlowConverter
            self.subscribe_to_batch_prepared_event(TensorFlowConverter())
        elif Config.backend == Backends.TEST:
            pass
        elif Config.backend == Backends.CNTK:
            self.subscribe_to_batch_prepared_event(DictConverter(keys))
        else:
            raise Exception('Backend has unsupported value {0}'.format(
                Config.backend))

        batchidx2paths, batchidx2start_end, shard2batchidx = self.create_batchidx_maps(
            config['counts'])

        for i in range(loader_threads):
            seed = 2345 + (i * 83)
            self.loaders.append(
                DataLoaderSlave(self, batchidx2paths, batchidx2start_end,
                                randomize, self.paths, shard2batchidx, seed,
                                self.fractions, cache_size_GB))
            self.loaders[-1].start()
Esempio n. 8
0
 def __init__(self):
     self.state = None
     self.execution_state = set(['fit', 'transform'])
     self.sample_counter = 0
     self.timer = Timer(silent=True)
Esempio n. 9
0
from spodernet.utils.global_config import Config
from past.builtins import basestring, long

import numpy as np
import os
import copy
import spacy
import nltk
import json
import pickle

from spodernet.utils.logger import Logger
log = Logger('processors.py.txt')

nlp = spacy.load('en')
timer = Timer()


class KeyToKeyMapper(IAtBatchPreparedObservable):
    def __init__(self, key2key):
        self.key2key = key2key

    def at_batch_prepared(self, batch_parts):
        str2var = batch_parts
        new_str2var = {}
        for key1, key2 in self.key2key.items():
            new_str2var[key2] = str2var[key1]

        return new_str2var