def __init__(self, stream_batcher, batchidx2paths, batchidx2start_end, randomize=False, paths=None, shard2batchidx=None, seed=None, shard_fractions=None, cache_size_GB=4): super(DataLoaderSlave, self).__init__() if randomize: assert seed is not None, 'For randomized data loading a seed needs to be set!' self.cache_size_GB = cache_size_GB self.stream_batcher = stream_batcher self.batchidx2paths = batchidx2paths self.batchidx2start_end = batchidx2start_end self.current_data = {} self.randomize = randomize self.num_batches = len(list(batchidx2paths.keys())) self.rdm = np.random.RandomState(234 + seed) self.shard_fractions = shard_fractions self.shard2batchidx = shard2batchidx self.paths = paths self._stop = threading.Event() self.daemon = True self.t = Timer() self.batches_processes = 0 self.cache_order = []
class AbstractProcessor(object): def __init__(self): self.state = None self.execution_state = set(['fit', 'transform']) self.sample_counter = 0 self.timer = Timer(silent=True) def link_with_pipeline(self, state): self.state = state def abstract_process(self, inputs, inp_type, benchmark): benchmark = True if benchmark: self.sample_counter += 1 self.timer.tick() result = self.process(inputs, inp_type) if benchmark: self.timer.tick() if self.sample_counter == 10000: log.info_once( 'Time taken for 10000 samples for input type {0} for processor {1}: ' .format(inp_type, type(self).__name__) + '{0} seconds', round(self.timer.tock(), 2)) return result def process(self, inputs, inp_type): raise NotImplementedError( 'Classes that inherit from AbstractProcessor need to implement the process method' )
class ETAHook(AbstractHook, IAtEpochStartObservable): def __init__(self, name='', print_every_x_batches=1000): super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches) self.t = Timer(silent=True) self.cumulative_t = 0.0 self.skipped_first = False def get_time_string(self, seconds): m, s = divmod(seconds, 60) h, m = divmod(m, 60) if h < 0: h = 0 if m < 0: m = 0 if s < 0: s = 0 return "%d:%02d:%02d" % (h, m, s) def calculate_metric(self, state): n = state.num_batches i = state.current_idx cumulative_t = self.t.tick('ETA') total_time_estimate = (cumulative_t / i) * n self.t.tick('ETA') self.cumulative_t = cumulative_t return total_time_estimate def print_statistic(self): if not self.skipped_first: # the first estimation is very unreliable for time measures self.skipped_first = True return 0, 0, 0, 0 n, lower, m, upper = self.get_confidence_intervals() lower -= self.cumulative_t m -= self.cumulative_t upper -= self.cumulative_t lower, m, upper = self.get_time_string(lower), self.get_time_string( m), self.get_time_string(upper) log.info('{3} {4}: {2}\t99% CI: ({0}, {1}), n={5}'.format( lower, upper, m, self.name, self.metric_name, n)) return lower, upper, m, n def at_start_of_epoch_event(self, batcher_state): self.t.tick('ETA') t = self.t.tick('Epoch') def at_end_of_epoch_event(self, state): self.t.tock('ETA') epoch_time = self.t.tock('Epoch') self.epoch_errors.append([epoch_time]) log.info('Total epoch time: {0}'.format( self.get_time_string(epoch_time))) del self.current_scores[:] self.n = 0 self.mean = 0 self.M2 = 0 self.skipped_first = False self.epoch += 1 return epoch_time
def __init__(self, name='', print_every_x_batches=1000): super(ETAHook, self).__init__(name, 'ETA', print_every_x_batches) self.t = Timer(silent=True) self.cumulative_t = 0.0 self.skipped_first = False
import os import shutil import json import zipfile import numpy as np from spodernet.preprocessing.vocab import Vocab from spodernet.utils.util import Timer from spodernet.preprocessing.processors import SaveLengthsToState from sklearn.feature_extraction.text import TfidfVectorizer from spodernet.utils.logger import Logger log = Logger('pipeline.py.txt') t = Timer() class StreamMethods: files = 'FILES' data = 'DATA' class DatasetStreamer(object): def __init__(self, input_keys=None, output_keys=None, stream_method=StreamMethods.files): self.stream_processors = [] self.input_keys = input_keys or ['input', 'support', 'target'] self.output_keys = output_keys self.paths = [] self.output_keys = output_keys or self.input_keys self.stream_method = stream_method self.data = [] def add_stream_processor(self, stream):
class DataLoaderSlave(threading.Thread): def __init__(self, stream_batcher, batchidx2paths, batchidx2start_end, randomize=False, paths=None, shard2batchidx=None, seed=None, shard_fractions=None, cache_size_GB=4): super(DataLoaderSlave, self).__init__() if randomize: assert seed is not None, 'For randomized data loading a seed needs to be set!' self.cache_size_GB = cache_size_GB self.stream_batcher = stream_batcher self.batchidx2paths = batchidx2paths self.batchidx2start_end = batchidx2start_end self.current_data = {} self.randomize = randomize self.num_batches = len(list(batchidx2paths.keys())) self.rdm = np.random.RandomState(234 + seed) self.shard_fractions = shard_fractions self.shard2batchidx = shard2batchidx self.paths = paths self._stop = threading.Event() self.daemon = True self.t = Timer() self.batches_processes = 0 self.cache_order = [] def stop(self): self._stop.set() def stopped(self): return self._stop.isSet() def load_files_if_needed(self, current_paths): if isinstance(current_paths[0], list): for paths in current_paths: shuffle_idx = None for path in paths: if path not in self.current_data: ordered_data = load_data(path) self.cache_order.append(path) if shuffle_idx == None and self.randomize: shuffle_idx = np.arange(ordered_data.shape[0]) self.rdm.shuffle(shuffle_idx) if self.randomize: # be careful with pointers here, or we have trouble # with garbage collection data = np.copy(ordered_data[shuffle_idx]) del ordered_data order_data = None self.current_data[path] = data else: self.current_data[path] = ordered_data shuffle_idx = None else: shuffle_idx = None for path in current_paths: if path not in self.current_data: ordered_data = load_data(path) self.cache_order.append(path) if shuffle_idx is None and self.randomize: shuffle_idx = np.arange(ordered_data.shape[0]) self.rdm.shuffle(shuffle_idx) if self.randomize: # be careful with pointers here, or we have trouble # with garbage collection data = np.copy(ordered_data[shuffle_idx]) del ordered_data order_data = None self.current_data[path] = data else: self.current_data[path] = ordered_data def create_batch_parts(self, current_paths, start, end): # index loaded data for minibatch batch_parts = [] if isinstance(current_paths[0], list): start = start[0] end = end[1] for i in range(len(current_paths[0])): x1 = self.current_data[current_paths[0][i]][start:] x2 = self.current_data[current_paths[1][i]][:end] if len(x1.shape) == 1: x = np.hstack([x1, x2]) else: x = np.vstack([x1, x2]) batch_parts.append(x) else: for path in current_paths: batch_parts.append(self.current_data[path][start:end]) return batch_parts def determine_cache_size(self): total_bytes = 0 for path, shard in self.current_data.items(): total_bytes += shard.nbytes return total_bytes / (1024.0**3.0) def clean_cache(self, current_paths): # delete unused cached data i = 0 n = len(self.cache_order) while i < n: if self.cache_order[i] in current_paths: i += 1 continue path = self.cache_order.pop(i) self.current_data.pop(path, None) GB_usage = self.determine_cache_size() n -= 1 if GB_usage < self.cache_size_GB: break def publish_at_prepared_batch_event(self, batch_parts): for i, obs in enumerate( self.stream_batcher.at_batch_prepared_observers): self.t.tick(str(i)) batch_parts = obs.at_batch_prepared(batch_parts) self.t.tick(str(i)) return batch_parts def run(self): while not self.stopped(): # we have this to terminate threads gracefully # if we use daemons then the terminational signal might not be heard while loading files # thus causing ugly exceptions try: batch_idx = self.stream_batcher.work.get(block=False, timeout=1.0) except: continue if self.randomize: n = 0 while (n - self.stream_batcher.batch_size + 1) <= 0: shard_idx = self.rdm.choice(len( list(self.shard2batchidx.keys())), 1, p=self.shard_fractions)[0] current_paths = self.paths[shard_idx] self.load_files_if_needed(current_paths) n = self.current_data[current_paths[0]].shape[0] start = self.rdm.randint( 0, n - self.stream_batcher.batch_size + 1) end = start + self.stream_batcher.batch_size batch_parts = self.create_batch_parts(current_paths, start, end) else: if batch_idx not in self.batchidx2paths: log.error('{0}, {1}', batch_idx, list(self.batchidx2paths.keys())) current_paths = self.batchidx2paths[batch_idx] start, end = self.batchidx2start_end[batch_idx] self.load_files_if_needed(current_paths) batch_parts = self.create_batch_parts(current_paths, start, end) batch_parts = self.publish_at_prepared_batch_event(batch_parts) # pass data to streambatcher self.stream_batcher.prepared_batches[batch_idx] = batch_parts try: self.stream_batcher.prepared_batchidx.put(batch_idx, block=False, timeout=1.0) except: continue GB_usage = self.determine_cache_size() if GB_usage > self.cache_size_GB: self.clean_cache(current_paths) self.batches_processes += 1 if self.batches_processes % 100 == 0: if benchmark: for i, obs in enumerate( self.stream_batcher.at_batch_prepared_observers): t = self.t.tock(str(i))
def __init__(self, pipeline_name, name, batch_size, mnt_name='', loader_threads=4, randomize=False, seed=None, keys=['input', 'support', 'target'], is_volatile=False, cache_size_GB=4): config_path = join(get_data_path(mnt_name=mnt_name), pipeline_name, name, 'hdf5_config.pkl') if not exists(config_path): log.error( 'Path {0} does not exists! Have you forgotten to preprocess your dataset?', config_path) config = pickle.load(open(config_path, 'rb')) self.paths = config['paths'] self.fractions = config['fractions'] self.num_batches = int(np.sum(config['counts']) / batch_size) self.max_lengths = config['max_lengths'] self.batch_size = batch_size self.batch_idx = 0 self.prefetch_batch_idx = 0 self.loaders = [] self.prepared_batches = {} self.prepared_batchidx = queue.Queue() self.work = queue.Queue() self.cached_batches = {} self.end_iter_observers = [] self.end_epoch_observers = [] self.start_epoch_observers = [] self.at_batch_prepared_observers = [] self.state = BatcherState() self.current_iter = 0 self.current_epoch = 0 self.timer = Timer() self.loader_threads = loader_threads if Config.backend == Backends.TORCH: from spodernet.backends.torchbackend import TorchConverter, TorchCUDAConverter self.subscribe_to_batch_prepared_event(DictConverter(keys)) self.subscribe_to_batch_prepared_event(TorchConverter(is_volatile)) if Config.cuda: import torch self.subscribe_to_batch_prepared_event( TorchCUDAConverter(torch.cuda.current_device())) elif Config.backend == Backends.TENSORFLOW: from spodernet.backends.tfbackend import TensorFlowConverter self.subscribe_to_batch_prepared_event(TensorFlowConverter()) elif Config.backend == Backends.TEST: pass elif Config.backend == Backends.CNTK: self.subscribe_to_batch_prepared_event(DictConverter(keys)) else: raise Exception('Backend has unsupported value {0}'.format( Config.backend)) batchidx2paths, batchidx2start_end, shard2batchidx = self.create_batchidx_maps( config['counts']) for i in range(loader_threads): seed = 2345 + (i * 83) self.loaders.append( DataLoaderSlave(self, batchidx2paths, batchidx2start_end, randomize, self.paths, shard2batchidx, seed, self.fractions, cache_size_GB)) self.loaders[-1].start()
def __init__(self): self.state = None self.execution_state = set(['fit', 'transform']) self.sample_counter = 0 self.timer = Timer(silent=True)
from spodernet.utils.global_config import Config from past.builtins import basestring, long import numpy as np import os import copy import spacy import nltk import json import pickle from spodernet.utils.logger import Logger log = Logger('processors.py.txt') nlp = spacy.load('en') timer = Timer() class KeyToKeyMapper(IAtBatchPreparedObservable): def __init__(self, key2key): self.key2key = key2key def at_batch_prepared(self, batch_parts): str2var = batch_parts new_str2var = {} for key1, key2 in self.key2key.items(): new_str2var[key2] = str2var[key1] return new_str2var