def dump(self, data_list, file_prefix, valid_batches=None, shuffle=True): """ Dump data to pickle. :param data_list: :param file_prefix: prefix of output files :param valid_batches: size of valid data :param shuffle """ if shuffle: for i in range(len(data_list)): random.Random(3).shuffle(data_list[i]) if valid_batches: train_file = open("%s_train.pkl" % file_prefix, "wb") valid_file = open("%s_valid.pkl" % file_prefix, "wb") else: train_file = open("%s.pkl" % file_prefix, "wb") valid_file = None for i in range(len(data_list[0])): if valid_batches and i < valid_batches: StreamPickler.dump_one([d[i] for d in data_list], valid_file) else: StreamPickler.dump_one([d[i] for d in data_list], train_file) train_file.close() if valid_file: valid_file.close()
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cached=False, post_processing=None, shuffle_memory=False, curriculum=None): self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cached self._cached_train_data = None self._post_processing = post_processing if post_processing else lambda x: x self._shuffle_memory = shuffle_memory self._curriculum = curriculum self._curriculum_count = 0 if curriculum and not callable(curriculum): raise Exception("curriculum function must be callable") if curriculum and not cached: raise Exception( "curriculum learning needs training data to be cached") if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list( map(self._post_processing, StreamPickler.load(open(self._train_path)))) self._train_size = len(self._cached_train_data) if self._shuffle_memory: logging.info("Shuffle on-memory data") global_rand.shuffle(self._cached_train_data)
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cache_on_memory=False): self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cache_on_memory self._cached_train_data = None if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list(StreamPickler.load(open(self._train_path)))
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cached=False, post_processing=None, shuffle_memory=False): self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cached self._cached_train_data = None self._post_processing = post_processing if post_processing else lambda x: x self._shuffle_memory = shuffle_memory if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path)))) if self._shuffle_memory: logging.info("Shuffle on-memory data") global_rand.shuffle(self._cached_train_data)
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cached=False, post_processing=None, shuffle_memory=False, curriculum=None): self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cached self._cached_train_data = None self._post_processing = post_processing if post_processing else lambda x: x self._shuffle_memory = shuffle_memory self._curriculum = curriculum self._curriculum_count = 0 if curriculum and not callable(curriculum): raise Exception("curriculum function must be callable") if curriculum and not cached: raise Exception("curriculum learning needs training data to be cached") if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path)))) if self._shuffle_memory: logging.info("Shuffle on-memory data") global_rand.shuffle(self._cached_train_data)
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cached=False, post_processing=None, shuffle_memory=False, data_processor=None): """ :type data_processor: DataProcessor """ self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cached self._cached_train_data = None self._post_processing = post_processing if post_processing else lambda x: x self._shuffle_memory = shuffle_memory self._epoch = 0 self._data_processor = data_processor if data_processor and not isinstance(data_processor, DataProcessor): raise Exception("data_processor must be an instance of DataProcessor.") if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path)))) self._train_size = len(self._cached_train_data) if self._shuffle_memory: logging.info("Shuffle on-memory data") env.numpy_rand.shuffle(self._cached_train_data)
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cache_on_memory=False, post_processing=None, shuffle_memory=False): self._train_path = train_path self._valid_path = valid_path self._test_path = test_path self._train_size = train_size self._cache_on_memory = cache_on_memory self._cached_train_data = None self._post_processing = post_processing if post_processing else lambda x: x self._shuffle_memory = shuffle_memory if self._cache_on_memory: logging.info("Cache on memory") self._cached_train_data = list( map(self._post_processing, StreamPickler.load(open(self._train_path)))) if self._shuffle_memory: logging.info("Shuffle on-memory data") global_rand.shuffle(self._cached_train_data)
def generate_test_data(self): for data in StreamPickler.load(open(self._test_path)): yield self._post_processing(data)
def generate_train_data(self): for data in StreamPickler.load(open(self._train_path)): if self._skip_amount > 0: self._skip_amount -= 1 continue yield self._post_processing(data)
def generate_test_data(self): for data in StreamPickler.load(open(self._test_path)): yield data
def generate_valid_data(self): for data in StreamPickler.load(open(self._valid_path)): yield data
def generate_test_data(self): data_source = StreamPickler.load(open(self._test_path)) for data in self._process_data('test', self._epoch, data_source): yield self._post_processing(data)
def generate_train_data(self): self._epoch += 1 data_source = self._cached_train_data if self._cache_on_memory else StreamPickler.load(open(self._train_path)) for data in self._process_data('train', self._epoch, data_source): yield self._post_processing(data)