Exemple #1
0
 def dump(self, data_list, file_prefix, valid_batches=None, shuffle=True):
     """
     Dump data to pickle.
     :param data_list:
     :param file_prefix: prefix of output files
     :param valid_batches: size of valid data
     :param shuffle
     """
     if shuffle:
         for i in range(len(data_list)):
             random.Random(3).shuffle(data_list[i])
     if valid_batches:
         train_file = open("%s_train.pkl" % file_prefix, "wb")
         valid_file = open("%s_valid.pkl" % file_prefix, "wb")
     else:
         train_file = open("%s.pkl" % file_prefix, "wb")
         valid_file = None
     for i in range(len(data_list[0])):
         if valid_batches and i < valid_batches:
             StreamPickler.dump_one([d[i] for d in data_list], valid_file)
         else:
             StreamPickler.dump_one([d[i] for d in data_list], train_file)
     train_file.close()
     if valid_file:
         valid_file.close()
Exemple #2
0
 def __init__(self,
              train_path,
              valid_path=None,
              test_path=None,
              train_size=None,
              cached=False,
              post_processing=None,
              shuffle_memory=False,
              curriculum=None):
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cached
     self._cached_train_data = None
     self._post_processing = post_processing if post_processing else lambda x: x
     self._shuffle_memory = shuffle_memory
     self._curriculum = curriculum
     self._curriculum_count = 0
     if curriculum and not callable(curriculum):
         raise Exception("curriculum function must be callable")
     if curriculum and not cached:
         raise Exception(
             "curriculum learning needs training data to be cached")
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(
             map(self._post_processing,
                 StreamPickler.load(open(self._train_path))))
         self._train_size = len(self._cached_train_data)
         if self._shuffle_memory:
             logging.info("Shuffle on-memory data")
             global_rand.shuffle(self._cached_train_data)
Exemple #3
0
 def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, cache_on_memory=False):
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cache_on_memory
     self._cached_train_data = None
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(StreamPickler.load(open(self._train_path)))
Exemple #4
0
 def __init__(self, train_path, valid_path=None, test_path=None, train_size=None,
              cached=False, post_processing=None, shuffle_memory=False):
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cached
     self._cached_train_data = None
     self._post_processing = post_processing if post_processing else lambda x: x
     self._shuffle_memory = shuffle_memory
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path))))
         if self._shuffle_memory:
             logging.info("Shuffle on-memory data")
             global_rand.shuffle(self._cached_train_data)
Exemple #5
0
 def __init__(self, train_path, valid_path=None, test_path=None, train_size=None,
              cached=False, post_processing=None, shuffle_memory=False, curriculum=None):
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cached
     self._cached_train_data = None
     self._post_processing = post_processing if post_processing else lambda x: x
     self._shuffle_memory = shuffle_memory
     self._curriculum = curriculum
     self._curriculum_count = 0
     if curriculum and not callable(curriculum):
         raise Exception("curriculum function must be callable")
     if curriculum and not cached:
         raise Exception("curriculum learning needs training data to be cached")
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path))))
         if self._shuffle_memory:
             logging.info("Shuffle on-memory data")
             global_rand.shuffle(self._cached_train_data)
Exemple #6
0
 def __init__(self, train_path, valid_path=None, test_path=None, train_size=None,
              cached=False, post_processing=None, shuffle_memory=False, data_processor=None):
     """
     :type data_processor: DataProcessor
     """
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cached
     self._cached_train_data = None
     self._post_processing = post_processing if post_processing else lambda x: x
     self._shuffle_memory = shuffle_memory
     self._epoch = 0
     self._data_processor = data_processor
     if data_processor and not isinstance(data_processor, DataProcessor):
         raise Exception("data_processor must be an instance of DataProcessor.")
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path))))
         self._train_size = len(self._cached_train_data)
         if self._shuffle_memory:
             logging.info("Shuffle on-memory data")
             env.numpy_rand.shuffle(self._cached_train_data)
Exemple #7
0
 def __init__(self,
              train_path,
              valid_path=None,
              test_path=None,
              train_size=None,
              cache_on_memory=False,
              post_processing=None,
              shuffle_memory=False):
     self._train_path = train_path
     self._valid_path = valid_path
     self._test_path = test_path
     self._train_size = train_size
     self._cache_on_memory = cache_on_memory
     self._cached_train_data = None
     self._post_processing = post_processing if post_processing else lambda x: x
     self._shuffle_memory = shuffle_memory
     if self._cache_on_memory:
         logging.info("Cache on memory")
         self._cached_train_data = list(
             map(self._post_processing,
                 StreamPickler.load(open(self._train_path))))
         if self._shuffle_memory:
             logging.info("Shuffle on-memory data")
             global_rand.shuffle(self._cached_train_data)
Exemple #8
0
 def generate_test_data(self):
     for data in StreamPickler.load(open(self._test_path)):
         yield self._post_processing(data)
Exemple #9
0
 def generate_train_data(self):
     for data in StreamPickler.load(open(self._train_path)):
         if self._skip_amount > 0:
             self._skip_amount -= 1
             continue
         yield self._post_processing(data)
Exemple #10
0
 def generate_test_data(self):
     for data in StreamPickler.load(open(self._test_path)):
         yield data
Exemple #11
0
 def generate_valid_data(self):
     for data in StreamPickler.load(open(self._valid_path)):
         yield data
Exemple #12
0
 def generate_test_data(self):
     data_source = StreamPickler.load(open(self._test_path))
     for data in self._process_data('test', self._epoch, data_source):
         yield self._post_processing(data)
Exemple #13
0
 def generate_train_data(self):
     self._epoch += 1
     data_source = self._cached_train_data if self._cache_on_memory else StreamPickler.load(open(self._train_path))
     for data in self._process_data('train', self._epoch, data_source):
         yield self._post_processing(data)
Exemple #14
0
 def generate_test_data(self):
     for data in StreamPickler.load(open(self._test_path)):
         yield self._post_processing(data)