def generate_one_preprocessed_sample(self): """ Yield a preprocessed sample from the dataset. Args: preprocessor: the preprocessor to use must implement the following method: preprocess Yields: preprocessed_sample: yields the preprocessed sample one by one. """ if len(self._preprocessed_samples) == len(self._data): for preprocessed_sentence, entities, data in self._preprocessed_samples: logging.debug( "Yielding Following Data from memory:\npreprocessed_sentence:{}\nentities:{}\ndata:{}" .format(preprocessed_sentence.encode("utf8"), entities, data.encode("utf8"))) yield preprocessed_sentence, entities, data else: self._preprocessed_samples = [] for data in self.generate_one_sample(): if self.data_type == "one_line_one_sentence": preprocessed_sentence, entities = self.preprocessor.preprocess( data) logging.debug( "Yielding and Saving Following Data:\npreprocessed_sentence:{}\nentities:{}\ndata:{}" .format(preprocessed_sentence.encode("utf8"), entities, data.encode("utf8"))) self._preprocessed_samples.append( (preprocessed_sentence, entities, data)) yield preprocessed_sentence, entities, data else: raise NotImplementedError
def generate_one_sample(self): """ Yield data samples one by one. """ if self.data_type == "one_line_one_sentence": for data in self._data: logging.debug("Yielding sentence: {}".format( data.encode("utf8"))) yield data else: raise NotImplementedError
def _train(cls, config, train_state, examples): model = train_state.model optimizer = train_state.optimizer train_batches = similar_size_batches(examples.train, config.optim.batch_size, size=lambda ex: len(ex)) while True: random.shuffle(train_batches) i = 0 # cannot enumerate(verboserate(...)) for batch in verboserate(train_batches, desc='Streaming training examples'): loss = model.loss(batch, cls._train_state.train_steps) cls._take_grad_step(train_state, loss) if (i % 5) == 0: cls.evaluate() if (i % 1000) == 0: if config.model.type == 1: # SVAE # write interpolations to file fname = "interps_batches_{}".format(i) num_ex = 10 a_idx = np.random.randint(len(batch), size=num_ex) b_idx = np.random.randint(len(batch), size=num_ex) interps = [] for a, b in zip(a_idx, b_idx): ex_a = batch[a] ex_b = batch[b] interpolation = model._interpolate_examples( ex_a, ex_b) interpolation_repr = [] interpolation_repr.append(" ".join(ex_a)) interpolation_repr.extend( [" ".join(ex) for ex in interpolation]) interpolation_repr.append(" ".join(ex_b)) interps.append(interpolation_repr) with open(join(cls._interps_dir, fname), 'w') as fout: data = "\n\n".join( ["\n".join(ex) for ex in interps]) fout.write(data.encode('utf-8')) if (i % 5000) == 0: cls.checkpoints.save(train_state) i += 1
def preprocess_all(self, force_preprocessing): """ Preprocess all the data or load them from pickle file. Args: force_preprocessing: Bool weather to force preprocessing or autoload from pickle file if exist. """ if not force_preprocessing: try: with open(self._file_path + "_preprocessed.pickle", "rb") as f: self._preprocessed_samples = pickle.load(f) logging.info( "Preprocessed sample pickle file successfully loaded.") return True except IOError: logging.info("Couldn't find a preprocessed pickle file.") logging.info("Preprocessing all the data and saving a new pickle.") for data in self.generate_one_sample(): if self.data_type == "one_line_one_sentence": preprocessed_sentence, entities = self.preprocessor.preprocess( data) logging.debug( "Saving Following Preprocessed Data:\npreprocessed_sentence:{}\nentities:{}\ndata:{}" .format(preprocessed_sentence.encode("utf8"), entities, data.encode("utf8"))) self._preprocessed_samples.append( (preprocessed_sentence, entities, data)) else: raise NotImplementedError with open(self._file_path + "_preprocessed.pickle", "wb") as f: pickle.dump(self._preprocessed_samples, f) logging.info( "Preprocessing done and preprocessed samples saved in `{}`".format( self._file_path + "_preprocessed.pickle")) return True