def write(cls, path, records: Iterator[ParallelSeqRecord]): if path.exists(): log.warning(f"Overwriting {path} with new records") os.remove(str(path)) maybe_tmp = IO.maybe_tmpfs(path) log.info(f'Creating {maybe_tmp}') conn = sqlite3.connect(str(maybe_tmp)) cur = conn.cursor() cur.execute(cls.TABLE_STATEMENT) cur.execute(cls.INDEX_X_LEN) cur.execute(cls.INDEX_Y_LEN) cur.execute(f"PRAGMA user_version = {cls.CUR_VERSION};") count = 0 for x_seq, y_seq in records: # use numpy. its a lot efficient if not isinstance(x_seq, np.ndarray): x_seq = np.array(x_seq, dtype=np.int32) if y_seq is not None and not isinstance(y_seq, np.ndarray): y_seq = np.array(y_seq, dtype=np.int32) values = (x_seq.tobytes(), None if y_seq is None else y_seq.tobytes(), len(x_seq), len(y_seq) if y_seq is not None else -1) cur.execute(cls.INSERT_STMT, values) count += 1 cur.close() conn.commit() if maybe_tmp != path: # bring the file back to original location where it should be IO.copy_file(maybe_tmp, path) log.info(f"stored {count} rows in {path}")
def get_train_data(self, batch_size: Union[int, Tuple[int, int]], steps: int = 0, sort_by='eq_len_rand_batch', batch_first=True, shuffle=False, fine_tune=False, keep_in_mem=False, split_ratio: float = 0., dynamic_epoch=False): data_path = self.train_db if self.train_db.exists( ) else self.train_file if fine_tune: if not self.finetune_file.exists(): # user may have added fine tune file later self._pre_process_parallel('finetune_src', 'finetune_tgt', self.finetune_file) log.info("Using Fine tuning corpus instead of training corpus") data_path = self.finetune_file if split_ratio > 0: data_path = IO.maybe_tmpfs(data_path) train_file = data_path.with_suffix('.db.tmp') file_creator = partial(self.file_creator, train_file=train_file, split_ratio=split_ratio) train_data = GenerativeBatchIterable(file_creator=file_creator, batches=steps, batch_size=batch_size, field=self.tgt_vocab, dynamic_epoch=dynamic_epoch, batch_first=batch_first, shuffle=shuffle, sort_by=sort_by, **self._get_batch_args()) else: data = BatchIterable(data_path=data_path, batch_size=batch_size, field=self.tgt_vocab, sort_by=sort_by, batch_first=batch_first, shuffle=shuffle, **self._get_batch_args()) train_data = LoopingIterable(data, steps) return train_data
def get_combo_data(self, batch_size: int, steps: int = 0, sort_desc=False, batch_first=True, shuffle=False): if not self.combo_file.exists(): # user may have added fine tune file later self._pre_process_parallel('combo_src', 'combo_tgt', self.combo_file) combo_file = IO.maybe_tmpfs(self.combo_file) data = BatchIterable(combo_file, batch_size=batch_size, sort_desc=sort_desc, field=self.tgt_vocab, batch_first=batch_first, shuffle=shuffle, **self._get_batch_args()) if steps > 0: data = LoopingIterable(data, steps) return data