Example #1
0
    def write(cls, path, records: Iterator[ParallelSeqRecord]):
        if path.exists():
            log.warning(f"Overwriting {path} with new records")
            os.remove(str(path))
        maybe_tmp = IO.maybe_tmpfs(path)
        log.info(f'Creating {maybe_tmp}')
        conn = sqlite3.connect(str(maybe_tmp))
        cur = conn.cursor()
        cur.execute(cls.TABLE_STATEMENT)
        cur.execute(cls.INDEX_X_LEN)
        cur.execute(cls.INDEX_Y_LEN)
        cur.execute(f"PRAGMA user_version = {cls.CUR_VERSION};")

        count = 0
        for x_seq, y_seq in records:
            # use numpy. its a lot efficient
            if not isinstance(x_seq, np.ndarray):
                x_seq = np.array(x_seq, dtype=np.int32)
            if y_seq is not None and not isinstance(y_seq, np.ndarray):
                y_seq = np.array(y_seq, dtype=np.int32)
            values = (x_seq.tobytes(),
                      None if y_seq is None else y_seq.tobytes(), len(x_seq),
                      len(y_seq) if y_seq is not None else -1)
            cur.execute(cls.INSERT_STMT, values)
            count += 1
        cur.close()
        conn.commit()
        if maybe_tmp != path:
            # bring the file back to original location where it should be
            IO.copy_file(maybe_tmp, path)
        log.info(f"stored {count} rows in {path}")
Example #2
0
File: exp.py Project: isi-nlp/rtg
    def get_train_data(self,
                       batch_size: Union[int, Tuple[int, int]],
                       steps: int = 0,
                       sort_by='eq_len_rand_batch',
                       batch_first=True,
                       shuffle=False,
                       fine_tune=False,
                       keep_in_mem=False,
                       split_ratio: float = 0.,
                       dynamic_epoch=False):

        data_path = self.train_db if self.train_db.exists(
        ) else self.train_file
        if fine_tune:
            if not self.finetune_file.exists():
                # user may have added fine tune file later
                self._pre_process_parallel('finetune_src', 'finetune_tgt',
                                           self.finetune_file)
            log.info("Using Fine tuning corpus instead of training corpus")
            data_path = self.finetune_file

        if split_ratio > 0:
            data_path = IO.maybe_tmpfs(data_path)
            train_file = data_path.with_suffix('.db.tmp')
            file_creator = partial(self.file_creator,
                                   train_file=train_file,
                                   split_ratio=split_ratio)
            train_data = GenerativeBatchIterable(file_creator=file_creator,
                                                 batches=steps,
                                                 batch_size=batch_size,
                                                 field=self.tgt_vocab,
                                                 dynamic_epoch=dynamic_epoch,
                                                 batch_first=batch_first,
                                                 shuffle=shuffle,
                                                 sort_by=sort_by,
                                                 **self._get_batch_args())
        else:
            data = BatchIterable(data_path=data_path,
                                 batch_size=batch_size,
                                 field=self.tgt_vocab,
                                 sort_by=sort_by,
                                 batch_first=batch_first,
                                 shuffle=shuffle,
                                 **self._get_batch_args())
            train_data = LoopingIterable(data, steps)

        return train_data
Example #3
0
File: exp.py Project: isi-nlp/rtg
 def get_combo_data(self,
                    batch_size: int,
                    steps: int = 0,
                    sort_desc=False,
                    batch_first=True,
                    shuffle=False):
     if not self.combo_file.exists():
         # user may have added fine tune file later
         self._pre_process_parallel('combo_src', 'combo_tgt',
                                    self.combo_file)
     combo_file = IO.maybe_tmpfs(self.combo_file)
     data = BatchIterable(combo_file,
                          batch_size=batch_size,
                          sort_desc=sort_desc,
                          field=self.tgt_vocab,
                          batch_first=batch_first,
                          shuffle=shuffle,
                          **self._get_batch_args())
     if steps > 0:
         data = LoopingIterable(data, steps)
     return data