コード例 #1
0
ファイル: batch.py プロジェクト: pegasus-lynx/nlcodec
    def read_all(self):
        batch = []
        max_len = 0
        for ex in self.data:
            if min(len(ex.x), len(ex.y)) == 0:
                log.warning("Skipping a record,  either source or target is empty")
                continue

            this_len = max(len(ex.x), len(ex.y))
            if (len(batch) + 1) * max(max_len, this_len) <= self.max_toks and len(batch) < self.max_sents :
                batch.append(ex)  # this one can go in
                max_len = max(max_len, this_len)
            else:
                if this_len > self.max_toks:
                    raise Exception(f'Unable to make a batch of {self.max_toks} toks'
                                    f' with a seq of x_len:{len(ex.x)} y_len:{len(ex.y)}')
                # yield the current batch
                yield Batch(batch, sort_dec=self.sort_desc, batch_first=self.batch_first,
                            meta=self.batch_meta)
                batch = [ex]  # new batch
                max_len = this_len
        if batch:
            log.debug(f"\nLast batch, size={len(batch)}")
            yield Batch(batch, sort_dec=self.sort_desc, batch_first=self.batch_first,
                        meta=self.batch_meta)
コード例 #2
0
ファイル: core.py プロジェクト: pegasus-lynx/nlcodec
 def load(cls, path, rec_type=None) -> 'Db':
     log.debug(f"Loading from {path}")
     with open(path, 'rb') as f:
         obj = pickle.load(f)
     assert isinstance(obj, cls)
     if rec_type:
         obj._rec_type = rec_type
     return obj
コード例 #3
0
ファイル: codec.py プロジェクト: marcelomata/nlcodec
    def __init__(self, table: List[Type], validate=True, invertible=True):
        """

        :param table: list of `Type`s
        :param validate: validate that reserved types are found
        :param invertible: validate that the idx->str and str->idx are invertible
        """
        if validate:
            Reseved.validate(table)
            self.unk_idx = Reseved.UNK_IDX
        else:
            # at least UNK should be available
            assert table[Reseved.UNK_IDX].name == Reseved.UNK_TOK[0]
            # TODO: reverse lookup UNK IDX based on UNK_TOK name
            self.unk_idx = Reseved.UNK_IDX

        self.vocab_size = len(table)
        self.table = table
        self.idx_to_str = [t.name for t in table]
        if invertible:
            self.str_to_idx = {
                tok: idx
                for idx, tok in enumerate(self.idx_to_str)
            }
            assert len(self.idx_to_str) == len(self.str_to_idx)
        else:
            # keep the first occurrence TODO: maybe keep both and do random; str_to_idx be multiset
            self.str_to_idx = {}
            for idx, typ in enumerate(table):
                if typ.name in self.str_to_idx:
                    typ2 = table[self.str_to_idx[typ.name]]
                    log.debug(
                        f"skip:: {typ.signature()}; it conflicts with {typ2.signature()}"
                    )
                else:
                    self.str_to_idx[typ.name] = idx
        self.invertible = invertible
コード例 #4
0
ファイル: core.py プロジェクト: pegasus-lynx/nlcodec
 def save(self, path):
     log.debug(f"Saving to {path}")
     with open(path, 'wb') as f:
         pickle.dump(self, f)