def read_all(self): batch = [] max_len = 0 for ex in self.data: if min(len(ex.x), len(ex.y)) == 0: log.warning("Skipping a record, either source or target is empty") continue this_len = max(len(ex.x), len(ex.y)) if (len(batch) + 1) * max(max_len, this_len) <= self.max_toks and len(batch) < self.max_sents : batch.append(ex) # this one can go in max_len = max(max_len, this_len) else: if this_len > self.max_toks: raise Exception(f'Unable to make a batch of {self.max_toks} toks' f' with a seq of x_len:{len(ex.x)} y_len:{len(ex.y)}') # yield the current batch yield Batch(batch, sort_dec=self.sort_desc, batch_first=self.batch_first, meta=self.batch_meta) batch = [ex] # new batch max_len = this_len if batch: log.debug(f"\nLast batch, size={len(batch)}") yield Batch(batch, sort_dec=self.sort_desc, batch_first=self.batch_first, meta=self.batch_meta)
def load(cls, path, rec_type=None) -> 'Db': log.debug(f"Loading from {path}") with open(path, 'rb') as f: obj = pickle.load(f) assert isinstance(obj, cls) if rec_type: obj._rec_type = rec_type return obj
def __init__(self, table: List[Type], validate=True, invertible=True): """ :param table: list of `Type`s :param validate: validate that reserved types are found :param invertible: validate that the idx->str and str->idx are invertible """ if validate: Reseved.validate(table) self.unk_idx = Reseved.UNK_IDX else: # at least UNK should be available assert table[Reseved.UNK_IDX].name == Reseved.UNK_TOK[0] # TODO: reverse lookup UNK IDX based on UNK_TOK name self.unk_idx = Reseved.UNK_IDX self.vocab_size = len(table) self.table = table self.idx_to_str = [t.name for t in table] if invertible: self.str_to_idx = { tok: idx for idx, tok in enumerate(self.idx_to_str) } assert len(self.idx_to_str) == len(self.str_to_idx) else: # keep the first occurrence TODO: maybe keep both and do random; str_to_idx be multiset self.str_to_idx = {} for idx, typ in enumerate(table): if typ.name in self.str_to_idx: typ2 = table[self.str_to_idx[typ.name]] log.debug( f"skip:: {typ.signature()}; it conflicts with {typ2.signature()}" ) else: self.str_to_idx[typ.name] = idx self.invertible = invertible
def save(self, path): log.debug(f"Saving to {path}") with open(path, 'wb') as f: pickle.dump(self, f)