def encode_file(self, path, ordered=False, verbose=False, add_eos=False, add_double_eos=False): if verbose: print('encoding file {} ...'.format(path)) assert exists(path) encoded = [] with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) if len(line.strip()) == 0: continue symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) encoded.append(self.convert_to_nparray(symbols)) if ordered: encoded = np.concatenate(encoded) return encoded
def save_vocab(self, path, vocab_name): with open(os.path.join(path, "vocabulary_%s.txt" % vocab_name), "w") as write_vocab: for i in self.idx2sym: write_vocab.write(i + "\n") print(" Vocabulary of {} is saved under path: {}".format( vocab_name, os.path.abspath(path)))
def _build_from_file(self, vocab_file): self.idx2sym = [] self.sym2idx = OrderedDict() with open(vocab_file, 'r') as f: for line in f: symb = line.strip().split()[0] self.add_symbol(symb) self.unk_idx = self.sym2idx['<UNK>']
def count_file(self, path, verbose=False, add_eos=False): if verbose: print('counting file {} ...'.format(path)) assert exists(path) with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) symbols = self.tokenize(line, add_eos=add_eos) self.counter.update(symbols)
def _build_from_file(self, vocab_file): self.idx2sym = [] self.sym2idx = OrderedDict() with open(vocab_file, 'r') as f: for line in f: symb = line.replace("\n", "") self.add_symbol(symb) self.unk_idx = self.sym2idx['<UNK>']