def process(self, corpora, valid_corpora, output_path, bpe_symbols, max_vocab_size, working_dir='.', dump_dicts=False): class _ReaderWrapper: def __init__(self, _corpus, _langs): self.corpus = _corpus self.langs = _langs self._reader = None def __enter__(self): self._reader = self.corpus.reader(self.langs).__enter__() return self._reader def __exit__(self, exc_type, exc_val, exc_tb): self._reader.__exit__(exc_type, exc_val, exc_tb) return self._reader self._logger.info('Creating VBE vocabulary') vb_builder = SubwordTextProcessor.Builder(symbols=bpe_symbols, max_vocabulary_size=max_vocab_size) bpe_encoder = vb_builder.build([_ReaderWrapper(c, [self._source_lang, self._target_lang]) for c in corpora]) bpe_encoder.save_to_file(self._bpe_model) self._logger.info('Creating vocabularies') src_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False) trg_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False) for word in bpe_encoder.get_source_terms(): src_vocab.add(word) for word in bpe_encoder.get_target_terms(): trg_vocab.add(word) self._logger.info('Preparing training corpora') src_train, trg_train = self._prepare_corpora(corpora, bpe_encoder, src_vocab, trg_vocab) self._logger.info('Preparing validation corpora') src_valid, trg_valid = self._prepare_corpora(valid_corpora, bpe_encoder, src_vocab, trg_vocab) output_file = os.path.join(output_path, 'train_processed.train.pt') self._logger.info('Storing OpenNMT preprocessed validation data to "%s"' % output_file) torch.save({ 'dicts': {'src': src_vocab, 'tgt': trg_vocab}, 'train': {'src': src_train, 'tgt': trg_train}, 'valid': {'src': src_valid, 'tgt': trg_valid}, }, output_file) if dump_dicts: src_dict_file = os.path.join(working_dir, 'train_processed.src.dict') trg_dict_file = os.path.join(working_dir, 'train_processed.trg.dict') self._logger.info('Storing OpenNMT preprocessed source dictionary "%s"' % src_dict_file) src_vocab.writeFile(src_dict_file) self._logger.info('Storing OpenNMT preprocessed target dictionary "%s"' % trg_dict_file) trg_vocab.writeFile(trg_dict_file)
def initVocabularyWithEmb(name, dataFile, vocabFile, embFile, vocabSize, embFile2=None, phraseFile=None): if name == "source": vocab = None if embFile is None: raise ValueError("Please provide an embedding file for target") if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary_src(dataFile, vocabSize, embFile is not None, embFile) vocab = genWordVocab elif name == "target": vocab = None if embFile is None: raise ValueError("Please provide an embedding file for target") if phraseFile is not None: with open(phraseFile, 'r') as file: phrase_list = set([l.strip() for l in file.readlines()]) else: phrase_list = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFile, vocabSize, embFile is not None, embFile, embFile2, phrase_list) vocab = genWordVocab print("sanity check: " + str(vocab == None)) return vocab
def process(self, corpora, valid_corpora, output_path, checkpoint=None): bpe_output_path = os.path.join(output_path, 'vocab.bpe') voc_output_path = os.path.join(output_path, 'vocab.pt') if checkpoint is not None: existing_bpe_path = checkpoint + '.bpe' existing_dat_path = checkpoint + '.dat' existing_vcb_path = checkpoint + '.vcb' with _log_timed_action(self._logger, 'Loading BPE model from %s' % existing_bpe_path): shutil.copy(existing_bpe_path, bpe_output_path) bpe_encoder = SubwordTextProcessor.load_from_file(bpe_output_path) with _log_timed_action(self._logger, 'Loading vocabularies from %s' % existing_dat_path): checkpoint_vcb = torch.load(existing_vcb_path, map_location=lambda storage, loc: storage) src_vocab = checkpoint_vcb['src'] trg_vocab = checkpoint_vcb['tgt'] else: with _log_timed_action(self._logger, 'Creating BPE model'): vb_builder = SubwordTextProcessor.Builder(symbols=self._bpe_symbols, max_vocabulary_size=self._max_vocab_size, vocab_pruning_threshold=self._vocab_pruning_threshold) bpe_encoder = vb_builder.build([c.reader([self._source_lang, self._target_lang]) for c in corpora]) bpe_encoder.save_to_file(bpe_output_path) with _log_timed_action(self._logger, 'Creating vocabularies'): src_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False) trg_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False) for word in bpe_encoder.get_source_terms(): src_vocab.add(word) for word in bpe_encoder.get_target_terms(): trg_vocab.add(word) torch.save({ 'src': src_vocab, 'tgt': trg_vocab }, voc_output_path) with _log_timed_action(self._logger, 'Preparing training corpora'): train_output_path = os.path.join(output_path, 'train_dataset') self._prepare_corpora(corpora, bpe_encoder, src_vocab, trg_vocab, train_output_path) with _log_timed_action(self._logger, 'Preparing validation corpora'): valid_output_path = os.path.join(output_path, 'valid_dataset') self._prepare_corpora(valid_corpora, bpe_encoder, src_vocab, trg_vocab, valid_output_path)
def main(): dicts = {} dicts['src'] = onmt.Dict() dicts['src'] = initVocabulary('source', opt.train_txt, opt.src_vocab, opt.src_vocab_size) dicts['tgt'] = dicts['src'] if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.dict') print('Preparing training ...') train_data = makeData(opt.train_txt, dicts) print('Preparing validation ...') valid_data = makeData(opt.valid_txt, dicts) for cs in train_data.keys(): print('Saving data to \'' + opt.save_data + '_cs' + str(cs) + '.train.pt\'...') save_data = { 'dicts': dicts, 'train': train_data[cs], 'valid': valid_data[cs] } save_str = '_cs' + str(cs) torch.save(save_data, opt.save_data + save_str + '.train.pt') '''
def init_vocab(name, dataFiles, vocabFile, vocabSize, tokenizer, num_workers=1, join=False): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: print('Building ' + name + ' vocabulary...') gen_word_vocab = make_vocab(dataFiles, vocabSize, tokenizer, num_workers=num_workers) vocab = gen_word_vocab print() return vocab
def main(): dicts = {} dicts['src'] = onmt.Dict() if opt.src_type == "text": dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab, opt.src_vocab_size) dicts['tgt'] = initVocabulary('target', opt.train_tgt, opt.tgt_vocab, opt.tgt_vocab_size) print('Preparing training ...') train = {} train['src'], train['tgt'] = makeData(opt.train_src, opt.train_tgt, dicts['src'], dicts['tgt']) print('Preparing validation ...') valid = {} valid['src'], valid['tgt'] = makeData(opt.valid_src, opt.valid_tgt, dicts['src'], dicts['tgt']) if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict') if opt.tgt_vocab is None: saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict') print('Saving data to \'' + opt.save_data + '.train.pt\'...') save_data = { 'dicts': dicts, 'type': opt.src_type, 'train': train, 'valid': valid } torch.save(save_data, opt.save_data + '.train.pt')
def collect_attributes(atb_files): # the files can contain multiple attributes # each of them will be stored in one onmt.Dict print("* Reading attributes ...") atb_dicts = dict() for file_ in atb_files: if not file_: continue reader = open(file_) while True: line = reader.readline() # normal end of file if line == "": break # attributes are split by space atbs = line.strip().split() for i, atb in enumerate(atbs): if i not in atb_dicts: atb_dicts[i] = onmt.Dict() atb_dicts[i].add(atb) return atb_dicts
def makeVocabulary(filenames, size): vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower) for filename in filenames: print("Reading file " + filename) with open(filename) as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word) #~ with open(filename) as f: #~ for sent in f.readlines(): #~ for word in sent.split(): #~ vocab.add(word) #~ originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def make_join_vocab(filenames, size, input_type="word"): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower) for filename in filenames: print("Reading file %s ... " % filename) with open(filename) as f: for sent in f.readlines(): if input_type == "word": for word in sent.split(): vocab.add(word) elif input_type == "char": chars = split_line_by_char(sent) for char in chars: vocab.add(char) else: raise NotImplementedError("Input type not implemented") original_size = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), original_size)) return vocab
def make_vocab(filename, size, input_type='word'): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower) unk_count = 0 with open(filename) as f: for sent in f.readlines(): if input_type == "word": for word in sent.split(): idx = vocab.add(word) elif input_type == "char": chars = split_line_by_char(sent) for char in chars: idx = vocab.add(char) else: raise NotImplementedError("Input type not implemented") if idx == 'onmt.Constants.UNK': unk_count += 1 original_size = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), original_size)) return vocab
def init_vocab(name, dataFile, vocabFile, vocabSize, join=False, input_type='word'): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. if join: print('Building ' + 'shared' + ' vocabulary...') gen_word_vocab = make_join_vocab(dataFile, vocabSize, input_type=input_type) else: print('Building ' + name + ' vocabulary...') gen_word_vocab = make_vocab(dataFile, vocabSize, input_type=input_type) vocab = gen_word_vocab print() return vocab
def makeVocabulary(filename, size, input_type='word'): vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower) with open(filename) as f: for sent in f.readlines(): if input_type == "word": for word in sent.split(): vocab.add(word) elif input_type == "char": sent = sent.strip() for char in sent: vocab.add(char) else: raise NotImplementedError("Input type not implemented") originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFile, vocabFile, vocabSize): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + vocab.size() + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') if opt.prune_by_freq: if name == 'source': genWordVocab = makeVocabulary(dataFile, vocabSize, opt.src_min_freq) elif name == 'target': genWordVocab = makeVocabulary(dataFile, vocabSize, opt.tgt_min_freq) else: genWordVocab = makeVocabulary(dataFile, vocabSize) vocab = genWordVocab print() return vocab
def initVocabulary(name, srcFile, vocabFile, vocabSize, dacts_vocab=0): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. if not dacts_vocab: print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(srcFile, vocabSize) elif dacts_vocab: print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(srcFile, vocabSize, 0) vocab = genWordVocab originalSize = vocab.size() vocab = vocab.prune(vocabSize) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) print() return vocab
def main(): dicts = {} dicts['src'] = onmt.Dict() dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab, opt.src_vocab_size) dicts['tgt'] = dicts['src'] print('Preparing training ...') train = {} train['src'], train['tgt'] = makeData(opt.train_src, dicts['src']) print('Preparing validation ...') valid = {} valid['src'], valid['tgt'] = makeData(opt.valid_src, dicts['src']) if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict') print('Saving data to \'' + opt.save_data + '_cs' + str(opt.context_size) + '.train.pt\'...') save_data = {'dicts': dicts, 'train': train, 'valid': valid} torch.save(save_data, opt.save_data + '.train.pt')
def init_vocab(name, data_files, vocab_file, vocab_size, tokenizer, num_workers=1): vocab = None if vocab_file is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocab_file + '\'...') if not opt.load_bpe_voc: vocab = onmt.Dict() else: if name == "target": vocab = onmt.Dict([ opt.tgt_pad_token, opt.tgt_unk_token, opt.tgt_bos_token, opt.tgt_eos_token ], lower=opt.lower) elif name == "source": vocab = onmt.Dict([ opt.src_pad_token, opt.src_unk_token, opt.src_bos_token, opt.src_eos_token ], lower=opt.lower) else: print("Warning: name should be source or target") exit(-1) vocab.loadFile(vocab_file) print('Loaded ' + str(vocab.size()) + ' ' + name + ' words') if vocab is None: print('Building ' + name + ' vocabulary...') gen_word_vocab = make_vocab( name, data_files, vocab_size, tokenizer, num_workers=num_workers, ) vocab = gen_word_vocab print() return vocab
def makeVocabulary(filename, size, vocab=None): if vocab is None: vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower) elif vocab == 0: vocab = onmt.Dict([onmt.Constants.UNK_WORD], lower=opt.lower) with open(filename) as f: for sent in f.readlines(): if sent.split()[0] != '===': for word in sent.split(): vocab.add(word) return vocab
def makeFeature(data): vocabs = {} for key, value in data.items(): in_d = {} in_d[onmt.Constants.UNK] = onmt.Constants.UNK_WORD in_d[onmt.Constants.PAD] = onmt.Constants.PAD_WORD vocab = onmt.Dict([value for key, value in sorted(in_d.items())], lower=opt.lower) for data in value: vocab.add(data) vocabs[key] = vocab return vocabs
def makeVocabulary(filename, size): "Construct the word and feature vocabs." vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower) featuresVocabs = [] with codecs.open(filename, "r", "utf-8") as f: cnt = 0 for sent in f.readlines(): cnt += 1 words, features, numFeatures \ = onmt.IO.extractFeatures(sent.split()) if len(featuresVocabs) == 0 and numFeatures > 0: for j in range(numFeatures): featuresVocabs.append( onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ])) else: print('cnt : ', cnt) assert len(featuresVocabs) == numFeatures, \ "all sentences must have the same number of features" for i in range(len(words)): vocab.add(words[i]) for j in range(numFeatures): featuresVocabs[j].add(features[j][i]) originalSize = vocab.size() if size != 0: vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) else: print('Created dictionary of size %d' % (vocab.size())) return vocab, featuresVocabs
def initVocabulary(dataFile, src_vocab_file, src_vocabSize, tgt_vocab_file, tgt_vocabSize): src_vocab = None tgt_vocab = None if src_vocab_file is not None: # If given, load existing word dictionary. print('Reading vocabulary from \'' + src_vocab_file + '\'...') src_vocab = onmt.Dict() src_vocab.loadFile(src_vocab_file) print('Loaded ' + str(src_vocab.size()) + ' source words') if tgt_vocab_file is not None: # If given, load existing word dictionary. print('Reading vocabulary from \'' + tgt_vocab_file + '\'...') tgt_vocab = onmt.Dict() tgt_vocab.loadFile(tgt_vocab_file) print('Loaded ' + str(tgt_vocab.size()) + ' target words') if src_vocab and tgt_vocab: # early return return src_vocab, tgt_vocab with open(dataFile) as dataFileHandle: lines_in_file = [l.strip().decode('utf8').split('\t') for l in dataFileHandle.readlines()] line_num = 1 for l in lines_in_file: if len(l) != 2: print("Error on line %d" % (line_num)) line_num += 1 if src_vocab is None: # If a dictionary is still missing, generate it. print('Building source vocabulary...') src_lines = [l[0] for l in lines_in_file] src_vocab = makeVocabulary(src_lines, src_vocabSize) if tgt_vocab is None: # If a dictionary is still missing, generate it. print('Building target vocabulary...') tgt_lines = [l[1] for l in lines_in_file] tgt_vocab = makeVocabulary(tgt_lines, tgt_vocabSize) return src_vocab, tgt_vocab
def main(): conn = redis.Redis(host=opt.host, port=opt.port, db=opt.db) if not opt.dbExist: dicts = {} dicts['src'] = onmt.Dict() #if opt.src_type == "text": if True: dicts['src'], dicts['src_features'] = \ initVocabulary('source', opt.train_src, opt.src_vocab, opt.src_vocab_size) dicts['tgt'], dicts['tgt_features'] = \ initVocabulary('target', opt.train_tgt, opt.tgt_vocab, opt.tgt_vocab_size) print('Preparing training ...') else: save_data = torch.load(opt.save_data + ".dicts.pt") dicts = save_data['dicts'] train = {} conn.set('bucket_size', opt.bucket) train['src'], train['tgt'], \ train['src_features'], train['tgt_features'], \ train['alignments'] \ = makeData(conn, "train", opt.train_src, opt.train_tgt, dicts['src'], dicts['tgt'], dicts['src_features'], dicts['tgt_features'], bucket_size=opt.bucket, dbExist=opt.dbExist) print('Preparing validation ...') valid = {} valid['src'], valid['tgt'], \ valid['src_features'], valid['tgt_features'], \ valid['alignments'] \ = makeData(conn, "valid", opt.valid_src, opt.valid_tgt, dicts['src'], dicts['tgt'], dicts['src_features'], dicts['tgt_features'], bucket_size=opt.bucket, dbExist=opt.dbExist) if not opt.dbExist: if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict') if opt.tgt_vocab is None: saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict') if opt.features_vocabs_prefix: saveFeaturesVocabularies('source', dicts['src_features'], opt.save_data) saveFeaturesVocabularies('target', dicts['tgt_features'], opt.save_data) print('Saving data to \'' + opt.save_data + '.dicts.pt\'...') save_data = {'dicts': dicts, 'type': "text"} torch.save(save_data, opt.save_data + '.dicts.pt')
def makeVocabulary(filename, size, embGiven=False, embFile=None): special_embeddings = None if embGiven: special_embeddings = [ np.zeros(opt.emb_dim, ), np.zeros(opt.emb_dim, ), np.zeros(opt.emb_dim, ), np.ones(opt.emb_dim, ) ] vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower, special_embeddings=special_embeddings) with codecs.open(filename, "r", "utf-8") as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word) if embGiven: n = 0 with codecs.open(embFile, "r", "utf-8") as f: for l in f: items = l.strip().split() if len(items) < 301: continue try: v = np.array(items[1:], dtype=np.float32) except Exception as e: print(items) continue #sys.exit(-1) vocab.add_embedding(items[0], v, onmt.Constants.UNK_WORD, opt.normalize) n += 1 originalSize = vocab.size() vocab, c = vocab.prune(size, embGiven) if embGiven: # print (c, size) # print (len(vocab.idxToLabel), len(vocab.embeddings)) # print (max(vocab.embeddings.keys())) vocab.average_unk(onmt.Constants.UNK_WORD, n - c, opt.normalize) vocab.convert_embeddings_to_torch(dim=opt.emb_dim) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(filename, size): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD]) with open(filename) as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(data, size, freq): in_d = {} in_d[onmt.Constants.UNK] = onmt.Constants.UNK_WORD in_d[onmt.Constants.PAD] = onmt.Constants.PAD_WORD vocab = onmt.Dict([value for key, value in sorted(in_d.items())], lower=opt.lower) for key, value in data.items(): vocab.add(key, value) originalSize = vocab.size() vocab = vocab.prune(size, freq) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def initVocabulary(name, dataFile, vocabFile, vocabSize): vocab = None if vocabFile is not None: # If given, load existing word dictionary. print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...') vocab = onmt.Dict() vocab.loadFile(vocabFile) print('Loaded ' + vocab.size() + ' ' + name + ' words') if vocab is None: # If a dictionary is still missing, generate it. print('Building ' + name + ' vocabulary...') genWordVocab = makeVocabulary(dataFile, vocabSize) vocab = genWordVocab print() return vocab
def makeVocabulary(filename, size): vocab = onmt.Dict( [onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, \ onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower, seq_len=opt.seq_length) count = 0 with codecs.open(filename, "r", "utf-8") as f: tsv_reader = csv.reader(f, delimiter='\t') for line in tsv_reader: sent = line[1] for word in sent.split(): vocab.add(word) sent = line[2] for word in sent.split(): vocab.add(word) count += 1 with codecs.open(opt.valid_src, "r", "utf-8") as f: tsv_reader = csv.reader(f, delimiter='\t') for line in tsv_reader: sent = line[1] for word in sent.split(): vocab.add(word) sent = line[2] for word in sent.split(): vocab.add(word) fname = opt.valid_src.split('.tsv')[0][:-3] + 'test.tsv' with codecs.open(fname, "r", "utf-8") as f: tsv_reader = csv.reader(f, delimiter='\t') for line in tsv_reader: sent = line[1] for word in sent.split(): vocab.add(word) sent = line[2] for word in sent.split(): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def makeVocabulary(filename, size, min_freq): vocab = onmt.Dict([ onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD ], lower=opt.lower) with codecs.open(filename, 'r', encoding='utf-8') as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word) originalSize = vocab.size() vocab = vocab.prune_by_freq(min_freq) vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab
def main(): dicts = {} dicts['src'] = onmt.Dict() dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab, opt.src_vocab_size, opt.src_min_freq) dicts['tgt'] = initVocabulary('target', opt.train_tgt, opt.tgt_vocab, opt.tgt_vocab_size, opt.tgt_min_freq) print('Preparing training ...') train = {} train['src'], train['tgt'] = makeData(opt.train_src, opt.train_tgt, dicts['src'], dicts['tgt']) print('Preparing test ...') test = {} test['src'], test['tgt'] = makeData(opt.test_src, opt.test_tgt, dicts['src'], dicts['tgt'], True) print('Preparing validation ...') if '' not in (opt.valid_src, opt.valid_tgt): valid = {} valid['src'], valid['tgt'] = makeData(opt.valid_src, opt.valid_tgt, dicts['src'], dicts['tgt']) else: print('Empty validation') valid = test if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict') if opt.tgt_vocab is None: saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict') print('Saving data to \'' + opt.save_data + '\'...') save_data = { 'dicts': dicts, 'type': opt.src_type, 'train': train, 'valid': valid, 'test': test } torch.save(save_data, opt.save_data)
def main(): assert opt.keys != opt.acts dicts = {} dicts['src'] = onmt.Dict() dicts['src'] = initVocabulary('source', opt.train_txt, opt.train_key, opt.src_vocab, opt.src_vocab_size) dicts['tgt'] = dicts['src'] if opt.src_vocab is None: saveVocabulary('source', dicts['src'], opt.save_data + '.dict') dicts['das'] = make_act_dict('dialog act', opt.train_act, opt.da_dict) if opt.da_dict is None: saveVocabulary('dialog act', dicts['das'], opt.save_data + '.da_dict') if opt.acts: train_context_file = opt.train_act valid_context_file = opt.valid_act elif opt.keys: train_context_file = opt.train_key valid_context_file = opt.valid_key print('Preparing training ...') train_data = makeData(opt.train_txt, opt.train_act, dicts) print('Preparing validation ...') valid_data = makeData(opt.valid_txt, opt.valid_act, dicts) for cs in train_data.keys(): print('Saving data to \'' + opt.save_data + '_cs' + str(cs) + '.train.pt\'...') save_data = { 'dicts': dicts, 'train': train_data[cs], 'valid': valid_data[cs] } save_str = '_cs' + str(cs) torch.save(save_data, opt.save_data + save_str + '.train.pt') '''
def makeVocabulary(lines, size): vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower) sent_num = 1 for sent in lines: if sent is not list: sent = list(sent) for word in sent: vocab.add(word) sent_num += 1 originalSize = vocab.size() vocab = vocab.prune(size) print('Created dictionary of size %d (pruned from %d)' % (vocab.size(), originalSize)) return vocab