def learn_bpe(self, item_list, from_filenames=True): logging.info('generating bpe codes file. saving to %s' % self.codes_file) if from_filenames: filenames = item_list if isinstance(filenames, str): filenames = [filenames] # get combined vocabulary of all input files full_vocab = OrderedCounter() for fname in filenames: with codecs.open(fname, encoding='UTF-8') as f: full_vocab += learn_bpe.get_vocabulary(f) else: # get combined vocabulary of all input texts full_vocab = OrderedCounter() full_vocab += learn_bpe.get_vocabulary(item_list) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, self.num_symbols, self.min_frequency, False, is_dict=True) self.set_bpe(self.codes_file)
def learn_joint_codes(source_corp, dest_corp, joint_codes, num_sequences, verbose=False): """ source_corp - file path to source corpus dest_corp - file path to destination corpus joint_codes - file path to write joint codes num_sequences - maximum number of sequences to learn verbose - print all messages """ temp_file = "temp_cat_file" with open(source_corp, "r", encoding="utf-8") as s, open(dest_corp, "r", encoding="utf-8") as d, open( temp_file, "w", encoding="utf-8") as t: lines = s.readlines() + d.readlines() t.writelines(lines) if verbose: print("Concatenated %s and %s to temporary file %s" % (source_corp, dest_corp, temp_file)) with open(temp_file, "r", encoding="utf-8") as temp, open(joint_codes, "w", encoding="utf-8") as joint: learn_bpe.main(temp, joint, num_sequences, verbose=verbose) os.remove(temp_file) if verbose: print("Deleted temporary file %s" % temp_file) print("Wrote joint codes file %s" % joint_codes)
def learn_codes(source_corp, dest_corp, source_codes, dest_codes, num_sequences, verbose=False): """ source_corp - file path to source corpus dest_corp - file path to destination corpus source_codes - file path to write source codes dest_codes - file path to write destination codes num_sequences - maximum number of sequences to learn verbose - print all messages """ with open(source_corp, "r", encoding="utf-8") as corp, open(source_codes, "w", encoding="utf-8") as codes: learn_bpe.main(corp, codes, num_sequences, verbose=verbose) if verbose: print("Wrote codes file %s" % source_codes) with open(dest_corp, "r", encoding="utf-8") as corp, open(dest_codes, "w", encoding="utf-8") as codes: learn_bpe.main(corp, codes, num_sequences, verbose=verbose) if verbose: print("Wrote codes file %s" % dest_codes)
def build_subword_vocab(operation_num=100): infile = os.path.join(datagen_path, 'word') outfile = infile + '_subword' learn_bpe.main(codecs.open(infile, encoding='utf-8'), codecs.open(outfile, 'w', encoding='utf-8'), operation_num, 2, verbose=True, vocab_name=vocab_file)
def test_learn_bpe(self): infile = codecs.open(os.path.join(currentdir, 'data', 'corpus.en'), encoding='utf-8') outfile = codecs.open(os.path.join(currentdir, 'data', 'bpe.out'), 'w', encoding='utf-8') learn_bpe.main(infile, outfile, 1000) infile.close() outfile.close() outlines = open(os.path.join(currentdir, 'data', 'bpe.out')) reflines = open(os.path.join(currentdir, 'data', 'bpe.ref')) for line, line2 in zip(outlines, reflines): self.assertEqual(line, line2) outlines.close() reflines.close()
def main(input, output_name, vocab, symbols, separator='@@', min_frequency=2, verbose=False): # read/write files as UTF-8 input = [codecs.open(f, encoding='UTF-8') for f in input] vocab = [codecs.open(f, 'w', encoding='UTF-8') for f in vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(output_name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, symbols, min_frequency, verbose, is_dict=True) with codecs.open(output_name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator, None) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(input, vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close()
# get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0)
# get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with io.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, case_insensitive=args.case_insensitive) separator = '@@' if args.opennmt_separator: separator = '■' with io.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator, None, case_feature=args.case_insensitive) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab):
# read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip())
def main(args_list: list = None): parser = create_parser() args = parser.parse_args(args_list) if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write( 'Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [ codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab ] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0}\t{1}\n".format(key, freq)) vocab_file.close()