def bpe_processing(filenames: list[str]): ru_sentences = [] zh_sentences = [] for filename in filenames: with open(filename, "r") as f: for line in f: ru, zh = line.split(" ||| ") ru_sentences.append(ru) zh_sentences.append(zh.replace(" ", "")) bpe = {} learn_bpe(StringIO("\n".join(ru_sentences)), open('bpe_rules.ru', 'w'), num_symbols=8000) bpe["ru"] = BPE(open('./bpe_rules.ru')) learn_bpe(StringIO("\n".join(zh_sentences)), open('bpe_rules.zh', 'w'), num_symbols=8000) bpe["zh"] = BPE(open('./bpe_rules.zh')) with open("token_map.txt", "w") as f: for ru, zh in zip(ru_sentences, zh_sentences): ru_tokens = bpe["ru"].process_line(ru.strip()) zh_tokens = bpe["zh"].process_line(zh.strip()) ru_map = ru_token_map(ru_tokens) zh_map = zh_token_map(zh_tokens) print(*ru_map, "|||", *zh_map, file=f) print(ru_tokens + " ||| " + zh_tokens)
def finalize(self, frequencies, num_symbols=30000, minfreq=2): """Build the codecs. :param: dictionary of (token: frequency) pairs :param num_symbols: Number of BPE symbols. Recommend 30000-40000. If <= 0, default 30000 will be used. :param minfreq: Minimum frequency of a token before forced BPE decomposition. If <= 0 will use subword-nmt default of 2. """ if hasattr(self, 'bpe'): # we already finalized the codecs return False print('Dictionary: saving bpe codecs to {}'.format(self.codecs)) dictionary = ("{} {}".format(k, v) for k, v in frequencies.items()) if num_symbols <= 0: num_symbols = 30000 if minfreq <= 0: minfreq = 2 with open(self.codecs, 'w') as outstream: learn_bpe.learn_bpe( dictionary, outstream, num_symbols=num_symbols, min_frequency=minfreq, is_dict=True, ) self._load_from_codecs() return True
def subword_gen(infile, outfile, num_symbols): infile_stream = codecs.open(infile, encoding='utf-8') outfile_stream = codecs.open(outfile, 'w', encoding='utf-8') learn_bpe(infile_stream, outfile_stream, num_symbols, is_dict=False, total_symbols=True)
def build_vocab(imgs, params): # count up the number of words captions = [] for img in imgs: for sent in img['sentences']: captions.append(' '.join(sent['tokens'])) captions = '\n'.join(captions) all_captions = tempfile.NamedTemporaryFile(delete=False) all_captions.close() with open(all_captions.name, 'w') as txt_file: txt_file.write(captions) # codecs_output = tempfile.NamedTemporaryFile(delete=False) codecs_output.close() with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count']) with codecs.open(codecs_output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes) tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') for _, img in enumerate(imgs): img['final_captions'] = [] for sent in img['sentences']: txt = ' '.join(sent['tokens']) txt = bpe.segment(txt).strip() img['final_captions'].append(txt.split(' ')) tmpout.write(txt) tmpout.write('\n') if _ < 20: print(txt) tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True) # Always insert UNK print('inserting the special UNK token') vocab.append('UNK') print('Vocab size:', len(vocab)) os.remove(all_captions.name) with open(codecs_output.name, 'r') as codes: bpe = codes.read() os.remove(codecs_output.name) os.remove(tmp.name) return vocab, bpe
def __learn(self): """ Train a BPE. :trainfile: a file path which the model will learn. :codesfile: the output codes file. :num_symbols: number of vocabulary. :min_frequency: min frequency of the word. """ trainfile = codecs.open(self.trainfile, encoding='utf-8') codesfile = codecs.open(self.codesfile, mode='w', encoding='utf-8') learn_bpe(trainfile, codesfile, self.num_symbols, self.min_frequency) self.__open_bpe()
def learn_bpe_function(raw_train_file, bpe_codes_file): parser = learn_bpe.create_parser() args = parser.parse_args( ["--input", raw_train_file, "--output", bpe_codes_file]) args.output = codecs.open(args.output.name, 'w', encoding='utf-8') args.input = codecs.open(args.input.name, encoding='utf-8') learn_bpe.learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols)
def build_wikitext_bpe_encoder( special_tokens: Optional[Sequence[str]] = None) -> BPEEncoder: regex_tokenizer = RegexTokenizer() def tokenize_with_regex(text: str) -> Iterable[str]: document = regex_tokenizer.apply(text) for token in document: if token.number: yield TOKEN_FOR_NUMBERS else: yield str(token) def wikitext_tokens(tokenizer: Callable[[str], Iterable[str]], description: str): train_tokens = read_wikitext_file(TRAINING_SET_NAME) all_lines = train_tokens.splitlines() for line in tqdm.tqdm(all_lines, desc=description): yield from tokenizer(line) vocabulary_file = io.StringIO('\n'.join( '{} {}'.format(word, counter) for word, counter in build_vocabulary( wikitext_tokens(tokenize_with_regex, 'Building vocabulary')))) with io.StringIO() as file_with_merges: print('Learning BPE...', flush=True, end='') learn_bpe(vocabulary_file, file_with_merges, NUM_BPE_MERGES, min_frequency=3, verbose=False, is_dict=True, total_symbols=False) file_with_merges.seek(0) print('Done', flush=True) merges = BPEMerges.load_from_file(file_with_merges) bpe_tokenizer = BPETokenizer(merges, tokenize_with_regex, mark_sequence_edges=True) bpe_vocabulary = build_vocabulary( wikitext_tokens(bpe_tokenizer.apply, 'Building BPE vocabulary')) print('BPE Vocabulary size:', len(bpe_vocabulary)) bpe_vocabulary_file = io.StringIO('\n'.join( '{} {}'.format(word, counter) for word, counter in bpe_vocabulary)) bpe_encoder = BPEEncoder(bpe_tokenizer, bpe_vocabulary_file, special_tokens=special_tokens) return bpe_encoder
def fix_file(self, corpus, pair, locale_code, src_file, target_file, action): """learn and apply BPE on the whole file this will never change the actual file and always return an empty set """ debug(f" -[{pair}/{locale_code}]: processing {src_file} now!") learn_bpe(infile=open(src_file), outfile=open(self.code_files[pair][locale_code], "w"), verbose=self.verbose, num_symbols=self.n_symbols, min_frequency=self.min_frequency) copyfile(src_file, target_file) return set()
def finalize(self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2) -> bool: """ Build the codecs. :param frequencies: dictionary of (token: frequency) pairs :param num_symbols: Number of BPE symbols. Recommend 30000-40000. If <= 0, default 30000 will be used. :param minfreq: Minimum frequency of a token before forced BPE decomposition. If <= 0 will use subword-nmt default of 2. :return did_finalize: return whether codecs are finalized this call. """ if hasattr(self, 'bpe'): # we already finalized the codecs return False logging.debug(f'Saving bpe codecs to {self.codecs}') dictionary = ("{} {}".format(k, v) for k, v in frequencies.items()) if num_symbols <= 0: num_symbols = 30000 if minfreq <= 0: minfreq = 2 codec_dir, _ = os.path.split(self.codecs) PathManager.mkdirs(codec_dir) with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream: learn_bpe.learn_bpe( dictionary, outstream, num_symbols=num_symbols, min_frequency=minfreq, is_dict=True, ) self._load_from_codecs() return True
def train_bpe(config): print('Start BPE training...') from subword_nmt.learn_bpe import main as learn_bpe train = json.load(open(config.train_file, 'r')) train_texts = [] for p in train['data'][0]['paragraphs']: train_texts.append( preprocess_string(' '.join(word_tokenize(p['context'])), remove_unicode=config.remove_unicode)) for qas in p['qas']: train_texts.append( preprocess_string(' '.join(word_tokenize(qas['question'])), remove_unicode=config.remove_unicode)) learn_bpe(train_texts, outfile=open(config.bpe_codes_file, 'w'), num_symbols=config.bpe_merges_count) print('BPE trained. BPE codes saved to {}'.format(config.bpe_codes_file))
def create_train_bpe(train_loc, bpe_voc=['en', 'ru'], num_symbols=10000): """ args: train_loc: location of train.lang files with previously tokenized data returns: write train.bpe.lang files into train_loc """ # build and apply bpe vocs bpe = {} for lang in bpe_voc: print("Learning BPE...") learn_bpe(open(train_loc + 'train.' + lang), open(train_loc + 'bpe_rules.' + lang, 'w'), num_symbols=num_symbols) bpe[lang] = BPE(open(train_loc + 'bpe_rules.' + lang)) print("Writing train files...") with open(train_loc + 'train.bpe.' + lang, 'w') as f_out: for line in open(train_loc + 'train.' + lang): f_out.write(bpe[lang].process_line(line.strip()) + '\n')
def train_subword_model(src_text, trg_text, nb_symbols=10000): # create text content with source and target text content = [] content.extend(src_text) content.extend(trg_text) bpe_model_io = io.StringIO() src_vocab_io = io.StringIO() trg_vocab_io = io.StringIO() # 1. Learn BPE model on both source and target text # 1.1 cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file} # 1.2 subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1 # 1.3 subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2 # 1.1 learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols)) learn_bpe(content, bpe_model_io, nb_symbols, 0, False, False, False) # 1.2 src_text_tok = apply_bpe(bpe_model_io, src_text, merges=nb_symbols) get_vocab(src_text_tok, src_vocab_io) src_vocab_io.seek(0) src_vocab = read_vocabulary(src_vocab_io, 0) # 1.3 trg_text_tok = apply_bpe(bpe_model_io, trg_text, merges=nb_symbols) get_vocab(trg_text_tok, trg_vocab_io) trg_vocab_io.seek(0) trg_vocab = read_vocabulary(trg_vocab_io, 0) # 3. Re-apply BPE with the obtained vocabulary # subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1 src_text_tok = apply_bpe(bpe_model_io, src_text, vocab=src_vocab) trg_text_tok = apply_bpe(bpe_model_io, trg_text, vocab=trg_vocab) bpe_model = bpe_model_io.getvalue() bpe_model_io.close() src_vocab_io.close() trg_vocab_io.close() return bpe_model, src_vocab, trg_vocab, src_text_tok, trg_text_tok
def subword(self, cleaned_filepaths, overwrite): bpe_filepath = get_bpe_path(self.experiment_name, self.merge_ops) if self.corpora_type == 'training': # Concatenated file necessary for BPE learning concatenated_filepath = get_concat_path(self.file_prefix) concatenate_files(cleaned_filepaths, concatenated_filepath, overwrite=overwrite) if os.path.exists(bpe_filepath) and overwrite == False: print(bpe_filepath, 'already exists') else: print('Learning BPE encoding. This may take a while.') with open(concatenated_filepath, 'r', encoding='utf-8') as infile, open( bpe_filepath, 'w', encoding='utf-8') as outfile: learn_bpe.learn_bpe( infile, outfile, num_symbols=self.merge_ops ) # Get codecs, write codecs to outfile print('Applying') with open(bpe_filepath, 'r', encoding='utf-8') as codec: bpe = apply_bpe.BPE(codec) print('Writing bpe') for i, lang in enumerate(self.langs): lang_filepath = cleaned_filepaths[i] processed_filepath = get_processed_data_path( self.experiment_name, self.corpora_type, lang) if overwrite == False and os.path.exists(processed_filepath): continue with open(lang_filepath, 'r', encoding='utf-8') as f1, open(processed_filepath, 'w', encoding='utf-8') as f2: for line in f1: f2.write(bpe.process_line(line)) if self.corpora_type == 'training': vocab_filepath = get_vocab_path(self.experiment_name, lang) with open(processed_filepath, 'r', encoding='utf-8') as train_file, open( vocab_filepath, 'w', encoding='utf-8') as vocab_file: get_vocab.get_vocab(train_file, vocab_file)
def learn_bpe(self, item_list, bin_file=False, from_filenames=True): logging.info('generating bpe codes file. saving to %s' % self.codes_file) if from_filenames: filenames = item_list if isinstance(filenames, str): filenames = [filenames] # get combined vocabulary of all input files full_vocab = OrderedCounter() if bin_file: for fname in filenames: reader = open(fname, 'rb') len_bytes = reader.read(8) if not len_bytes: break # finished reading this file str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] example_str = example_pb2.Example.FromString(example_str) article_text = example_str.features.feature['article'].bytes_list.value[ 0].decode() # the article text was saved under the key 'article' in the data files abstract_text = example_str.features.feature['abstract'].bytes_list.value[ 0].decode() # the abstract text was saved under the key 'abstract' in the data files full_vocab += learn_bpe.get_vocabulary(abstract2sents(article_text) + abstract2sents(abstract_text)) else: for fname in filenames: with codecs.open(fname, encoding='UTF-8') as f: full_vocab += learn_bpe.get_vocabulary(f) else: # get combined vocabulary of all input texts full_vocab = OrderedCounter() full_vocab += learn_bpe.get_vocabulary(item_list) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, self.num_symbols, self.min_frequency, False, is_dict=True) self.set_bpe(self.codes_file)
def finalize(self): """Build the codecs""" if self.built: return False self.built = True with open(self.codecs, 'w') as outstream: # There's a potentially more memory efficient way to do this, with # the is_dict method able to handle <word> \t <count> format. # It will require more sophisticated marshalling of data back and # forth learn_bpe.learn_bpe( self.training_data, outstream, num_symbols=self.num_symbols, min_frequency=self.minfreq, is_dict=False, ) self._load_from_codecs() return True
def tokenize_corpus_hw(data='data.txt', train_loc='./', bpe_voc=['en', 'ru'], num_symbols=10000): tokenizer = WordPunctTokenizer() with open(train_loc + 'train.' + bpe_voc[0], 'w') as f_src, \ open(train_loc + 'train.' + bpe_voc[1], 'w') as f_dst: for line in open(data): src_line, dst_line = line.strip().split('\t') f_src.write(tokenize(src_line, tokenizer) + '\n') f_dst.write(tokenize(dst_line, tokenizer) + '\n') # build and apply bpe vocs bpe = {} for lang in bpe_voc: learn_bpe(open(train_loc + 'train.' + lang), open('bpe_rules.' + lang, 'w'), num_symbols=num_symbols) bpe[lang] = BPE(open(train_loc + 'bpe_rules.' + lang)) with open(train_loc + 'train.bpe.' + lang, 'w') as f_out: for line in open(train_loc + 'train.' + lang): f_out.write(bpe[lang].process_line(line.strip()) + '\n')
def learn_bpe(self, item_list, from_filenames=True): logging.info('generating bpe codes file. saving to %s' % self.codes_file) # get vocabulary at word level (before bpe) def segment_words(line): return _segment_words(line, self.pre_tokenize) vocab_words = _get_vocabulary(item_list, from_filenames=from_filenames, segment=segment_words) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in vocab_words.items() ] # learn BPE on combined vocabulary with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols, min_frequency=self.min_frequency, verbose=False, is_dict=True, total_symbols=self.total_symbols) self.set_bpe(self.codes_file)
def __init__(self, elements, prune, max_num, start=True, stop=True, pad=True, unk=True, rule=False, bpe=-1): self.start = start self.stop = stop self.codes = None vocab = Counter() self.max_num = max_num self.itos = [] self.stoi = {} if pad: self.addSymbol('<blank>') if unk: self.addSymbol('<unk>') if start: self.addSymbol('<s>') if stop: self.addSymbol('</s>') self.rule = rule if rule: # Adding these for both ATIS and CONCODE. Extra things in the vocab are ok. for pre_terminal in CDDataset.pre_terminal_symbols: self.addSymbol(CDDataset._unk_rule_from_Nt(pre_terminal)) if bpe >= 0: self.codes = learn_bpe.learn_bpe(elements, bpe, 0) # last is min freq b = apply_bpe.BPE(self.codes) elements = b.segment_tokens(elements) for w in elements: vocab[w] += 1 if bpe >= 0: print('Vocab size {}'.format(len(vocab))) # prune low frequency words max_vocab = self.max_num if not rule else 100000000000 for (w, f) in vocab.most_common(max_vocab): if ((rule == False and f > prune) or (rule == True and not CDDataset._is_terminal_rule(w)) or (rule == True and CDDataset._is_terminal_rule(w) and len(self.itos) < self.max_num) or w.endswith("_concodeNT")): word = w.replace('concodeclass_', '').replace('concodefunc_', '') self.itos.append(word) self.stoi[word] = len(self.itos) - 1 else: #map everything else to unk if rule: # We need the right kind of UNK rule here mapped_to_known_unk = False for pre_terminal in CDDataset.pre_terminal_symbols: if pre_terminal in w: self.stoi[w] = self.stoi[ CDDataset._unk_rule_from_Nt(pre_terminal)] mapped_to_known_unk = True break if not mapped_to_known_unk: # An unk type we dont know about. Investigate. import ipdb ipdb.set_trace() # For next_rules, we cannot have any other type of unk self.stoi[w] = self.stoi['<unk>'] else: self.stoi[w] = self.stoi['<unk>']
def get_dict(args): input_args = [ args.train_prefix + '.' + args.source_lang, args.train_prefix + '.' + args.target_lang ] path = '/Users/chaofeng/atmt/assignment3/baseline/raw_data_back20000/' vocab_args = [ path + "dict" + '.' + args.source_lang, path + "dict" + '.' + args.target_lang ] #input_args = [path+"train.de", path+"train.en"] #vocab_args = [path+"dict.de", path+"dict.en"] separator = '@@' symbols = 10000 min_frequency = 1 output = path + "code" # read/write files as UTF-8 input = [codecs.open(f, encoding='UTF-8') for f in input_args] vocab = [codecs.open(f, mode='w', encoding='UTF-8') for f in vocab_args] # get combined vocabulary of all input texts full_vocab = Counter() for f in input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(output, mode='w', encoding='UTF-8') as file: learn_bpe.learn_bpe(vocab_list, file, symbols, min_frequency) with codecs.open(output, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(input, vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) return vocab_args, output
def main(argv): import argparse from io import StringIO # argument parsing parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="adapts an UD dataset to context-sensitive lemmatization") io_group = parser.add_argument_group('io') io_group.add_argument("--input", help="file to be transformed", type=str) io_group.add_argument("--output", help="output source and target files", nargs='+', type=str, default=None) io_group.add_argument( "--transform_appendix", help= "appendix to transform folder name (e.g. SLURM_JOB_ID or datetime)", type=str, default=None) io_group.add_argument( "--word_column_index", help="index of word column in the file (zero-indexed)", type=int, default=0) io_group.add_argument( "--lemma_column_index", help="index of lemma column in the file (zero-indexed)", type=int, default=1) io_group.add_argument( "--tag_column_index", help="index of tag column in the file (zero-indexed)", type=int, default=2) io_group.add_argument('--debug', dest='debug', help="debug mode prints target/source file to stdout" " instead of writing to the file system", action='store_true') io_group.add_argument('--overwrite', dest='overwrite', action='store_true') io_group.add_argument( "--print_file", help="which file to output (source/target) in debug mode", choices=['source', 'target'], type=str, default=defaults["PRINT_FILE"]) repr_group = parser.add_argument_group('representation') repr_group.add_argument( "--mode", help="mode of transformation", choices=['word_and_context', 'sentence_to_sentence'], type=str, default=defaults["MODE"]) repr_group.add_argument("--word_unit", help="type of word representation", choices=['char', 'word', 'bpe'], type=str, default=defaults["WORD_UNIT"]) repr_group.add_argument("--tag_unit", help="type of tag representation", choices=['char', 'word'], type=str, default=defaults["TAG_UNIT"]) repr_group.add_argument("--context_unit", help="type of context representation", choices=['char', 'bpe', 'word'], type=str, default=defaults["CONTEXT_UNIT"]) repr_group.add_argument( "--char_n_gram_mode", help="size of char-n-grams (only used if --context_unit is char" "or if --mode is sentence_to_sentence and --word_unit is char, default: %(default)s)", type=int, default=defaults["CHAR_N_GRAM"]) repr_group.add_argument( "--sentence_size", help="maximum size of sentence in sentence_to_sentence mode", type=int, default=argparse.SUPPRESS) repr_group.add_argument('--tag_first', action='store_true', help="if true tags will be printed before " "words in source and target files") ctx_group = parser.add_argument_group('context') ctx_group.add_argument( "--context_size", help= "size of context representation (in respective units) on left and right (0 to use full span)", type=int, default=defaults["CONTEXT_SIZE"]) ctx_group.add_argument( "--context_char_size", help= "size of context representation (in characters) on left and right (0 to use full span, has precedence over --context_size)", type=int, default=argparse.SUPPRESS) ctx_group.add_argument( "--context_span", help= "maximum span of a word in number of sentences on left and right of the sentence of the word, default: %(default)s))", type=int, default=defaults["CONTEXT_SPAN"]) ctx_group.add_argument( "--context_tags", help="whether and where to include tag in the context", choices=['none', 'left'], type=str, default=defaults["CONTEXT_TAGS"]) bpe_group = parser.add_argument_group('bpe') bpe_group.add_argument( "--bpe_operations", help="number of BPE merge operations to be learned " "(corresponds to number of symbols/char-n-grams/codes)", type=int, default=defaults["BPE_OPERATIONS"]) bpe_group.add_argument( "--bpe_codes_path", help= "full file path to export BPE codes to or to read them from if available", type=str, default=None) boundary_group = parser.add_argument_group('boundaries') boundary_group.add_argument( "--left_context_boundary", help="left context boundary special symbol (default: %(default)s)", type=str, default=defaults["LEFT_CONTEXT_BOUNDARY"]) boundary_group.add_argument( "--example_boundary", help="example boundary special symbol (default: %(default)s)", type=str, default=defaults["EXAMPLE_BOUNDARY"]) boundary_group.add_argument( "--right_context_boundary", help="right context boundary special symbol (default: %(default)s)", type=str, default=defaults["RIGHT_CONTEXT_BOUNDARY"]) boundary_group.add_argument( "--word_boundary", help="word boundary special symbol (default: %(default)s)", type=str, default=defaults["WORD_BOUNDARY"]) boundary_group.add_argument( "--tag_boundary", help="tag boundary special symbol (default: %(default)s)", type=str, default=defaults["TAG_BOUNDARY"]) boundary_group.add_argument( '--subword_separator', type=str, default=defaults["SUBWORD_SEPARATOR"], metavar='STR', help= "separator between non-final BPE subword units (default: '%(default)s'))" ) args = parser.parse_args(argv) # determining input if args.input is None: if args.output is None: raise ValueError( "Can't decide how to name the transformation because you feed from stdin. Use --output to specify path." ) args.input = sys.stdin else: input_folders = re.split("/+|\\\\+", args.input) if len(input_folders) < 2: raise ValueError( "Can't decide how to name the transformation. Use --output to specify path." ) else: input_folder = input_folders[-2] input_filename = input_folders[-1].split(".")[0] # determining output if not args.debug: if args.output is None or (type(args.output) is list and len(args.output) == 1): transform_folder = "{}_{}_{}{}{}{}{}{}{}{}{}".format( input_folder, "w" + args.word_unit, "t" + args.tag_unit, ("_" + (("{:02d}u".format(args.context_size)) if not hasattr(args, 'context_char_size') else (str(args.context_char_size) + "ch"))) if args.mode == 'word_and_context' else "", ("_" + ("c" + args.context_unit)) if args.mode == "word_and_context" else "", "_n{}".format(args.char_n_gram_mode) if ((args.mode == 'word_and_context' and args.context_unit == "char") or (args.mode == 'sentence_to_sentence' and args.word_unit == 'char')) else "", "_n{}".format(args.bpe_operations) if ((args.mode == 'word_and_context' and args.context_unit == "bpe") or (args.mode == 'sentence_to_sentence' and args.word_unit == 'bpe')) else "", "_ct" if args.context_tags == 'left' else "", "_tf" if args.tag_first else "", "_cs{}".format(args.context_span) if args.mode == 'word_and_context' else "", ".{}".format(args.transform_appendix) if args.transform_appendix else "") if args.output is None or not args.output or '' in args.output: full_transform_folder_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'input', args.mode, transform_folder) else: full_transform_folder_path = os.path.join( args.output[0], transform_folder) os.makedirs(full_transform_folder_path, exist_ok=True) output_source_path = os.path.join( full_transform_folder_path, '{}_source'.format(input_filename)) output_target_path = os.path.join( full_transform_folder_path, '{}_target'.format(input_filename)) print(full_transform_folder_path) if not args.overwrite and (os.path.isfile(output_source_path) or os.path.isfile(output_target_path)): raise ValueError( "Output files for {} already exist in {}. Pass --overwrite or delete them." .format(input_filename, full_transform_folder_path)) # truncate output files or create them anew open(output_source_path, 'w').close() open(output_target_path, 'w').close() else: if len(args.output) != 2: raise ValueError( "You must specify full target and source output file paths (including file name)." ) full_transform_folder_path = None output_source_path = args.output[0] output_target_path = args.output[1] print(args, file=sys.stderr) # loading file infile_df = preprocess_dataset_for_train( pd.read_csv(args.input, sep='\s+', names=cols, usecols=[ args.word_column_index, args.lemma_column_index, args.tag_column_index ], skip_blank_lines=False, comment='#', quoting=3)[cols]) infile_df = infile_df.reset_index(drop=True) # subword preprocessing of the input file if (args.mode == 'word_and_context' and args.context_unit == 'char') or ( args.mode == 'sentence_to_sentence' and args.word_unit == 'char'): # uses subword-nmt to segment text into chargrams from types import SimpleNamespace import numpy as np sys.path.append( os.path.join(os.path.dirname(__file__), '..', 'subword-nmt')) from subword_nmt.segment_char_ngrams import segment_char_ngrams def segment(col): subword_nmt_output = StringIO() segment_char_ngrams( SimpleNamespace(input=infile_df[col].dropna().astype(str), vocab={}, n=args.char_n_gram_mode, output=subword_nmt_output, separator=args.subword_separator)) subword_nmt_output.seek(0) infile_df.loc[infile_df[col].notnull(), [col]] = np.array([ line.rstrip(' \t\n\r') for line in subword_nmt_output ])[:, np.newaxis] subword_nmt_output.truncate(0) segment("word") if args.mode == 'sentence_to_sentence' and args.word_unit == 'char': segment("lemma") elif (args.mode == 'word_and_context' and args.context_unit == 'bpe') or ( args.mode == 'sentence_to_sentence' and args.word_unit == 'bpe'): if args.bpe_codes_path: bpe_codes_file_path = args.bpe_codes_path elif full_transform_folder_path: bpe_codes_file_path = os.path.join(full_transform_folder_path, "bpe_codes") else: raise ValueError( "Specify transformation output folder or bpe output file path in order to export BPE codes." ) # BPE processing sys.path.append( os.path.join(os.path.dirname(__file__), '..', 'subword-nmt')) from subword_nmt.apply_bpe import BPE # only learn BPEs if bpe_codes file is unavailable if not os.path.isfile(bpe_codes_file_path): # as advised in subword-nmt's readme, we learn BPE jointly on the sources and targets # because they share an alphabet (for the most part) from subword_nmt.learn_bpe import learn_bpe bpe_codes = open(bpe_codes_file_path, "w", encoding='utf-8') learn_bpe( infile_df[["word", "lemma"]].dropna().astype(str).to_string( index=False, header=False).splitlines(), bpe_codes, args.bpe_operations) bpe_codes.close() with open(bpe_codes_file_path, encoding='utf-8') as bpe_codes: # apply all merge operations, without vocabulary and glossaries bpe = BPE(bpe_codes, -1, args.subword_separator, [], []) infile_df.loc[infile_df["word"].notnull(), ["word", "lemma"]] = \ infile_df.loc[infile_df["word"].notnull(), ["word", "lemma"]].applymap(bpe.process_line) sentence_indices = pd.isna(infile_df).all(axis=1) sentence_end_iterator = (i for i, e in sentence_indices.to_dict().items() if e is True) # per-mode specific processing if args.mode == 'word_and_context': sentence_dfs = [] transformer_args = { 'word_unit': args.word_unit, 'tag_unit': args.tag_unit, 'context_size': args.context_size, 'context_char_size': args.context_char_size if hasattr(args, 'context_char_size') else None, 'context_tags': args.context_tags, 'tag_first': args.tag_first, 'left_context_boundary': args.left_context_boundary, 'tag_boundary': args.tag_boundary, 'right_context_boundary': args.right_context_boundary, 'word_boundary': args.word_boundary, 'example_boundary': args.example_boundary, 'subword_separator': args.subword_separator } transformer = Transformer(**transformer_args) sentence_start = 0 for sentence_end in sentence_end_iterator: sentence_dfs.append(infile_df.loc[sentence_start:sentence_end - 1]) sentence_start = sentence_end + 1 for sentence_df_idx, sentence_df in enumerate(sentence_dfs): # adds additional context according to CONTEXT_SPAN to sentence below lc_df = pd.DataFrame() rc_df = pd.DataFrame() if args.context_span > 0: lc_df_ls = sentence_dfs[ max(sentence_df_idx - args.context_span, 0):sentence_df_idx] if lc_df_ls: lc_df = pd.concat(lc_df_ls) rc_df_ls = sentence_dfs[ sentence_df_idx + 1:min(sentence_df_idx + 1 + args.context_span, len(sentence_dfs) - 1)] if rc_df_ls: rc_df = pd.concat(rc_df_ls) output_source_lines, output_target_lines = transformer.process_sentence( sentence_df, lc_df, rc_df) if not (output_source_lines or output_target_lines): continue if args.debug: if args.print_file == 'source': print("\n".join(output_source_lines)) else: print("\n".join(output_target_lines)) else: with open(output_source_path, 'a+', encoding='utf-8') as outsourcefile, \ open(output_target_path, 'a+', encoding='utf-8') as outtargetfile: outsourcefile.write("\n".join(output_source_lines) + "\n") outtargetfile.write("\n".join(output_target_lines) + "\n") elif args.mode == 'sentence_to_sentence': sentence_start = 0 if args.example_boundary is not None: pos_close_tag = args.example_boundary.find('<') + 1 open_tag = args.example_boundary close_tag = open_tag[:pos_close_tag] + '/' + open_tag[ pos_close_tag:] for sentence_end in sentence_end_iterator: output_source_line = [open_tag] output_target_line = [open_tag] last_split_pos = 0 for sentence_idx in range(sentence_start, sentence_end): subwords = re.split("\s*{}\s*".format(args.subword_separator), infile_df.at[sentence_idx, "word"]) lemma = re.split("\s*{}\s*".format(args.subword_separator), infile_df.at[sentence_idx, "lemma"]) tag = infile_df.at[sentence_idx, "tag"] # inserts a breaking point at the position before this word+tag were inserted in both the source and the target output_source_insertion_point = len(output_source_line) output_target_insertion_point = len(output_target_line) output_source_line.extend(subwords) output_source_line.append(args.word_boundary) if not args.tag_first: output_target_line.extend(lemma) output_target_line.append(args.tag_boundary) if args.tag_unit == "word": output_target_line.append(tag) else: output_target_line.extend(tag) if args.tag_first: output_target_line.append(args.tag_boundary) output_target_line.extend(lemma) output_target_line.append(args.word_boundary) # if the target translation overflows (target sentence is guaranteed to be longer in size) # sanity check: awk 'NF > 50 { print NR, NF }' dev_source | wc -l last_split_size = len(output_target_line) - last_split_pos if hasattr(args, 'sentence_size' ) and last_split_size > args.sentence_size: output_source_line.insert(output_source_insertion_point, defaults["SENTENCE_SPLIT_TAG"]) output_target_line.insert(output_target_insertion_point, defaults["SENTENCE_SPLIT_TAG"]) # set to 1, for internal slices to account for the opening <w> sentence boundary tag last_split_pos = output_target_insertion_point sentence_start = sentence_end + 1 output_source_line.pop() output_target_line.pop() output_source_line.append(close_tag) output_target_line.append(close_tag) assert output_source_line.count(defaults["SENTENCE_SPLIT_TAG"]) == output_target_line.count(defaults["SENTENCE_SPLIT_TAG"]), \ "Sentence splits in sentence_to_sentence mode are wrong." # split sentences if necessary split_cond = True end_source_line_split_pos = 0 end_target_line_split_pos = 0 while split_cond: try: start_source_line_pos = end_source_line_split_pos start_target_line_pos = end_target_line_split_pos end_source_line_split_pos = output_source_line.index( defaults["SENTENCE_SPLIT_TAG"], end_source_line_split_pos) + 1 end_target_line_split_pos = output_target_line.index( defaults["SENTENCE_SPLIT_TAG"], end_target_line_split_pos) + 1 except ValueError: split_cond = False end_source_line_split_pos = len(output_source_line) + 1 end_target_line_split_pos = len(output_target_line) + 1 output_source_line_split = output_source_line[ start_source_line_pos:end_source_line_split_pos - 1] output_target_line_split = output_target_line[ start_target_line_pos:end_target_line_split_pos - 1] if output_source_line_split[-1] == defaults["WORD_BOUNDARY"]: output_source_line_split[-1] = close_tag if output_source_line_split[0] != open_tag: output_source_line_split.insert(0, open_tag) if output_target_line_split[-1] == defaults["WORD_BOUNDARY"]: output_target_line_split[-1] = close_tag if output_target_line_split[0] != open_tag: output_target_line_split.insert(0, open_tag) if args.debug: if args.print_file == 'source': print(" ".join(output_source_line_split)) else: print(" ".join(output_target_line_split)) print("\n") else: with open(output_source_path, 'a+', encoding='utf-8') as outsourcefile, \ open(output_target_path, 'a+', encoding='utf-8') as outtargetfile: outsourcefile.write( " ".join(output_source_line_split) + "\n") outtargetfile.write( " ".join(output_target_line_split) + "\n")
def main(args): corpus_path_list = args.corpus if args.save_dir is None: args.save_dir = args.model for corpus_path in corpus_path_list: if not os.path.exists(corpus_path): raise ValueError( 'The path="{}" provided by --corpus does not exist!'.format( corpus_path)) print('Learn the "{}"s subword model based on {}.'.format( args.model, args.corpus)) os.makedirs(args.save_dir, exist_ok=True) model_prefix = os.path.join(args.save_dir, args.model) print('Save the subword model to {}.model'.format(model_prefix)) print('Save the vocabulary to {}.vocab'.format(model_prefix)) print() print('------- Start Training -------------') special_tokens_kv = OrderedDict() if not args.disable_unk: special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN if not args.disable_bos: special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN if not args.disable_eos: special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN if not args.disable_pad: special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN # split custom special tokens if args.model in ['yttm'] and len(args.custom_special_tokens) > 0: raise ValueError( 'model {} do not support custom_special_tokens'.format(args.model)) additional_custom_special_token = OrderedDict() for custom_special_token in args.custom_special_tokens: kv = custom_special_token.split('=') if not len(kv) == 2: raise ValueError( 'parameter {} has wrong format'.format(custom_special_token)) k, v = kv[0], kv[1] if k in special_tokens_kv: warnings.warn( f'There are overlaps between the custom special tokens and the' f' unk, bos, eos, pad tokens. Currently, we will overwrite the ' f'default tokens. We will overwrite "{k}" to "{v}"') special_tokens_kv[k] = v additional_custom_special_token[k] = v if args.model == 'hf_wordpiece': tokenizers = try_import_huggingface_tokenizers() if 'unk_token' not in special_tokens_kv or special_tokens_kv[ 'unk_token'] != '[UNK]': # TODO, HF Tokenizer must have the unk token. special_tokens_kv['unk_token'] = '[UNK]' if parse_version(tokenizers.__version__) < parse_version('0.8'): # The older version of Tokenizers # hf_wordpiece must contain mask, cls and sep tokens # the custom defined mask,cls,sep can overwrite the default settings if 'mask_token' not in special_tokens_kv: special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN if 'cls_token' not in special_tokens_kv: special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN if 'sep_token' not in special_tokens_kv: special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN special_tokens = list(special_tokens_kv.values()) print('special tokens: ' + ', '.join(special_tokens)) vocab = [] if args.model == 'spm': try_import_sentencepiece() import sentencepiece as spm corpus_path = ','.join(corpus_path_list) script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \ .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size) script += (' --unk_id=' + str(list(special_tokens_kv.keys()).index('unk_token'))) script += (' --bos_id=' + ('-1' if args.disable_bos else str( list(special_tokens_kv.keys()).index('bos_token')))) script += (' --eos_id=' + ('-1' if args.disable_eos else str( list(special_tokens_kv.keys()).index('eos_token')))) script += (' --pad_id=' + ('-1' if args.disable_pad else str( list(special_tokens_kv.keys()).index('pad_token')))) if len(additional_custom_special_token) > 0: script += ( ' --control_symbols=' + ','.join(list(additional_custom_special_token.values()))) print(script) spm.SentencePieceTrainer.Train(script) if 'bos_token' in special_tokens_kv: special_tokens_kv['bos_token'] = '<s>' if 'eos_token' in special_tokens_kv: special_tokens_kv['eos_token'] = '</s>' # build spm vocab spm_model = spm.SentencePieceProcessor() spm_model.load(model_prefix + '.model') vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))] os.remove(model_prefix + '.vocab') elif args.model == 'subword_nmt': try_import_subword_nmt() from subword_nmt import learn_bpe corpus_path = cat_corpus(corpus_path_list)\ if len(corpus_path_list) > 1 else corpus_path_list[0] # build model with open(corpus_path, 'r', encoding='utf-8') as fc,\ open(model_prefix + '.model', 'w', encoding='utf-8') as fm: learn_bpe.learn_bpe(fc, fm, args.vocab_size - len(special_tokens), total_symbols=True) # build vocab with open(corpus_path, 'r', encoding='utf-8') as fc, \ open(model_prefix + '.model', 'r', encoding='utf-8') as fm: vocab.extend(special_tokens) uniq_chars_internal = set() uniq_chars_final = set() uniq_words = set() for line in fc: for word in line.strip('\r\n ').split(' '): if word: uniq_words.add(word) # this code piece is same as # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows uniq_words = [ tuple(x[:-1]) + (x[-1] + '</w>', ) for x in uniq_words ] for word in uniq_words: for char in word[:-1]: uniq_chars_internal.add(char) uniq_chars_final.add(word[-1]) # sort to ensure the same settings produce the same vocab vocab.extend(sorted(list(uniq_chars_internal))) vocab.extend(sorted(list(uniq_chars_final))) fm.readline() pair = fm.readline() while pair: vocab.append(pair.replace(' ', '', 1).strip()) pair = fm.readline() if len(corpus_path_list) > 1: os.remove(corpus_path) elif args.model == 'yttm': try_import_yttm() import youtokentome as yttm corpus_path = cat_corpus(corpus_path_list)\ if len(corpus_path_list) > 1 else corpus_path_list[0] tokenizer = yttm.BPE.train( data=corpus_path, model=model_prefix + '.model', vocab_size=args.vocab_size, coverage=args.coverage, n_threads=args.n_threads, unk_id=special_tokens.index(Vocab.UNK_TOKEN), bos_id=-1 if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN), eos_id=-1 if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN), pad_id=-1 if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN)) vocab = tokenizer.vocab() if 'unk_token' in special_tokens_kv: special_tokens_kv['unk_token'] = '<UNK>' if 'bos_token' in special_tokens_kv: special_tokens_kv['bos_token'] = '<BOS>' if 'eos_token' in special_tokens_kv: special_tokens_kv['eos_token'] = '<EOS>' if 'pad_token' in special_tokens_kv: special_tokens_kv['pad_token'] = '<PAD>' if len(corpus_path_list) > 1: os.remove(corpus_path) elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']: tokenizers = try_import_huggingface_tokenizers() if args.model == 'hf_bpe': split_on_whitespace_only = not args.split_punctuation tokenizer = tokenizers.CharBPETokenizer( lowercase=args.lowercase, bert_normalizer=args.bert_normalizer, split_on_whitespace_only=split_on_whitespace_only) elif args.model == 'hf_bytebpe': tokenizer = tokenizers.ByteLevelBPETokenizer( lowercase=args.lowercase) elif args.model == 'hf_wordpiece': unk_token = special_tokens_kv.get('unk_token', None) sep_token = special_tokens_kv.get('sep_token', None) cls_token = special_tokens_kv.get('cls_token', None) pad_token = special_tokens_kv.get('pad_token', None) mask_token = special_tokens_kv.get('mask_token', None) if args.bert_normalizer: strip_accents = None clean_text = True handle_chinese_chars = True else: strip_accents = False clean_text = False handle_chinese_chars = False tokenizer = tokenizers.BertWordPieceTokenizer( unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, lowercase=args.lowercase, strip_accents=strip_accents, handle_chinese_chars=handle_chinese_chars, clean_text=clean_text) else: raise NotImplementedError tokenizer.train(corpus_path_list, vocab_size=args.vocab_size, show_progress=True, special_tokens=special_tokens) # Deal with the API change of tokenizers >= 0.8 if version.parse(tokenizers.__version__) >= version.parse('0.8'): save_model_path = model_prefix + '.model' tokenizer.save(save_model_path) model_info = json.load(open(save_model_path, encoding='utf-8')) special_tokens_in_tokenizer = model_info['added_tokens'] assert len(special_tokens_in_tokenizer) == len(special_tokens) hf_vocab = model_info['model']['vocab'] hf_vocab_sorted = sorted(list(hf_vocab.items()), key=lambda x: x[1]) hf_vocab_ids = [ele[1] for ele in hf_vocab_sorted] assert min(hf_vocab_ids) == 0 and max( hf_vocab_ids) == len(hf_vocab_ids) - 1 vocab = [ele[0] for ele in hf_vocab_sorted] else: tokenizer.save(args.save_dir, args.model) # we replace the huggingface vocab file with our Vocab implementation if args.model == 'hf_wordpiece': hf_vocab_file = model_prefix + '-vocab.txt' with open(hf_vocab_file, 'r', encoding='utf-8') as fv: for line in fv: vocab.append(line.strip()) else: # Move the hf_${model}-merges.txt to hf_${model}.models os.rename( os.path.join(args.save_dir, '{}-merges.txt'.format(args.model)), os.path.join(args.save_dir, '{}.model'.format(args.model))) hf_vocab_file = model_prefix + '-vocab.json' with open(hf_vocab_file, 'r', encoding='utf-8') as fv: vocab_kv = json.load(fv) vocab_kv = sorted(list(vocab_kv.items()), key=lambda x: x[1]) for kv in vocab_kv: vocab.append(kv[0]) os.remove(hf_vocab_file) else: raise NotImplementedError vocab_obj = Vocab(vocab, **special_tokens_kv) vocab_obj.save(model_prefix + '.vocab') print('-------- Done Training -------------')
def create_corpus(src, trg, en_test_sents, filedir, lc=False, seed=42, bpe_size=10000, dev_size=1000, test_dir="test"): """ Create a BPE-preprocessed corpus with random train/dev/test splits. """ source_file = "{}/jw300.{}-{}.{}".format(filedir, src, trg, src) target_file = "{}/jw300.{}-{}.{}".format(filedir, src, trg, trg) # download opus_reader = opustools_pkg.OpusRead(directory="JW300", source=src, target=trg, write_mode="moses", write=[source_file, target_file], suppress_prompts=True) opus_reader.printPairs() # unzip subprocess.Popen('gunzip JW300_latest_xml_{}-{}.xml.gz'.format( src, trg).split()) # TMX file to dataframe source = [] target = [] skip_lines = [] with open(source_file) as f: for i, line in enumerate(f): # skip sentences that are contained in the test set if line.strip() not in en_test_sents: source.append(line.strip()) else: skip_lines.append(i) with open(target_file) as f: for j, line in enumerate(f): # only add to corpus if corresponding source was not skipped if j not in skip_lines: target.append(line.strip()) print( 'Loaded data and skipped {} lines since contained in test set.'.format( len(skip_lines))) df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence']) # drop duplicate translations df_pp = df.drop_duplicates() # drop conflicting translations df_pp.drop_duplicates(subset='source_sentence', inplace=True) df_pp.drop_duplicates(subset='target_sentence', inplace=True) # shuffle df_pp = df_pp.sample(frac=1, random_state=seed).reset_index(drop=True) # do the split between dev/test/train and create parallel corpora num_dev_patterns = dev_size # test data is loaded from file # num_test_patterns = 1000 # Lower case the corpora if lc: df_pp["source_sentence"] = df_pp["source_sentence"].str.lower() df_pp["target_sentence"] = df_pp["target_sentence"].str.lower() dev = df_pp.tail(num_dev_patterns) stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index) train_src_file = "{}/train.{}-{}.{}".format(filedir, src, trg, src) train_trg_file = "{}/train.{}-{}.{}".format(filedir, src, trg, trg) dev_src_file = "{}/dev.{}-{}.{}".format(filedir, src, trg, src) dev_trg_file = "{}/dev.{}-{}.{}".format(filedir, src, trg, trg) # tests are already created #test_src_file = "{}/test.{}-{}.{}".format(filedir, src, trg, src) #test_trg_file = "{}/test.{}-{}.{}".format(filedir, src, trg, trg) stripped[["source_sentence"]].to_csv(train_src_file, header=False, index=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") stripped[["target_sentence"]].to_csv(train_trg_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") dev[["source_sentence"]].to_csv(dev_src_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") dev[["target_sentence"]].to_csv(dev_trg_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") #test[["source_sentence"]].to_csv(test_src_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") #test[["target_sentence"]].to_csv(test_trg_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§") # train bpe (separately for src and trg) src_bpe_file = "{}/{}-{}.{}.bpe".format(filedir, src, trg, src) trg_bpe_file = "{}/{}-{}.{}.bpe".format(filedir, src, trg, trg) learn_bpe.learn_bpe(codecs.open(train_src_file, encoding='utf-8'), codecs.open(src_bpe_file, "w", encoding='utf-8'), bpe_size) learn_bpe.learn_bpe(codecs.open(train_trg_file, encoding='utf-8'), codecs.open(trg_bpe_file, "w", encoding='utf-8'), bpe_size) # apply bpe def bpe_process(inp, outp, codes): codes = codecs.open(codes, encoding='utf-8') inp = codecs.open(inp, encoding='utf-8') outp = codecs.open(outp, "w", encoding='utf-8') bpe = apply_bpe.BPE(codes) for line in inp: outp.write(bpe.process_line(line)) for split in ["train", "dev"]: for side in [src, trg]: bpe_process( "{}/{}.{}-{}.{}".format(filedir, split, src, trg, side), "{}/{}.{}-{}.bpe.{}".format(filedir, split, src, trg, side), "{}/{}-{}.{}.bpe".format(filedir, src, trg, side)) for side in [src, trg]: bpe_process( "{}/{}.{}-{}.{}".format(test_dir, "test", src, trg, side), "{}/{}.{}-{}.bpe.{}".format(filedir, "test", src, trg, side), "{}/{}-{}.{}.bpe".format(filedir, src, trg, side))
def main(args): corpus_path_list = args.corpus if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_prefix = os.path.join(args.save_dir, args.model) special_tokens_kv = OrderedDict() # unk is always required special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN if not args.disable_bos: special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN if not args.disable_eos: special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN if not args.disable_pad: special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN # split custom special tokens if args.model in ['yttm'] and len(args.custom_special_tokens) > 0: raise ValueError( 'model {} do not support custom_special_tokens'.format(args.model)) for custom_special_token in args.custom_special_tokens: kv = custom_special_token.split('=') if not len(kv) == 2: raise ValueError( 'parameter {} has wrong format'.format(custom_special_token)) k, v = kv[0], kv[1] if k in special_tokens_kv: raise ValueError( 'There are overlaps between the custom special tokens and the' ' unk, bos, eos, pad tokens') special_tokens_kv[k] = v # hf_wordpiece must contains mask, cls and sep tokens # the costom defined mask,cls,sep can overwrite the default settings if args.model == 'hf_wordpiece': if 'mask_token' not in special_tokens_kv: special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN if 'cls_token' not in special_tokens_kv: special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN if 'sep_token' not in special_tokens_kv: special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN special_tokens = list(special_tokens_kv.values()) print('special tokens: ' + ', '.join(special_tokens)) vocab = [] if args.model == 'spm': try_import_sentencepiece() import sentencepiece as spm corpus_path = ','.join(corpus_path_list) script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \ .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size) script += (' --unk_id=' + str(special_tokens.index(Vocab.UNK_TOKEN))) script += (' --bos_id=' + ('-1' if args.disable_bos else str( special_tokens.index(Vocab.BOS_TOKEN)))) script += (' --eos_id=' + ('-1' if args.disable_eos else str( special_tokens.index(Vocab.EOS_TOKEN)))) script += (' --pad_id=' + ('-1' if args.disable_pad else str( special_tokens.index(Vocab.PAD_TOKEN)))) if len(args.custom_special_tokens) > 0: ids_in_script = script.count('_id') script += (' --control_symbols=' + ','.join(special_tokens[ids_in_script:])) print(script) spm.SentencePieceTrainer.Train(script) if 'bos_token' in special_tokens_kv: special_tokens_kv['bos_token'] = '<s>' if 'eos_token' in special_tokens_kv: special_tokens_kv['eos_token'] = '</s>' # build spm vocab spm_model = spm.SentencePieceProcessor() spm_model.load(model_prefix + '.model') vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))] os.remove(model_prefix + '.vocab') elif args.model == 'subword_nmt': try_import_subword_nmt() from subword_nmt import learn_bpe corpus_path = cat_corpus(corpus_path_list)\ if len(corpus_path_list) > 1 else corpus_path_list[0] # build model with open(corpus_path, 'r', encoding='utf-8') as fc,\ open(model_prefix + '.model', 'w', encoding='utf-8') as fm: learn_bpe.learn_bpe(fc, fm, args.vocab_size - len(special_tokens), total_symbols=True) # build vocab with open(corpus_path, 'r', encoding='utf-8') as fc, \ open(model_prefix + '.model', 'r', encoding='utf-8') as fm: vocab.extend(special_tokens) uniq_chars_internal = set() uniq_chars_final = set() uniq_words = set() for line in fc: for word in line.strip('\r\n ').split(' '): if word: uniq_words.add(word) # this code piece is same as # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows uniq_words = [ tuple(x[:-1]) + (x[-1] + '</w>', ) for x in uniq_words ] for word in uniq_words: for char in word[:-1]: uniq_chars_internal.add(char) uniq_chars_final.add(word[-1]) # sort to ensure the same settings produce the same vocab vocab.extend(sorted(list(uniq_chars_internal))) vocab.extend(sorted(list(uniq_chars_final))) fm.readline() pair = fm.readline() while (pair): vocab.append(pair.replace(' ', '', 1).strip()) pair = fm.readline() if len(corpus_path_list) > 1: os.remove(corpus_path) elif args.model == 'yttm': try_import_yttm() import youtokentome as yttm corpus_path = cat_corpus(corpus_path_list)\ if len(corpus_path_list) > 1 else corpus_path_list[0] tokenizer = yttm.BPE.train( data=corpus_path, model=model_prefix + '.model', vocab_size=args.vocab_size, coverage=args.coverage, n_threads=args.n_threads, unk_id=special_tokens.index(Vocab.UNK_TOKEN), bos_id=-1 if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN), eos_id=-1 if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN), pad_id=-1 if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN)) vocab = tokenizer.vocab() if 'unk_token' in special_tokens_kv: special_tokens_kv['unk_token'] = '<UNK>' if 'bos_token' in special_tokens_kv: special_tokens_kv['bos_token'] = '<BOS>' if 'eos_token' in special_tokens_kv: special_tokens_kv['eos_token'] = '<EOS>' if 'pad_token' in special_tokens_kv: special_tokens_kv['pad_token'] = '<PAD>' if len(corpus_path_list) > 1: os.remove(corpus_path) elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']: tokenizers = try_import_huggingface_tokenizers() if args.model == 'hf_bpe': tokenizer = tokenizers.CharBPETokenizer(lowercase=args.lowercase) elif args.model == 'hf_bytebpe': tokenizer = tokenizers.ByteLevelBPETokenizer( lowercase=args.lowercase) elif args.model == 'hf_wordpiece': tokenizer = tokenizers.BertWordPieceTokenizer( lowercase=args.lowercase, strip_accents=args.strip_accents) else: raise NotImplementedError tokenizer.train(corpus_path_list, vocab_size=args.vocab_size, show_progress=True, special_tokens=special_tokens) tokenizer.save(args.save_dir, args.model) # we replace the huggingface vocab file with our Vocab implementation if args.model == 'hf_wordpiece': hf_vocab_file = model_prefix + '-vocab.txt' with open(hf_vocab_file, 'r', encoding='utf-8') as fv: for line in fv: vocab.append(line.strip()) else: # Move the hf_${model}-merges.txt to hf_${model}.models os.rename( os.path.join(args.save_dir, '{}-merges.txt'.format(args.model)), os.path.join(args.save_dir, '{}.model'.format(args.model))) hf_vocab_file = model_prefix + '-vocab.json' with open(hf_vocab_file, 'r', encoding='utf-8') as fv: vocab_kv = json.load(fv) vocab_kv = sorted(list(vocab_kv.items()), key=lambda x: x[1]) for kv in vocab_kv: vocab.append(kv[0]) os.remove(hf_vocab_file) else: raise NotImplementedError unk_token = special_tokens_kv.pop('unk_token') vocab_obj = Vocab(vocab, unk_token=unk_token, **special_tokens_kv) vocab_obj.save(model_prefix + '.vocab')
def main(): options = parse_args() torch.manual_seed(options.seed) basename = os.path.splitext(os.path.basename(options.input))[0] out_dir = options.out_dir or "data/{}/".format(basename) spinner = Halo(spinner="dots", placement="right") with open(options.input, "r", encoding="utf8") as fd: reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") lines = [[line[0]] for line in reader] if not os.path.exists(out_dir): os.makedirs(out_dir) output_full = os.path.join(out_dir, "{}.tsv".format(basename)) with open(output_full, "w", encoding="utf8") as fd: writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="") writer.writerows(lines) vocab_size = 32000 spiece_out = os.path.join(out_dir, "spiece") spiece_args = ( "--input={} " "--model_prefix={} " "--vocab_size={} " "--character_coverage=1.0" ).format(output_full, spiece_out, vocab_size) SentencePieceTrainer.Train(spiece_args) # Load the generated vocabulary with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) vocab = [line[0] for line in reader] # Remove the special tokens <unk>, <s>, </s> vocab = vocab[3:] # Convert to BERT style bert_vocab = [ v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁" ] # Add BERT's special tokens to the beginning bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab # Fill up with unused tokens pad_size = vocab_size - len(bert_vocab) bert_vocab += ["unused{}".format(i) for i in range(pad_size)] with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows([[b] for b in bert_vocab]) # Convert to GPT-2 style # Unfortunately it's slow and tedious. spinner.start(text="Generating BPE vocabulary") gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab] # Add the GPT-2 special token to the end gpt2_vocab.append("<|endoftext|>") with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd: json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False) spiece_processor = SentencePieceProcessor() spiece_processor.Load("{}.model".format(spiece_out)) # Encode the whole text encoded = [ [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")] for line in lines ] tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp() tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp() try: # Write the encoded text to a temporary file. with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(encoded) learn_bpe( open(tmp_encoded_path, "r", encoding="utf8"), open(tmp_bpe_path, "w", encoding="utf8"), num_symbols=vocab_size, ) with open(tmp_bpe_path, "r", encoding="utf8") as fd: reader = csv.reader( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) seen = set() merges = [] for line in reader: # Get rid of the </w> tokens line = line[0].replace("</w>", "") # Remove duplicates (due to </w> tokens) if line not in seen: seen.add(line) merges.append([line]) with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd: writer = csv.writer( fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="" ) writer.writerows(merges) finally: os.remove(tmp_encoded_path) os.remove(tmp_bpe_path) spinner.stop()