def bpe_create(self): os.makedirs(self.args.output_path, exist_ok=True) self.state.vocab = os.path.join(self.args.output_path, 'model.vcb') # Copy existing model if specified if self.args.vocabulary_path is not None: shutil.copyfile(self.args.vocabulary_path, self.state.vocab) return # Create custom tokens list custom_tokens = [('${DNT%d}' % i) for i in range(10)] if len(self._target_langs) > 1: custom_tokens = [ SubwordDictionary.language_tag(l) for l in self._target_langs ] + custom_tokens # Collect all training files all_files = [] for src_lang, tgt_lang in self._mono_pairs: lang_dir = '%s__%s' % (src_lang, tgt_lang) train_path = os.path.join(self.state.tokenized_corpora, lang_dir, 'train') dev_path = os.path.join(self.state.tokenized_corpora, lang_dir, 'dev') all_src, all_tgt = collect_parallel_files(src_lang, tgt_lang, [train_path, dev_path]) all_files.extend(all_src) all_files.extend(all_tgt) # Build SubwordDictionary builder = SubwordDictionary.Factory( self.args.voc_size, vocab_threads=self.args.threads, custom_tokens=custom_tokens, padding_factor=8, count_threshold=self.args.count_threshold) dictionary = builder.build(all_files, tmp_path=self.wdir('bpe_temp')) dictionary.save(self.state.vocab)
def _bpe_encode_files(self, pool, src_lang, tgt_lang, in_src_files, in_tgt_files, out_src_file_obj, out_tgt_file_obj): src_prefix, tgt_prefix = None, None if len(self._target_langs) > 1: src_prefix = SubwordDictionary.language_tag(tgt_lang) + '_ ' tgt_prefix = SubwordDictionary.language_tag(src_lang) + '_ ' batch_size = (multiprocessing.cpu_count() or 1) * 100 bidirectional = ((src_lang, tgt_lang) in self._langs) and ( (tgt_lang, src_lang) in self._langs) fwd_seq, bwd_seq = _Sequence(), _Sequence() for in_src_file, in_tgt_file in zip(in_src_files, in_tgt_files): with open(in_src_file, 'r', encoding='utf-8') as in_src_file_obj, \ open(in_tgt_file, 'r', encoding='utf-8') as in_tgt_file_obj: for batch in iter( lambda: tuple( islice(zip(in_src_file_obj, in_tgt_file_obj), batch_size)), ()): for src_line, tgt_line, src_len, tgt_len in pool.map( _apply_bpe, batch): if src_line is None or tgt_line is None: continue s2t_rate, t2s_rate = tgt_len / src_len, src_len / tgt_len fwd_seq.add(s2t_rate) bwd_seq.add(t2s_rate) if src_prefix is not None: out_src_file_obj.write(src_prefix) out_src_file_obj.write(src_line) out_tgt_file_obj.write(tgt_line) if bidirectional: if tgt_prefix is not None: out_src_file_obj.write(tgt_prefix) out_src_file_obj.write(tgt_line) out_tgt_file_obj.write(src_line) return fwd_seq, bwd_seq
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) # load dictionary subword_dict = SubwordDictionary.load( os.path.join(args.data[0], 'model.vcb')) return cls(args, subword_dict)
def load_dictionary(cls, filename): if os.path.basename(filename) != 'model.vcb': filename = os.path.join(os.path.dirname(filename), 'model.vcb') return SubwordDictionary.load(filename)
def load_dictionary(cls, filename): return SubwordDictionary.load(filename)
def _pool_initializer(vocab_path): global bpe_vocab bpe_vocab = SubwordDictionary.load(vocab_path)