Example #1
0
    def bpe_create(self):
        os.makedirs(self.args.output_path, exist_ok=True)
        self.state.vocab = os.path.join(self.args.output_path, 'model.vcb')

        # Copy existing model if specified
        if self.args.vocabulary_path is not None:
            shutil.copyfile(self.args.vocabulary_path, self.state.vocab)
            return

        # Create custom tokens list
        custom_tokens = [('${DNT%d}' % i) for i in range(10)]

        if len(self._target_langs) > 1:
            custom_tokens = [
                SubwordDictionary.language_tag(l) for l in self._target_langs
            ] + custom_tokens

        # Collect all training files
        all_files = []

        for src_lang, tgt_lang in self._mono_pairs:
            lang_dir = '%s__%s' % (src_lang, tgt_lang)
            train_path = os.path.join(self.state.tokenized_corpora, lang_dir,
                                      'train')
            dev_path = os.path.join(self.state.tokenized_corpora, lang_dir,
                                    'dev')

            all_src, all_tgt = collect_parallel_files(src_lang, tgt_lang,
                                                      [train_path, dev_path])

            all_files.extend(all_src)
            all_files.extend(all_tgt)

        # Build SubwordDictionary
        builder = SubwordDictionary.Factory(
            self.args.voc_size,
            vocab_threads=self.args.threads,
            custom_tokens=custom_tokens,
            padding_factor=8,
            count_threshold=self.args.count_threshold)
        dictionary = builder.build(all_files, tmp_path=self.wdir('bpe_temp'))
        dictionary.save(self.state.vocab)
Example #2
0
    def _bpe_encode_files(self, pool, src_lang, tgt_lang, in_src_files,
                          in_tgt_files, out_src_file_obj, out_tgt_file_obj):
        src_prefix, tgt_prefix = None, None
        if len(self._target_langs) > 1:
            src_prefix = SubwordDictionary.language_tag(tgt_lang) + '_ '
            tgt_prefix = SubwordDictionary.language_tag(src_lang) + '_ '

        batch_size = (multiprocessing.cpu_count() or 1) * 100
        bidirectional = ((src_lang, tgt_lang) in self._langs) and (
            (tgt_lang, src_lang) in self._langs)

        fwd_seq, bwd_seq = _Sequence(), _Sequence()

        for in_src_file, in_tgt_file in zip(in_src_files, in_tgt_files):
            with open(in_src_file, 'r', encoding='utf-8') as in_src_file_obj, \
                    open(in_tgt_file, 'r', encoding='utf-8') as in_tgt_file_obj:
                for batch in iter(
                        lambda: tuple(
                            islice(zip(in_src_file_obj, in_tgt_file_obj),
                                   batch_size)), ()):
                    for src_line, tgt_line, src_len, tgt_len in pool.map(
                            _apply_bpe, batch):
                        if src_line is None or tgt_line is None:
                            continue

                        s2t_rate, t2s_rate = tgt_len / src_len, src_len / tgt_len
                        fwd_seq.add(s2t_rate)
                        bwd_seq.add(t2s_rate)

                        if src_prefix is not None:
                            out_src_file_obj.write(src_prefix)
                        out_src_file_obj.write(src_line)
                        out_tgt_file_obj.write(tgt_line)

                        if bidirectional:
                            if tgt_prefix is not None:
                                out_src_file_obj.write(tgt_prefix)
                            out_src_file_obj.write(tgt_line)
                            out_tgt_file_obj.write(src_line)

        return fwd_seq, bwd_seq
Example #3
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])

        # load dictionary
        subword_dict = SubwordDictionary.load(
            os.path.join(args.data[0], 'model.vcb'))

        return cls(args, subword_dict)
Example #4
0
 def load_dictionary(cls, filename):
     if os.path.basename(filename) != 'model.vcb':
         filename = os.path.join(os.path.dirname(filename), 'model.vcb')
     return SubwordDictionary.load(filename)
Example #5
0
 def load_dictionary(cls, filename):
     return SubwordDictionary.load(filename)
Example #6
0
def _pool_initializer(vocab_path):
    global bpe_vocab
    bpe_vocab = SubwordDictionary.load(vocab_path)