Python SubwordDictionary Examples

Programming Language: Python

Namespace/Package Name: mmt.textencoder

Examples at hotexamples.com: 6

Python SubwordDictionary - 6 examples found. These are the top rated real world Python examples of mmt.textencoder.SubwordDictionary extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

load(4)

language_tag(2)

Factory(1)

Frequently Used Methods

load (4)

language_tag (2)

Factory (1)

Example #1

Show file

    def bpe_create(self):
        os.makedirs(self.args.output_path, exist_ok=True)
        self.state.vocab = os.path.join(self.args.output_path, 'model.vcb')

        # Copy existing model if specified
        if self.args.vocabulary_path is not None:
            shutil.copyfile(self.args.vocabulary_path, self.state.vocab)
            return

        # Create custom tokens list
        custom_tokens = [('${DNT%d}' % i) for i in range(10)]

        if len(self._target_langs) > 1:
            custom_tokens = [
                SubwordDictionary.language_tag(l) for l in self._target_langs
            ] + custom_tokens

        # Collect all training files
        all_files = []

        for src_lang, tgt_lang in self._mono_pairs:
            lang_dir = '%s__%s' % (src_lang, tgt_lang)
            train_path = os.path.join(self.state.tokenized_corpora, lang_dir,
                                      'train')
            dev_path = os.path.join(self.state.tokenized_corpora, lang_dir,
                                    'dev')

            all_src, all_tgt = collect_parallel_files(src_lang, tgt_lang,
                                                      [train_path, dev_path])

            all_files.extend(all_src)
            all_files.extend(all_tgt)

        # Build SubwordDictionary
        builder = SubwordDictionary.Factory(
            self.args.voc_size,
            vocab_threads=self.args.threads,
            custom_tokens=custom_tokens,
            padding_factor=8,
            count_threshold=self.args.count_threshold)
        dictionary = builder.build(all_files, tmp_path=self.wdir('bpe_temp'))
        dictionary.save(self.state.vocab)

Example #2

Show file

    def _bpe_encode_files(self, pool, src_lang, tgt_lang, in_src_files,
                          in_tgt_files, out_src_file_obj, out_tgt_file_obj):
        src_prefix, tgt_prefix = None, None
        if len(self._target_langs) > 1:
            src_prefix = SubwordDictionary.language_tag(tgt_lang) + '_ '
            tgt_prefix = SubwordDictionary.language_tag(src_lang) + '_ '

        batch_size = (multiprocessing.cpu_count() or 1) * 100
        bidirectional = ((src_lang, tgt_lang) in self._langs) and (
            (tgt_lang, src_lang) in self._langs)

        fwd_seq, bwd_seq = _Sequence(), _Sequence()

        for in_src_file, in_tgt_file in zip(in_src_files, in_tgt_files):
            with open(in_src_file, 'r', encoding='utf-8') as in_src_file_obj, \
                    open(in_tgt_file, 'r', encoding='utf-8') as in_tgt_file_obj:
                for batch in iter(
                        lambda: tuple(
                            islice(zip(in_src_file_obj, in_tgt_file_obj),
                                   batch_size)), ()):
                    for src_line, tgt_line, src_len, tgt_len in pool.map(
                            _apply_bpe, batch):
                        if src_line is None or tgt_line is None:
                            continue

                        s2t_rate, t2s_rate = tgt_len / src_len, src_len / tgt_len
                        fwd_seq.add(s2t_rate)
                        bwd_seq.add(t2s_rate)

                        if src_prefix is not None:
                            out_src_file_obj.write(src_prefix)
                        out_src_file_obj.write(src_line)
                        out_tgt_file_obj.write(tgt_line)

                        if bidirectional:
                            if tgt_prefix is not None:
                                out_src_file_obj.write(tgt_prefix)
                            out_src_file_obj.write(tgt_line)
                            out_tgt_file_obj.write(src_line)

        return fwd_seq, bwd_seq

Example #3

Show file

    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])

        # load dictionary
        subword_dict = SubwordDictionary.load(
            os.path.join(args.data[0], 'model.vcb'))

        return cls(args, subword_dict)

Example #4

Show file

 def load_dictionary(cls, filename):
     if os.path.basename(filename) != 'model.vcb':
         filename = os.path.join(os.path.dirname(filename), 'model.vcb')
     return SubwordDictionary.load(filename)

Example #5

Show file

 def load_dictionary(cls, filename):
     return SubwordDictionary.load(filename)

Example #6

Show file

def _pool_initializer(vocab_path):
    global bpe_vocab
    bpe_vocab = SubwordDictionary.load(vocab_path)