Ejemplo n.º 1
0
    def test_gpt2standin(self):
        with testing_utils.tempdir() as tmpdir:
            # we need to build the dict file
            hf_bpe_opt = self._get_dict_opt('bytelevelbpe')
            slow_bytelevel_bpe_opt = self._get_dict_opt('slow_bytelevel_bpe')

            dict_file = os.path.join(tmpdir, "dict")
            pp = build_dict.setup_args()
            pp.set_defaults(**hf_bpe_opt)
            pp.set_defaults(task='babi')
            popt = pp.parse_args([])
            popt['dict_file'] = dict_file
            build_dict.build_dict(popt)

            hf_bpe_opt['dict_file'] = dict_file
            hf_bpe = DictionaryAgent(hf_bpe_opt)

            slow_bytelevel_bpe_opt['dict_file'] = dict_file
            slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt)

            self._run_test(slow_bytelevel_bpe, hf_bpe)

            slow_bytelevel_bpe_opt['bpe_add_prefix_space'] = True
            slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt)
            self._run_prefix_space_test(slow_bytelevel_bpe)
Ejemplo n.º 2
0
 def _run_test(self, opt):
     with testing_utils.tempdir() as tmpdir:
         dict_file = os.path.join(tmpdir, "dict")
         pp = build_dict.setup_args()
         pp.set_defaults(**opt)
         pp.set_defaults(task='babi')
         popt = pp.parse_args([])
         popt['dict_file'] = dict_file
         for k, v in opt.items():
             popt[k] = v
Ejemplo n.º 3
0
def download(datapath):
    DICT_PATH = os.path.join(datapath, 'models', 'twitter', 'dict', 'dict_30k')
    # don't actually download--build it
    parser = setup_args(ParlaiParser())
    # first build on standard train and validation
    parser.set_defaults(task='twitter',
                        dict_lower=True,
                        dict_file=DICT_PATH,
                        dict_maxtokens=30000)
    opt = parser.parse_args(args='')
    return main_build_dict(opt)
Ejemplo n.º 4
0
def build_dict_30k():
    parser = setup_args()
    # first build on standard train and validation
    parser.set_defaults(
        task='twitter',
        dict_lower=True,
        dict_file=DICT_FILE_30K,
        dict_maxtokens=30000,
    )
    opt = parser.parse_args(args='')
    return main_build_dict(opt)
Ejemplo n.º 5
0
def build_dict():
    parser = setup_args()
    # first build on standard train and validation
    parser.set_defaults(
        task='convai2:self',
        dict_lower=True,
        dict_file=DICT_FILE,
        dict_include_valid=True,
        dict_tokenizer='split',
    )
    opt = parser.parse_args(args="")
    return main_build_dict(opt)
def build_dict():
    DICT_FINAL = 'models:convai2/dict_self'

    parser = setup_args()
    # first build on standard train and validation
    parser.set_defaults(
        task='convai2:self',
        dict_lower=True,
        dict_file=DICT_FINAL,
        dict_include_valid=True,
    )
    opt = parser.parse_args()
    return main_build_dict(opt)
Ejemplo n.º 7
0
    def _distributed_train_model(self, opt):
        with testing_utils.tempdir() as tmpdir:
            if 'model_file' not in opt:
                opt['model_file'] = os.path.join(tmpdir, 'model')
            if 'dict_file' not in opt:
                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')

            parser = mp_train.setup_args()
            popt = _forced_parse(parser, opt)

            # we need a prebuilt dictionary
            parser = build_dict.setup_args()
            build_dict.build_dict(popt)

            valid, test = mp_train.launch_and_train(popt, 31337)

        return (valid, test)
Ejemplo n.º 8
0
    def _distributed_train_model(self, **overrides):
        opt = {**self.base_config, **overrides}
        with testing_utils.tempdir() as tmpdir:
            if 'model_file' not in opt:
                opt['model_file'] = os.path.join(tmpdir, 'model')
            if 'dict_file' not in opt:
                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')

            parser = mp_train.setup_args()
            popt = parser.parse_kwargs(**opt)

            # we need a prebuilt dictionary
            parser = build_dict.setup_args()
            build_dict.build_dict(popt)

            valid, test = mp_train.launch_and_train(popt)

        return (valid, test)
Ejemplo n.º 9
0
    def _distributed_train_model(self, opt):
        with testing_utils.tempdir() as tmpdir:
            if 'model_file' not in opt:
                opt['model_file'] = os.path.join(tmpdir, 'model')
            if 'dict_file' not in opt:
                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')

            parser = mp_train.setup_args()
            # TODO: Kill this after dictionaries build correctly
            popt = self._forced_parse(parser, opt)

            # we need a prebuilt dictionary
            parser = build_dict.setup_args()
            build_dict.build_dict(popt)

            valid, test = mp_train.launch_and_train(popt, 31338)
            dist.destroy_process_group()

        return (valid, test)
Ejemplo n.º 10
0
    def _distributed_train_model(self, opt):
        # we have to delay our import to here, because the set_spawn_method call
        # inside multiprocessing_train will break the multithreading tests, even
        # when we skip the test.
        import parlai.scripts.multiprocessing_train as mp_train

        with testing_utils.capture_output() as output:
            with testing_utils.tempdir() as tmpdir:
                if 'model_file' not in opt:
                    opt['model_file'] = os.path.join(tmpdir, 'model')
                if 'dict_file' not in opt:
                    opt['dict_file'] = os.path.join(tmpdir, 'model.dict')

                parser = mp_train.setup_args()
                popt = _forced_parse(parser, opt)

                # we need a prebuilt dictionary
                parser = build_dict.setup_args()
                build_dict.build_dict(popt)

                valid, test = mp_train.launch_and_train(popt, 31337)

        return (output.getvalue(), valid, test)
Ejemplo n.º 11
0
 def _get_parlai_opt(self, options: List[str] = [], print_args=False):
     from parlai.scripts.build_dict import setup_args
     parser = setup_args()
     opt = parser.parse_args(options, print_args=print_args)
     return opt
Ejemplo n.º 12
0
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Generates a dictionary file from the training data.

For more documentation, see `parlai.scripts.build_dict`.
"""

from parlai.scripts.build_dict import setup_args, build_dict

if __name__ == '__main__':
    parser = setup_args()
    opt = parser.parse_args()
    build_dict(opt)