def test_gpt2standin(self): with testing_utils.tempdir() as tmpdir: # we need to build the dict file hf_bpe_opt = self._get_dict_opt('bytelevelbpe') slow_bytelevel_bpe_opt = self._get_dict_opt('slow_bytelevel_bpe') dict_file = os.path.join(tmpdir, "dict") pp = build_dict.setup_args() pp.set_defaults(**hf_bpe_opt) pp.set_defaults(task='babi') popt = pp.parse_args([]) popt['dict_file'] = dict_file build_dict.build_dict(popt) hf_bpe_opt['dict_file'] = dict_file hf_bpe = DictionaryAgent(hf_bpe_opt) slow_bytelevel_bpe_opt['dict_file'] = dict_file slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt) self._run_test(slow_bytelevel_bpe, hf_bpe) slow_bytelevel_bpe_opt['bpe_add_prefix_space'] = True slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt) self._run_prefix_space_test(slow_bytelevel_bpe)
def _run_test(self, opt): with testing_utils.tempdir() as tmpdir: dict_file = os.path.join(tmpdir, "dict") pp = build_dict.setup_args() pp.set_defaults(**opt) pp.set_defaults(task='babi') popt = pp.parse_args([]) popt['dict_file'] = dict_file for k, v in opt.items(): popt[k] = v
def download(datapath): DICT_PATH = os.path.join(datapath, 'models', 'twitter', 'dict', 'dict_30k') # don't actually download--build it parser = setup_args(ParlaiParser()) # first build on standard train and validation parser.set_defaults(task='twitter', dict_lower=True, dict_file=DICT_PATH, dict_maxtokens=30000) opt = parser.parse_args(args='') return main_build_dict(opt)
def build_dict_30k(): parser = setup_args() # first build on standard train and validation parser.set_defaults( task='twitter', dict_lower=True, dict_file=DICT_FILE_30K, dict_maxtokens=30000, ) opt = parser.parse_args(args='') return main_build_dict(opt)
def build_dict(): parser = setup_args() # first build on standard train and validation parser.set_defaults( task='convai2:self', dict_lower=True, dict_file=DICT_FILE, dict_include_valid=True, dict_tokenizer='split', ) opt = parser.parse_args(args="") return main_build_dict(opt)
def build_dict(): DICT_FINAL = 'models:convai2/dict_self' parser = setup_args() # first build on standard train and validation parser.set_defaults( task='convai2:self', dict_lower=True, dict_file=DICT_FINAL, dict_include_valid=True, ) opt = parser.parse_args() return main_build_dict(opt)
def _distributed_train_model(self, opt): with testing_utils.tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = mp_train.setup_args() popt = _forced_parse(parser, opt) # we need a prebuilt dictionary parser = build_dict.setup_args() build_dict.build_dict(popt) valid, test = mp_train.launch_and_train(popt, 31337) return (valid, test)
def _distributed_train_model(self, **overrides): opt = {**self.base_config, **overrides} with testing_utils.tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = mp_train.setup_args() popt = parser.parse_kwargs(**opt) # we need a prebuilt dictionary parser = build_dict.setup_args() build_dict.build_dict(popt) valid, test = mp_train.launch_and_train(popt) return (valid, test)
def _distributed_train_model(self, opt): with testing_utils.tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = mp_train.setup_args() # TODO: Kill this after dictionaries build correctly popt = self._forced_parse(parser, opt) # we need a prebuilt dictionary parser = build_dict.setup_args() build_dict.build_dict(popt) valid, test = mp_train.launch_and_train(popt, 31338) dist.destroy_process_group() return (valid, test)
def _distributed_train_model(self, opt): # we have to delay our import to here, because the set_spawn_method call # inside multiprocessing_train will break the multithreading tests, even # when we skip the test. import parlai.scripts.multiprocessing_train as mp_train with testing_utils.capture_output() as output: with testing_utils.tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = mp_train.setup_args() popt = _forced_parse(parser, opt) # we need a prebuilt dictionary parser = build_dict.setup_args() build_dict.build_dict(popt) valid, test = mp_train.launch_and_train(popt, 31337) return (output.getvalue(), valid, test)
def _get_parlai_opt(self, options: List[str] = [], print_args=False): from parlai.scripts.build_dict import setup_args parser = setup_args() opt = parser.parse_args(options, print_args=print_args) return opt
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Generates a dictionary file from the training data. For more documentation, see `parlai.scripts.build_dict`. """ from parlai.scripts.build_dict import setup_args, build_dict if __name__ == '__main__': parser = setup_args() opt = parser.parse_args() build_dict(opt)