def setup_args(parser=None): if parser is None: parser = ParlaiParser(True, True) dict_loop = parser.add_argument_group('Dictionary Loop Arguments') dict_loop.add_argument('--dict-maxexs', default=-1, type=int, help='max number of examples to build dict on') dict_loop.add_argument( '--dict-include-valid', default=False, type='bool', help='Include validation set in dictionary building for task.') dict_loop.add_argument( '--dict-include-test', default=False, type='bool', help='Include test set in dictionary building for task.') dict_loop.add_argument('-ltim', '--log-every-n-secs', type=float, default=2) partial, _ = parser.parse_known_args(nohelp=True) if vars(partial).get('dict_class'): str2class(vars(partial).get('dict_class')).add_cmdline_args(parser) else: DictionaryAgent.add_cmdline_args(parser) return parser
def build_dict(opt): if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary for _ in world_dict: cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def build_dict(opt): if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered' if 'stream' in opt['datatype']: ordered_opt['datatype'] += ':stream' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary for _ in world_dict: cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher': pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '') if pytorch_buildteacher_task != '': ordered_opt['task'] = pytorch_buildteacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens ]'.format(len(dictionary))) return dictionary
def __build_bag_of_words(opt): """Build a dictionary for some models. opt is a dictionary returned by arg_parse """ if not opt['dict_build_first'] or not 'dict_file' in opt: return if opt['dict_file'] is None and opt.get('pretrained_model'): opt['dict_file'] = opt['pretrained_model'] + '.dict' if opt['dict_file'] is None and opt.get('model_file'): opt['dict_file'] = opt['model_file'] + '.dict' print("[ building dictionary first... ]") if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered' if 'stream' in opt['datatype']: ordered_opt['datatype'] += ':stream' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary for _ in world_dict: cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def build_dict(opt): if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered:stream' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher' and ordered_opt.get('pytorch_preprocess', False): pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '') if pytorch_buildteacher_task != '': ordered_opt['task'] = pytorch_buildteacher_task world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): logging.error('Should be passed opt not Parser') opt = opt.parse_args() if not opt.get('dict_file'): logging.error( 'Tried to build dictionary but `--dict-file` is not set. Set ' 'this param so the dictionary can be saved.') return if skip_if_built and PathManager.exists(opt['dict_file']): # Dictionary already built, skip all loading or setup logging.debug("dictionary already built.") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if PathManager.exists( opt['dict_file']) or (hasattr(dictionary, 'is_prebuilt') and dictionary.is_prebuilt()): # Dictionary already built, return loaded dictionary agent logging.debug("dictionary already built.") return dictionary if is_distributed(): raise ValueError( 'Dictionaries should be pre-built before distributed train.') ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['batchsize'] = 1 # Set this to none so that image features are not calculated when Teacher is # instantiated while building the dict ordered_opt['image_mode'] = 'no_image_model' ordered_opt.log() datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm(total=total, desc='Building dictionary', unit='ex', unit_scale=True) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: logging.info('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) logging.info(f'dictionary built with {len(dictionary)} tokens ' f'in {log_time.total_time():.1f}s') return dictionary
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if is_distributed(): raise ValueError( 'Dictionaries should be pre-built before distributed train.') if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 # Set this to none so that image features are not calculated when Teacher is # instantiated while building the dict ordered_opt['image_mode'] = 'no_image_model' ordered_opt['pytorch_teacher_batch_sort'] = False if ordered_opt['task'] == 'pytorch_teacher' or not ordered_opt['task']: pytorch_teacher_task = ordered_opt.get('pytorch_teacher_task', '') if pytorch_teacher_task != '': ordered_opt['task'] = pytorch_teacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm(total=total, desc='Building dictionary', unit='ex', unit_scale=True) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: print('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2))) return dictionary
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher': pytorch_teacher_task = ordered_opt.get('pytorch_teacher_task', '') if pytorch_teacher_task != '': ordered_opt['task'] = pytorch_teacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if log_time.time() > log_every_n_secs: sys.stdout.write('\r') text, _log = log_time.log(cnt, max(opt.get('dict_maxexs', 0), world_dict.num_examples())) sys.stdout.write(text) sys.stdout.flush() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2))) return dictionary