def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if is_distributed(): raise ValueError( 'Dictionaries should be pre-built before distributed train.') if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' ordered_opt['pytorch_teacher_batch_sort'] = False if ordered_opt['task'] == 'pytorch_teacher' or not ordered_opt['task']: pytorch_teacher_task = ordered_opt.get('pytorch_teacher_task', '') if pytorch_teacher_task != '': ordered_opt['task'] = pytorch_teacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm(total=total, desc='Building dictionary', unit='ex', unit_scale=True) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: print('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2))) return dictionary
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher': pytorch_teacher_task = ordered_opt.get('pytorch_teacher_task', '') if pytorch_teacher_task != '': ordered_opt['task'] = pytorch_teacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if log_time.time() > log_every_n_secs: sys.stdout.write('\r') text, _log = log_time.log(cnt, max(opt.get('dict_maxexs', 0), world_dict.num_examples())) sys.stdout.write(text) sys.stdout.flush() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2))) return dictionary