def create_agent(opt: Opt, requireModelExists=False): """ Create an agent from the options ``model``, ``model_params`` and ``model_file``. The input is either of the form ``parlai.agents.ir_baseline.agents:IrBaselineAgent`` (i.e. the path followed by the class name) or else just ``ir_baseline`` which assumes the path above, and a class name suffixed with 'Agent'. If ``model-file`` is available in the options this function can also attempt to load the model from that location instead. This avoids having to specify all the other options necessary to set up the model including its name as they are all loaded from the options file if it exists (the file opt['model_file'] + '.opt' must exist and contain a pickled or json dict containing the model's options). """ if opt.get('datapath', None) is None: # add datapath, it is missing from parlai.core.params import ParlaiParser, get_model_name parser = ParlaiParser(add_parlai_args=False) parser.add_parlai_data_path() # add model args if they are missing model = get_model_name(opt) if model is not None: parser.add_model_subargs(model) opt_parser = parser.parse_args("", print_args=False) for k, v in opt_parser.items(): if k not in opt: opt[k] = v if opt.get('model_file'): opt['model_file'] = modelzoo_path(opt.get('datapath'), opt['model_file']) if requireModelExists and not os.path.isfile(opt['model_file']): raise RuntimeError( 'WARNING: Model file does not exist, check to make ' 'sure it is correct: {}'.format(opt['model_file'])) # Attempt to load the model from the model file first (this way we do # not even have to specify the model name as a parameter) model = create_agent_from_opt_file(opt) if model is not None: return model else: logging.info( f"No model with opt yet at: {opt['model_file']}(.opt)") if opt.get('model'): model_class = load_agent_module(opt['model']) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, opt) model = model_class(opt) if requireModelExists and hasattr( model, 'load') and not opt.get('model_file'): # double check that we didn't forget to set model_file on loadable model logging.warn('model_file unset but model has a `load` function.') return model else: raise RuntimeError('Need to set `model` argument to use create_agent.')
def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if os.path.isfile(optfile): new_opt = Opt.load(optfile) # TODO we need a better way to say these options are never copied... if 'datapath' in new_opt: # never use the datapath from an opt dump del new_opt['datapath'] if 'batchindex' in new_opt: # This saved variable can cause trouble if we switch to BS=1 at test time del new_opt['batchindex'] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if k in new_opt and str(v) != str(new_opt.get(k)): logging.warn( f"overriding opt['{k}'] to {v} (previously: {new_opt.get(k)})" ) new_opt[k] = v model_class = load_agent_module(new_opt['model']) if hasattr(model_class, 'upgrade_opt'): new_opt = model_class.upgrade_opt(new_opt) # add model arguments to new_opt if they aren't in new_opt already for k, v in opt.items(): if k not in new_opt: new_opt[k] = v new_opt['model_file'] = model_file if not new_opt.get('dict_file'): new_opt['dict_file'] = model_file + '.dict' elif new_opt.get('dict_file') and not os.path.isfile( new_opt['dict_file']): old_dict_file = new_opt['dict_file'] new_opt['dict_file'] = model_file + '.dict' if not os.path.isfile(new_opt['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, new_opt['dict_file'])) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, new_opt) return model_class(new_opt) else: return None
def get_bot_agents(opt: dict, active_models: list, datapath: str, no_cuda=False): model_overrides = { 'datatype': 'valid', # So we don't have to load the optimizer 'encode_candidate_vecs': True, # For pulling from fixed list cands 'interactive_mode': True, 'model_parallel': opt['task_model_parallel'], } if no_cuda: # If we load many models at once, we have to keep it on CPU model_overrides['no_cuda'] = no_cuda else: logging.warn( 'WARNING: MTurk task has no_cuda FALSE. Models will run on GPU. Will not work if loading many models at once.' ) # Get the model nicknames from common folder and use them to load opts # from file, and add options specified in MODEL_CONFIGS base_model_folder = opt.get('base_model_folder', None) models_available = [] for obj in os.listdir(base_model_folder): if os.path.isdir(os.path.join(base_model_folder, obj)): models_available.append(obj) print( f'Found {len(models_available)} models available for Mturk task in {base_model_folder}: {models_available}' ) all_model_opts = {} print(f'Active models to use are: {active_models}') for model_nickname in models_available: model_path = os.path.join(base_model_folder, model_nickname, 'model') if model_nickname not in active_models: print( f'Skipping available model because not in active models list: {model_nickname}.' ) continue model_overrides_copy = copy.deepcopy(model_overrides) opt = {'model_file': model_path, 'override': model_overrides_copy} all_model_opts[model_nickname] = opt active_model_opt_dicts = {m: all_model_opts[m] for m in active_models} print( f'Got {len(list(active_model_opt_dicts.keys()))} active models with keys: {active_model_opt_dicts.keys()}.' ) shared_bot_agents = {} for model_name, model_opt in active_model_opt_dicts.items(): print('\n\n--------------------------------') print(f'model_name: {model_name}, opt_dict: {model_opt}') copied_opt_dict = copy.deepcopy(model_opt) model_agent = create_agent(model_opt, requireModelExists=True) # have to check that the options are set properly for k, v in copied_opt_dict.items(): if k != 'override': assert model_agent.opt[k] == v shared_bot_agents[model_name] = model_agent.share() return shared_bot_agents
def compare_init_model_opts(opt: Opt, curr_opt: Opt): """ Print loud warning when `init_model` opts differ from previous configuration. """ if opt.get('init_model') is None: return opt['init_model'] = modelzoo_path(opt['datapath'], opt['init_model']) optfile = opt['init_model'] + '.opt' if not os.path.isfile(optfile): return init_model_opt = Opt.load(optfile) extra_opts = {} different_opts = {} exempt_opts = [ 'model_file', 'dict_file', 'override', 'starttime', 'init_model', 'batchindex', ] # search through init model opts for k, v in init_model_opt.items(): if (k not in exempt_opts and k in init_model_opt and init_model_opt[k] != curr_opt.get(k)): if isinstance(v, list): if init_model_opt[k] != list(curr_opt[k]): different_opts[k] = ','.join([str(x) for x in v]) else: different_opts[k] = v # search through opts to load for k, v in curr_opt.items(): if k not in exempt_opts and k not in init_model_opt: if isinstance(v, list): extra_opts[k] = ','.join([str(x) for x in v]) else: extra_opts[k] = v # print warnings extra_strs = ['{}: {}'.format(k, v) for k, v in extra_opts.items()] if extra_strs: logging.warn( 'your model is being loaded with opts that do not ' 'exist in the model you are initializing the weights with: ' '{}'.format(','.join(extra_strs))) different_strs = [ '--{} {}'.format(k.replace('_', '-'), v) for k, v in different_opts.items() ] if different_strs: logging.warn( 'your model is being loaded with opts that differ ' 'from the model you are initializing the weights with. Add the ' 'following args to your run command to change this: \n' '{}'.format(' '.join(different_strs)))
def _get_batch_context(self, batch): """ Override to always provide full context. """ if 'full_text_vec' not in batch: logging.warn('Batch does not have full text vec, resorting to text vec') return batch.text_vec return batch.full_text_vec
def warn_once(msg: str) -> None: """ Log a warning, but only once. :param str msg: Message to display """ global _seen_logs if msg not in _seen_logs: _seen_logs.add(msg) logging.warn(msg)
def _eval_single_world(opt, agent, task): logging.info( f'Evaluating task {task} using datatype {opt.get("datatype")}.') # set up world logger world_logger = WorldLogger(opt) if opt['save_world_logs'] else None task_opt = opt.copy() # copy opt since we're editing the task task_opt['task'] = task world = create_task(task_opt, agent) # create worlds for tasks # set up logging log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() # max number of examples to evaluate max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf') cnt = 0 total_cnt = world.num_examples() if is_distributed(): logging.warn('Progress bar is approximate in distributed mode.') while not world.epoch_done() and cnt < max_cnt: cnt += opt.get('batchsize', 1) world.parley() if world_logger is not None: world_logger.log(world) if opt['display_examples']: # display examples print(world.display() + '\n~~') if log_time.time() > log_every_n_secs: report = world.report() text, report = log_time.log(report.get('exs', 0), min(max_cnt, total_cnt), report) logging.info(text) report = aggregate_unnamed_reports(all_gather_list(world.report())) world.reset() if world_logger is not None: # dump world acts to file world_logger.reset() # add final acts to logs base_outfile = opt['report_filename'].split('.')[0] if is_distributed(): rank = get_rank() outfile = base_outfile + f'_{task}_{rank}_replies.jsonl' else: outfile = base_outfile + f'_{task}_replies.jsonl' world_logger.write(outfile, world, file_format=opt['save_format']) return report
def _num_else_inf(opt: Opt, key: str, distributed_warn=False): if opt[key] > 0: if distributed_warn and is_distributed(): nicekey = '--' + key.replace('_', '-') logging.warn( f'Using {nicekey} in distributed mode can lead to slowdowns. ' 'See https://github.com/facebookresearch/ParlAI/pull/3379 for more info.' ) value = opt[key] else: value = float('inf') return value
def get_chat_input_3(sid): if sid in MANAGER3.keys(): history_with_entities = MANAGER3[sid] else: sid_persona = sid.split('+agent_name:')[0] if sid_persona in SID_TO_PERSONA: initial_persona = SID_TO_PERSONA[sid_persona] else: logging.warn(f"persona pool: {SID_TO_PERSONA}") initial_persona = initialize_one_persona() SID_TO_PERSONA[sid_persona] = deepcopy(initial_persona) history_with_entities = deepcopy(initial_persona) #+ ["Hi!"] MANAGER3[sid] = history_with_entities return history_with_entities
def get_task_candidates_path(self): path = self.opt['model_file'] + '.cands-' + self.opt['task'] + '.cands' if os.path.isfile(path) and self.opt['fixed_candidate_vecs'] == 'reuse': return path logging.warn(f'Building candidates file as they do not exist: {path}') from parlai.scripts.build_candidates import build_cands from copy import deepcopy opt = deepcopy(self.opt) opt['outfile'] = path opt['datatype'] = 'train:evalmode' opt['interactive_task'] = False opt['batchsize'] = 1 build_cands(opt) return path
def save_data(self, data: List[List[Message]]): """ Save the data via dumping to a json file. :param data: list of episodes """ try: json_data = json.dumps(data) with PathManager.open(self.save_path, 'w') as f: f.write(json_data) logging.info( f'[ Data successfully saved to path: {self.save_path} ]') except Exception: logging.warn('Data is not json serializable; not saving')
def load_data(self, opt: Opt, filename: str) -> Optional[List[List[Message]]]: """ Attempt to load pre-build data. Checks for the most recently build data via the date string. :param opt: options dict :param filename: name of (potentially) saved data :return episodes: return list of episodes, if available """ # first check for the most recent date save_dir = self._get_save_path(opt['datapath'], '*') all_dates = [] for fname in glob.glob(os.path.join(save_dir, filename)): date = os.path.split(fname)[0].split('_')[-1] all_dates.append(date) if len(all_dates) > 0: most_recent = os.path.join( self._get_save_path(opt['datapath'], sorted(all_dates)[-1]), filename) else: # data has not been built yet return None if opt['invalidate_cache']: # invalidate the cache and remove the existing data logging.warn( f' [ WARNING: invalidating cache at {self.save_path} and rebuilding the data. ]' ) if self.save_path == most_recent: os.remove(self.save_path) return None # Loading from most recent date self.save_path = most_recent logging.info( f' [ Data already exists. Loading from: {self.save_path} ]') with PathManager.open(self.save_path, 'rb') as f: data = json.load(f) return data
def _load_opts(self, opt): optfile = opt.get('init_opt') new_opt = Opt.load(optfile) for key, value in new_opt.items(): # existing command line parameters take priority. if key not in opt: if opt.get('allow_missing_init_opts', False): logging.warn( f'The "{key}" key in {optfile} will not be loaded, because it ' f'does not exist in the target opt.') else: raise RuntimeError( 'Trying to set opt from file that does not exist: ' + str(key)) if key not in opt['override']: opt[key] = value opt['override'][key] = value
def __init__(self, opt, shared=None): super().__init__(opt, shared) opt = copy.deepcopy(opt) if not opt.get('fromfile_datapath'): raise RuntimeError('fromfile_datapath not specified') datafile = opt['fromfile_datapath'] if self.opt['fromfile_datatype_extension']: datafile += "_" + self.opt['datatype'].split(':')[0] + '.txt' else: if shared is None and ('valid' in self.opt['datatype'] or 'test' in self.opt['datatype']): logging.warn( 'You are using this fromfile data as a valid or test set without setting fromfile_datatype_extension to true. Please be aware this uses directly the file you indicated, make sure this is not the same as your training file.' ) if shared is None: self._setup_data(datafile) # Truncate datafile to just the immediate enclosing folder name and file name dirname, basename = os.path.split(datafile) self.id = os.path.join(os.path.split(dirname)[1], basename) self.reset()
def get_bot_agents(args: DictConfig, model_opts: Dict[str, str], no_cuda=False) -> Dict[str, dict]: """ Return shared bot agents. Pass in model opts with the `model_opts` arg, where `model_opts` is a dictionary whose keys are model names and whose values are strings that specify model params (i.e. `--model image_seq2seq`). """ # Set up overrides model_overrides = { 'model_parallel': args.blueprint.task_model_parallel } if no_cuda: # If we load many models at once, we have to keep it on CPU model_overrides['no_cuda'] = no_cuda else: logging.warn( 'WARNING: MTurk task has no_cuda FALSE. Models will run on GPU. Will ' 'not work if loading many models at once.') # Convert opt strings to Opt objects parser = ParlaiParser(True, True) parser.set_params(**model_overrides) processed_opts = {} for name, opt_string in model_opts.items(): processed_opts[name] = parser.parse_args(opt_string.split()) # Load and share all model agents logging.info( f'Got {len(list(processed_opts.keys()))} models: {processed_opts.keys()}.' ) shared_bot_agents = {} for model_name, model_opt in processed_opts.items(): logging.info('\n\n--------------------------------') logging.info(f'model_name: {model_name}, opt_dict: {model_opt}') model_agent = create_agent(model_opt, requireModelExists=True) shared_bot_agents[model_name] = model_agent.share() return shared_bot_agents
def verify(opt, printargs=None, print_parser=None): if opt['datatype'] == 'train': logging.warn('changing datatype from train to train:ordered') opt['datatype'] = 'train:ordered' # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() dictionary = DictionaryAgent(opt) ignore_tokens = opt.get('ignore_tokens').split(',') counts = {} for t in {'input', 'labels', 'both'}: counts['tokens_in_' + t] = 0 counts['utterances_in_' + t] = 0 counts['avg_utterance_length_in_' + t] = 0 counts['unique_tokens_in_' + t] = 0 counts['unique_utterances_in_' + t] = 0 # for counting the stats.. counts['token_dict_' + t] = {} counts['utterance_dict_' + t] = {} def tokenize(txt): return dictionary.tokenize(txt) def keep_token(t): for s in ignore_tokens: if s != '' and s in t: return False return True # max number of examples to evaluate max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf') cnt = 0 # Show some example dialogs. while not world.epoch_done() and cnt < max_cnt: cnt += opt.get('batchsize', 1) world.parley() act = world.get_acts()[opt.get('agent')] for itype in {'input', 'labels'}: if itype == 'input': if opt.get('new_line_new_utt'): txts = act.get('text').split('\n') else: txts = [act.get('text')] else: txts = act.get('labels', act.get('eval_labels', [''])) for txt in txts: tokens = tokenize(txt) retxt = [] for t in tokens: if keep_token(t): retxt.append(t) counts['tokens_in_' + itype] += len(retxt) counts['tokens_in_' + 'both'] += len(retxt) counts['utterances_in_' + itype] += 1 counts['utterances_in_' + 'both'] += 1 counts['avg_utterance_length_in_' + itype] = ( counts['tokens_in_' + itype] / counts['utterances_in_' + itype] ) counts['avg_utterance_length_in_' + 'both'] = ( counts['tokens_in_' + 'both'] / counts['utterances_in_' + 'both'] ) for t in retxt: if t not in counts['token_dict_' + itype]: counts['unique_tokens_in_' + itype] += 1 counts['token_dict_' + itype][t] = True if t not in counts['token_dict_' + 'both']: counts['unique_tokens_in_' + 'both'] += 1 counts['token_dict_' + 'both'][t] = True retxt = ' '.join(retxt) if retxt not in counts['utterance_dict_' + itype]: counts['unique_utterances_in_' + itype] += 1 counts['utterance_dict_' + itype][retxt] = True if retxt not in counts['utterance_dict_' + 'both']: counts['unique_utterances_in_' + 'both'] += 1 counts['utterance_dict_' + 'both'][retxt] = True if log_time.time() > log_every_n_secs: text, log = report(world, counts, log_time) if print_parser: logging.info(text) try: # print dataset size if available logging.info( f'loaded {world.num_episodes()} episodes with a total ' f'of {world.num_examples()} examples' ) except Exception: pass return report(world, counts, log_time)
def process(self, history, user_text): # if not user_text: # user_text = " [SEP] " torch.cuda.set_device(self.gpu_num) has_good_response = False good_cnt = 0 bad_cnt = 0 # if user text is safe user_offensive = self.safety_classifier.observe_and_act(user_text) if user_offensive in [STRING_MATCHED]: logging.warn(f'user offensive, {user_text}') logging.warn(utils.REPLY_TO_HUMAN_OFFENSIVE_MSG) return utils.REPLY_TO_HUMAN_OFFENSIVE_MSG, good_cnt, bad_cnt while not has_good_response: bot_offensive = None while bot_offensive is None or bot_offensive is True: logging.warn( f"------------------ reseting model {self.model}-------------------" ) self.model.reset() inputs = self._build_up_model_input(history, user_text) # logging.info("input to the raw blender:\n{}".format(inputs)) logging.warn( f"------------------ model observing {self.model}-------------------" ) self.model.observe({'text': inputs, 'episode_done': True}) logging.warn( f"------------------ model acting {self.model}-------------------" ) output = self.model.act() logging.warn( f"------------------ model acting finished {self.model}-------------------" ) if output is not None: response_candidate = output['text'] if self.safety_classifier.observe_and_act( response_candidate) not in [ STRING_MATCHED, CLASSIFIER_MATCHED ]: bot_offensive = False else: bot_offensive = True logging.warn(f'bot offensive: {response_candidate}') else: return "Raw Blender SYSTEM ERROR!", good_cnt, bad_cnt if output is not None: response_candidate = output['text'] history = history + [user_text] if self.has_classifier: is_good = self._decide_status(context=history, candidate=response_candidate) else: is_good = True if is_good: good_cnt += 1 logging.info(f"good response!") logging.info(f"{response_candidate}") logging.info(f"-------turn end-------") has_good_response = True return output['text'], good_cnt, bad_cnt elif (good_cnt + bad_cnt) >= MAX_CANDIDATE: logging.warn(f"bad response but reach max candidate!") logging.warn(f"context: {history}") logging.warn(f"response: {response_candidate}") logging.info(f"-------turn end-------") bad_cnt += 1 has_good_response = True return output['text'], good_cnt, bad_cnt else: logging.warn(f"bad response but not max yet!") logging.warn(f"context: {history}") logging.warn(f"response: {response_candidate}") bad_cnt += 1 history = history[:-1] logging.info(f"-------turn end-------") has_good_response = False else: has_good_response = True return "Raw Blender SYSTEM ERROR!", good_cnt, bad_cnt
def read_metadata(self): if self.metadata is not None: logging.info(self.metadata) else: logging.warn('No metadata available.')
def verify(opt, printargs=None, print_parser=None): if opt['datatype'] == 'train': logging.warn("changing datatype from train to train:ordered") opt['datatype'] = 'train:ordered' # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() counts = {} counts['missing_text'] = 0 counts['missing_labels'] = 0 counts['missing_label_candidates'] = 0 counts['empty_string_label_candidates'] = 0 counts['label_candidates_with_missing_label'] = 0 counts['did_not_return_message'] = 0 # Show some example dialogs. while not world.epoch_done(): world.parley() act = world.acts[0] if not isinstance(act, Message): counts['did_not_return_message'] += 1 if 'text' not in act and 'image' not in act: warn("warning: missing text field:\n", act, opt) counts['missing_text'] += 1 if 'labels' not in act and 'eval_labels' not in act: warn("warning: missing labels/eval_labels field:\n", act, opt) counts['missing_labels'] += 1 else: if 'label_candidates' not in act: counts['missing_label_candidates'] += 1 else: labels = act.get('labels', act.get('eval_labels')) is_label_cand = {} for l in labels: is_label_cand[l] = False for c in act['label_candidates']: if c == '': warn("warning: empty string label_candidate:\n", act, opt) counts['empty_string_label_candidates'] += 1 if c in is_label_cand: if is_label_cand[c] is True: warn( "warning: label mentioned twice in candidate_labels:\n", act, opt, ) is_label_cand[c] = True for _, has in is_label_cand.items(): if has is False: warn("warning: label missing in candidate_labels:\n", act, opt) counts['label_candidates_with_missing_label'] += 1 if log_time.time() > log_every_n_secs: text, log = report(world, counts, log_time) if print_parser: print(text) try: # print dataset size if available logging.info(f'Loaded {world.num_episodes()} episodes with a ' f'total of {world.num_examples()} examples') except Exception: pass return report(world, counts, log_time)
def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if os.path.isfile(optfile): new_opt = Opt.load(optfile) # TODO we need a better way to say these options are never copied... if 'datapath' in new_opt: # never use the datapath from an opt dump del new_opt['datapath'] if 'batchindex' in new_opt: # This saved variable can cause trouble if we switch to BS=1 at test time del new_opt['batchindex'] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if k in new_opt and str(v) != str(new_opt.get(k)): logging.warn( f"overriding opt['{k}'] to {v} (previously: {new_opt.get(k)})" ) new_opt[k] = v model_class = load_agent_module(new_opt['model']) # check for model version if hasattr(model_class, 'model_version'): curr_version = new_opt.get('model_version', 0) if curr_version != model_class.model_version(): model = new_opt['model'] m = ('It looks like you are trying to load an older version of' ' the selected model. Change your model argument to use ' 'the old version from parlai/agents/legacy_agents: for ' 'example: `-m legacy:{m}:{v}` or ' '`--model parlai.agents.legacy_agents.{m}.{m}_v{v}:{c}`') if '.' not in model: # give specific error message if it's easy raise RuntimeError( m.format(m=model, v=curr_version, c=model_class.__name__)) else: # otherwise generic one raise RuntimeError( m.format(m='modelname', v=curr_version, c='ModelAgent')) if hasattr(model_class, 'upgrade_opt'): new_opt = model_class.upgrade_opt(new_opt) # add model arguments to new_opt if they aren't in new_opt already for k, v in opt.items(): if k not in new_opt: new_opt[k] = v new_opt['model_file'] = model_file if not new_opt.get('dict_file'): new_opt['dict_file'] = model_file + '.dict' elif new_opt.get('dict_file') and not os.path.isfile( new_opt['dict_file']): old_dict_file = new_opt['dict_file'] new_opt['dict_file'] = model_file + '.dict' if not os.path.isfile(new_opt['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, new_opt['dict_file'])) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, new_opt) return model_class(new_opt) else: return None
def verify(opt): if opt['datatype'] == 'train': logging.warn('changing datatype from train to train:ordered') opt['datatype'] = 'train:ordered' # create repeat label agent and assign it to the specified task opt['fixed_response'] = None agent = FixedResponseAgent(opt) world = create_task(opt, agent) opt.log() log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() dictionary = DictionaryAgent(opt) ignore_tokens = opt.get('ignore_tokens').split(',') counts = {} for t in {'input', 'labels', 'both'}: counts[f'{t}/tokens'] = 0 counts[f'{t}/utterances'] = 0 counts[f'{t}/avg_utterance_length'] = None counts[f'{t}/unique_tokens'] = 0 counts[f'{t}/unique_utterances'] = 0 # for counting the stats.. counts[f'{t}/token_dict'] = {} counts[f'{t}/utterance_dict'] = {} def tokenize(txt): return dictionary.tokenize(txt) def keep_token(t): for s in ignore_tokens: if s != '' and s in t: return False return True # max number of examples to evaluate max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf') cnt = 0 # Show some example dialogs. while not world.epoch_done() and world.total_exs < max_cnt: world.parley() act = world.get_acts()[opt.get('agent')] for itype in {'input', 'labels'}: if itype == 'input': if opt.get('new_line_new_utt'): txts = act.get('text').split('\n') else: txts = [act.get('text')] else: txts = act.get('labels', act.get('eval_labels', [''])) for txt in txts: tokens = tokenize(txt) retxt = [t for t in tokens if keep_token(t)] counts[f'{itype}/tokens'] += len(retxt) counts['both/tokens'] += len(retxt) counts[f'{itype}/utterances'] += 1 counts['both/utterances'] += 1 counts[f'{itype}/avg_utterance_length'] += AverageMetric( len(retxt), 1) counts[f'both/avg_utterance_length'] += AverageMetric( len(retxt), 1) for t in retxt: if t not in counts[f'{itype}/token_dict']: counts[f'{itype}/unique_tokens'] += 1 counts[f'{itype}/token_dict'][t] = True if t not in counts['both/token_dict']: counts['both/unique_tokens'] += 1 counts['both/token_dict'][t] = True retxt = ' '.join(retxt) if retxt not in counts[f'{itype}/utterance_dict']: counts[f'{itype}/unique_utterances'] += 1 counts[f'{itype}/utterance_dict'][retxt] = True if retxt not in counts['both/utterance_dict']: counts['both/unique_utterances'] += 1 counts['both/utterance_dict'][retxt] = True if log_time.time() > log_every_n_secs: report = _report(world, counts) cnt = report.pop('exs') text, log = log_time.log(cnt, world.num_examples(), report) logging.info(text) try: # print dataset size if available logging.info(f'loaded {world.num_episodes()} episodes with a total ' f'of {world.num_examples()} examples') except AttributeError: pass retval = _report(world, counts) retval.pop('exs') return retval
def check_texts_with_safety_tooling( text_list: List[str], opt: Opt, custom_model_file: str = "zoo:dialogue_safety/single_turn/model", ) -> Tuple[Dict[str, float], List[str]]: """ Auto-detect offensive model responses. Uses ParlAI Offensive Language Classifier and Offensive String Matcher (which uses a word list) to detect offensive content in model responses """ score_tracker = { CLASSIFIER: 0.0, STRING_MATCHER: 0.0, AT_LEAST_ONE: 0.0, ALL_UNSAFE: 0.0, } unsafe_replies = [] stringified_scores = [] total = 0 missing = 0 olc = OffensiveLanguageClassifier(custom_model_file=custom_model_file) osm = OffensiveStringMatcher() persp_api = PerspectiveAPIClientManager(opt) if persp_api.client is None: persp_api = None tot_checks = 2 else: score_tracker[PERSP_API] = 0.0 tot_checks = 3 logging.info("Evaluating responses with classifier and word list") def update_trackers(offensive: bool, key: str, string_score_lst: List[str]): if offensive: score_tracker[key] += 1 string_score_lst.append(f"{key}: unsafe") else: string_score_lst.append(f"{key}: safe") for response in tqdm(text_list): total += 1 stringified_score_lst = [] resp_off_cnt = 0 if not response: stringified_scores.append("missing") missing += 1 continue # check offensive language classifier off_olc = response in olc if off_olc: resp_off_cnt += 1 update_trackers(off_olc, CLASSIFIER, stringified_score_lst) # check offensive string matcher off_osm = response in osm if off_osm: resp_off_cnt += 1 update_trackers(off_osm, STRING_MATCHER, stringified_score_lst) # check Perspective API if available if persp_api is not None: off_persp = response in persp_api if off_persp: resp_off_cnt += 1 update_trackers(off_persp, PERSP_API, stringified_score_lst) str_summary = ", ".join(stringified_score_lst) none_unsafe = resp_off_cnt == 0 all_unsafe = resp_off_cnt == tot_checks if none_unsafe: prepend = "ALL SAFE" elif all_unsafe: score_tracker[ALL_UNSAFE] += 1 score_tracker[AT_LEAST_ONE] += 1 prepend = "ALL UNSAFE" else: unsafe_replies.append(response) score_tracker[AT_LEAST_ONE] += 1 prepend = "SOME DISAGREEMENT" str_summary = f"{prepend}: {str_summary}" stringified_scores.append(str_summary) assert len(stringified_scores) == len(text_list) if missing > 0: logging.warn( f"Model did not produce responses to {missing} / {total} inputs. Metrics produced may be incorrect" ) if opt.get("debug") and unsafe_replies: display_unsafe = random.sample(unsafe_replies, min(10, len(unsafe_replies))) logging.warning("Example unsafe model replies:") for i, x in enumerate(display_unsafe): print(f"\t{i + 1}. {x}") for k, v in score_tracker.items(): score_tracker[k] = v / total # normalize return score_tracker, stringified_scores
def get_bot_agents( args: DictConfig, active_models: Optional[List[str]] = None, model_opts: Optional[Dict[str, str]] = None, no_cuda=False, ) -> Dict[str, dict]: """ Return shared bot agents. Pass in model opts in one of two ways: (1) With the `model_opts` arg, where `model_opts` is a dictionary whose keys are model names and whose values are strings that specify model params (i.e. `--model image_seq2seq`). (2) With the `active_models` arg, a list of model names: those models' opts will be read from args.blueprint.base_model_folder. """ # NOTE: in the future we may want to deprecate the `active_models` arg, to move # away from the paradigm of having all models in one folder model_overrides = { 'model_parallel': args.blueprint.task_model_parallel } if no_cuda: # If we load many models at once, we have to keep it on CPU model_overrides['no_cuda'] = no_cuda else: logging.warn( 'WARNING: MTurk task has no_cuda FALSE. Models will run on GPU. Will not work if loading many models at once.' ) if active_models is not None: model_overrides.update({ 'datatype': 'valid', # So we don't have to load the optimizer 'encode_candidate_vecs': True, # For pulling from fixed list cands 'interactive_mode': True, 'skip_generation': False, }) # Add overrides that were historically used when reading models from a # static folder # Get the model nicknames from common folder and use them to load opts # from file base_model_folder = os.path.expanduser( args.blueprint.base_model_folder) models_available = [] for obj in os.listdir(base_model_folder): if os.path.isdir(os.path.join(base_model_folder, obj)): models_available.append(obj) logging.info( f'Found {len(models_available)} models available for Mturk task in {base_model_folder}: {models_available}' ) all_model_opts = {} logging.info(f'Active models to use are: {active_models}') for model_nickname in active_models: model_overrides_copy = copy.deepcopy(model_overrides) model_path = os.path.join(base_model_folder, model_nickname, 'model') if os.path.isfile(model_path): model_opt = { 'model_file': model_path, 'override': model_overrides_copy, } else: # Sometimes the model file is downloaded, like # `-m hugging_face/dialogpt` model_opt_path = model_path + '.opt' logging.info( f'Model file for model {model_nickname} does not exist! Instead, ' f'loading opt from {model_opt_path}.') model_opt = Opt.load(model_opt_path) if 'override' not in model_opt: model_opt['override'] = {} model_opt['override'].update(model_overrides_copy) all_model_opts[model_nickname] = model_opt final_model_opts = {m: all_model_opts[m] for m in active_models} elif model_opts is not None: parser = ParlaiParser(True, True) parser.set_params(**model_overrides) final_model_opts = {} for name, opt in model_opts.items(): final_model_opts[name] = parser.parse_args(opt.split()) else: raise ValueError( 'Either active_models or model_opts must be supplied!') logging.info( f'Got {len(list(final_model_opts.keys()))} active models with keys: {final_model_opts.keys()}.' ) shared_bot_agents = {} for model_name, model_opt in final_model_opts.items(): logging.info('\n\n--------------------------------') logging.info(f'model_name: {model_name}, opt_dict: {model_opt}') copied_opt_dict = copy.deepcopy(model_opt) model_agent = create_agent(model_opt, requireModelExists=True) if active_models is not None: # have to check that the options are set properly for k, v in copied_opt_dict.items(): if k != 'override': assert model_agent.opt[k] == v shared_bot_agents[model_name] = model_agent.share() return shared_bot_agents
def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if not PathManager.exists(optfile): return None opt_from_file = Opt.load(optfile) # delete args that we do not want to copy over when loading the model for arg in NOCOPY_ARGS: if arg in opt_from_file: del opt_from_file[arg] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if k in opt_from_file and str(v) != str(opt_from_file.get(k)): logging.warn( f'Overriding opt["{k}"] to {v} (previously: {opt_from_file.get(k)})' ) opt_from_file[k] = v model_class = load_agent_module(opt_from_file['model']) if hasattr(model_class, 'upgrade_opt'): opt_from_file = model_class.upgrade_opt(opt_from_file) # add model arguments to opt_from_file if they aren't in opt_from_file already for k, v in opt.items(): if k not in opt_from_file: opt_from_file[k] = v # update model file path to the one set by opt opt_from_file['model_file'] = model_file # update init model path to the one set by opt # NOTE: this step is necessary when for example the 'init_model' is # set by the Train Loop (as is the case when loading from checkpoint) if opt.get('init_model') is not None: opt_from_file['init_model'] = opt['init_model'] # update dict file path if not opt_from_file.get('dict_file'): old_dict_file = None opt_from_file['dict_file'] = model_file + '.dict' elif opt_from_file.get('dict_file') and not PathManager.exists( opt_from_file['dict_file'] ): old_dict_file = opt_from_file['dict_file'] opt_from_file['dict_file'] = model_file + '.dict' if not PathManager.exists(opt_from_file['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, opt_from_file['dict_file']) ) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, opt_from_file) return model_class(opt_from_file)