def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if os.path.isfile(optfile): new_opt = load_opt_file(optfile) # TODO we need a better way to say these options are never copied... if 'datapath' in new_opt: # never use the datapath from an opt dump del new_opt['datapath'] if 'batchindex' in new_opt: # This saved variable can cause trouble if we switch to BS=1 at test time del new_opt['batchindex'] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if str(v) != str(new_opt.get(k, None)): print("[ warning: overriding opt['{}'] to {} (" "previously: {} )]".format(k, v, new_opt.get(k, None))) new_opt[k] = v model_class = load_agent_module(new_opt['model']) # check for model version if hasattr(model_class, 'model_version'): curr_version = new_opt.get('model_version', 0) if curr_version != model_class.model_version(): model = new_opt['model'] m = ('It looks like you are trying to load an older version of' ' the selected model. Change your model argument to use ' 'the old version from parlai/agents/legacy_agents: for ' 'example: `-m legacy:{m}:{v}` or ' '`--model parlai.agents.legacy_agents.{m}.{m}_v{v}:{c}`') if '.' not in model: # give specific error message if it's easy raise RuntimeError( m.format(m=model, v=curr_version, c=model_class.__name__)) else: # otherwise generic one raise RuntimeError( m.format(m='modelname', v=curr_version, c='ModelAgent')) if hasattr(model_class, 'upgrade_opt'): new_opt = model_class.upgrade_opt(new_opt) # add model arguments to new_opt if they aren't in new_opt already for k, v in opt.items(): if k not in new_opt: new_opt[k] = v new_opt['model_file'] = model_file if not new_opt.get('dict_file'): new_opt['dict_file'] = model_file + '.dict' elif new_opt.get('dict_file') and not os.path.isfile( new_opt['dict_file']): old_dict_file = new_opt['dict_file'] new_opt['dict_file'] = model_file + '.dict' if not os.path.isfile(new_opt['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, new_opt['dict_file'])) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, new_opt) return model_class(new_opt) else: return None
def _setup_data(self, opt): counts = { 'partner': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, 'self': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, } dt = opt['datatype'].split(':')[0] if dt == 'test': warn_once('No test set; switching to valid') dt = 'valid' # build data print('[ Building data ... ]') new_eps = [] orig_teacher = OrigConvai2Teacher(opt) total_exs = orig_teacher.num_examples() num_exs = 0 while num_exs < total_exs: current_episode = [] episode_done = False while not episode_done: # TODO: eventually all teachers should return Messages, so # we should assert this action = Message(orig_teacher.act()) current_episode.append(action) episode_done = action.get('episode_done', False) num_exs += 1 # now we have the entire episode,... do something first_ex = current_episode[0] first_ex_text = [] partner_persona = [] your_persona = [] for line in first_ex['text'].split('\n'): # NOTE: we flip "your" and "partner" here since we are taking the 'text' # field instead of the 'label' if 'partner\'s persona: ' in line: your_persona.append(line.split('partner\'s persona: ')[1]) elif 'your persona: ' in line: partner_persona.append(line.split('your persona: ')[1]) else: first_ex_text.append(line) your, your_prob, partner, partner_prob = self.get_genders( your_persona, partner_persona) for i, ex in enumerate(current_episode): counts['self'][your] += 1 counts['partner'][partner] += 1 if i == 0: text = '\n'.join(first_ex_text) else: text = ex['text'] new_ex = { 'text': text, 'episode_done': True, 'your_persona': '\n'.join(your_persona), 'partner_persona': '\n'.join(partner_persona), 'id': 'ConvAI2 Gender', } if not self.use_probably: new_ex['partner_prob'] = partner_prob new_ex['your_prob'] = your_prob if your is not None and self.labels_to_use != 'partner': # Get the your task labels = [f'SELF:{your}'] your_ex = deepcopy(new_ex) your_ex['labels'] = labels your_ex['class_type'] = 'self' new_eps.append(your_ex) if partner is not None and self.labels_to_use != 'self': # Get the partner task labels = [f'PARTNER:{partner}'] partner_ex = deepcopy(new_ex) partner_ex['labels'] = labels partner_ex['class_type'] = 'partner' new_eps.append(partner_ex) if self.labels_to_use == 'all' and self.add_unknown_classes: # load about data all_about_data = gend_utils.get_inferred_about_data( self.opt['task'], self.opt) sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: to_samp = int(sample_rate * len(all_about_data)) sampled = random.sample(all_about_data, to_samp) new_eps += sampled else: new_eps += all_about_data if self.is_train: random.shuffle(new_eps) self.data = new_eps print(f'Missing cnt: {self.missing_cnt} / {len(self.data) * 2}') for x in ['self', 'partner']: print(f'Totals for {x}:') subtot = sum(counts[x].values()) for k, v in counts[x].items(): print(f'\t{k}: {v} ({v / subtot})')
def forward(self, input, positions=None, segments=None): """ Forward pass. :param LongTensor[batch,seqlen] input: The input IDs :param BoolTensor[batch,seqlen] mask: The attention mask; 1 means attend, 0 means ignore. :param LongTensor[batch,seqlen]: If provided, additionally adds ``segments`` as extra embedding features. """ mask = input != self.padding_idx if positions is None: positions = (mask.cumsum(dim=1, dtype=torch.int64) - 1).clamp_(min=0) tensor = self.embeddings(input) if self.embeddings_scale: tensor = tensor * np.sqrt(self.dim) if positions.max().item() > self.n_positions: warn_once( 'You are inputting a sequence of {x} length, but only have ' '--n-positions {y}. Set --truncate or increase --n-positions'. format(x=positions.max().item(), y=self.n_positions)) position_embs = self.position_embeddings(positions).expand_as(tensor) tensor = tensor + position_embs if self.n_segments >= 1: if segments is None: segments = torch.zeros_like(input) tensor = tensor + self.segment_embeddings(segments) if self.variant == 'xlm': tensor = _normalize(tensor, self.norm_embeddings) # --dropout on the embeddings tensor = self.dropout(tensor) tensor *= mask.unsqueeze(-1).type_as(tensor) if getattr(self.layers, 'is_model_parallel', False): # factored out for readability. It is equivalent to the other # condition tensor = self._apply_model_parallel(tensor, mask) else: for i in range(self.n_layers): tensor = self.layers[i](tensor, mask) if self.variant == 'prelayernorm': tensor = _normalize(tensor, self.norm_embeddings) tensor *= self.output_scaling if self.reduction_type == 'first': return tensor[:, 0, :] elif self.reduction_type == 'max': return tensor.max(dim=1)[0] elif self.reduction_type == 'mean': divisor = mask.float().sum(dim=1).unsqueeze(-1).clamp( min=1).type_as(tensor) output = tensor.sum(dim=1) / divisor return output elif self.reduction_type is None or 'none' in self.reduction_type: return tensor, mask else: raise ValueError("Can't handle --reduction-type {}".format( self.reduction_type))
def __init__(self, opt: Opt, shared=None): init_model, self.is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) # set up classes if opt.get('classes') is None and opt.get('classes_from_file') is None: raise RuntimeError( 'Must specify --classes or --classes-from-file argument.') if not shared: if opt['classes_from_file'] is not None: with open(opt['classes_from_file']) as f: self.class_list = f.read().splitlines() else: self.class_list = opt['classes'] self.class_dict = {val: i for i, val in enumerate(self.class_list)} if opt.get('class_weights', None) is not None: self.class_weights = opt['class_weights'] else: self.class_weights = [1.0 for c in self.class_list] self.reset_metrics() else: self.class_list = shared['class_list'] self.class_dict = shared['class_dict'] self.class_weights = shared['class_weights'] # get reference class; if opt['get_all_metrics'] is False, this is # used to compute metrics # in binary classfication, opt['threshold'] applies to ref class if opt['ref_class'] is None or opt['ref_class'] not in self.class_dict: self.ref_class = self.class_list[0] else: self.ref_class = opt['ref_class'] ref_class_id = self.class_list.index(self.ref_class) if ref_class_id != 0: # move to the front of the class list self.class_list.insert(0, self.class_list.pop(ref_class_id)) if not opt['get_all_metrics']: warn_once('Using %s as the class for computing P, R, and F1' % self.ref_class) # set up threshold, only used in binary classification if len(self.class_list) == 2 and opt.get('threshold', 0.5) != 0.5: self.threshold = opt['threshold'] else: self.threshold = None # set up model and optimizers if shared: self.model = shared['model'] else: self.model = self.build_model() self.criterion = self.build_criterion() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model.cuda() self.criterion.cuda() if init_model: print('Loading existing model parameters from ' + init_model) self.load(init_model) if self.use_cuda: if self.opt['data_parallel']: if is_distributed(): raise ValueError( 'Cannot combine --data-parallel and distributed mode' ) self.model = torch.nn.DataParallel(self.model) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] else: optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params) self.build_lr_scheduler()
def __init__( self, opt: Opt, embedding: Optional[nn.Embedding] = None, n_positions: Optional[int] = None, ): super().__init__() def _default(val, default): return val if val is not None else default self.embedding_size = opt['embedding_size'] self.ffn_size = opt['ffn_size'] self.n_layers = (opt['n_decoder_layers'] if opt.get('n_decoder_layers', -1) > 0 else opt['n_layers']) self.n_heads = opt['n_heads'] self.dim = self.embedding_size self.activation = opt.get('activation', 'relu') self.variant = opt.get('variant', 'aiayn') self.embeddings_scale = opt.get('embeddings_scale', True) dropout_frac = opt.get('dropout', 0.0) self.dropout = nn.Dropout(p=dropout_frac) # --dropout self.n_positions = _default(n_positions, get_n_positions_from_options(opt)) self.out_dim = self.embedding_size assert (self.embedding_size % self.n_heads == 0 ), 'Transformer embedding size must be a multiple of n_heads' self.embeddings = embedding if (self.variant == 'xlm' or self.variant == 'prelayernorm' or self.variant == 'bart'): self.norm_embeddings = torch.nn.LayerNorm(self.dim, eps=LAYER_NORM_EPS) if self.variant == 'xlm': warn_once( 'DEPRECATED: XLM should only be used for backwards compatibility, ' 'as it involves a less-stable layernorm operation.') elif self.variant == 'aiayn': pass else: raise ValueError("Can't handle --variant {}".format(self.variant)) # create the positional embeddings self.position_embeddings = nn.Embedding(self.n_positions, self.embedding_size) if not opt.get('learn_positional_embeddings', False): create_position_codes( self.n_positions, self.embedding_size, out=self.position_embeddings.weight, ) else: nn.init.normal_(self.position_embeddings.weight, 0, self.embedding_size**-0.5) # build the model self.layers = nn.ModuleList() for _ in range(self.n_layers): self.layers.append( TransformerDecoderLayer( self.n_heads, self.embedding_size, self.ffn_size, attention_dropout=opt.get('attention_dropout', 0.0), relu_dropout=opt.get('relu_dropout', 0.0), dropout=dropout_frac, activation=self.activation, variant=self.variant, ))
def load_from_chunk(self, chunk_idx: int): """ [Abstract] Given the chunk index, load examples from that chunk. Return a list of tuples. The function `_create_message` will take these tuples to form the Message object that is returned by the teacher. """ output = [] chunk_path = self.chunk_idx_to_file[chunk_idx] extra_data = [] with open(chunk_path) as wf: for article_json in wf: article = json.loads(article_json) title = article['title'] text = article['text'] title = title.split(' (')[0] is_person = check_if_person(title) if not is_person: continue gender = get_gender(text) label = f'ABOUT:{gender}' for par in text.split('\n'): if par: output.append((par, title, label, gender, 'about')) self.counts[gender] += 1 if self.add_unknown_classes: extra_data.append(( par, title, f'SELF:{gend_utils.UNKNOWN}', gender, 'self', )) extra_data.append(( par, title, f'PARTNER:{gend_utils.NEUTRAL}', gender, 'partner', )) if len(extra_data) > 0: # possibly sample unknown classes sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: to_samp = int(sample_rate * len(extra_data)) sampled = random.sample(extra_data, to_samp) output += sampled else: output += extra_data if DEBUG: print('\n\nGender count update:') for k, v in self.counts.items(): print(f'{k}: {v}') if (self.is_train and self.opt['balance']) or (self.is_valid and self.opt['balance_valid']): exclude_lst = [ f'ABOUT:{gend_utils.NONBINARY}', f'SELF:{gend_utils.UNKNOWN}', f'PARTNER:{gend_utils.NEUTRAL}', ] # not enough of each of these examples to balance output = gend_utils.balance_data(output, key=2, exclude_labels=exclude_lst) if len(output) == 0: warn_once(f'CHUNK {chunk_idx} is empty') return output
def lr_scheduler_factory(cls, opt, optimizer, states, hard_reset=False): """ Create the learning rate scheduler, and assign it to self.scheduler. This scheduler will be updated upon a call to receive_metrics. May also create self.warmup_scheduler, if appropriate. :param opt opt: Arguments received by torch_agent :param optimizer optimizer: Optimizer being used for training. May be wrapped in fp16_optimizer_wrapper depending on whether fp16 is used. :param state_dict states: Possible state_dict provided by model checkpoint, for restoring LR state. :param bool hard_reset: If true, the LR scheduler should ignore the state dictionary. :return: ParlAILRScheduler object """ patience = opt.get('lr_scheduler_patience', 3) decay = opt.get('lr_scheduler_decay', 0.5) warmup_updates = opt.get('warmup_updates', -1) warmup_rate = opt.get('warmup_rate', 1e-4) max_lr_steps = opt.get('max_train_steps', -1) if opt.get('max_lr_steps', -1) > 0: raise ValueError( '--max-lr-steps is **DEPRECATED**; please set --max-train-steps directly' ) invsqrt_lr_decay_gamma = opt.get('invsqrt_lr_decay_gamma', -1) if opt.get('lr_scheduler') == 'none': return None elif decay == 1.0: warn_once( "Your LR decay is set to 1.0. Assuming you meant you wanted " "to disable learning rate scheduling. Adjust --lr-scheduler-decay " "if this is not correct.") return None elif opt.get('lr_scheduler') == 'reduceonplateau': scheduler = ReduceOnPlateauLRScheduler(optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate) elif opt.get('lr_scheduler') == 'fixed': scheduler = FixedLRScheduler(optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate) elif opt.get('lr_scheduler') == 'invsqrt': scheduler = InvSqrtLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, invsqrt_lr_decay_gamma, max_lr_steps, ) elif opt.get('lr_scheduler') == 'cosine': scheduler = CosineLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, max_lr_steps, ) elif opt.get('lr_scheduler') == 'linear': scheduler = LinearLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, max_lr_steps, ) else: raise ValueError( "Don't know what to do with --lr-scheduler '{}'".format( opt.get('lr_scheduler'))) # time to load LR state from the checkpoint, if possible. if ( # there is already an old LR scheduler saved on disk states # and there was a scheduler in the dump and 'lr_scheduler_type' in states # and the old LR scheduler is different and states.get('lr_scheduler_type') != opt['lr_scheduler'] # and we're not already using a fresh scheduler and not hard_reset): # the LR scheduler changed, start things fresh warn_once( f"LR scheduler ({opt['lr_scheduler']}) is different from saved " f"({states.get('lr_scheduler_type')}). Starting fresh!") hard_reset = True if not hard_reset: # do the actual loading (if possible) scheduler.load_state(states) # setup warmup scheduler after loading saved scheduler scheduler._init_warmup_scheduler(optimizer, states) return scheduler
def forward(self, input, encoder_state, embedded_input=None, incr_state=None): """ Forward pass with the ability to pass in token-embedded inputs. """ encoder_output, encoder_mask = encoder_state if input is not None: seq_len = input.size(1) positions = input.new(seq_len).long() else: seq_len = embedded_input.size(1) positions = embedded_input.new(seq_len).long() positions = torch.arange(seq_len, out=positions).unsqueeze(0) if incr_state is not None: # We're doing incremental decoding, so select only the most recent position if input is not None: input = input[:, -1:] if embedded_input is not None: embedded_input = embedded_input[:, -1:, :] if positions is not None: positions = positions[:, -1:] else: incr_state = {} if embedded_input is not None: tensor = embedded_input # No need to copy because we only reassign below else: tensor = self.embeddings(input) if self.embeddings_scale: tensor = tensor * np.sqrt(self.dim) if self.variant == 'xlm': tensor = _normalize(tensor, self.norm_embeddings) if positions.max().item() > self.n_positions: warn_once( 'You are inputting a sequence of {x} length, but only have ' '--n-positions {y}. Set --truncate or increase --n-positions'.format( x=positions.max().item(), y=self.n_positions ) ) tensor = tensor + self.position_embeddings(positions).expand_as(tensor) tensor = self.dropout(tensor) # --dropout new_incr_state = {} if getattr(self.layers, 'is_model_parallel', False): tensor, new_incr_state = self._apply_model_parallel( tensor, encoder_output, encoder_mask, incr_state ) else: for idx, layer in enumerate(self.layers): tensor, new_incr_state[idx] = layer( x=tensor, encoder_output=encoder_output, encoder_mask=encoder_mask, incr_state=incr_state.get(idx), ) if self.variant == 'prelayernorm': tensor = _normalize(tensor, self.norm_embeddings) return tensor, new_incr_state
def warn(txt, act, opt): if opt.get('display_examples'): print(txt + ":\n" + str(act)) else: warn_once(txt)
def train(self): """ Perform a training run. :return: tuple of reports (validation_report, test_report) """ if is_distributed(): warn_once( "Distributed training outputs average-per-worker metrics during " "training, and may be slightly distorted. Validation/test are " "unadulterated.") opt = self.opt world = self.world with world: while True: # do one example / batch of examples try: world.parley() except StopTrainException: if is_distributed(): raise RuntimeError( "StopTrainException not supported for " "distributed mode") break self.parleys += 1 # get the total training examples done, compute epochs self._total_epochs = ( self._preempted_epochs + num_workers() * self.world.get_total_epochs()) exs_per_epoch = self.world.num_examples() self._total_exs = int( np.round(self._total_epochs * exs_per_epoch)) # and use the primary worker's timings for everything train_time, log_time, validate_time = sync_object(( self.train_time.time(), self.log_time.time(), self.validate_time.time(), )) # check counters and timers if self._total_epochs >= self.max_num_epochs: self.log() print( '[ num_epochs completed:{} time elapsed:{}s ]'.format( self.max_num_epochs, train_time)) break if train_time > self.max_train_time: print('[ max_train_time elapsed:{}s ]'.format(train_time)) break if log_time > self.log_every_n_secs: self.log() if (validate_time > self.val_every_n_secs or self._total_epochs - self.last_valid_epoch >= self.val_every_n_epochs): try: stop_training = self.validate() except StopTrainException: if is_distributed(): raise RuntimeError( "StopTrainException not " "supported for distributed mode") break self.last_valid_epoch = self._total_epochs if stop_training: break if (self.save_time.time() > self.save_every_n_secs and opt.get('model_file') and is_primary_worker()): print("[ saving model checkpoint: {}.checkpoint".format( opt['model_file'])) self.save_model('.checkpoint') self.save_time.reset() if not self.saved and is_primary_worker(): # save agent self.save_model() elif opt.get('model_file'): # reload best validation model self.agent = create_agent(opt) valid_worlds = _maybe_load_eval_worlds(self.agent, opt, 'valid') max_exs = opt['validation_max_exs'] if opt.get( 'short_final_eval') else -1 v_report = run_eval(valid_worlds, opt, 'valid', max_exs, write_log=True) test_worlds = _maybe_load_eval_worlds(self.agent, opt, 'test') t_report = run_eval(test_worlds, opt, 'test', max_exs, write_log=True) if valid_worlds: for valid_world in valid_worlds: valid_world.shutdown() if test_worlds: for test_world in test_worlds: test_world.shutdown() print_announcements(opt) return v_report, t_report
def __init__(self, opt, shared=None): opt['batch_sort'] = False super().__init__(opt, shared) self.use_batch_act = self.bsz > 1 self.num_workers = opt['numworkers'] self.batch_sort = (opt.get('pytorch_teacher_batch_sort') and 'train' in self.datatype) self.batch_cache_type = opt.get('batch_sort_cache_type') self.batch_sort_field = opt.get('batch_sort_field') # One can specify a collate function to use for preparing a batch self.opt = opt.copy() self.is_shared = shared is not None dataset_classes = self._get_dataset_class(opt) self.ordered = 'ordered' in self.datatype or ( 'stream' in self.datatype and not opt.get('shuffle')) if self.ordered: # force index for ordered, so that we see every example warn_once('\nNote: You are using PytorchDataTeacher with ordered ' 'examples. Please specify `--shuffle` if you would like ' 'to have examples loaded in randomized order.\n') self.batch_cache_type = 'index' if not shared: BatchSortCache.create() if len(dataset_classes) > 1: datasets = [] for class_name, collate_fn, task_name in dataset_classes: dataset_opt = opt.copy() dataset_opt['pytorch_teacher_task'] = task_name dataset_opt['task'] = task_name datasets.append(class_name(dataset_opt)) self.collate_fn = collate_fn self.id = ','.join([d[2] for d in dataset_classes]) self.dataset = ParlAIConcatDataset(datasets) else: class_name, self.collate_fn, task_name = dataset_classes[0] self.id = task_name self.dataset = class_name(opt) if self.ordered or not self.training: data_sampler = sampler.SequentialSampler(self.dataset) else: data_sampler = sampler.RandomSampler(self.dataset) self.pytorch_dataloader = DataLoader( self.dataset, batch_size=self.bsz, sampler=data_sampler, num_workers=self.num_workers, collate_fn=self.collate_fn, pin_memory=False, drop_last=False, ) self.lastYs = [None] * self.bsz if self.batch_sort: self.loader_process = LoaderProcess(opt) self.loader_process.start() self.data = enumerate(self.pytorch_dataloader) else: self.dataset = shared['dataset'] self.pytorch_dataloader = shared['pytorch_dataloader'] self.lastYs = shared['lastYs'] self.data = shared['data'] self.id = shared['id'] self.num_batches = math.ceil(self.dataset.num_episodes() / self.bsz) self.reset()
import math from typing import Dict, Tuple, Optional import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from parlai.core.torch_generator_agent import TorchGeneratorModel from parlai.utils.misc import warn_once from parlai.utils.torch import neginf try: from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm except ImportError: warn_once("Installing APEX can give a significant speed boost.") from torch.nn import LayerNorm LAYER_NORM_EPS = 1e-5 # Epsilon for layer norm. def _normalize(tensor, norm_layer): """ Broadcast layer norm. """ size = tensor.size() return norm_layer(tensor.view(-1, size[-1])).view(size) def _create_embeddings(dictionary, embedding_size, padding_idx): """
def self_chat(opt, print_parser=None): if print_parser is not None: if print_parser is True and isinstance(opt, ParlaiParser): print_parser = opt elif print_parser is False: print_parser = None if isinstance(opt, ParlaiParser): print( '[ Deprecated Warning: self_chat should be passed opt not Parser ]' ) opt = opt.parse_args() random.seed(opt['seed']) # Create models agent1 = create_agent(opt, requireModelExists=True) agent2 = agent1.clone() if hasattr(agent2, 'id'): agent2.id = agent2.id + "2" # Check for `selfchat` in the task name if 'selfchat' not in opt['task']: warn_once( 'You are using self chat with task {}. '.format(opt['task']) + 'If your task has an existing self chat world, then run with ' '-t {}:selfchat'.format(opt['task'])) world = create_task(opt, [agent1, agent2]) if print_parser: # Show arguments after loading model print_parser.opt = agent1.opt print_parser.print_args() # set up logging log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() logger = WorldLogger(opt) # Run some self chats. max_cnt = int(opt['num_examples'] * opt.get('selfchat_max_turns') / opt.get('batchsize')) cnt = 0 for _ in tqdm.trange(max_cnt): cnt += opt.get('batchsize', 1) world.parley() logger.log(world) if opt.get('display_examples'): print(world.display()) if log_time.time() > log_every_n_secs: text = log_time.log(cnt, max_cnt) print(text) if opt.get('display_examples'): print('-- end of episode --') logger.reset_world() # flush last episode indent = opt['indent'] if opt['indent'] >= 0 else None logger.write(opt['outfile'], opt['format'], indent=indent) return logger.get_logs()
def validate(self): """ Perform a validation run, checking whether we should stop training. :return: boolean indicating whether training should stop :rtype: bool """ opt = self.opt if self.valid_worlds is None: # we need to load the world now self.valid_worlds = load_eval_worlds(self.agent, opt, 'valid') # run evaluation on valid set # TODO(MW): replace sync_object with self._sync_metrics. You'll need some # logic to handle 'validation_max_exs' properly valid_report = run_eval(self.valid_worlds, opt, 'valid', opt['validation_max_exs']) v = valid_report.copy() v['train_time'] = self.train_time.time() self.valid_reports.append(v) # logging if opt['tensorboard_log'] and is_primary_worker(): valid_report['total_exs'] = self._total_exs self.tb_logger.log_metrics('valid', self.parleys, valid_report) # flush on a validation self.tb_logger.flush() # saving if (opt.get('model_file') and opt.get('save_after_valid') and is_primary_worker()): print("[ saving model checkpoint: " + opt['model_file'] + ".checkpoint ]") self.save_model('.checkpoint') # send valid metrics to agent if the agent wants them if hasattr(self.agent, 'receive_metrics'): self.agent.receive_metrics(valid_report) # --------------- change by hengyicai ------------------------- teacher_agent = self.return_teacher_agent() if teacher_agent: teacher_agent.receive_metrics(valid_report) # --------------- change by hengyicai ------------------------- # check which metric to look at new_valid = valid_report[opt['validation_metric']] if isinstance(new_valid, Metric): new_valid = new_valid.value() # check if this is the best validation so far if (self.best_valid is None or self.valid_optim * new_valid > self.valid_optim * self.best_valid): print('[ new best {}: {}{} ]'.format( opt['validation_metric'], new_valid, ' (previous best was {})'.format(self.best_valid) if self.best_valid is not None else '', )) self.best_valid = new_valid self.impatience = 0 if opt.get('model_file') and is_primary_worker(): print("[ saving best valid model: " + opt['model_file'] + " ]") self.save_model() self.saved = True if (opt['validation_metric'] == 'accuracy' and self.best_valid >= opt['validation_cutoff']): print('[ task solved! stopping. ]') return True else: self.impatience += 1 print('[ did not beat best {}: {} impatience: {} ]'.format( opt['validation_metric'], round(self.best_valid, 4), self.impatience)) # --------------- change by hengyicai ------------------------- if self.opt.get('cutoff_metric_name', 'none') != 'none': cutoff_metric_name = self.opt['cutoff_metric_name'] cutoff_metric_val = self.opt['cutoff_metric_val'] if cutoff_metric_name in valid_report and cutoff_metric_val > 0: if valid_report[cutoff_metric_name] >= cutoff_metric_val: print('[ {} >= {}, stopping. ]'.format( cutoff_metric_name, cutoff_metric_val)) return True elif cutoff_metric_name not in valid_report: warn_once('[ {} is not in the validation report!' 'can not do metric cutoff stopping! ]'.format( cutoff_metric_name)) else: warn_once('[ you asked to do metric cutoff stopping,' 'but the cutoff_metric_val <= 0! ]') # --------------- change by hengyicai ------------------------- self.validate_time.reset() # check if we are out of patience if (opt['validation_patience'] > 0 and self.impatience >= opt['validation_patience']): print('[ ran out of patience! stopping training. ]') return True return False
def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if not PathManager.exists(optfile): return None opt_from_file = Opt.load(optfile) # delete args that we do not want to copy over when loading the model for arg in NOCOPY_ARGS: if arg in opt_from_file: del opt_from_file[arg] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if k in opt_from_file and str(v) != str(opt_from_file.get(k)): logging.warn( f'Overriding opt["{k}"] to {v} (previously: {opt_from_file.get(k)})' ) opt_from_file[k] = v model_class = load_agent_module(opt_from_file['model']) if hasattr(model_class, 'upgrade_opt'): opt_from_file = model_class.upgrade_opt(opt_from_file) # add model arguments to opt_from_file if they aren't in opt_from_file already for k, v in opt.items(): if k not in opt_from_file: opt_from_file[k] = v opt_from_file['model_file'] = model_file # update model file path # update dict file path if not opt_from_file.get('dict_file'): old_dict_file = None opt_from_file['dict_file'] = model_file + '.dict' elif opt_from_file.get('dict_file') and not PathManager.exists( opt_from_file['dict_file'] ): old_dict_file = opt_from_file['dict_file'] opt_from_file['dict_file'] = model_file + '.dict' if not PathManager.exists(opt_from_file['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, opt_from_file['dict_file']) ) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, opt_from_file) return model_class(opt_from_file)
def _build_candidates(self, batch, source, mode): """ Build a candidate set for this batch. :param batch: a Batch object (defined in torch_agent.py) :param source: the source from which candidates should be built, one of ['batch', 'batch-all-cands', 'inline', 'fixed'] :param mode: 'train' or 'eval' :return: tuple of tensors (label_inds, cands, cand_vecs) label_inds: A [bsz] LongTensor of the indices of the labels for each example from its respective candidate set cands: A [num_cands] list of (text) candidates OR a [batchsize] list of such lists if source=='inline' cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline' Possible sources of candidates: * batch: the set of all labels in this batch Use all labels in the batch as the candidate set (with all but the example's label being treated as negatives). Note: with this setting, the candidate set is identical for all examples in a batch. This option may be undesirable if it is possible for duplicate labels to occur in a batch, since the second instance of the correct label will be treated as a negative. * batch-all-cands: the set of all candidates in this batch Use all candidates in the batch as candidate set. Note 1: This can result in a very large number of candidates. Note 2: In this case we will deduplicate candidates. Note 3: just like with 'batch' the candidate set is identical for all examples in a batch. * inline: batch_size lists, one list per example If each example comes with a list of possible candidates, use those. Note: With this setting, each example will have its own candidate set. * fixed: one global candidate list, provided in a file from the user If self.fixed_candidates is not None, use a set of fixed candidates for all examples. Note: this setting is not recommended for training unless the universe of possible candidates is very small. * vocab: one global candidate list, extracted from the vocabulary with the exception of self.NULL_IDX. """ label_vecs = batch.label_vec # [bsz] list of lists of LongTensors label_inds = None batchsize = (batch.text_vec.size(0) if batch.text_vec is not None else batch.image.size(0)) if label_vecs is not None: assert label_vecs.dim() == 2 if source == 'batch': warn_once( '[ Executing {} mode with batch labels as set of candidates. ]' ''.format(mode)) if batchsize == 1: warn_once( "[ Warning: using candidate source 'batch' and observed a " "batch of size 1. This may be due to uneven batch sizes at " "the end of an epoch. ]") if label_vecs is None: raise ValueError( "If using candidate source 'batch', then batch.label_vec cannot be " "None.") cands = batch.labels cand_vecs = label_vecs label_inds = label_vecs.new_tensor(range(batchsize)) elif source == 'batch-all-cands': warn_once( '[ Executing {} mode with all candidates provided in the batch ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'batch-all-cands', then batch." "candidate_vecs cannot be None. If your task does not have " "inline candidates, consider using one of " "--{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) # initialize the list of cands with the labels cands = [] all_cands_vecs = [] # dictionary used for deduplication cands_to_id = {} for i, cands_for_sample in enumerate(batch.candidates): for j, cand in enumerate(cands_for_sample): if cand not in cands_to_id: cands.append(cand) cands_to_id[cand] = len(cands_to_id) all_cands_vecs.append(batch.candidate_vecs[i][j]) cand_vecs, _ = padded_tensor( all_cands_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) label_inds = label_vecs.new_tensor( [cands_to_id[label] for label in batch.labels]) elif source == 'inline': warn_once( '[ Executing {} mode with provided inline set of candidates ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'inline', then batch.candidate_vecs " "cannot be None. If your task does not have inline candidates, " "consider using one of --{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) cands = batch.candidates cand_vecs = padded_3d( batch.candidate_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros( cand_vecs[i].size(1)).fill_(self.NULL_IDX) if cand_vecs[i].size(1) < len(label_vec): label_vec = label_vec[0:cand_vecs[i].size(1)] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad) if label_inds[i] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'fixed': if self.fixed_candidates is None: raise ValueError( "If using candidate source 'fixed', then you must provide the path " "to a file of candidates with the flag --fixed-candidates-path or " "the name of a task with --fixed-candidates-task.") warn_once( "[ Executing {} mode with a common set of fixed candidates " "(n = {}). ]".format(mode, len(self.fixed_candidates))) cands = self.fixed_candidates cand_vecs = self.fixed_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for batch_idx, label_vec in enumerate(label_vecs): max_c_len = cand_vecs.size(1) label_vec_pad = label_vec.new_zeros(max_c_len).fill_( self.NULL_IDX) if max_c_len < len(label_vec): label_vec = label_vec[0:max_c_len] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[batch_idx] = self._find_match( cand_vecs, label_vec_pad) if label_inds[batch_idx] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'vocab': warn_once( '[ Executing {} mode with tokens from vocabulary as candidates. ]' ''.format(mode)) cands = self.vocab_candidates cand_vecs = self.vocab_candidate_vecs # NOTE: label_inds is None here, as we will not find the label in # the set of vocab candidates else: raise Exception("Unrecognized source: %s" % source) return (cands, cand_vecs, label_inds)
def eval_step(self, batch): """ Evaluate a single batch of examples. """ if batch.text_vec is None and batch.image is None: return if batch.text_vec is not None: bsz = batch.text_vec.size(0) else: bsz = len(batch.image) self.model.eval() cand_scores = None token_losses = None if batch.label_vec is not None: # calculate loss on targets with teacher forcing loss, model_output = self.compute_loss(batch, return_output=True) if self.output_token_losses: token_losses = self._construct_token_losses( batch.label_vec, model_output) preds = None if self.skip_generation: warn_once( "--skip-generation does not produce accurate metrics beyond ppl", RuntimeWarning, ) else: maxlen = self.label_truncate or 256 beam_preds_scores, _ = self._generate(batch, self.beam_size, maxlen) preds, scores = zip(*beam_preds_scores) cand_choices = None # TODO: abstract out the scoring here if self.rank_candidates: # compute roughly ppl to rank candidates cand_choices = [] encoder_states = self.model.encoder(*self._encoder_input(batch)) for i in range(bsz): num_cands = len(batch.candidate_vecs[i]) enc = self.model.reorder_encoder_states( encoder_states, [i] * num_cands) cands, _ = self._pad_tensor(batch.candidate_vecs[i]) scores, _ = self.model.decode_forced(enc, cands) cand_losses = F.cross_entropy( scores.view(num_cands * cands.size(1), -1), cands.view(-1), reduction='none', ).view(num_cands, cands.size(1)) # now cand_losses is cands x seqlen size, but we still need to # check padding and such mask = (cands != self.NULL_IDX).float() cand_scores = (cand_losses * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-9) _, ordering = cand_scores.sort() cand_choices.append([batch.candidates[i][o] for o in ordering]) text = [self._v2t(p) for p in preds] if preds is not None else None if text and self.compute_tokenized_bleu: # compute additional bleu scores self._compute_fairseq_bleu(batch, preds) self._compute_nltk_bleu(batch, text) return Output(text, cand_choices, token_losses=token_losses)
def __init__( self, n_heads, n_layers, embedding_size, ffn_size, vocabulary_size, embedding=None, dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, embeddings_scale=True, learn_positional_embeddings=False, padding_idx=None, n_positions=1024, n_segments=0, variant='aiayn', activation='relu', ): super().__init__() self.embedding_size = embedding_size self.ffn_size = ffn_size self.n_layers = n_layers self.n_heads = n_heads self.dim = embedding_size self.activation = activation self.variant = variant self.embeddings_scale = embeddings_scale self.dropout = nn.Dropout(p=dropout) # --dropout self.n_positions = n_positions self.out_dim = embedding_size assert ( embedding_size % n_heads == 0 ), 'Transformer embedding size must be a multiple of n_heads' self.embeddings = embedding if ( self.variant == 'xlm' or self.variant == 'prelayernorm' or self.variant == 'bart' ): self.norm_embeddings = LayerNorm(self.dim, eps=LAYER_NORM_EPS) if self.variant == 'xlm': warn_once( 'DEPRECATED: XLM should only be used for backwards compatibility, ' 'as it involves a less-stable layernorm operation.' ) elif self.variant == 'aiayn': pass else: raise ValueError("Can't handle --variant {}".format(self.variant)) # create the positional embeddings self.position_embeddings = nn.Embedding(n_positions, embedding_size) if not learn_positional_embeddings: create_position_codes( n_positions, embedding_size, out=self.position_embeddings.weight ) else: nn.init.normal_(self.position_embeddings.weight, 0, embedding_size ** -0.5) # build the model self.layers = nn.ModuleList() for _ in range(self.n_layers): self.layers.append( TransformerDecoderLayer( n_heads, embedding_size, ffn_size, attention_dropout=attention_dropout, relu_dropout=relu_dropout, dropout=dropout, activation=activation, variant=variant, ) )