def __init__( self, optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, invsqrt_lr_decay_gamma, ): """ invsqrt_lr_decay_gamma determines the cycle length of the inverse square root scheduler. When steps taken == invsqrt_lr_decay_gamma, the lr multiplier is 1 """ super().__init__(hard_reset, warmup_updates, warmup_rate) self.invsqrt_lr_decay_gamma = invsqrt_lr_decay_gamma if invsqrt_lr_decay_gamma <= 0: warn_once('--lr-scheduler invsqrt requires a value for ' '--invsqrt-lr-decay-gamma. Defaulting to set gamma to ' '--warmup-updates value for backwards compatibility.') self.invsqrt_lr_decay_gamma = self.warmup_updates self.decay_factor = np.sqrt(max(1, self.invsqrt_lr_decay_gamma)) self.scheduler = optim.lr_scheduler.LambdaLR(optimizer, self._invsqrt_lr)
def is_valid(self, obs): """ Override from TorchAgent. Check to see if label candidates contain the label. """ if not self.ignore_bad_candidates: return super().is_valid(obs) if not super().is_valid(obs): return False # skip examples for which the set of label candidates do not # contain the label if 'labels_vec' in obs and 'label_candidates_vecs' in obs: cand_vecs = obs['label_candidates_vecs'] label_vec = obs['labels_vec'] matches = [x for x in cand_vecs if torch.equal(x, label_vec)] if len(matches) == 0: warn_once( 'At least one example has a set of label candidates that ' 'does not contain the label.') return False return True
def forward(self, input, encoder_state, incr_state=None): """ Forward pass. :param LongTensor[batch,seqlen] input: The decoder inputs (partial or full decoded token IDs). :param encoder_state: Output from the encoder module forward pass. :param incr_state: The incremental state: a dictionary whose keys index the layers and whose values contain the incremental state for each layer. """ encoder_output, encoder_mask = encoder_state seq_len = input.size(1) positions = input.new(seq_len).long() positions = torch.arange(seq_len, out=positions).unsqueeze(0) if incr_state is not None: # We're doing incremental decoding, so select only the most recent position input = input[:, -1:] if positions is not None: positions = positions[:, -1:] else: incr_state = {} tensor = self.embeddings(input) if self.embeddings_scale: tensor = tensor * np.sqrt(self.dim) if self.variant == 'xlm': tensor = _normalize(tensor, self.norm_embeddings) if positions.max().item() > self.n_positions: warn_once( 'You are inputting a sequence of {x} length, but only have ' '--n-positions {y}. Set --truncate or increase --n-positions'. format(x=positions.max().item(), y=self.n_positions)) tensor = tensor + self.position_embeddings(positions).expand_as(tensor) tensor = self.dropout(tensor) # --dropout new_incr_state = {} if getattr(self.layers, 'is_model_parallel', False): tensor, new_incr_state = self._apply_model_parallel( tensor, encoder_output, encoder_mask, incr_state) else: for idx, layer in enumerate(self.layers): tensor, new_incr_state[idx] = layer( x=tensor, encoder_output=encoder_output, encoder_mask=encoder_mask, incr_state=incr_state.get(idx), ) if self.variant == 'prelayernorm': tensor = _normalize(tensor, self.norm_embeddings) return tensor, new_incr_state
def valid_step(self, metrics_dict): if self._is_lr_warming_up(): # we're not done warming up, so don't start using validation # metrics to adjust schedule return if 'loss' not in metrics_dict: # nothing to step on, just skip warn_once("LR scheduler expected to see loss metric, but didn't.") return self.scheduler.step(metrics_dict['loss'])
def _path(opt, persona, use_cands): # Build the data if it doesn't exist. build(opt) datatype = opt['datatype'].split(':')[0] if datatype == 'test': warn_once("WARNING: Test set not included. Setting datatype to valid.") datatype = 'valid' dt = datatype + '_' + persona cands = '' if use_cands else '_no_cands' return os.path.join(opt['datapath'], 'ConvAI2', dt + cands + '.txt')
def __choose_persona_from_topic(self, topic): topic = topic.strip() persona_strings = self.wow_topics_to_persona_strings_map[topic] for p in persona_strings: for persona in self.personas: if p in persona: return persona if self.no_persona_is_error: raise ValueError(f'ERROR: Found no persona for topic: {topic}.') else: warn_once( f'Found no persona for topic: {topic}. Returning first persona.' ) return self.personas[0]
def __init__(self, opt, shared=None): if 'stream' in opt['datatype']: warn_once( 'Warning: this teacher is not compatible with StreamDialogData!' ) # StreamDialogData works by reading directly from a text file without any # alteration, but this teacher must append a WoW topic string to the context # of the first example of each episode. assert opt['datatype'].endswith(':stream') opt['datatype'] = opt['datatype'][:-len(':stream')] self.persona_topicifier = PersonaTopicifier(opt=opt, should_have_personas=True, should_have_topics=False) super().__init__(opt, shared=shared)
def _add_knowledge_to_act(self, act): act = super()._add_knowledge_to_act(act) if self.opt.get('prepend_gold_knowledge', False): warn_once('Prepending selected knowledge to dialogue input.' 'If this was not intended behavior, please run with the ' 'flag --prepend-gold-knowledge False') knowledge_text = ' '.join([ TOKEN_KNOWLEDGE, act['checked_sentence'], TOKEN_END_KNOWLEDGE ]) new_text = '\n'.join([knowledge_text, act['text']]) act.force_set('text', new_text) else: warn_once('Not prepending selected knowledge to dialogue input.' 'If this was not intended behavior, please run with the ' 'flag --prepend-gold-knowledge True') return act
def evaluate_response(self, observation: Message, labels: List[str]) -> None: """ Compute all required text-based metrics based on an observation and labels. """ prediction = observation.get('text', None) self.add('exs', SumMetric(1)) if prediction is not None: self.add('accuracy', ExactMatchMetric.compute(prediction, labels)) self.add('f1', F1Metric.compute(prediction, labels)) for k in range(1, 5): # 1..4 if f'bleu-{k}' in self._metrics_list: self.add(f'bleu-{k}', BleuMetric.compute(prediction, labels, k)) # if any of the rouges are in the list if self._metrics_list & ROUGE_METRICS: r1, r2, rL = RougeMetric.compute_many(prediction, labels) if 'rouge-1' in self._metrics_list: self.add('rouge-1', r1) if 'rouge-2' in self._metrics_list: self.add('rouge-2', r2) if 'rouge-L' in self._metrics_list: self.add('rouge-L', rL) # Ranking metrics. self._update_ranking_metrics(observation, labels) # User-reported metrics if 'metrics' in observation: for uk, v in observation['metrics'].items(): if uk in ALL_METRICS: # don't let the user override our metrics uk = f'USER_{uk}' assert isinstance(uk, str), type(k) if not isinstance(v, Metric): warn_once( f'Metric {uk} is assumed to be averaged per example.') v = AverageMetric(v) assert isinstance(v, Metric) self.add(uk, v) # always flush at the end of processing this response self.flush()
def _get_labels(self, batch): """ Obtain the correct labels. Raises a ``KeyError`` if one of the labels is not in the class list. """ try: labels_indices_list = [ self.class_dict[label] for label in batch.labels ] except KeyError as e: warn_once('One of your labels is not in the class list.') raise e labels_tensor = torch.LongTensor(labels_indices_list) if self.use_cuda: labels_tensor = labels_tensor.cuda() return labels_tensor
def _setup_personas_to_wow_topics( self, ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: persona_strings_to_topics = defaultdict(list) topics_to_persona_strings = defaultdict(list) with open(self.topic_to_persona_path, 'r') as f: for line in f: match = re.fullmatch(r'([^[]+): (\[.+\])\n', line) topic = match.group(1) persona_strings = eval(match.group(2)) assert isinstance(persona_strings, list) topics_to_persona_strings[topic] = persona_strings for str_ in persona_strings: persona_strings_to_topics[str_].append(topic) warn_once( f'FINISHED MAPPING personas to topics, got: {len(list(persona_strings_to_topics.keys()))} persona strings to map to topics.' ) return topics_to_persona_strings, persona_strings_to_topics
def _compile_data(self) -> List[List[dict]]: """ Compile data to be saved for faster future use. """ warn_once(f'Starting to compile {self.num_episodes():d} episodes.') all_data = [] for episode_idx in tqdm(range(self.num_episodes())): episode_data = [] entry_idx = 0 while True: example_data = self._get_example(episode_idx=episode_idx, entry_idx=entry_idx) episode_data.append(example_data) if example_data['episode_done']: all_data.append(episode_data) break else: entry_idx += 1 return all_data
def __init__(self, opt, shared=None): self.persona_topicifier = PersonaTopicifier(opt=opt, should_have_personas=False, should_have_topics=False) super().__init__(opt, shared=shared) if (self.remove_political_convos is True or self.opt.get('deepmoji') is not None or self.opt.get('fasttextloc') is not None or self.opt.get('prepend', -1) > 0): raise NotImplementedError( 'Removing political conversations or using deepmoji, fasttextloc, or ' 'prepend not supported with this teacher.') # Running over all examples is really slow because the process of finding a WoW # topic is expensive, so let's load cached data with personas and topics unless # --recompile-persona-topic-data is True if opt.get('recompile_persona_topic_data', self.RECOMPILE_DEFAULT): self.data_path = (_cached_data_path( opt=self.opt, experiencer_side_only=self.experiencer_side_only) + '.recompiled') warn_once(f'Compiling data file for {self.data_path}.') self.persona_topic_data = self._compile_data() warn_once(f'Saving data to {self.data_path}.') with open(self.data_path, 'w') as f_write: json.dump(self.persona_topic_data, f_write) else: self.data_path = _cached_data_path( opt=self.opt, experiencer_side_only=self.experiencer_side_only) warn_once(f'Loading cached data from {self.data_path}.') with open(self.data_path, 'r') as f_read: self.persona_topic_data = json.load(f_read)
def train_step(self, batch): """ Train on a single batch of examples. """ self._maybe_invalidate_fixed_encs_cache() if batch.text_vec is None and batch.image is None: return self.model.train() self.zero_grad() cands, cand_vecs, label_inds = self._build_candidates( batch, source=self.candidates, mode='train') try: scores = self.score_candidates(batch, cand_vecs) loss = self.criterion(scores, label_inds) self.record_local_metric('mean_loss', AverageMetric.many(loss)) loss = loss.mean() self.backward(loss) self.update_params() except RuntimeError as e: # catch out of memory exceptions during fwd/bck (skip batch) if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch. ' 'if this happens frequently, decrease batchsize or ' 'truncate the inputs to the model.') return Output() else: raise e # Get train predictions if self.candidates == 'batch': self._get_batch_train_metrics(scores) return Output() if not self.opt.get('train_predict', False): warn_once( "Some training metrics are omitted for speed. Set the flag " "`--train-predict` to calculate train metrics.") return Output() return self._get_train_preds(scores, label_inds, cands, cand_vecs)
def batchify_image_features(self, batch: Batch) -> Batch: """ Return the image features as a Tensor of the correct type. Fill in missing feature vectors. Here, we require image features to be saved in `batch` as a Tensor for passing through the image encoder. This is required for data_parallel. """ # Checks/formatting of batch.image bsz = self._get_batch_size(batch) if batch.image is None or len(batch.image) == 0: batch.image = [None] * bsz else: assert len(batch.image) == bsz # Process all image feature vectors, or add in zero vectors if missing processed_features_list = [] processed_zero_features = self._process_image_features( torch.zeros((self.image_features_dim,)) ) for orig_features in batch.image: if isinstance(orig_features, torch.Tensor): processed_features_list.append( self._process_image_features(orig_features) ) else: if orig_features is not None: warn_once( 'Unsupported image feature format. Image features will be ignored!' ) processed_features_list.append(processed_zero_features) # Turn into batchsize x image_features_dim for DataParallel batch.image = torch.stack(processed_features_list) return batch
def compute_many( guess: str, answers: List[str] ) -> Tuple[Optional['RougeMetric'], Optional['RougeMetric'], Optional['RougeMetric']]: """ Compute ROUGE score between guess and *any* answer. Done with compute_many due to increased efficiency. :return: (rouge-1, rouge-2, rouge-L) """ # possible global initialization global rouge if rouge is None: return None, None, None if RougeMetric._evaluator is None: RougeMetric._evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l'], max_n=2) try: scores = [ RougeMetric._evaluator.get_scores(normalize_answer(guess), normalize_answer(a)) for a in answers ] except LookupError: warn_once('ROUGE requires nltk punkt tokenizer. Please run ' '`python -c "import nltk; nltk.download(\'punkt\')`') return None, None, None scores_rouge1 = max(score['rouge-1']['r'] for score in scores) scores_rouge2 = max(score['rouge-2']['r'] for score in scores) scores_rougeL = max(score['rouge-l']['r'] for score in scores) return ( RougeMetric(scores_rouge1), RougeMetric(scores_rouge2), RougeMetric(scores_rougeL), )
def _build_candidates(self, batch, source, mode): """ Build a candidate set for this batch. :param batch: a Batch object (defined in torch_agent.py) :param source: the source from which candidates should be built, one of ['batch', 'batch-all-cands', 'inline', 'fixed'] :param mode: 'train' or 'eval' :return: tuple of tensors (label_inds, cands, cand_vecs) label_inds: A [bsz] LongTensor of the indices of the labels for each example from its respective candidate set cands: A [num_cands] list of (text) candidates OR a [batchsize] list of such lists if source=='inline' cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline' Possible sources of candidates: * batch: the set of all labels in this batch Use all labels in the batch as the candidate set (with all but the example's label being treated as negatives). Note: with this setting, the candidate set is identical for all examples in a batch. This option may be undesirable if it is possible for duplicate labels to occur in a batch, since the second instance of the correct label will be treated as a negative. * batch-all-cands: the set of all candidates in this batch Use all candidates in the batch as candidate set. Note 1: This can result in a very large number of candidates. Note 2: In this case we will deduplicate candidates. Note 3: just like with 'batch' the candidate set is identical for all examples in a batch. * inline: batch_size lists, one list per example If each example comes with a list of possible candidates, use those. Note: With this setting, each example will have its own candidate set. * fixed: one global candidate list, provided in a file from the user If self.fixed_candidates is not None, use a set of fixed candidates for all examples. Note: this setting is not recommended for training unless the universe of possible candidates is very small. * vocab: one global candidate list, extracted from the vocabulary with the exception of self.NULL_IDX. """ label_vecs = batch.label_vec # [bsz] list of lists of LongTensors label_inds = None batchsize = (batch.text_vec.size(0) if batch.text_vec is not None else batch.image.size(0)) if label_vecs is not None: assert label_vecs.dim() == 2 if source == 'batch': warn_once( '[ Executing {} mode with batch labels as set of candidates. ]' ''.format(mode)) if batchsize == 1: warn_once( "[ Warning: using candidate source 'batch' and observed a " "batch of size 1. This may be due to uneven batch sizes at " "the end of an epoch. ]") if label_vecs is None: raise ValueError( "If using candidate source 'batch', then batch.label_vec cannot be " "None.") cands = batch.labels cand_vecs = label_vecs label_inds = label_vecs.new_tensor(range(batchsize)) elif source == 'batch-all-cands': warn_once( '[ Executing {} mode with all candidates provided in the batch ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'batch-all-cands', then batch." "candidate_vecs cannot be None. If your task does not have " "inline candidates, consider using one of " "--{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) # initialize the list of cands with the labels cands = [] all_cands_vecs = [] # dictionary used for deduplication cands_to_id = {} for i, cands_for_sample in enumerate(batch.candidates): for j, cand in enumerate(cands_for_sample): if cand not in cands_to_id: cands.append(cand) cands_to_id[cand] = len(cands_to_id) all_cands_vecs.append(batch.candidate_vecs[i][j]) cand_vecs, _ = self._pad_tensor(all_cands_vecs) label_inds = label_vecs.new_tensor( [cands_to_id[label] for label in batch.labels]) elif source == 'inline': warn_once( '[ Executing {} mode with provided inline set of candidates ]' ''.format(mode)) if batch.candidate_vecs is None: raise ValueError( "If using candidate source 'inline', then batch.candidate_vecs " "cannot be None. If your task does not have inline candidates, " "consider using one of --{m}={{'batch','fixed','vocab'}}." "".format(m='candidates' if mode == 'train' else 'eval-candidates')) cands = batch.candidates cand_vecs = padded_3d( batch.candidate_vecs, self.NULL_IDX, use_cuda=self.use_cuda, fp16friendly=self.fp16, ) if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for i, label_vec in enumerate(label_vecs): label_vec_pad = label_vec.new_zeros( cand_vecs[i].size(1)).fill_(self.NULL_IDX) if cand_vecs[i].size(1) < len(label_vec): label_vec = label_vec[0:cand_vecs[i].size(1)] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[i] = self._find_match(cand_vecs[i], label_vec_pad) if label_inds[i] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'fixed': if self.fixed_candidates is None: raise ValueError( "If using candidate source 'fixed', then you must provide the path " "to a file of candidates with the flag --fixed-candidates-path or " "the name of a task with --fixed-candidates-task.") warn_once( "[ Executing {} mode with a common set of fixed candidates " "(n = {}). ]".format(mode, len(self.fixed_candidates))) cands = self.fixed_candidates cand_vecs = self.fixed_candidate_vecs if label_vecs is not None: label_inds = label_vecs.new_empty((batchsize)) bad_batch = False for batch_idx, label_vec in enumerate(label_vecs): max_c_len = cand_vecs.size(1) label_vec_pad = label_vec.new_zeros(max_c_len).fill_( self.NULL_IDX) if max_c_len < len(label_vec): label_vec = label_vec[0:max_c_len] label_vec_pad[0:label_vec.size(0)] = label_vec label_inds[batch_idx] = self._find_match( cand_vecs, label_vec_pad) if label_inds[batch_idx] == -1: bad_batch = True if bad_batch: if self.ignore_bad_candidates and not self.is_training: label_inds = None else: raise RuntimeError( 'At least one of your examples has a set of label candidates ' 'that does not contain the label. To ignore this error ' 'set `--ignore-bad-candidates True`.') elif source == 'vocab': warn_once( '[ Executing {} mode with tokens from vocabulary as candidates. ]' ''.format(mode)) cands = self.vocab_candidates cand_vecs = self.vocab_candidate_vecs # NOTE: label_inds is None here, as we will not find the label in # the set of vocab candidates else: raise Exception("Unrecognized source: %s" % source) return (cands, cand_vecs, label_inds)
def create_agent_from_opt_file(opt: Opt): """ Load agent options and module from file if opt file exists. Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the options from the file and use that to create an agent, loading the model type from that file and overriding any options specified in that file when instantiating the agent. If that file does not exist, return None. """ model_file = opt['model_file'] optfile = model_file + '.opt' if os.path.isfile(optfile): new_opt = Opt.load(optfile) # TODO we need a better way to say these options are never copied... if 'datapath' in new_opt: # never use the datapath from an opt dump del new_opt['datapath'] if 'batchindex' in new_opt: # This saved variable can cause trouble if we switch to BS=1 at test time del new_opt['batchindex'] # only override opts specified in 'override' dict if opt.get('override'): for k, v in opt['override'].items(): if str(v) != str(new_opt.get(k, None)): print("[ warning: overriding opt['{}'] to {} (" "previously: {} )]".format(k, v, new_opt.get(k, None))) new_opt[k] = v model_class = load_agent_module(new_opt['model']) # check for model version if hasattr(model_class, 'model_version'): curr_version = new_opt.get('model_version', 0) if curr_version != model_class.model_version(): model = new_opt['model'] m = ('It looks like you are trying to load an older version of' ' the selected model. Change your model argument to use ' 'the old version from core/agents/legacy_agents: for ' 'example: `-m legacy:{m}:{v}` or ' '`--model parlai.agents.legacy_agents.{m}.{m}_v{v}:{c}`') if '.' not in model: # give specific error message if it's easy raise RuntimeError( m.format(m=model, v=curr_version, c=model_class.__name__)) else: # otherwise generic one raise RuntimeError( m.format(m='modelname', v=curr_version, c='ModelAgent')) if hasattr(model_class, 'upgrade_opt'): new_opt = model_class.upgrade_opt(new_opt) # add model arguments to new_opt if they aren't in new_opt already for k, v in opt.items(): if k not in new_opt: new_opt[k] = v new_opt['model_file'] = model_file if not new_opt.get('dict_file'): new_opt['dict_file'] = model_file + '.dict' elif new_opt.get('dict_file') and not os.path.isfile( new_opt['dict_file']): old_dict_file = new_opt['dict_file'] new_opt['dict_file'] = model_file + '.dict' if not os.path.isfile(new_opt['dict_file']): warn_once( 'WARNING: Neither the specified dict file ({}) nor the ' '`model_file`.dict file ({}) exists, check to make sure either ' 'is correct. This may manifest as a shape mismatch later ' 'on.'.format(old_dict_file, new_opt['dict_file'])) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, new_opt) return model_class(new_opt) else: return None
def lr_scheduler_factory(cls, opt, optimizer, states, hard_reset=False): """ Create the learning rate scheduler, and assign it to self.scheduler. This scheduler will be updated upon a call to receive_metrics. May also create self.warmup_scheduler, if appropriate. :param opt opt: Arguments received by torch_agent :param optimizer optimizer: Optimizer being used for training. May be wrapped in fp16_optimizer_wrapper depending on whether fp16 is used. :param state_dict states: Possible state_dict provided by model checkpoint, for restoring LR state. :param bool hard_reset: If true, the LR scheduler should ignore the state dictionary. :return: ParlAILRScheduler object """ patience = opt.get('lr_scheduler_patience', 3) decay = opt.get('lr_scheduler_decay', 0.5) warmup_updates = opt.get('warmup_updates', -1) warmup_rate = opt.get('warmup_rate', 1e-4) max_lr_steps = opt.get('max_lr_steps', -1) invsqrt_lr_decay_gamma = opt.get('invsqrt_lr_decay_gamma', -1) if opt.get('lr_scheduler') == 'none': return None elif decay == 1.0: warn_once( "Your LR decay is set to 1.0. Assuming you meant you wanted " "to disable learning rate scheduling. Adjust --lr-scheduler-decay " "if this is not correct.") return None elif opt.get('lr_scheduler') == 'reduceonplateau': scheduler = ReduceOnPlateauLRScheduler(optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate) elif opt.get('lr_scheduler') == 'fixed': scheduler = FixedLRScheduler(optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate) elif opt.get('lr_scheduler') == 'invsqrt': scheduler = InvSqrtLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, invsqrt_lr_decay_gamma, ) elif opt.get('lr_scheduler') == 'cosine': scheduler = CosineLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, max_lr_steps, ) elif opt.get('lr_scheduler') == 'linear': scheduler = LinearLRScheduler( optimizer, hard_reset, patience, decay, warmup_updates, warmup_rate, max_lr_steps, ) else: raise ValueError( "Don't know what to do with --lr-scheduler '{}'".format( opt.get('lr_scheduler'))) # time to load LR state from the checkpoint, if possible. if ( # there is already an old LR scheduler saved on disk states # and there was a scheduler in the dump and 'lr_scheduler_type' in states # and the old LR scheduler is different and states.get('lr_scheduler_type') != opt['lr_scheduler'] # and we're not already using a fresh scheduler and not hard_reset): # the LR scheduler changed, start things fresh warn_once( f"LR scheduler ({opt['lr_scheduler']}) is different from saved " f"({states.get('lr_scheduler_type')}). Starting fresh!") hard_reset = True if not hard_reset: # do the actual loading (if possible) scheduler.load_state(states) # setup warmup scheduler after loading saved scheduler scheduler._init_warmup_scheduler(optimizer, states) return scheduler
def forward(self, input, positions=None, segments=None): """ Forward pass. :param LongTensor[batch,seqlen] input: The input IDs :param BoolTensor[batch,seqlen] mask: The attention mask; 1 means attend, 0 means ignore. :param LongTensor[batch,seqlen]: If provided, additionally adds ``segments`` as extra embedding features. """ mask = input != self.padding_idx if positions is None: positions = (mask.cumsum(dim=1, dtype=torch.int64) - 1).clamp_(min=0) tensor = self.embeddings(input) if self.embeddings_scale: tensor = tensor * np.sqrt(self.dim) if positions.max().item() > self.n_positions: warn_once( 'You are inputting a sequence of {x} length, but only have ' '--n-positions {y}. Set --truncate or increase --n-positions'. format(x=positions.max().item(), y=self.n_positions)) position_embs = self.position_embeddings(positions).expand_as(tensor) tensor = tensor + position_embs if self.n_segments >= 1: if segments is None: segments = torch.zeros_like(input) tensor = tensor + self.segment_embeddings(segments) if self.variant == 'xlm': tensor = _normalize(tensor, self.norm_embeddings) # --dropout on the embeddings tensor = self.dropout(tensor) tensor *= mask.unsqueeze(-1).type_as(tensor) if getattr(self.layers, 'is_model_parallel', False): # factored out for readability. It is equivalent to the other # condition tensor = self._apply_model_parallel(tensor, mask) else: for i in range(self.n_layers): tensor = self.layers[i](tensor, mask) if self.variant == 'prelayernorm': tensor = _normalize(tensor, self.norm_embeddings) tensor *= self.output_scaling if self.reduction_type == 'first': return tensor[:, 0, :] elif self.reduction_type == 'max': return tensor.max(dim=1)[0] elif self.reduction_type == 'mean': divisor = mask.float().sum(dim=1).unsqueeze(-1).clamp( min=1).type_as(tensor) output = tensor.sum(dim=1) / divisor return output elif self.reduction_type is None or 'none' in self.reduction_type: return tensor, mask else: raise ValueError("Can't handle --reduction-type {}".format( self.reduction_type))