Ejemplo n.º 1
0
    def __init__(
        self,
        optimizer,
        hard_reset,
        patience,
        decay,
        warmup_updates,
        warmup_rate,
        invsqrt_lr_decay_gamma,
    ):
        """
        invsqrt_lr_decay_gamma determines the cycle length of the inverse square root
        scheduler.

        When steps taken == invsqrt_lr_decay_gamma, the lr multiplier is 1
        """
        super().__init__(hard_reset, warmup_updates, warmup_rate)
        self.invsqrt_lr_decay_gamma = invsqrt_lr_decay_gamma
        if invsqrt_lr_decay_gamma <= 0:
            warn_once('--lr-scheduler invsqrt requires a value for '
                      '--invsqrt-lr-decay-gamma. Defaulting to set gamma to '
                      '--warmup-updates value for backwards compatibility.')
            self.invsqrt_lr_decay_gamma = self.warmup_updates

        self.decay_factor = np.sqrt(max(1, self.invsqrt_lr_decay_gamma))
        self.scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                     self._invsqrt_lr)
    def is_valid(self, obs):
        """
        Override from TorchAgent.

        Check to see if label candidates contain the label.
        """
        if not self.ignore_bad_candidates:
            return super().is_valid(obs)

        if not super().is_valid(obs):
            return False

        # skip examples for which the set of label candidates do not
        # contain the label
        if 'labels_vec' in obs and 'label_candidates_vecs' in obs:
            cand_vecs = obs['label_candidates_vecs']
            label_vec = obs['labels_vec']
            matches = [x for x in cand_vecs if torch.equal(x, label_vec)]
            if len(matches) == 0:
                warn_once(
                    'At least one example has a set of label candidates that '
                    'does not contain the label.')
                return False

        return True
Ejemplo n.º 3
0
    def forward(self, input, encoder_state, incr_state=None):
        """
        Forward pass.

        :param LongTensor[batch,seqlen] input:
            The decoder inputs (partial or full decoded token IDs).
        :param encoder_state:
            Output from the encoder module forward pass.
        :param incr_state:
            The incremental state: a dictionary whose keys index the layers and whose
            values contain the incremental state for each layer.
        """
        encoder_output, encoder_mask = encoder_state

        seq_len = input.size(1)
        positions = input.new(seq_len).long()
        positions = torch.arange(seq_len, out=positions).unsqueeze(0)

        if incr_state is not None:
            # We're doing incremental decoding, so select only the most recent position
            input = input[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]
        else:
            incr_state = {}

        tensor = self.embeddings(input)
        if self.embeddings_scale:
            tensor = tensor * np.sqrt(self.dim)
        if self.variant == 'xlm':
            tensor = _normalize(tensor, self.norm_embeddings)
        if positions.max().item() > self.n_positions:
            warn_once(
                'You are inputting a sequence of {x} length, but only have '
                '--n-positions {y}. Set --truncate or increase --n-positions'.
                format(x=positions.max().item(), y=self.n_positions))
        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
        tensor = self.dropout(tensor)  # --dropout

        new_incr_state = {}
        if getattr(self.layers, 'is_model_parallel', False):
            tensor, new_incr_state = self._apply_model_parallel(
                tensor, encoder_output, encoder_mask, incr_state)
        else:
            for idx, layer in enumerate(self.layers):
                tensor, new_incr_state[idx] = layer(
                    x=tensor,
                    encoder_output=encoder_output,
                    encoder_mask=encoder_mask,
                    incr_state=incr_state.get(idx),
                )

        if self.variant == 'prelayernorm':
            tensor = _normalize(tensor, self.norm_embeddings)

        return tensor, new_incr_state
Ejemplo n.º 4
0
 def valid_step(self, metrics_dict):
     if self._is_lr_warming_up():
         # we're not done warming up, so don't start using validation
         # metrics to adjust schedule
         return
     if 'loss' not in metrics_dict:
         # nothing to step on, just skip
         warn_once("LR scheduler expected to see loss metric, but didn't.")
         return
     self.scheduler.step(metrics_dict['loss'])
Ejemplo n.º 5
0
def _path(opt, persona, use_cands):
    # Build the data if it doesn't exist.
    build(opt)
    datatype = opt['datatype'].split(':')[0]
    if datatype == 'test':
        warn_once("WARNING: Test set not included. Setting datatype to valid.")
        datatype = 'valid'
    dt = datatype + '_' + persona
    cands = '' if use_cands else '_no_cands'
    return os.path.join(opt['datapath'], 'ConvAI2', dt + cands + '.txt')
Ejemplo n.º 6
0
 def __choose_persona_from_topic(self, topic):
     topic = topic.strip()
     persona_strings = self.wow_topics_to_persona_strings_map[topic]
     for p in persona_strings:
         for persona in self.personas:
             if p in persona:
                 return persona
     if self.no_persona_is_error:
         raise ValueError(f'ERROR: Found no persona for topic: {topic}.')
     else:
         warn_once(
             f'Found no persona for topic: {topic}. Returning first persona.'
         )
         return self.personas[0]
Ejemplo n.º 7
0
 def __init__(self, opt, shared=None):
     if 'stream' in opt['datatype']:
         warn_once(
             'Warning: this teacher is not compatible with StreamDialogData!'
         )
         # StreamDialogData works by reading directly from a text file without any
         # alteration, but this teacher must append a WoW topic string to the context
         # of the first example of each episode.
         assert opt['datatype'].endswith(':stream')
         opt['datatype'] = opt['datatype'][:-len(':stream')]
     self.persona_topicifier = PersonaTopicifier(opt=opt,
                                                 should_have_personas=True,
                                                 should_have_topics=False)
     super().__init__(opt, shared=shared)
Ejemplo n.º 8
0
 def _add_knowledge_to_act(self, act):
     act = super()._add_knowledge_to_act(act)
     if self.opt.get('prepend_gold_knowledge', False):
         warn_once('Prepending selected knowledge to dialogue input.'
                   'If this was not intended behavior, please run with the '
                   'flag --prepend-gold-knowledge False')
         knowledge_text = ' '.join([
             TOKEN_KNOWLEDGE, act['checked_sentence'], TOKEN_END_KNOWLEDGE
         ])
         new_text = '\n'.join([knowledge_text, act['text']])
         act.force_set('text', new_text)
     else:
         warn_once('Not prepending selected knowledge to dialogue input.'
                   'If this was not intended behavior, please run with the '
                   'flag --prepend-gold-knowledge True')
     return act
Ejemplo n.º 9
0
    def evaluate_response(self, observation: Message,
                          labels: List[str]) -> None:
        """
        Compute all required text-based metrics based on an observation and labels.
        """
        prediction = observation.get('text', None)

        self.add('exs', SumMetric(1))

        if prediction is not None:
            self.add('accuracy', ExactMatchMetric.compute(prediction, labels))
            self.add('f1', F1Metric.compute(prediction, labels))

            for k in range(1, 5):  # 1..4
                if f'bleu-{k}' in self._metrics_list:
                    self.add(f'bleu-{k}',
                             BleuMetric.compute(prediction, labels, k))
            # if any of the rouges are in the list
            if self._metrics_list & ROUGE_METRICS:
                r1, r2, rL = RougeMetric.compute_many(prediction, labels)
                if 'rouge-1' in self._metrics_list:
                    self.add('rouge-1', r1)
                if 'rouge-2' in self._metrics_list:
                    self.add('rouge-2', r2)
                if 'rouge-L' in self._metrics_list:
                    self.add('rouge-L', rL)

        # Ranking metrics.
        self._update_ranking_metrics(observation, labels)

        # User-reported metrics
        if 'metrics' in observation:
            for uk, v in observation['metrics'].items():
                if uk in ALL_METRICS:
                    # don't let the user override our metrics
                    uk = f'USER_{uk}'
                assert isinstance(uk, str), type(k)
                if not isinstance(v, Metric):
                    warn_once(
                        f'Metric {uk} is assumed to be averaged per example.')
                    v = AverageMetric(v)
                assert isinstance(v, Metric)
                self.add(uk, v)

        # always flush at the end of processing this response
        self.flush()
    def _get_labels(self, batch):
        """
        Obtain the correct labels.

        Raises a ``KeyError`` if one of the labels is not in the class list.
        """
        try:
            labels_indices_list = [
                self.class_dict[label] for label in batch.labels
            ]
        except KeyError as e:
            warn_once('One of your labels is not in the class list.')
            raise e

        labels_tensor = torch.LongTensor(labels_indices_list)
        if self.use_cuda:
            labels_tensor = labels_tensor.cuda()
        return labels_tensor
Ejemplo n.º 11
0
    def _setup_personas_to_wow_topics(
        self, ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
        persona_strings_to_topics = defaultdict(list)
        topics_to_persona_strings = defaultdict(list)
        with open(self.topic_to_persona_path, 'r') as f:
            for line in f:
                match = re.fullmatch(r'([^[]+): (\[.+\])\n', line)
                topic = match.group(1)
                persona_strings = eval(match.group(2))
                assert isinstance(persona_strings, list)
                topics_to_persona_strings[topic] = persona_strings
                for str_ in persona_strings:
                    persona_strings_to_topics[str_].append(topic)

        warn_once(
            f'FINISHED MAPPING personas to topics, got: {len(list(persona_strings_to_topics.keys()))} persona strings to map to topics.'
        )
        return topics_to_persona_strings, persona_strings_to_topics
Ejemplo n.º 12
0
    def _compile_data(self) -> List[List[dict]]:
        """
        Compile data to be saved for faster future use.
        """
        warn_once(f'Starting to compile {self.num_episodes():d} episodes.')
        all_data = []
        for episode_idx in tqdm(range(self.num_episodes())):
            episode_data = []
            entry_idx = 0
            while True:
                example_data = self._get_example(episode_idx=episode_idx,
                                                 entry_idx=entry_idx)
                episode_data.append(example_data)
                if example_data['episode_done']:
                    all_data.append(episode_data)
                    break
                else:
                    entry_idx += 1

        return all_data
Ejemplo n.º 13
0
    def __init__(self, opt, shared=None):
        self.persona_topicifier = PersonaTopicifier(opt=opt,
                                                    should_have_personas=False,
                                                    should_have_topics=False)
        super().__init__(opt, shared=shared)

        if (self.remove_political_convos is True
                or self.opt.get('deepmoji') is not None
                or self.opt.get('fasttextloc') is not None
                or self.opt.get('prepend', -1) > 0):
            raise NotImplementedError(
                'Removing political conversations or using deepmoji, fasttextloc, or '
                'prepend not supported with this teacher.')

        # Running over all examples is really slow because the process of finding a WoW
        # topic is expensive, so let's load cached data with personas and topics unless
        # --recompile-persona-topic-data is True
        if opt.get('recompile_persona_topic_data', self.RECOMPILE_DEFAULT):
            self.data_path = (_cached_data_path(
                opt=self.opt, experiencer_side_only=self.experiencer_side_only)
                              + '.recompiled')
            warn_once(f'Compiling data file for {self.data_path}.')
            self.persona_topic_data = self._compile_data()
            warn_once(f'Saving data to {self.data_path}.')
            with open(self.data_path, 'w') as f_write:
                json.dump(self.persona_topic_data, f_write)
        else:
            self.data_path = _cached_data_path(
                opt=self.opt, experiencer_side_only=self.experiencer_side_only)
            warn_once(f'Loading cached data from {self.data_path}.')
            with open(self.data_path, 'r') as f_read:
                self.persona_topic_data = json.load(f_read)
Ejemplo n.º 14
0
    def train_step(self, batch):
        """
        Train on a single batch of examples.
        """
        self._maybe_invalidate_fixed_encs_cache()
        if batch.text_vec is None and batch.image is None:
            return
        self.model.train()
        self.zero_grad()

        cands, cand_vecs, label_inds = self._build_candidates(
            batch, source=self.candidates, mode='train')
        try:
            scores = self.score_candidates(batch, cand_vecs)
            loss = self.criterion(scores, label_inds)
            self.record_local_metric('mean_loss', AverageMetric.many(loss))
            loss = loss.mean()
            self.backward(loss)
            self.update_params()
        except RuntimeError as e:
            # catch out of memory exceptions during fwd/bck (skip batch)
            if 'out of memory' in str(e):
                print('| WARNING: ran out of memory, skipping batch. '
                      'if this happens frequently, decrease batchsize or '
                      'truncate the inputs to the model.')
                return Output()
            else:
                raise e

        # Get train predictions
        if self.candidates == 'batch':
            self._get_batch_train_metrics(scores)
            return Output()
        if not self.opt.get('train_predict', False):
            warn_once(
                "Some training metrics are omitted for speed. Set the flag "
                "`--train-predict` to calculate train metrics.")
            return Output()
        return self._get_train_preds(scores, label_inds, cands, cand_vecs)
Ejemplo n.º 15
0
    def batchify_image_features(self, batch: Batch) -> Batch:
        """
        Return the image features as a Tensor of the correct type.

        Fill in missing feature vectors. Here, we require image features to be saved in
        `batch` as a Tensor for passing through the image encoder. This is required for
        data_parallel.
        """

        # Checks/formatting of batch.image
        bsz = self._get_batch_size(batch)
        if batch.image is None or len(batch.image) == 0:
            batch.image = [None] * bsz
        else:
            assert len(batch.image) == bsz

        # Process all image feature vectors, or add in zero vectors if missing
        processed_features_list = []
        processed_zero_features = self._process_image_features(
            torch.zeros((self.image_features_dim,))
        )
        for orig_features in batch.image:
            if isinstance(orig_features, torch.Tensor):
                processed_features_list.append(
                    self._process_image_features(orig_features)
                )
            else:
                if orig_features is not None:
                    warn_once(
                        'Unsupported image feature format. Image features will be ignored!'
                    )
                processed_features_list.append(processed_zero_features)

        # Turn into batchsize x image_features_dim for DataParallel
        batch.image = torch.stack(processed_features_list)

        return batch
Ejemplo n.º 16
0
    def compute_many(
        guess: str, answers: List[str]
    ) -> Tuple[Optional['RougeMetric'], Optional['RougeMetric'],
               Optional['RougeMetric']]:
        """
        Compute ROUGE score between guess and *any* answer.

        Done with compute_many due to increased efficiency.

        :return: (rouge-1, rouge-2, rouge-L)
        """
        # possible global initialization
        global rouge
        if rouge is None:
            return None, None, None
        if RougeMetric._evaluator is None:
            RougeMetric._evaluator = rouge.Rouge(
                metrics=['rouge-n', 'rouge-l'], max_n=2)
        try:
            scores = [
                RougeMetric._evaluator.get_scores(normalize_answer(guess),
                                                  normalize_answer(a))
                for a in answers
            ]
        except LookupError:
            warn_once('ROUGE requires nltk punkt tokenizer. Please run '
                      '`python -c "import nltk; nltk.download(\'punkt\')`')
            return None, None, None

        scores_rouge1 = max(score['rouge-1']['r'] for score in scores)
        scores_rouge2 = max(score['rouge-2']['r'] for score in scores)
        scores_rougeL = max(score['rouge-l']['r'] for score in scores)
        return (
            RougeMetric(scores_rouge1),
            RougeMetric(scores_rouge2),
            RougeMetric(scores_rougeL),
        )
Ejemplo n.º 17
0
    def _build_candidates(self, batch, source, mode):
        """
        Build a candidate set for this batch.

        :param batch:
            a Batch object (defined in torch_agent.py)
        :param source:
            the source from which candidates should be built, one of
            ['batch', 'batch-all-cands', 'inline', 'fixed']
        :param mode:
            'train' or 'eval'

        :return: tuple of tensors (label_inds, cands, cand_vecs)

            label_inds: A [bsz] LongTensor of the indices of the labels for each
                example from its respective candidate set
            cands: A [num_cands] list of (text) candidates
                OR a [batchsize] list of such lists if source=='inline'
            cand_vecs: A padded [num_cands, seqlen] LongTensor of vectorized candidates
                OR a [batchsize, num_cands, seqlen] LongTensor if source=='inline'

        Possible sources of candidates:

            * batch: the set of all labels in this batch
                Use all labels in the batch as the candidate set (with all but the
                example's label being treated as negatives).
                Note: with this setting, the candidate set is identical for all
                examples in a batch. This option may be undesirable if it is possible
                for duplicate labels to occur in a batch, since the second instance of
                the correct label will be treated as a negative.
            * batch-all-cands: the set of all candidates in this batch
                Use all candidates in the batch as candidate set.
                Note 1: This can result in a very large number of candidates.
                Note 2: In this case we will deduplicate candidates.
                Note 3: just like with 'batch' the candidate set is identical
                for all examples in a batch.
            * inline: batch_size lists, one list per example
                If each example comes with a list of possible candidates, use those.
                Note: With this setting, each example will have its own candidate set.
            * fixed: one global candidate list, provided in a file from the user
                If self.fixed_candidates is not None, use a set of fixed candidates for
                all examples.
                Note: this setting is not recommended for training unless the
                universe of possible candidates is very small.
            * vocab: one global candidate list, extracted from the vocabulary with the
                exception of self.NULL_IDX.
        """
        label_vecs = batch.label_vec  # [bsz] list of lists of LongTensors
        label_inds = None
        batchsize = (batch.text_vec.size(0)
                     if batch.text_vec is not None else batch.image.size(0))

        if label_vecs is not None:
            assert label_vecs.dim() == 2

        if source == 'batch':
            warn_once(
                '[ Executing {} mode with batch labels as set of candidates. ]'
                ''.format(mode))
            if batchsize == 1:
                warn_once(
                    "[ Warning: using candidate source 'batch' and observed a "
                    "batch of size 1. This may be due to uneven batch sizes at "
                    "the end of an epoch. ]")
            if label_vecs is None:
                raise ValueError(
                    "If using candidate source 'batch', then batch.label_vec cannot be "
                    "None.")

            cands = batch.labels
            cand_vecs = label_vecs
            label_inds = label_vecs.new_tensor(range(batchsize))

        elif source == 'batch-all-cands':
            warn_once(
                '[ Executing {} mode with all candidates provided in the batch ]'
                ''.format(mode))
            if batch.candidate_vecs is None:
                raise ValueError(
                    "If using candidate source 'batch-all-cands', then batch."
                    "candidate_vecs cannot be None. If your task does not have "
                    "inline candidates, consider using one of "
                    "--{m}={{'batch','fixed','vocab'}}."
                    "".format(m='candidates' if mode ==
                              'train' else 'eval-candidates'))
            # initialize the list of cands with the labels
            cands = []
            all_cands_vecs = []
            # dictionary used for deduplication
            cands_to_id = {}
            for i, cands_for_sample in enumerate(batch.candidates):
                for j, cand in enumerate(cands_for_sample):
                    if cand not in cands_to_id:
                        cands.append(cand)
                        cands_to_id[cand] = len(cands_to_id)
                        all_cands_vecs.append(batch.candidate_vecs[i][j])
            cand_vecs, _ = self._pad_tensor(all_cands_vecs)
            label_inds = label_vecs.new_tensor(
                [cands_to_id[label] for label in batch.labels])

        elif source == 'inline':
            warn_once(
                '[ Executing {} mode with provided inline set of candidates ]'
                ''.format(mode))
            if batch.candidate_vecs is None:
                raise ValueError(
                    "If using candidate source 'inline', then batch.candidate_vecs "
                    "cannot be None. If your task does not have inline candidates, "
                    "consider using one of --{m}={{'batch','fixed','vocab'}}."
                    "".format(m='candidates' if mode ==
                              'train' else 'eval-candidates'))

            cands = batch.candidates
            cand_vecs = padded_3d(
                batch.candidate_vecs,
                self.NULL_IDX,
                use_cuda=self.use_cuda,
                fp16friendly=self.fp16,
            )
            if label_vecs is not None:
                label_inds = label_vecs.new_empty((batchsize))
                bad_batch = False
                for i, label_vec in enumerate(label_vecs):
                    label_vec_pad = label_vec.new_zeros(
                        cand_vecs[i].size(1)).fill_(self.NULL_IDX)
                    if cand_vecs[i].size(1) < len(label_vec):
                        label_vec = label_vec[0:cand_vecs[i].size(1)]
                    label_vec_pad[0:label_vec.size(0)] = label_vec
                    label_inds[i] = self._find_match(cand_vecs[i],
                                                     label_vec_pad)
                    if label_inds[i] == -1:
                        bad_batch = True
                if bad_batch:
                    if self.ignore_bad_candidates and not self.is_training:
                        label_inds = None
                    else:
                        raise RuntimeError(
                            'At least one of your examples has a set of label candidates '
                            'that does not contain the label. To ignore this error '
                            'set `--ignore-bad-candidates True`.')

        elif source == 'fixed':
            if self.fixed_candidates is None:
                raise ValueError(
                    "If using candidate source 'fixed', then you must provide the path "
                    "to a file of candidates with the flag --fixed-candidates-path or "
                    "the name of a task with --fixed-candidates-task.")
            warn_once(
                "[ Executing {} mode with a common set of fixed candidates "
                "(n = {}). ]".format(mode, len(self.fixed_candidates)))

            cands = self.fixed_candidates
            cand_vecs = self.fixed_candidate_vecs

            if label_vecs is not None:
                label_inds = label_vecs.new_empty((batchsize))
                bad_batch = False
                for batch_idx, label_vec in enumerate(label_vecs):
                    max_c_len = cand_vecs.size(1)
                    label_vec_pad = label_vec.new_zeros(max_c_len).fill_(
                        self.NULL_IDX)
                    if max_c_len < len(label_vec):
                        label_vec = label_vec[0:max_c_len]
                    label_vec_pad[0:label_vec.size(0)] = label_vec
                    label_inds[batch_idx] = self._find_match(
                        cand_vecs, label_vec_pad)
                    if label_inds[batch_idx] == -1:
                        bad_batch = True
                if bad_batch:
                    if self.ignore_bad_candidates and not self.is_training:
                        label_inds = None
                    else:
                        raise RuntimeError(
                            'At least one of your examples has a set of label candidates '
                            'that does not contain the label. To ignore this error '
                            'set `--ignore-bad-candidates True`.')

        elif source == 'vocab':
            warn_once(
                '[ Executing {} mode with tokens from vocabulary as candidates. ]'
                ''.format(mode))
            cands = self.vocab_candidates
            cand_vecs = self.vocab_candidate_vecs
            # NOTE: label_inds is None here, as we will not find the label in
            # the set of vocab candidates
        else:
            raise Exception("Unrecognized source: %s" % source)

        return (cands, cand_vecs, label_inds)
Ejemplo n.º 18
0
def create_agent_from_opt_file(opt: Opt):
    """
    Load agent options and module from file if opt file exists.

    Checks to see if file exists opt['model_file'] + ".opt"; if so, load up the
    options from the file and use that to create an agent, loading the model
    type from that file and overriding any options specified in that file when
    instantiating the agent.

    If that file does not exist, return None.
    """
    model_file = opt['model_file']
    optfile = model_file + '.opt'
    if os.path.isfile(optfile):
        new_opt = Opt.load(optfile)
        # TODO we need a better way to say these options are never copied...
        if 'datapath' in new_opt:
            # never use the datapath from an opt dump
            del new_opt['datapath']
        if 'batchindex' in new_opt:
            # This saved variable can cause trouble if we switch to BS=1 at test time
            del new_opt['batchindex']
        # only override opts specified in 'override' dict
        if opt.get('override'):
            for k, v in opt['override'].items():
                if str(v) != str(new_opt.get(k, None)):
                    print("[ warning: overriding opt['{}'] to {} ("
                          "previously: {} )]".format(k, v,
                                                     new_opt.get(k, None)))
                new_opt[k] = v

        model_class = load_agent_module(new_opt['model'])

        # check for model version
        if hasattr(model_class, 'model_version'):
            curr_version = new_opt.get('model_version', 0)
            if curr_version != model_class.model_version():
                model = new_opt['model']
                m = ('It looks like you are trying to load an older version of'
                     ' the selected model. Change your model argument to use '
                     'the old version from core/agents/legacy_agents: for '
                     'example: `-m legacy:{m}:{v}` or '
                     '`--model parlai.agents.legacy_agents.{m}.{m}_v{v}:{c}`')
                if '.' not in model:
                    # give specific error message if it's easy
                    raise RuntimeError(
                        m.format(m=model,
                                 v=curr_version,
                                 c=model_class.__name__))
                else:
                    # otherwise generic one
                    raise RuntimeError(
                        m.format(m='modelname', v=curr_version,
                                 c='ModelAgent'))

        if hasattr(model_class, 'upgrade_opt'):
            new_opt = model_class.upgrade_opt(new_opt)

        # add model arguments to new_opt if they aren't in new_opt already
        for k, v in opt.items():
            if k not in new_opt:
                new_opt[k] = v
        new_opt['model_file'] = model_file
        if not new_opt.get('dict_file'):
            new_opt['dict_file'] = model_file + '.dict'
        elif new_opt.get('dict_file') and not os.path.isfile(
                new_opt['dict_file']):
            old_dict_file = new_opt['dict_file']
            new_opt['dict_file'] = model_file + '.dict'
        if not os.path.isfile(new_opt['dict_file']):
            warn_once(
                'WARNING: Neither the specified dict file ({}) nor the '
                '`model_file`.dict file ({}) exists, check to make sure either '
                'is correct. This may manifest as a shape mismatch later '
                'on.'.format(old_dict_file, new_opt['dict_file']))

        # if we want to load weights from --init-model, compare opts with
        # loaded ones
        compare_init_model_opts(opt, new_opt)
        return model_class(new_opt)
    else:
        return None
Ejemplo n.º 19
0
    def lr_scheduler_factory(cls, opt, optimizer, states, hard_reset=False):
        """
        Create the learning rate scheduler, and assign it to self.scheduler. This
        scheduler will be updated upon a call to receive_metrics. May also create
        self.warmup_scheduler, if appropriate.

        :param opt opt:
            Arguments received by torch_agent
        :param optimizer optimizer:
            Optimizer being used for training. May be wrapped in
            fp16_optimizer_wrapper depending on whether fp16 is used.
        :param state_dict states:
            Possible state_dict provided by model checkpoint, for restoring
            LR state.
        :param bool hard_reset:
            If true, the LR scheduler should ignore the state dictionary.
        :return: ParlAILRScheduler object
        """

        patience = opt.get('lr_scheduler_patience', 3)
        decay = opt.get('lr_scheduler_decay', 0.5)
        warmup_updates = opt.get('warmup_updates', -1)
        warmup_rate = opt.get('warmup_rate', 1e-4)
        max_lr_steps = opt.get('max_lr_steps', -1)
        invsqrt_lr_decay_gamma = opt.get('invsqrt_lr_decay_gamma', -1)

        if opt.get('lr_scheduler') == 'none':
            return None
        elif decay == 1.0:
            warn_once(
                "Your LR decay is set to 1.0. Assuming you meant you wanted "
                "to disable learning rate scheduling. Adjust --lr-scheduler-decay "
                "if this is not correct.")
            return None
        elif opt.get('lr_scheduler') == 'reduceonplateau':
            scheduler = ReduceOnPlateauLRScheduler(optimizer, hard_reset,
                                                   patience, decay,
                                                   warmup_updates, warmup_rate)
        elif opt.get('lr_scheduler') == 'fixed':
            scheduler = FixedLRScheduler(optimizer, hard_reset, patience,
                                         decay, warmup_updates, warmup_rate)
        elif opt.get('lr_scheduler') == 'invsqrt':
            scheduler = InvSqrtLRScheduler(
                optimizer,
                hard_reset,
                patience,
                decay,
                warmup_updates,
                warmup_rate,
                invsqrt_lr_decay_gamma,
            )
        elif opt.get('lr_scheduler') == 'cosine':
            scheduler = CosineLRScheduler(
                optimizer,
                hard_reset,
                patience,
                decay,
                warmup_updates,
                warmup_rate,
                max_lr_steps,
            )
        elif opt.get('lr_scheduler') == 'linear':
            scheduler = LinearLRScheduler(
                optimizer,
                hard_reset,
                patience,
                decay,
                warmup_updates,
                warmup_rate,
                max_lr_steps,
            )
        else:
            raise ValueError(
                "Don't know what to do with --lr-scheduler '{}'".format(
                    opt.get('lr_scheduler')))

        # time to load LR state from the checkpoint, if possible.
        if (
                # there is already an old LR scheduler saved on disk
                states
                # and there was a scheduler in the dump
                and 'lr_scheduler_type' in states
                # and the old LR scheduler is different
                and states.get('lr_scheduler_type') != opt['lr_scheduler']
                # and we're not already using a fresh scheduler
                and not hard_reset):
            # the LR scheduler changed, start things fresh
            warn_once(
                f"LR scheduler ({opt['lr_scheduler']}) is different from saved "
                f"({states.get('lr_scheduler_type')}). Starting fresh!")
            hard_reset = True

        if not hard_reset:
            # do the actual loading (if possible)
            scheduler.load_state(states)

        # setup warmup scheduler after loading saved scheduler
        scheduler._init_warmup_scheduler(optimizer, states)

        return scheduler
Ejemplo n.º 20
0
    def forward(self, input, positions=None, segments=None):
        """
        Forward pass.

        :param LongTensor[batch,seqlen] input:
            The input IDs
        :param BoolTensor[batch,seqlen] mask:
            The attention mask; 1 means attend, 0 means ignore.
        :param LongTensor[batch,seqlen]:
            If provided, additionally adds ``segments`` as extra embedding features.
        """
        mask = input != self.padding_idx
        if positions is None:
            positions = (mask.cumsum(dim=1, dtype=torch.int64) -
                         1).clamp_(min=0)
        tensor = self.embeddings(input)
        if self.embeddings_scale:
            tensor = tensor * np.sqrt(self.dim)

        if positions.max().item() > self.n_positions:
            warn_once(
                'You are inputting a sequence of {x} length, but only have '
                '--n-positions {y}. Set --truncate or increase --n-positions'.
                format(x=positions.max().item(), y=self.n_positions))
        position_embs = self.position_embeddings(positions).expand_as(tensor)
        tensor = tensor + position_embs

        if self.n_segments >= 1:
            if segments is None:
                segments = torch.zeros_like(input)
            tensor = tensor + self.segment_embeddings(segments)

        if self.variant == 'xlm':
            tensor = _normalize(tensor, self.norm_embeddings)

        # --dropout on the embeddings
        tensor = self.dropout(tensor)

        tensor *= mask.unsqueeze(-1).type_as(tensor)

        if getattr(self.layers, 'is_model_parallel', False):
            # factored out for readability. It is equivalent to the other
            # condition
            tensor = self._apply_model_parallel(tensor, mask)
        else:
            for i in range(self.n_layers):
                tensor = self.layers[i](tensor, mask)

        if self.variant == 'prelayernorm':
            tensor = _normalize(tensor, self.norm_embeddings)
        tensor *= self.output_scaling
        if self.reduction_type == 'first':
            return tensor[:, 0, :]
        elif self.reduction_type == 'max':
            return tensor.max(dim=1)[0]
        elif self.reduction_type == 'mean':
            divisor = mask.float().sum(dim=1).unsqueeze(-1).clamp(
                min=1).type_as(tensor)
            output = tensor.sum(dim=1) / divisor
            return output
        elif self.reduction_type is None or 'none' in self.reduction_type:
            return tensor, mask
        else:
            raise ValueError("Can't handle --reduction-type {}".format(
                self.reduction_type))