def index(self, vocab: Vocabulary):
     if self._indexed_labels is None:
         self._indexed_labels = [
             vocab.get_token_index(label,
                                   self._label_namespace)  # type: ignore
             for label in self.labels
         ]
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        dep_labels = [token.dep_ or 'NONE' for token in tokens]

        return {
            index_name: [
                vocabulary.get_token_index(dep_label, self.namespace)
                for dep_label in dep_labels
            ]
        }
Exemple #3
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags = [
            'NONE' if token.ent_type_ is None else token.ent_type_
            for token in tokens
        ]

        return {
            index_name:
            [vocabulary.get_token_index(tag, self._namespace) for tag in tags]
        }
Exemple #4
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags: List[str] = []

        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if tag is None:
                tag = 'NONE'

            tags.append(tag)

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
Exemple #5
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in tokens:
            if getattr(token, 'text_id', None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                if self.namespace == "decoder_token_ids":
                    here = True
                indices.append(vocabulary.get_token_index(
                    text, self.namespace))

        return {index_name: indices}
 def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[List[int]]]:
     indices: List[List[int]] = []
     for token in tokens:
         token_indices: List[int] = []
         if token.text is None:
             raise ConfigurationError(
                 'TokenCharactersIndexer needs a tokenizer that retains text'
             )
         if re.search(r'^(en_|it_|es_|de_|zh_)', token.text):
             token_text = token.text[3:]
         else:
             token_text = token.text
         for character in self._character_tokenizer.tokenize(token_text):
             if getattr(character, 'text_id', None) is not None:
                 # `text_id` being set on the token means that we aren't using the vocab, we just
                 # use this id instead.
                 index = character.text_id
             else:
                 index = vocabulary.get_token_index(character.text,
                                                    self._namespace)
             token_indices.append(index)
         indices.append(token_indices)
     return {index_name: indices}
Exemple #7
0
    def _load(cls,
              config: Params,
              serialization_dir: str,
              weights_file: str = None,
              device=None) -> 'Model':
        """
        Instantiates an already-trained model, based on the experiment
        configuration and some optional overrides.
        """
        weights_file = weights_file or os.path.join(serialization_dir,
                                                    _DEFAULT_WEIGHTS)

        # Load vocabulary from file
        vocab_dir = os.path.join(serialization_dir, 'vocabulary')
        # If the config specifies a vocabulary subclass, we need to use it.
        vocab = Vocabulary.from_files(vocab_dir)

        model_params = config['model']

        # The experiment config tells us how to _train_ a model, including where to get pre-trained
        # embeddings from.  We're now _loading_ the model, so those embeddings will already be
        # stored in our weights.  We don't need any pretrained weight file anymore, and we don't
        # want the code to look for it, so we remove it from the parameters here.
        remove_pretrained_embedding_params(model_params)
        model = cls.from_params(vocab=vocab, params=model_params)
        model_state = torch.load(weights_file, map_location=device_mapping(-1))
        if not isinstance(model, torch.nn.DataParallel):
            model_state = {
                re.sub(r'^module\.', '', k): v
                for k, v in model_state.items()
            }
        model.load_state_dict(model_state)
        model.set_vocab(vocab)
        model.to(device)

        return model
Exemple #8
0
def train_model(params: Params):
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results.
    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    Returns
    -------
    best_model: ``Model``
        The model with the best epoch weights.
    """
    # Set up the environment.
    environment_params = params['environment']
    environment.set_seed(environment_params)
    create_serialization_dir(params)
    environment.prepare_global_logging(environment_params)
    environment.check_for_gpu(environment_params)
    if environment_params['gpu']:
        device = torch.device('cuda:{}'.format(environment_params['cuda_device']))
        environment.occupy_gpu(device)
    else:
        device = torch.device('cpu')
    params['trainer']['device'] = device

    # Load data.
    data_params = params['data']
    dataset = dataset_from_params(data_params,
                                  universal_postags=params["model"].get('universal_postags',False),
                                  generator_source_copy=data_params.get('source_copy', True),
                                  multilingual=params['model'].get('multilingual',False),
                                  extra_check=params['data'].get('extra_check',False))
    train_data = dataset['train']
    dev_data = dataset.get('dev')
    test_data = dataset.get('test')
    train_mappings = dataset.get('train_mappings',None)
    train_replacements = dataset.get('train_replacements',None)

    # Vocabulary and iterator are created here.
    vocab_params = params.get('vocab', {})
    if "fixed_vocab" in vocab_params and vocab_params["fixed_vocab"]:
        vocab = Vocabulary.from_files("data/vocabulary")
    else:
        vocab = Vocabulary.from_instances(instances=train_data, **vocab_params)

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(environment_params['serialization_dir'], "vocabulary"))
    train_iterator, dev_iterater, test_iterater = iterator_from_params(vocab, data_params['iterator'])
    if train_mappings is not None and train_replacements is not None:
        with open(os.path.join(environment_params['serialization_dir'],"trns_lex_missing.json"),"w", encoding='utf-8') as outfile:
            json.dump(train_mappings[-1], outfile, indent=4, default=serialize_sets)
        with open(os.path.join(environment_params['serialization_dir'],"trns_lexicalizations.json"),"w", encoding='utf-8') as outfile:
            json.dump(train_mappings[-2], outfile, indent=4, default=serialize_sets)
        with open(os.path.join(environment_params['serialization_dir'],"trns_rep.json"), "w", encoding='utf-8') as outfile:
            json.dump(train_replacements, outfile, indent=4, default=serialize_sets)
    # Build the model.
    model_params = params['model']
    model = getattr(Models, model_params['model_type']).from_params(vocab, model_params, environment_params['gpu'], train_mappings, train_replacements)
    logger.info(model)

    # Train
    trainer_params = params['trainer']
    no_grad_regexes = trainer_params['no_grad']
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
        environment.get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)

    logger.info("Total nr of parameters Tunable (with gradient):")
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(pytorch_total_params)

    trainer = Trainer.from_params(model, train_data, dev_data, train_iterator, dev_iterater, trainer_params)

    serialization_dir = trainer_params['serialization_dir']
    try:
        metrics = trainer.train()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logger.info("Training interrupted by the user. Attempting to create "
                         "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    # Now tar up results
    archive_model(serialization_dir)

    logger.info("Loading the best epoch weights.")
    best_model_state_path = os.path.join(serialization_dir, 'best.th')
    best_model_state = torch.load(best_model_state_path)
    best_model = model
    if not isinstance(best_model, torch.nn.DataParallel):
        best_model_state = {re.sub(r'^module\.', '', k):v for k, v in best_model_state.items()}
    best_model.load_state_dict(best_model_state)

    return best_model
 def index(self, vocab: Vocabulary):
     if self.is_global_rule and self._rule_id is None:
         self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)
Exemple #10
0
 def index(self, vocab: Vocabulary):
     if self._label_id is None:
         self._label_id = vocab.get_token_index(
             self.label, self._label_namespace)  # type: ignore
Exemple #11
0
 def index(self, vocab: Vocabulary):
     if self._indexed_labels is None and self.labels is not None:
         self._indexed_labels = [
             vocab.get_token_index(label, self._label_namespace)
             for label in self.labels
         ]