コード例 #1
0
    def pop_choice(self, key: str, choices: List[Any], default_to_first_choice: bool = False) -> Any:
        """
        Gets the value of ``key`` in the ``params`` dictionary, ensuring that the value is one of
        the given choices. Note that this `pops` the key from params, modifying the dictionary,
        consistent with how parameters are processed in this codebase.

        Parameters
        ----------
        key: str
            Key to get the value from in the param dictionary
        choices: List[Any]
            A list of valid options for values corresponding to ``key``.  For example, if you're
            specifying the type of encoder to use for some part of your model, the choices might be
            the list of encoder classes we know about and can instantiate.  If the value we find in
            the param dictionary is not in ``choices``, we raise a ``ConfigurationError``, because
            the user specified an invalid value in their parameter file.
        default_to_first_choice: bool, optional (default=False)
            If this is ``True``, we allow the ``key`` to not be present in the parameter
            dictionary.  If the key is not present, we will use the return as the value the first
            choice in the ``choices`` list.  If this is ``False``, we raise a
            ``ConfigurationError``, because specifying the ``key`` is required (e.g., you `have` to
            specify your model class when running an experiment, but you can feel free to use
            default settings for encoders if you want).
        """
        default = choices[0] if default_to_first_choice else self.DEFAULT
        value = self.pop(key, default)
        if value not in choices:
            key_str = self.history + key
            message = '%s not in acceptable choices for %s: %s' % (value, key_str, str(choices))
            raise ConfigurationError(message)
        return value
コード例 #2
0
ファイル: elmo.py プロジェクト: fulQuan/ELMo
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 num_output_representations: int,
                 requires_grad: bool = False,
                 do_layer_norm: bool = False,
                 dropout: float = 0.5,
                 module: torch.nn.Module = None) -> None:
        super(Elmo, self).__init__()

        logging.info("Initializing ELMo")
        if module is not None:
            if options_file is not None or weight_file is not None:
                raise ConfigurationError(
                    "Don't provide options_file or weight_file with module")
            self._elmo_lstm = module
        else:
            self._elmo_lstm = _ElmoBiLm(options_file,
                                        weight_file,
                                        requires_grad=requires_grad)
        self._dropout = Dropout(p=dropout)
        self._scalar_mixes: Any = []
        for k in range(num_output_representations):
            scalar_mix = ScalarMix(self._elmo_lstm.num_layers,
                                   do_layer_norm=do_layer_norm)
            self.add_module('scalar_mix_{}'.format(k), scalar_mix)
            self._scalar_mixes.append(scalar_mix)
コード例 #3
0
ファイル: elmo.py プロジェクト: fulQuan/ELMo
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
コード例 #4
0
 def add_subclass_to_registry(subclass: Type[T]):
     # Add to registry, raise an error if key has already been used.
     if name in registry:
         message = "Cannot register %s as %s; name already in use for %s" % (
             name, cls.__name__, registry[name].__name__)
         raise ConfigurationError(message)
     registry[name] = subclass
     return subclass
コード例 #5
0
 def assert_empty(self, class_name: str):
     """
     Raises a ``ConfigurationError`` if ``self.params`` is not empty.  We take ``class_name`` as
     an argument so that the error message gives some idea of where an error happened, if there
     was one.  ``class_name`` should be the name of the `calling` class, the one that got extra
     parameters (if there are any).
     """
     if self.params:
         raise ConfigurationError("Extra parameters passed to {}: {}".format(class_name, self.params))
コード例 #6
0
ファイル: text_field.py プロジェクト: fulQuan/ELMo
    def __init__(self, tokens: List[Token],
                 token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError("TextFields must be passed Tokens. "
                                     "Found: {} with types {}.".format(
                                         tokens, [type(x) for x in tokens]))
コード例 #7
0
 def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                   int]]):
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         # If `text_id` is set on the character token (e.g., if we're using byte encoding), we
         # will not be using the vocab for this character.
         if getattr(character, 'text_id', None) is None:
             counter[self._namespace][character.text] += 1
コード例 #8
0
    def list_available(cls) -> List[str]:
        """List default first if it exists"""
        keys = list(Registrable._registry[cls].keys())
        default = cls.default_implementation

        if default is None:
            return keys
        elif default not in keys:
            message = "Default implementation %s is not registered" % default
            raise ConfigurationError(message)
        else:
            return [default] + [k for k in keys if k != default]
コード例 #9
0
def block_orthogonal(tensor: NDArray,
                     split_sizes: List[int],
                     gain: float = 1.0) -> None:
    """
    An initializer which allows initializing model parameters in "blocks". This is helpful
    in the case of recurrent models which use multiple gates applied to linear projections,
    which can be computed efficiently if they are concatenated together. However, they are
    separate parameters which should be initialized independently.

    Parameters
    ----------
    tensor : ``mxnet.ndarray.ndarray.NDArray``, required.
        A tensor to initialize.
    split_sizes : List[int], required.
        A list of length ``tensor.ndim()`` specifying the size of the
        blocks along that particular dimension. E.g. ``[10, 20]`` would
        result in the tensor being split into chunks of size 10 along the
        first dimension and 20 along the second.
    gain : float, optional (default = 1.0)
        The gain (scaling) applied to the orthogonal initialization.
    """

    # if isinstance(tensor, NDArray):
    #     block_orthogonal(tensor.data, split_sizes, gain)
    # else:
    sizes = list(tensor.shape)
    if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
        raise ConfigurationError(
            "tensor dimensions must be divisible by their respective "
            "split_sizes. Found size: {} and split_sizes: {}".format(
                sizes, split_sizes))
    indexes = [
        list(range(0, max_size, split))
        for max_size, split in zip(sizes, split_sizes)
    ]
    # Iterate over all possible blocks within the tensor.
    for block_start_indices in itertools.product(*indexes):
        # A list of tuples containing the index to start at for this block
        # and the appropriate step size (i.e split_size[i] for dimension i).
        index_and_step_tuples = zip(block_start_indices, split_sizes)
        # This is a tuple of slices corresponding to:
        # tensor[index: index + step_size, ...]. This is
        # required because we could have an arbitrary number
        # of dimensions. The actual slices we need are the
        # start_index: start_index + step for each dimension in the tensor.
        block_slice = tuple([
            slice(start_index, start_index + step)
            for start_index, step in index_and_step_tuples
        ])
        orthogonal = init.Orthogonal(scale=gain)
        temp = tensor[block_slice]
        orthogonal._init_weight(None, temp)
        tensor[block_slice] = temp
コード例 #10
0
 def get(self, key: str, default: Any = DEFAULT):
     """
     Performs the functionality associated with dict.get(key) but also checks for returned
     dicts and returns a Params object in their place with an updated history.
     """
     if default is self.DEFAULT:
         try:
             value = self.params.get(key)
         except KeyError:
             raise ConfigurationError("key \"{}\" is required at location \"{}\"".format(key, self.history))
     else:
         value = self.params.get(key, default)
     return self._check_is_dict(key, value)
コード例 #11
0
ファイル: label_field.py プロジェクト: fulQuan/ELMo
    def __init__(self,
                 label: Union[str, int],
                 label_namespace: str = 'labels',
                 skip_indexing: bool = False) -> None:
        self.label = label
        self._label_namespace = label_namespace
        self._label_id = None
        self._maybe_warn_for_namespace(label_namespace)

        if skip_indexing:
            if not isinstance(label, int):
                raise ConfigurationError(
                    "In order to skip indexing, your labels must be integers. "
                    "Found label = {}".format(label))
            else:
                self._label_id = label
        else:
            if not isinstance(label, str):
                raise ConfigurationError(
                    "LabelFields must be passed a string label if skip_indexing=False. "
                    "Found label: {} with type: {}.".format(
                        label, type(label)))
コード例 #12
0
ファイル: dataset.py プロジェクト: fulQuan/ELMo
 def _check_types(self) -> None:
     """
     Check that all the instances have the same types.
     """
     all_instance_fields_and_types: List[Dict[str, str]] = [{
         k: v.__class__.__name__
         for k, v in x.fields.items()
     } for x in self.instances]
     # Check all the field names and Field types are the same for every instance.
     if not all([
             all_instance_fields_and_types[0] == x
             for x in all_instance_fields_and_types
     ]):
         raise ConfigurationError(
             "You cannot construct a Batch with non-homogeneous Instances.")
コード例 #13
0
    def from_params(cls,
                    params: Params,
                    instances: Iterable['adi.Instance'] = None):
        """
        There are two possible ways to build a vocabulary; from a
        collection of instances, using :func:`Vocabulary.from_instances`, or
        from a pre-saved vocabulary, using :func:`Vocabulary.from_files`.
        This method wraps both of these options, allowing their specification
        from a ``Params`` object, generated from a JSON configuration file.

        Parameters
        ----------
        params: Params, required.
        dataset: Dataset, optional.
            If ``params`` doesn't contain a ``vocabulary_directory`` key,
            the ``Vocabulary`` can be built directly from a ``Dataset``.

        Returns
        -------
        A ``Vocabulary``.
        """
        vocabulary_directory = params.pop("directory_path", None)
        if not vocabulary_directory and not instances:
            raise ConfigurationError(
                "You must provide either a Params object containing a "
                "vocab_directory key or a Dataset to build a vocabulary from.")
        if vocabulary_directory and instances:
            logger.info("Loading Vocab from files instead of dataset.")

        if vocabulary_directory:
            params.assert_empty("Vocabulary - from files")
            return Vocabulary.from_files(vocabulary_directory)

        min_count = params.pop("min_count", None)
        max_vocab_size = params.pop_int("max_vocab_size", None)
        non_padded_namespaces = params.pop("non_padded_namespaces",
                                           DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        only_include_pretrained_words = params.pop_bool(
            "only_include_pretrained_words", False)
        params.assert_empty("Vocabulary - from dataset")
        return Vocabulary.from_instances(
            instances=instances,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words)
コード例 #14
0
 def token_to_indices(self, token: Token,
                      vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         if getattr(character, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = character.text_id
         else:
             index = vocabulary.get_token_index(character.text,
                                                self._namespace)
         indices.append(index)
     return indices
コード例 #15
0
    def forward(
            self,
            tensors: List[NDArray],  # pylint: disable=arguments-differ
            mask: NDArray = None) -> NDArray:
        """
        Compute a weighted average of the ``tensors``.  The input tensors an be any shape
        with at least two dimensions, but must all be the same shape.

        When ``do_layer_norm=True``, the ``mask`` is required input.  If the ``tensors`` are
        dimensioned  ``(dim_0, ..., dim_{n-1}, dim_n)``, then the ``mask`` is dimensioned
        ``(dim_0, ..., dim_{n-1})``, as in the typical case with ``tensors`` of shape
        ``(batch_size, timesteps, dim)`` and ``mask`` of shape ``(batch_size, timesteps)``.

        When ``do_layer_norm=False`` the ``mask`` is ignored.
        """
        if len(tensors) != self.mixture_size:
            raise ConfigurationError(
                "{} tensors were passed, but the module was initialized to "
                "mix {} tensors.".format(len(tensors), self.mixture_size))

        def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
            tensor_masked = tensor * broadcast_mask
            mean = nd.sum(tensor_masked) / num_elements_not_masked
            variance = nd.sum(((tensor_masked - mean) * broadcast_mask)**
                              2) / num_elements_not_masked
            return (tensor - mean) / nd.sqrt(variance + 1E-12)

        normed_weights = nd.softmax(self.scalar_parameters, axis=0)

        if not self.do_layer_norm:
            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * tensor)
            return self.gamma * sum(pieces)

        else:
            broadcast_mask = mask.expand_dims(-1)
            input_dim = tensors[0].shape[-1]
            num_elements_not_masked = nd.sum(mask) * input_dim

            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * _do_layer_norm(tensor, broadcast_mask,
                                                      num_elements_not_masked))
            return self.gamma * sum(pieces)
コード例 #16
0
    def pop(self, key: str, default: Any = DEFAULT) -> Any:
        """
        Performs the functionality associated with dict.pop(key), along with checking for
        returned dictionaries, replacing them with Param objects with an updated history.

        If ``key`` is not present in the dictionary, and no default was specified, we raise a
        ``ConfigurationError``, instead of the typical ``KeyError``.
        """
        if default is self.DEFAULT:
            try:
                value = self.params.pop(key)
            except KeyError:
                raise ConfigurationError("key \"{}\" is required at location \"{}\"".format(key, self.history))
        else:
            value = self.params.pop(key, default)
        if not isinstance(value, dict):
            logger.info(self.history + key + " = " + str(value))  # type: ignore
        return self._check_is_dict(key, value)
コード例 #17
0
ファイル: text_field.py プロジェクト: fulQuan/ELMo
    def get_padding_lengths(self) -> Dict[str, int]:
        """
        The ``TextField`` has a list of ``Tokens``, and each ``Token`` gets converted into arrays by
        (potentially) several ``TokenIndexers``.  This method gets the max length (over tokens)
        associated with each of these arrays.
        """
        # Our basic outline: we will iterate over `TokenIndexers`, and aggregate lengths over tokens
        # for each indexer separately.  Then we will combine the results for each indexer into a single
        # dictionary, resolving any (unlikely) key conflicts by taking a max.
        lengths = []
        if self._indexed_tokens is None:
            raise ConfigurationError(
                "You must call .index(vocabulary) on a "
                "field before determining padding lengths.")
        for indexer_name, indexer in self._token_indexers.items():
            indexer_lengths = {}

            # This is a list of dicts, one for each token in the field.
            token_lengths = [
                indexer.get_padding_lengths(token)
                for token in self._indexed_tokens[indexer_name]
            ]
            if not token_lengths:
                # This is a padding edge case and occurs when we want to pad a ListField of
                # TextFields. In order to pad the list field, we need to be able to have an
                # _empty_ TextField, but if this is the case, token_lengths will be an empty
                # list, so we add the default empty padding dictionary to the list instead.
                token_lengths = [{}]
            # Iterate over the keys and find the maximum token length.
            # It's fine to iterate over the keys of the first token since all tokens have the same keys.
            for key in token_lengths[0].keys():
                indexer_lengths[key] = max(x[key] if key in x else 0
                                           for x in token_lengths)
            lengths.append(indexer_lengths)
        any_indexed_token_key = list(self._indexed_tokens.keys())[0]
        padding_lengths = {
            'num_tokens': len(self._indexed_tokens[any_indexed_token_key])
        }
        # Get all keys which have been used for padding for each indexer and take the max if there are duplicates.
        padding_keys = {key for d in lengths for key in d.keys()}
        for padding_key in padding_keys:
            padding_lengths[padding_key] = max(
                x[padding_key] if padding_key in x else 0 for x in lengths)
        return padding_lengths
コード例 #18
0
 def by_name(cls: Type[T], name: str) -> Type[T]:
     if name not in Registrable._registry[cls]:
         raise ConfigurationError("%s is not a registered name for %s" %
                                  (name, cls.__name__))
     return Registrable._registry[cls].get(name)
コード例 #19
0
ファイル: elmo.py プロジェクト: fulQuan/ELMo
    def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:  # pylint: disable=arguments-differ
        """
        Compute context insensitive token embeddings for ELMo representations.

        Parameters
        ----------
        inputs: ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length, 50)`` of character ids representing the
            current batch.

        Returns
        -------
        Dict with keys:
        ``'token_embedding'``: ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context
            insensitive token representations.
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask.
        """
        # Add BOS/EOS
        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
            inputs, mask, self._beginning_of_sentence_characters,
            self._end_of_sentence_characters)

        # the character id embedding
        max_chars_per_token = self._options['char_cnn'][
            'max_characters_per_token']
        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
        character_embedding = torch.nn.functional.embedding(
            character_ids_with_bos_eos.view(-1, max_chars_per_token),
            self._char_embedding_weights)

        # run convolutions
        cnn_options = self._options['char_cnn']
        if cnn_options['activation'] == 'tanh':
            activation = torch.nn.functional.tanh
        elif cnn_options['activation'] == 'relu':
            activation = torch.nn.functional.relu
        else:
            raise ConfigurationError("Unknown activation")

        # (batch_size * sequence_length, embed_dim, max_chars_per_token)
        character_embedding = torch.transpose(character_embedding, 1, 2)
        convs = []
        for i in range(len(self._convolutions)):
            conv = getattr(self, 'char_conv_{}'.format(i))
            convolved = conv(character_embedding)
            # (batch_size * sequence_length, n_filters for this width)
            convolved, _ = torch.max(convolved, dim=-1)
            convolved = activation(convolved)
            convs.append(convolved)

        # (batch_size * sequence_length, n_filters)
        token_embedding = torch.cat(convs, dim=-1)

        # apply the highway layers (batch_size * sequence_length, n_filters)
        token_embedding = self._highways(token_embedding)

        # final projection  (batch_size * sequence_length, embedding_dim)
        token_embedding = self._projection(token_embedding)

        # reshape to (batch_size, sequence_length, embedding_dim)
        batch_size, sequence_length, _ = character_ids_with_bos_eos.size()

        return {
            'mask':
            mask_with_bos_eos,
            'token_embedding':
            token_embedding.view(batch_size, sequence_length, -1)
        }
コード例 #20
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]:
     # pylint: disable=unused-argument
     if token.text is None:
         raise ConfigurationError('ELMoTokenCharactersIndexer needs a tokenizer '
                                  'that retains text')
     return ELMoCharacterMapper.convert_word_to_char_ids(token.text)