Ejemplo n.º 1
0
    def __init__(self,
                 indices: List[Tuple[int, int]],
                 sequence_field: SequenceField,
                 labels: List[str] = None,
                 label_namespace: str = 'labels',
                 padding_value: int = -1) -> None:
        self.indices = indices
        self.labels = labels
        self.sequence_field = sequence_field
        self._label_namespace = label_namespace
        self._padding_value = padding_value
        self._indexed_labels: List[int] = None

        self._maybe_warn_for_namespace(label_namespace)
        field_length = sequence_field.sequence_length()

        if len(set(indices)) != len(indices):
            raise ConfigurationError(
                f"Indices must be unique, but found {indices}")

        if not all([
                0 <= index[1] < field_length and 0 <= index[0] < field_length
                for index in indices
        ]):
            raise ConfigurationError(
                f"Label indices and sequence length "
                f"are incompatible: {indices} and {field_length}")

        if labels is not None and len(indices) != len(labels):
            raise ConfigurationError(
                f"Labelled indices were passed, but their lengths do not match: "
                f" {labels}, {indices}")
Ejemplo n.º 2
0
 def _get_instance_data(self) -> Iterator[Instance]:
     if self._input_file == "-":
         raise ConfigurationError("stdin is not an option when using a DatasetReader.")
     elif self._dataset_reader is None:
         raise ConfigurationError("To generate instances directly, pass a DatasetReader.")
     else:
         yield from self._dataset_reader.read(self._input_file)
Ejemplo n.º 3
0
def create_serialization_dir(params: Params) -> None:
    """
    This function creates the serialization directory if it doesn't exist.  If it already exists
    and is non-empty, then it verifies that we're recovering from a training with an identical configuration.
    Parameters
    ----------
    params: ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: ``str``
        The directory in which to save results and logs.
    recover: ``bool``
        If ``True``, we will try to recover from an existing serialization directory, and crash if
        the directory doesn't exist, or doesn't match the configuration we're given.
    """
    serialization_dir = params['environment']['serialization_dir']
    recover = params['environment']['recover']
    if os.path.exists(serialization_dir) and os.listdir(serialization_dir):
        if not recover:
            raise ConfigurationError(
                f"Serialization directory ({serialization_dir}) already exists and is "
                f"not empty. Specify --recover to recover training from existing output."
            )

        logger.info(f"Recovering from prior training at {serialization_dir}.")

        recovered_config_file = os.path.join(serialization_dir, CONFIG_NAME)
        if not os.path.exists(recovered_config_file):
            raise ConfigurationError(
                "The serialization directory already exists but doesn't "
                "contain a config.json. You probably gave the wrong directory."
            )
        else:
            loaded_params = Params.from_file(recovered_config_file)

            if params != loaded_params:
                raise ConfigurationError(
                    "Training configuration does not match the configuration we're "
                    "recovering from.")

            # In the recover mode, we don't need to reload the pre-trained embeddings.
            remove_pretrained_embedding_params(params)
    else:
        if recover:
            raise ConfigurationError(
                f"--recover specified but serialization_dir ({serialization_dir}) "
                "does not exist.  There is nothing to recover from.")
        os.makedirs(serialization_dir, exist_ok=True)
        params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
Ejemplo n.º 4
0
    def from_archive(cls, archive, predictor_name: str = None) -> 'Predictor':
        """
        Instantiate a :class:`Predictor` from an :class:`~allennlp.models.archival.Archive`;
        that is, from the result of training a model. Optionally specify which `Predictor`
        subclass; otherwise, the default one for the model will be used.
        """
        # Duplicate the config so that the config inside the archive doesn't get consumed
        config = archive.config.duplicate()

        if not predictor_name:
            model_type = config.get("model").get("model_type")
            if not model_type in DEFAULT_PREDICTORS:
                raise ConfigurationError(f"No default predictor for model type {model_type}.\n"\
                                         f"Please specify a predictor explicitly.")
            predictor_name = DEFAULT_PREDICTORS[model_type]

        word_splitter = None
        if config['model'].get('use_bert', False):
            word_splitter = config['data'].get('word_splitter', None)
        dataset_reader = load_dataset_reader(config["data"]["data_type"],
                                             word_splitter=word_splitter)
        if hasattr(dataset_reader, 'set_evaluation'):
            dataset_reader.set_evaluation()

        model = archive.model
        model.eval()

        return Predictor.by_name(predictor_name)(model, dataset_reader)
Ejemplo n.º 5
0
def check_for_gpu(params) -> object:
    device_id = params['cuda_device']
    if device_id is not None and device_id >= cuda.device_count():
        raise ConfigurationError(
            "Experiment specified a GPU but none is available;"
            " if you want to run on CPU use the override"
            " 'trainer.cuda_device=-1' in the json config file.")
Ejemplo n.º 6
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            instances = self._read(file_path)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))
            return instances
Ejemplo n.º 7
0
 def __init__(self, module: torch.nn.modules.RNNBase) -> None:
     # Seq2VecEncoders cannot be stateful.
     super(PytorchSeq2VecWrapper, self).__init__(stateful=False)
     self._module = module
     try:
         if not self._module.batch_first:
             raise ConfigurationError(
                 "Our encoder semantics assumes batch is always first!")
     except AttributeError:
         pass
Ejemplo n.º 8
0
 def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                   int]]):
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         # If `text_id` is set on the character token (e.g., if we're using byte encoding), we
         # will not be using the vocab for this character.
         if getattr(character, 'text_id', None) is None:
             counter[self._namespace][character.text] += 1
Ejemplo n.º 9
0
 def _check_types(self) -> None:
     """
     Check that all the instances have the same types.
     """
     all_instance_fields_and_types: List[Dict[str, str]] = [{k: v.__class__.__name__
                                                             for k, v in x.fields.items()}
                                                            for x in self.instances]
     # Check all the field names and Field types are the same for every instance.
     if not all([all_instance_fields_and_types[0] == x for x in all_instance_fields_and_types]):
         raise ConfigurationError("You cannot construct a Batch with non-homogeneous Instances.")
Ejemplo n.º 10
0
    def __init__(self,
                 base_iterator: DataIterator,
                 num_workers: int = 1,
                 output_queue_size: int = 1000) -> None:
        # pylint: disable=protected-access
        super().__init__()
        self.num_workers = num_workers
        self.batch_size = base_iterator._batch_size
        self.output_queue_size = output_queue_size

        # These two options make the iterator stateful, which means it can't be shared
        # across multiple processes.
        if base_iterator._cache_instances:
            raise ConfigurationError("cannot use Multiprocess iterator with cache_instances")
        if base_iterator._instances_per_epoch:
            raise ConfigurationError("cannot use instances_per_epoch with Multiprocess iterator")

        self.iterator = base_iterator

        self.processes: List[Process] = []
        self.queuer: Optional[Process] = None
Ejemplo n.º 11
0
    def __init__(self,
                 labels: Union[List[str], List[int]],
                 sequence_field: SequenceField,
                 label_namespace: str = 'labels',
                 strip_sentence_symbols : bool = False) -> None:
        self.labels = labels
        self.sequence_field = sequence_field
        self._label_namespace = label_namespace
        self._indexed_labels = None
        self._maybe_warn_for_namespace(label_namespace)
        if len(labels) != sequence_field.sequence_length() and not strip_sentence_symbols:
            raise ConfigurationError("Label length and sequence length "
                                     "don't match: %d and %d" % (len(labels), sequence_field.sequence_length()))

        if all([isinstance(x, int) for x in labels]):
            self._indexed_labels = labels

        elif not all([isinstance(x, str) for x in labels]):
            raise ConfigurationError("SequenceLabelFields must be passed either all "
                                     "strings or all ints. Found labels {} with "
                                     "types: {}.".format(labels, [type(x) for x in labels]))
Ejemplo n.º 12
0
    def __init__(self, tokens: List[Token],
                 token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str,
                                                           List[str]]] = None

        if not all([isinstance(x, (Token, SpacyToken)) for x in tokens]):
            raise ConfigurationError("TextFields must be passed Tokens. "
                                     "Found: {} with types {}.".format(
                                         tokens, [type(x) for x in tokens]))
Ejemplo n.º 13
0
    def __init__(self,
                 label: Union[str, int],
                 label_namespace: str = 'labels',
                 skip_indexing: bool = False) -> None:
        self.label = label
        self._label_namespace = label_namespace
        self._label_id = None
        self._maybe_warn_for_namespace(label_namespace)

        if skip_indexing:
            if not isinstance(label, int):
                raise ConfigurationError(
                    "In order to skip indexing, your labels must be integers. "
                    "Found label = {}".format(label))
            else:
                self._label_id = label
        else:
            if not isinstance(label, str):
                raise ConfigurationError(
                    "LabelFields must be passed a string label if skip_indexing=False. "
                    "Found label: {} with type: {}.".format(
                        label, type(label)))
Ejemplo n.º 14
0
def takes_arg(obj, arg: str) -> bool:
    """
    Checks whether the provided obj takes a certain arg.
    If it's a class, we're really checking whether its constructor does.
    If it's a function or method, we're checking the object itself.
    Otherwise, we raise an error.
    """
    if inspect.isclass(obj):
        signature = inspect.signature(obj.__init__)
    elif inspect.ismethod(obj) or inspect.isfunction(obj):
        signature = inspect.signature(obj)
    else:
        raise ConfigurationError(f"object {obj} is not callable")
    return arg in signature.parameters
Ejemplo n.º 15
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[List[int]]]:
        # pylint: disable=unused-argument
        texts = [token.text for token in tokens]

        if any(text is None for text in texts):
            raise ConfigurationError(
                'ELMoTokenCharactersIndexer needs a tokenizer '
                'that retains text')
        return {
            index_name: [
                ELMoCharacterMapper.convert_word_to_char_ids(text)
                for text in texts
            ]
        }
Ejemplo n.º 16
0
    def __call__(self,
                 instances: Iterable[Instance],
                 num_epochs: int = None,
                 shuffle: bool = True) -> Iterator[TensorDict]:

        # If you run it forever, the multiprocesses won't shut down correctly.
        # TODO(joelgrus) find a solution for this
        if num_epochs is None:
            raise ConfigurationError(
                "Multiprocess Iterator must be run for a fixed number of epochs"
            )

        manager = Manager()
        output_queue = manager.Queue(self.output_queue_size)
        input_queue = manager.Queue(self.output_queue_size * self.batch_size)

        # Start process that populates the queue.
        self.queuer = Process(target=_queuer,
                              args=(instances, input_queue, self.num_workers,
                                    num_epochs))
        self.queuer.start()

        # Start the tensor-dict workers.
        for i in range(self.num_workers):
            args = (input_queue, output_queue, self.iterator, shuffle, i)
            process = Process(target=_create_tensor_dicts, args=args)
            process.start()
            self.processes.append(process)

        num_finished = 0
        while num_finished < self.num_workers:
            item = output_queue.get()
            if isinstance(item, int):
                num_finished += 1
                logger.info(
                    f"worker {item} finished ({num_finished} / {self.num_workers})"
                )
            else:
                yield item

        for process in self.processes:
            process.join()
        self.processes.clear()

        if self.queuer is not None:
            self.queuer.join()
            self.queuer = None
Ejemplo n.º 17
0
    def forward(self,  # pylint: disable=arguments-differ
                inputs: PackedSequence,
                initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2).
        Returns
        -------
        output_sequence : PackedSequence
            The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2)
        final_states: torch.Tensor
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size * 2).
        """
        if not initial_state:
            hidden_states = [None] * len(self.lstm_layers)
        elif initial_state[0].size()[0] != len(self.lstm_layers):
            raise ConfigurationError("Initial states were passed to forward() but the number of "
                                     "initial states does not match the number of layers.")
        else:
            hidden_states = list(zip(initial_state[0].split(1, 0),
                                     initial_state[1].split(1, 0)))

        output_sequence = inputs
        final_states = []
        for i, state in enumerate(hidden_states):
            forward_layer = getattr(self, 'forward_layer_{}'.format(i))
            backward_layer = getattr(self, 'backward_layer_{}'.format(i))
            # The state is duplicated to mirror the Pytorch API for LSTMs.
            forward_output, final_forward_state = forward_layer(output_sequence, state)
            backward_output, final_backward_state = backward_layer(output_sequence, state)

            forward_output, lengths = pad_packed_sequence(forward_output, batch_first=True)
            backward_output, _ = pad_packed_sequence(backward_output, batch_first=True)

            output_sequence = torch.cat([forward_output, backward_output], -1)
            output_sequence = pack_padded_sequence(output_sequence, lengths, batch_first=True)
            final_states.append((torch.cat(both_direction_states, -1) for both_direction_states
                                 in zip(final_forward_state, final_backward_state)))

        final_state_tuple = [torch.cat(state_list, 0) for state_list in zip(*final_states)]
        return output_sequence, final_state_tuple
Ejemplo n.º 18
0
    def _get_prediction_device(self):
        """
        This method checks the device of the model parameters to determine the cuda_device
        this model should be run on for predictions.  If there are no parameters, it returns -1.
        Returns
        -------
        The cuda device this model should run on for predictions.
        """
        devices = {get_device_of(param) for param in self.parameters()}

        if len(devices) > 1:
            devices_string = ", ".join(str(x) for x in devices)
            raise ConfigurationError(
                f"Parameters have mismatching cuda_devices: {devices_string}")
        elif len(devices) == 1 and all(i >= 0 for i in devices):
            device = torch.device('cuda:{}'.format(devices.pop()))
        else:
            device = torch.device('cpu')
        return device
Ejemplo n.º 19
0
 def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[List[int]]]:
     indices: List[List[int]] = []
     for token in tokens:
         token_indices: List[int] = []
         if token.text is None:
             raise ConfigurationError(
                 'TokenCharactersIndexer needs a tokenizer that retains text'
             )
         for character in self._character_tokenizer.tokenize(token.text):
             if getattr(character, 'text_id', None) is not None:
                 # `text_id` being set on the token means that we aren't using the vocab, we just
                 # use this id instead.
                 index = character.text_id
             else:
                 index = vocabulary.get_token_index(character.text,
                                                    self._namespace)
             token_indices.append(index)
         indices.append(token_indices)
     return {index_name: indices}
Ejemplo n.º 20
0
    def __init__(self,
                 module: torch.nn.Module,
                 stateful: bool = False) -> None:
        super(PytorchSeq2SeqWrapper, self).__init__(stateful)
        self._module = module
        try:
            if not self._module.batch_first:
                raise ConfigurationError(
                    "Our encoder semantics assumes batch is always first!")
        except AttributeError:
            pass

        try:
            self._is_bidirectional = self._module.bidirectional
        except AttributeError:
            self._is_bidirectional = False
        if self._is_bidirectional:
            self._num_directions = 2
        else:
            self._num_directions = 1
Ejemplo n.º 21
0
    def __init__(self,
                 num_embeddings: int,
                 embedding_dim: int,
                 projection_dim: int = None,
                 weight: torch.FloatTensor = None,
                 padding_index: int = None,
                 trainable: bool = True,
                 max_norm: float = None,
                 norm_type: float = 2.,
                 scale_grad_by_freq: bool = False,
                 sparse: bool = False) -> None:
        super(Embedding, self).__init__()
        self.num_embeddings = num_embeddings
        self.padding_index = padding_index
        self.max_norm = max_norm
        self.norm_type = norm_type
        self.scale_grad_by_freq = scale_grad_by_freq
        self.sparse = sparse
        self.trainable = trainable
        self.embedding_dim = embedding_dim

        self.output_dim = projection_dim or embedding_dim

        if weight is None:
            weight = torch.FloatTensor(num_embeddings, embedding_dim)
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)
            torch.nn.init.xavier_uniform_(self.weight)
        else:
            if weight.size() != (num_embeddings, embedding_dim):
                raise ConfigurationError(
                    "A weight matrix was passed with contradictory embedding shapes."
                )
            self.weight = torch.nn.Parameter(weight, requires_grad=trainable)

        if self.padding_index is not None:
            self.weight.data[self.padding_index].fill_(0)

        if projection_dim:
            self._projection = torch.nn.Linear(embedding_dim, projection_dim)
        else:
            self._projection = None
Ejemplo n.º 22
0
    def print_statistics(self) -> None:
        # Make sure if has been indexed first
        sequence_field_lengths: Dict[str, List] = defaultdict(list)
        for instance in self.instances:
            if not instance.indexed:
                raise ConfigurationError("Instances must be indexed with vocabulary "
                                         "before asking to print dataset statistics.")
            for field, field_padding_lengths in instance.get_padding_lengths().items():
                for key, value in field_padding_lengths.items():
                    sequence_field_lengths[f"{field}.{key}"].append(value)

        print("\n\n----Dataset Statistics----\n")
        for name, lengths in sequence_field_lengths.items():
            print(f"Statistics for {name}:")
            print(f"\tLengths: Mean: {numpy.mean(lengths)}, Standard Dev: {numpy.std(lengths)}, "
                  f"Max: {numpy.max(lengths)}, Min: {numpy.min(lengths)}")

        print("\n10 Random instances: ")
        for i in list(numpy.random.randint(len(self.instances), size=10)):
            print(f"Instance {i}:")
            print(f"\t{self.instances[i]}")
Ejemplo n.º 23
0
def _read_embeddings_from_hdf5(embeddings_filename: str,
                               embedding_dim: int,
                               vocab: Vocabulary,
                               namespace: str = "tokens",
                               amr: bool = False) -> torch.FloatTensor:
    """
    Reads from a hdf5 formatted file. The embedding matrix is assumed to
    be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``.
    """
    with h5py.File(embeddings_filename, 'r') as fin:
        embeddings = fin['embedding'][...]

    if list(embeddings.shape) != [
            vocab.get_vocab_size(namespace), embedding_dim
    ]:
        raise ConfigurationError(
            "Read shape {0} embeddings from the file, but expected {1}".format(
                list(embeddings.shape),
                [vocab.get_vocab_size(namespace), embedding_dim]))

    return torch.FloatTensor(embeddings)
Ejemplo n.º 24
0
def block_orthogonal(tensor: torch.Tensor,
                     split_sizes: List[int],
                     gain: float = 1.0) -> None:
    """
    An initializer which allows initializing model parameters in "blocks". This is helpful
    in the case of recurrent models which use multiple gates applied to linear projections,
    which can be computed efficiently if they are concatenated together. However, they are
    separate parameters which should be initialized independently.
    Parameters
    ----------
    tensor : ``torch.Tensor``, required.
        A tensor to initialize.
    split_sizes : List[int], required.
        A list of length ``tensor.ndim()`` specifying the size of the
        blocks along that particular dimension. E.g. ``[10, 20]`` would
        result in the tensor being split into chunks of size 10 along the
        first dimension and 20 along the second.
    gain : float, optional (default = 1.0)
        The gain (scaling) applied to the orthogonal initialization.
    """
    data = tensor.data
    sizes = list(tensor.size())
    if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
        raise ConfigurationError("tensor dimensions must be divisible by their respective "
                                 "split_sizes. Found size: {} and split_sizes: {}".format(sizes, split_sizes))
    indexes = [list(range(0, max_size, split))
               for max_size, split in zip(sizes, split_sizes)]
    # Iterate over all possible blocks within the tensor.
    for block_start_indices in itertools.product(*indexes):
        # A list of tuples containing the index to start at for this block
        # and the appropriate step size (i.e split_size[i] for dimension i).
        index_and_step_tuples = zip(block_start_indices, split_sizes)
        # This is a tuple of slices corresponding to:
        # tensor[index: index + step_size, ...]. This is
        # required because we could have an arbitrary number
        # of dimensions. The actual slices we need are the
        # start_index: start_index + step for each dimension in the tensor.
        block_slice = tuple([slice(start_index, start_index + step)
                             for start_index, step in index_and_step_tuples])
        data[block_slice] = torch.nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain)
Ejemplo n.º 25
0
    def __init__(self,
                 sorting_keys: List[Tuple[str, str]],
                 padding_noise: float = 0.1,
                 biggest_batch_first: bool = False,
                 batch_size: int = 32,
                 instances_per_epoch: int = None,
                 max_instances_in_memory: int = None,
                 cache_instances: bool = False,
                 track_epoch: bool = False,
                 maximum_samples_per_batch: Tuple[str, int] = None) -> None:
        if not sorting_keys:
            raise ConfigurationError(
                "BucketIterator requires sorting_keys to be specified")

        super().__init__(cache_instances=cache_instances,
                         track_epoch=track_epoch,
                         batch_size=batch_size,
                         instances_per_epoch=instances_per_epoch,
                         max_instances_in_memory=max_instances_in_memory,
                         maximum_samples_per_batch=maximum_samples_per_batch)
        self._sorting_keys = sorting_keys
        self._padding_noise = padding_noise
        self._biggest_batch_first = biggest_batch_first
Ejemplo n.º 26
0
def decode_mst(energy: numpy.ndarray,
               length: int,
               has_labels: bool = True) -> Tuple[numpy.ndarray, numpy.ndarray]:
    """
    Note: Counter to typical intuition, this function decodes the _maximum_
    spanning tree.
    Decode the optimal MST tree with the Chu-Liu-Edmonds algorithm for
    maximum spanning arboresences on graphs.
    Parameters
    ----------
    energy : ``numpy.ndarray``, required.
        A tensor with shape (num_labels, timesteps, timesteps)
        containing the energy of each edge. If has_labels is ``False``,
        the tensor should have shape (timesteps, timesteps) instead.
    length : ``int``, required.
        The length of this sequence, as the energy may have come
        from a padded batch.
    has_labels : ``bool``, optional, (default = True)
        Whether the graph has labels or not.
    """
    if has_labels and energy.ndim != 3:
        raise ConfigurationError(
            "The dimension of the energy array is not equal to 3.")
    elif not has_labels and energy.ndim != 2:
        raise ConfigurationError(
            "The dimension of the energy array is not equal to 2.")
    input_shape = energy.shape
    max_length = input_shape[-1]

    # Our energy matrix might have been batched -
    # here we clip it to contain only non padded tokens.
    if has_labels:
        energy = energy[:, :length, :length]
        # get best label for each edge.
        label_id_matrix = energy.argmax(axis=0)
        energy = energy.max(axis=0)
    else:
        energy = energy[:length, :length]
        label_id_matrix = None
    # get original score matrix
    original_score_matrix = energy
    # initialize score matrix to original score matrix
    score_matrix = numpy.array(original_score_matrix, copy=True)

    old_input = numpy.zeros([length, length], dtype=numpy.int32)
    old_output = numpy.zeros([length, length], dtype=numpy.int32)
    current_nodes = [True for _ in range(length)]
    representatives: List[Set[int]] = []

    for node1 in range(length):
        original_score_matrix[node1, node1] = 0.0
        score_matrix[node1, node1] = 0.0
        representatives.append({node1})

        for node2 in range(node1 + 1, length):
            old_input[node1, node2] = node1
            old_output[node1, node2] = node2

            old_input[node2, node1] = node2
            old_output[node2, node1] = node1

    final_edges: Dict[int, int] = {}

    # The main algorithm operates inplace.
    chu_liu_edmonds(length, score_matrix, current_nodes, final_edges,
                    old_input, old_output, representatives)

    heads = numpy.zeros([max_length], numpy.int32)
    if has_labels:
        head_type = numpy.ones([max_length], numpy.int32)
    else:
        head_type = None

    for child, parent in final_edges.items():
        heads[child] = parent
        if has_labels:
            head_type[child] = label_id_matrix[parent, child]

    return heads, head_type
Ejemplo n.º 27
0
    def forward(
            self,  # pylint: disable=arguments-differ
            inputs: PackedSequence,
            initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : PackedSequence, required.
            A tensor of shape (batch_size, num_timesteps, input_size)
            to apply the LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. Each tensor has shape (1, batch_size, output_dimension).
        Returns
        -------
        A PackedSequence containing a torch.FloatTensor of shape
        (batch_size, num_timesteps, output_dimension) representing
        the outputs of the LSTM per timestep and a tuple containing
        the LSTM state, with shape (1, batch_size, hidden_size) to
        match the Pytorch API.
        """
        if not isinstance(inputs, PackedSequence):
            raise ConfigurationError(
                'inputs must be PackedSequence but got %s' % (type(inputs)))

        sequence_tensor, batch_lengths = pad_packed_sequence(inputs,
                                                             batch_first=True)
        batch_size = sequence_tensor.size()[0]
        total_timesteps = sequence_tensor.size()[1]

        output_accumulator = sequence_tensor.new_zeros(batch_size,
                                                       total_timesteps,
                                                       self.hidden_size)
        if initial_state is None:
            full_batch_previous_memory = sequence_tensor.new_zeros(
                batch_size, self.hidden_size)
            full_batch_previous_state = sequence_tensor.data.new_zeros(
                batch_size, self.hidden_size)
        else:
            full_batch_previous_state = initial_state[0].squeeze(0)
            full_batch_previous_memory = initial_state[1].squeeze(0)

        current_length_index = batch_size - 1 if self.go_forward else 0
        if self.recurrent_dropout_probability > 0.0:
            dropout_mask = get_dropout_mask(self.recurrent_dropout_probability,
                                            full_batch_previous_memory)
        else:
            dropout_mask = None

        for timestep in range(total_timesteps):
            # The index depends on which end we start.
            index = timestep if self.go_forward else total_timesteps - timestep - 1

            # What we are doing here is finding the index into the batch dimension
            # which we need to use for this timestep, because the sequences have
            # variable length, so once the index is greater than the length of this
            # particular batch sequence, we no longer need to do the computation for
            # this sequence. The key thing to recognise here is that the batch inputs
            # must be _ordered_ by length from longest (first in batch) to shortest
            # (last) so initially, we are going forwards with every sequence and as we
            # pass the index at which the shortest elements of the batch finish,
            # we stop picking them up for the computation.
            if self.go_forward:
                while batch_lengths[current_length_index] <= index:
                    current_length_index -= 1
            # If we're going backwards, we are _picking up_ more indices.
            else:
                # First conditional: Are we already at the maximum number of elements in the batch?
                # Second conditional: Does the next shortest sequence beyond the current batch
                # index require computation use this timestep?
                while current_length_index < (len(batch_lengths) - 1) and \
                                batch_lengths[current_length_index + 1] > index:
                    current_length_index += 1

            # Actually get the slices of the batch which we need for the computation at this timestep.
            previous_memory = full_batch_previous_memory[
                0:current_length_index + 1].clone()
            previous_state = full_batch_previous_state[0:current_length_index +
                                                       1].clone()
            timestep_input = sequence_tensor[0:current_length_index + 1, index]

            # Do the projections for all the gates all at once.
            projected_input = self.input_linearity(timestep_input)
            projected_state = self.state_linearity(previous_state)

            # Main LSTM equations using relevant chunks of the big linear
            # projections of the hidden state and inputs.
            input_gate = torch.sigmoid(
                projected_input[:, 0 * self.hidden_size:1 * self.hidden_size] +
                projected_state[:, 0 * self.hidden_size:1 * self.hidden_size])
            forget_gate = torch.sigmoid(
                projected_input[:, 1 * self.hidden_size:2 * self.hidden_size] +
                projected_state[:, 1 * self.hidden_size:2 * self.hidden_size])
            memory_init = torch.tanh(
                projected_input[:, 2 * self.hidden_size:3 * self.hidden_size] +
                projected_state[:, 2 * self.hidden_size:3 * self.hidden_size])
            output_gate = torch.sigmoid(
                projected_input[:, 3 * self.hidden_size:4 * self.hidden_size] +
                projected_state[:, 3 * self.hidden_size:4 * self.hidden_size])
            memory = input_gate * memory_init + forget_gate * previous_memory
            timestep_output = output_gate * torch.tanh(memory)

            if self.use_highway:
                highway_gate = torch.sigmoid(
                    projected_input[:, 4 * self.hidden_size:5 *
                                    self.hidden_size] +
                    projected_state[:,
                                    4 * self.hidden_size:5 * self.hidden_size])
                highway_input_projection = projected_input[:, 5 *
                                                           self.hidden_size:6 *
                                                           self.hidden_size]
                timestep_output = highway_gate * timestep_output + (
                    1 - highway_gate) * highway_input_projection

            # Only do dropout if the dropout prob is > 0.0 and we are in training mode.
            if dropout_mask is not None and self.training:
                timestep_output = timestep_output * dropout_mask[
                    0:current_length_index + 1]

            # We've been doing computation with less than the full batch, so here we create a new
            # variable for the the whole batch at this timestep and insert the result for the
            # relevant elements of the batch into it.
            full_batch_previous_memory = full_batch_previous_memory.data.clone(
            )
            full_batch_previous_state = full_batch_previous_state.data.clone()
            full_batch_previous_memory[0:current_length_index + 1] = memory
            full_batch_previous_state[0:current_length_index +
                                      1] = timestep_output
            output_accumulator[0:current_length_index + 1,
                               index] = timestep_output

        output_accumulator = pack_padded_sequence(output_accumulator,
                                                  batch_lengths,
                                                  batch_first=True)

        # Mimic the pytorch API by returning state in the following shape:
        # (num_layers * num_directions, batch_size, hidden_size). As this
        # LSTM cannot be stacked, the first dimension here is just 1.
        final_state = (full_batch_previous_state.unsqueeze(0),
                       full_batch_previous_memory.unsqueeze(0))

        return output_accumulator, final_state
Ejemplo n.º 28
0
    def train(self):
        """Trains the supplied model with the supplied parameters.
        """
        try:
            epoch_counter, dev_metric_per_epoch = self._restore_checkpoint()
        except RuntimeError:
            traceback.print_exc()
            raise ConfigurationError(
                "Could not recover training from the checkpoint.  Did you mean to output to "
                "a different serialization directory or delete the existing serialization "
                "directory?")

        self._enable_gradient_clipping()

        logger.info('Start training...')

        # Init.
        training_start_time = time.time()
        epochs_trained_this_time = 0
        metrics = {}
        training_metrics = {}
        dev_metrics = {}
        is_best_so_far = True
        best_epoch_dev_metrics = {}

        for epoch in range(epoch_counter, self._num_epochs):
            epoch_start_time = time.time()
            training_metrics = self._train_epoch(epoch)
            # Validate on the dev set.
            if self._dev_dataset is not None:
                with torch.no_grad():
                    dev_metrics = self._validate_dev(epoch)

                    # Check dev metric for early stopping
                    this_epoch_dev_metric = dev_metrics[self._dev_metric]

                    # Check dev metric to see if it's the best so far
                    is_best_so_far = self._is_best_so_far(
                        this_epoch_dev_metric, dev_metric_per_epoch)
                    if is_best_so_far:
                        best_epoch_dev_metrics = dev_metrics.copy()
                    dev_metric_per_epoch.append(this_epoch_dev_metric)
                    if self._should_stop_early(dev_metric_per_epoch):
                        logger.info("Ran out of patience.  Stopping training.")
                        break

            # Save status.
            self._save_checkpoint(epoch,
                                  dev_metric_per_epoch,
                                  is_best=is_best_so_far)
            self._metrics_to_tensorboard(epoch,
                                         training_metrics,
                                         dev_metrics=dev_metrics)
            self._metrics_to_console(training_metrics, dev_metrics=dev_metrics)
            self._tensorboard.add_dev_scalar('learning_rate',
                                             self._optimizer.lr, epoch)

            if is_best_so_far:
                # We may not have had validation data, so we need to hide this behind an if.
                metrics['best_epoch'] = epoch
                metrics.update({
                    f"best_dev_{k}": v
                    for k, v in best_epoch_dev_metrics.items()
                })

            # Estimate ETA.
            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info(
                "Epoch duration: %s",
                time.strftime("%H:%M:%S", time.gmtime(epoch_elapsed_time)))

            if epoch < self._num_epochs - 1:
                training_elapsed_time = time.time() - training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self._num_epochs - epoch_counter) / float(epoch - epoch_counter + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            epochs_trained_this_time += 1

        # Finish training, and summarize the status.
        training_elapsed_time = time.time() - training_start_time
        metrics.update(
            dict(training_duration=time.strftime(
                "%H:%M:%S", time.gmtime(training_elapsed_time)),
                 training_start_epoch=epoch_counter,
                 training_epochs=epochs_trained_this_time))
        for key, value in training_metrics.items():
            metrics["training_" + key] = value
        for key, value in dev_metrics.items():
            metrics["dev_" + key] = value
        return metrics
Ejemplo n.º 29
0
def create_kwargs(cls: Type[T], params: Params, **extras) -> Dict[str, Any]:
    """
    Given some class, a `Params` object, and potentially other keyword arguments,
    create a dict of keyword args suitable for passing to the class's constructor.

    The function does this by finding the class's constructor, matching the constructor
    arguments to entries in the `params` object, and instantiating values for the parameters
    using the type annotation and possibly a from_params method.

    Any values that are provided in the `extras` will just be used as is.
    For instance, you might provide an existing `Vocabulary` this way.
    """
    # Get the signature of the constructor.
    signature = inspect.signature(cls.__init__)
    kwargs: Dict[str, Any] = {}

    # Iterate over all the constructor parameters and their annotations.
    for name, param in signature.parameters.items():
        # Skip "self". You're not *required* to call the first parameter "self",
        # so in theory this logic is fragile, but if you don't call the self parameter
        # "self" you kind of deserve what happens.
        if name == "self":
            continue

        # If the annotation is a compound type like typing.Dict[str, int],
        # it will have an __origin__ field indicating `typing.Dict`
        # and an __args__ field indicating `(str, int)`. We capture both.
        annotation = remove_optional(param.annotation)
        origin = getattr(annotation, '__origin__', None)
        args = getattr(annotation, '__args__', [])

        # The parameter is optional if its default value is not the "no default" sentinel.
        default = param.default
        optional = default != _NO_DEFAULT

        # Some constructors expect extra non-parameter items, e.g. vocab: Vocabulary.
        # We check the provided `extras` for these and just use them if they exist.
        if name in extras:
            kwargs[name] = extras[name]

        # The next case is when the parameter type is itself constructible from_params.
        elif hasattr(annotation, 'from_params'):
            if name in params:
                # Our params have an entry for this, so we use that.
                subparams = params.pop(name)

                if takes_arg(annotation.from_params, 'extras'):
                    # If annotation.params accepts **extras, we need to pass them all along.
                    # For example, `BasicTextFieldEmbedder.from_params` requires a Vocabulary
                    # object, but `TextFieldEmbedder.from_params` does not.
                    subextras = extras
                else:
                    # Otherwise, only supply the ones that are actual args; any additional ones
                    # will cause a TypeError.
                    subextras = {
                        k: v
                        for k, v in extras.items()
                        if takes_arg(annotation.from_params, k)
                    }

                # In some cases we allow a string instead of a param dict, so
                # we need to handle that case separately.
                if isinstance(subparams, str):
                    kwargs[name] = annotation.by_name(subparams)()
                else:
                    kwargs[name] = annotation.from_params(params=subparams,
                                                          **subextras)
            elif not optional:
                # Not optional and not supplied, that's an error!
                raise ConfigurationError(
                    f"expected key {name} for {cls.__name__}")
            else:
                kwargs[name] = default

        # If the parameter type is a Python primitive, just pop it off
        # using the correct casting pop_xyz operation.
        elif annotation == str:
            kwargs[name] = (params.pop(name, default)
                            if optional else params.pop(name))
        elif annotation == int:
            kwargs[name] = (params.pop_int(name, default)
                            if optional else params.pop_int(name))
        elif annotation == bool:
            kwargs[name] = (params.pop_bool(name, default)
                            if optional else params.pop_bool(name))
        elif annotation == float:
            kwargs[name] = (params.pop_float(name, default)
                            if optional else params.pop_float(name))

        # This is special logic for handling types like Dict[str, TokenIndexer], which it creates by
        # instantiating each value from_params and returning the resulting dict.
        elif origin in (Dict, dict) and len(args) == 2 and hasattr(
                args[-1], 'from_params'):
            value_cls = annotation.__args__[-1]

            value_dict = {}

            for key, value_params in params.pop(name, Params({})).items():
                value_dict[key] = value_cls.from_params(params=value_params,
                                                        **extras)

            kwargs[name] = value_dict

        else:
            # Pass it on as is and hope for the best.   ¯\_(ツ)_/¯
            if optional:
                kwargs[name] = params.pop(name, default)
            else:
                kwargs[name] = params.pop(name)

    params.assert_empty(cls.__name__)
    return kwargs
Ejemplo n.º 30
0
 def __iter__(self) -> Iterator[Instance]:
     instances = self.instance_generator()
     if isinstance(instances, list):
         raise ConfigurationError(
             "For a lazy dataset reader, _read() must return a graphQA")
     return instances