Exemple #1
0
    def test_cached_path(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path(
                "tests/fixtures/does_not_exist/fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == self.glove_file

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR,
                                        url_to_filename(url, etag="0"))

        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Exemple #2
0
    def __init__(self,
                 file_uri: str,
                 encoding: str = DEFAULT_ENCODING,
                 cache_dir: str = None) -> None:

        self.uri = file_uri
        self._encoding = encoding
        self._cache_dir = cache_dir
        self._archive_handle: Any = None  # only if the file is inside an archive

        main_file_uri, path_inside_archive = parse_embeddings_file_uri(
            file_uri)
        main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir)

        if zipfile.is_zipfile(main_file_local_path):  # ZIP archive
            self._open_inside_zip(main_file_uri, path_inside_archive)

        elif tarfile.is_tarfile(main_file_local_path):  # TAR archive
            self._open_inside_tar(main_file_uri, path_inside_archive)

        else:  # all the other supported formats, including uncompressed files
            if path_inside_archive:
                raise ValueError('Unsupported archive format: %s' +
                                 main_file_uri)

            # All the python packages for compressed files share the same interface of io.open
            extension = get_file_extension(main_file_uri)
            package = {
                '.txt': io,
                '.vec': io,
                '.gz': gzip,
                '.bz2': bz2,
                '.lzma': lzma,
            }.get(extension, None)

            if package is None:
                warn(
                    'The embeddings file has an unknown file extension "%s". '
                    'We will assume the file is an (uncompressed) text file',
                    extension)
                package = io

            self._handle = package.open(main_file_local_path,
                                        'rt',
                                        encoding=encoding)  # type: ignore

        # To use this with tqdm we'd like to know the number of tokens. It's possible that the
        # first line of the embeddings file contains this: if it does, we want to start iteration
        # from the 2nd line, otherwise we want to start from the 1st.
        # Unfortunately, once we read the first line, we cannot move back the file iterator
        # because the underlying file may be "not seekable"; we use itertools.chain instead.
        first_line = next(self._handle)  # this moves the iterator forward
        self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(
            first_line)
        if self.num_tokens:
            # the first line is a header line: start iterating from the 2nd line
            self._iterator = self._handle
        else:
            # the first line is not a header line: start iterating from the 1st line
            self._iterator = itertools.chain([first_line], self._handle)
Exemple #3
0
    def add_file_to_archive(self, name: str) -> None:
        """
        Any class in its ``from_params`` method can request that some of its
        input files be added to the archive by calling this method.

        For example, if some class ``A`` had an ``input_file`` parameter, it could call

        ```
        params.add_file_to_archive("input_file")
        ```

        which would store the supplied value for ``input_file`` at the key
        ``previous.history.and.then.input_file``. The ``files_to_archive`` dict
        is shared with child instances via the ``_check_is_dict`` method, so that
        the final mapping can be retrieved from the top-level ``Params`` object.

        NOTE: You must call ``add_file_to_archive`` before you ``pop()``
        the parameter, because the ``Params`` instance looks up the value
        of the filename inside itself.

        If the ``loading_from_archive`` flag is True, this will be a no-op.
        """
        if not self.loading_from_archive:
            self.files_to_archive[f"{self.history}{name}"] = cached_path(
                self.get(name))
Exemple #4
0
    def _load_cnn_weights(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        char_embed_dim = cnn_options['embedding']['dim']

        convolutions = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(in_channels=char_embed_dim,
                                   out_channels=num,
                                   kernel_size=width,
                                   bias=True)
            # load the weights
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                weight = fin['CNN']['W_cnn_{}'.format(i)][...]
                bias = fin['CNN']['b_cnn_{}'.format(i)][...]

            w_reshaped = numpy.transpose(weight.squeeze(axis=0),
                                         axes=(2, 1, 0))
            if w_reshaped.shape != tuple(conv.weight.data.shape):
                raise ValueError("Invalid weight file")
            conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
            conv.bias.data.copy_(torch.FloatTensor(bias))

            conv.weight.requires_grad = self.requires_grad
            conv.bias.requires_grad = self.requires_grad

            convolutions.append(conv)
            self.add_module('char_conv_{}'.format(i), conv)

        self._convolutions = convolutions
Exemple #5
0
def _read_pretrained_words(embeddings_filename: str) -> Set[str]:
    words = set()
    with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
        for line in embeddings_file:
            fields = line.decode('utf-8').strip().split(' ')
            word = fields[0]
            words.add(word)
    return words
Exemple #6
0
    def _load_char_embedding(self):
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            char_embed_weights = fin['char_embed'][...]

        weights = numpy.zeros(
            (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]),
            dtype='float32')
        weights[1:, :] = char_embed_weights

        self._char_embedding_weights = torch.nn.Parameter(
            torch.FloatTensor(weights), requires_grad=self.requires_grad)
Exemple #7
0
    def from_file(params_file: str, params_overrides: str = "") -> 'Params':
        """
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)

        file_dict = pyhocon.ConfigFactory.parse_file(params_file)

        overrides_dict = pyhocon.ConfigFactory.parse_string(params_overrides)
        param_dict = overrides_dict.with_fallback(file_dict)
        return Params(param_dict)
Exemple #8
0
 def _open_inside_zip(self,
                      archive_path: str,
                      member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path,
                                       cache_dir=self._cache_dir)
     archive = zipfile.ZipFile(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.namelist()
         member_path = self._get_the_only_file_in_the_archive(
             members_list, archive_path)
     member_path = cast(str, member_path)
     member_file = archive.open(member_path, 'r')
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Exemple #9
0
    def from_file(params_file: str, params_overrides: str = "") -> 'Params':
        """
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = dict(os.environ)

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict,
                                   fallback=file_dict)

        return Params(param_dict)
Exemple #10
0
 def _open_inside_tar(self,
                      archive_path: str,
                      member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path,
                                       cache_dir=self._cache_dir)
     archive = tarfile.open(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.getnames()
         member_path = self._get_the_only_file_in_the_archive(
             members_list, archive_path)
     member_path = cast(str, member_path)
     member = archive.getmember(
         member_path)  # raises exception if not present
     member_file = cast(IO[bytes], archive.extractfile(member))
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Exemple #11
0
    def _load_projection(self):
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)

        self._projection = torch.nn.Linear(n_filters,
                                           self.output_dim,
                                           bias=True)
        with h5py.File(cached_path(self._weight_file), 'r') as fin:
            weight = fin['CNN_proj']['W_proj'][...]
            bias = fin['CNN_proj']['b_proj'][...]
            self._projection.weight.data.copy_(
                torch.FloatTensor(numpy.transpose(weight)))
            self._projection.bias.data.copy_(torch.FloatTensor(bias))

            self._projection.weight.requires_grad = self.requires_grad
            self._projection.bias.requires_grad = self.requires_grad
Exemple #12
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            warn(
                "You are fine tuning ELMo and caching char CNN word vectors. "
                "This behaviour is not guaranteed to be well defined, particularly. "
                "if not all of your inputs will occur in the vocabulary cache."
            )
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            info("Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
Exemple #13
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoCharacterEncoder, self).__init__()

        with open(cached_path(options_file), 'r') as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options['lstm']['projection_dim']
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) +
            1)
        self._end_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
Exemple #14
0
    def _load_highway(self):
        # pylint: disable=protected-access
        # the highway layers have same dimensionality as the number of cnn filters
        cnn_options = self._options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)
        n_highway = cnn_options['n_highway']

        # create the layers, and load the weights
        self._highways = Highway(n_filters,
                                 n_highway,
                                 activation=torch.nn.functional.relu)
        for k in range(n_highway):
            # The AllenNLP highway is one matrix multplication with concatenation of
            # transform and carry weights.
            with h5py.File(cached_path(self._weight_file), 'r') as fin:
                # The weights are transposed due to multiplication order assumptions in tf
                # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
                w_transform = numpy.transpose(
                    fin['CNN_high_{}'.format(k)]['W_transform'][...])
                # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
                w_carry = -1.0 * numpy.transpose(
                    fin['CNN_high_{}'.format(k)]['W_carry'][...])
                weight = numpy.concatenate([w_transform, w_carry], axis=0)
                self._highways._layers[k].weight.data.copy_(
                    torch.FloatTensor(weight))
                self._highways._layers[
                    k].weight.requires_grad = self.requires_grad

                b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...]
                b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...]
                bias = numpy.concatenate([b_transform, b_carry], axis=0)
                self._highways._layers[k].bias.data.copy_(
                    torch.FloatTensor(bias))
                self._highways._layers[
                    k].bias.requires_grad = self.requires_grad
Exemple #15
0
    def load_weights(self, weight_file: str) -> None:
        """
        Load the pre-trained weights from the file.
        """
        requires_grad = self.requires_grad

        with h5py.File(cached_path(weight_file), 'r') as fin:
            for i_layer, lstms in enumerate(
                    zip(self.forward_layers, self.backward_layers)):
                for j_direction, lstm in enumerate(lstms):
                    # lstm is an instance of LSTMCellWithProjection
                    cell_size = lstm.cell_size

                    dataset = fin['RNN_%s' %
                                  j_direction]['RNN']['MultiRNNCell'][
                                      'Cell%s' % i_layer]['LSTMCell']

                    # tensorflow packs together both W and U matrices into one matrix,
                    # but pytorch maintains individual matrices.  In addition, tensorflow
                    # packs the gates as input, memory, forget, output but pytorch
                    # uses input, forget, memory, output.  So we need to modify the weights.
                    tf_weights = numpy.transpose(dataset['W_0'][...])
                    torch_weights = tf_weights.copy()

                    # split the W from U matrices
                    input_size = lstm.input_size
                    input_weights = torch_weights[:, :input_size]
                    recurrent_weights = torch_weights[:, input_size:]
                    tf_input_weights = tf_weights[:, :input_size]
                    tf_recurrent_weights = tf_weights[:, input_size:]

                    # handle the different gate order convention
                    for torch_w, tf_w in [[input_weights, tf_input_weights],
                                          [
                                              recurrent_weights,
                                              tf_recurrent_weights
                                          ]]:
                        torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(
                            2 * cell_size):(3 * cell_size), :]
                        torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(
                            1 * cell_size):(2 * cell_size), :]

                    lstm.input_linearity.weight.data.copy_(
                        torch.FloatTensor(input_weights))
                    lstm.state_linearity.weight.data.copy_(
                        torch.FloatTensor(recurrent_weights))
                    lstm.input_linearity.weight.requires_grad = requires_grad
                    lstm.state_linearity.weight.requires_grad = requires_grad

                    # the bias weights
                    tf_bias = dataset['B'][...]
                    # tensorflow adds 1.0 to forget gate bias instead of modifying the
                    # parameters...
                    tf_bias[(2 * cell_size):(3 * cell_size)] += 1
                    torch_bias = tf_bias.copy()
                    torch_bias[(1 * cell_size):(2 * cell_size)] = tf_bias[(
                        2 * cell_size):(3 * cell_size)]
                    torch_bias[(2 * cell_size):(3 * cell_size)] = tf_bias[(
                        1 * cell_size):(2 * cell_size)]
                    lstm.state_linearity.bias.data.copy_(
                        torch.FloatTensor(torch_bias))
                    lstm.state_linearity.bias.requires_grad = requires_grad

                    # the projection weights
                    proj_weights = numpy.transpose(dataset['W_P_0'][...])
                    lstm.state_projection.weight.data.copy_(
                        torch.FloatTensor(proj_weights))
                    lstm.state_projection.weight.requires_grad = requires_grad