Ejemplo n.º 1
0
    def test_embeddings_text_file(self):
        txt_path = str(self.FIXTURES_ROOT / "utf-8_sample/utf-8_sample.txt")

        # This is for sure a correct way to read an utf-8 encoded text file
        with open(txt_path, "rt", encoding="utf-8") as f:
            correct_text = f.read()

        # Check if we get the correct text on plain and compressed versions of the file
        paths = [txt_path] + [txt_path + ext for ext in [".gz", ".zip"]]
        for path in paths:
            with EmbeddingsTextFile(path) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + path

        # Check for a file contained inside an archive with multiple files
        for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]:
            archive_path = str(
                self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext
            file_uri = format_embeddings_file_uri(archive_path,
                                                  "folder/utf-8_sample.txt")
            with EmbeddingsTextFile(file_uri) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + archive_path

        # Passing a second level path when not reading an archive
        with pytest.raises(ValueError):
            with EmbeddingsTextFile(
                    format_embeddings_file_uri(txt_path, "a/fake/path")):
                pass
Ejemplo n.º 2
0
    def test_embeddings_text_file(self):
        txt_path = str(self.FIXTURES_ROOT / 'utf-8_sample/utf-8_sample.txt')

        # This is for sure a correct way to read an utf-8 encoded text file
        with open(txt_path, 'rt', encoding='utf-8') as f:
            correct_text = f.read()

        # Check if we get the correct text on plain and compressed versions of the file
        paths = [txt_path] + [txt_path + ext for ext in ['.gz', '.zip']]
        for path in paths:
            with EmbeddingsTextFile(path) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + path

        # Check for a file contained inside an archive with multiple files
        for ext in ['.zip', '.tar.gz', '.tar.bz2', '.tar.lzma']:
            archive_path = str(self.FIXTURES_ROOT / 'utf-8_sample/archives/utf-8') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/utf-8_sample.txt')
            with EmbeddingsTextFile(file_uri) as f:
                text = f.read()
            assert text == correct_text, "Test failed for file: " + archive_path

        # Passing a second level path when not reading an archive
        with pytest.raises(ValueError):
            with EmbeddingsTextFile(format_embeddings_file_uri(txt_path, 'a/fake/path')):
                pass
Ejemplo n.º 3
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set("If you think you are too small to make a difference "
                    "try to sleeping with a mosquito àèìòù".split(" "))

        # Reading from a single (compressed) file or a single-file archive
        base_path = str(self.FIXTURES_ROOT /
                        "embeddings/fake_embeddings.5d.txt")
        for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]:
            file_path = base_path + ext
            words_read = set(_read_pretrained_tokens(file_path))
            assert words_read == words, (f"Wrong words for file {file_path}\n"
                                         f"   Read: {sorted(words_read)}\n"
                                         f"Correct: {sorted(words)}")

        # Reading from a multi-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive")
        file_path = "folder/fake_embeddings.5d.txt"
        for ext in [".zip", ".tar.gz"]:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(
                archive_path, file_path)
            words_read = set(_read_pretrained_tokens(embeddings_file_uri))
            assert words_read == words, (
                f"Wrong words for file {archive_path}\n"
                f"   Read: {sorted(words_read)}\n"
                f"Correct: {sorted(words)}")
Ejemplo n.º 4
0
    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_file_uri},
                                          only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
Ejemplo n.º 5
0
    def test_from_instances_exclusive_embeddings_file_inside_archive(self):
        """ Just for ensuring there are no problems when reading pretrained tokens from an archive """
        # Read embeddings file from archive
        archive_path = str(self.TEST_DIR / "embeddings-archive.zip")

        with zipfile.ZipFile(archive_path, 'w') as archive:
            file_path = 'embedding.3d.vec'
            with archive.open(file_path, 'w') as embeddings_file:
                embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
                embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

            with archive.open('dummy.vec', 'w') as dummy_file:
                dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8'))

        embeddings_file_uri = format_embeddings_file_uri(
            archive_path, file_path)
        vocab = Vocabulary.from_instances(
            self.dataset,
            min_count={'tokens': 4},
            pretrained_files={'tokens': embeddings_file_uri},
            only_include_pretrained_words=True)

        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(
            self.dataset,
            pretrained_files={'tokens': embeddings_file_uri},
            only_include_pretrained_words=True)
        words = set(vocab.get_index_to_token_vocabulary().values())
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words
Ejemplo n.º 6
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'),
                u'embedding_dim': 5
                })
        with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in [u'.zip', u'.tar.gz']:
            archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt')
            params = Params({
                    u'pretrained_file': file_uri,
                    u'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in list(token2vec.items()):
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
Ejemplo n.º 7
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set(u"If you think you are too small to make a difference "
                    u"try to sleeping with a mosquito àèìòù".split(u' '))

        # Reading from a single (compressed) file or a single-file archive
        base_path = unicode(self.FIXTURES_ROOT /
                            u"embeddings/fake_embeddings.5d.txt")
        for ext in [u'', u'.gz', u'.lzma', u'.bz2', u'.zip', u'.tar.gz']:
            file_path = base_path + ext
            words_read = _read_pretrained_tokens(file_path)
            assert words_read == words, "Wrong words for file {file_path}\n"\
                                        "   Read: {sorted(words_read)}\n"\
                                        "Correct: {sorted(words)}"

        # Reading from a multi-file archive
        base_path = unicode(self.FIXTURES_ROOT /
                            u"embeddings/multi-file-archive")
        file_path = u'folder/fake_embeddings.5d.txt'
        for ext in [u'.zip', u'.tar.gz']:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(
                archive_path, file_path)
            words_read = _read_pretrained_tokens(embeddings_file_uri)
            assert words_read == words, "Wrong words for file {archive_path}\n"\
                                        "   Read: {sorted(words_read)}\n"\
                                        "Correct: {sorted(words)}"
Ejemplo n.º 8
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError,
                           match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, "
                                 "so you must select one of the files inside "
                                 "providing a uri of the type: "
                                 "\\(path_or_url_to_archive\\)#path_inside_archive\\."):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
Ejemplo n.º 9
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
Ejemplo n.º 10
0
    def test_decode_embeddings_file_uri(self):
        first_level_paths = [
            "path/to/embeddings.gz",
            "unicode/path/òàè+ù.vec",
            "http://www.embeddings.com/path/to/embeddings.gz",
            "http://www.embeddings.com/àèìòù?query=blabla.zip",
        ]
        second_level_paths = [
            "path/to/glove.27B.300d.vec", "òàè+ù.vec", "crawl-300d-2M.vec"
        ]

        for simple_path in first_level_paths:
            assert parse_embeddings_file_uri(simple_path) == (simple_path,
                                                              None)

        for path1, path2 in zip(first_level_paths, second_level_paths):
            uri = format_embeddings_file_uri(path1, path2)
            decoded = parse_embeddings_file_uri(uri)
            assert decoded == (path1, path2)
Ejemplo n.º 11
0
    def test_decode_embeddings_file_uri(self):
        first_level_paths = [
                'path/to/embeddings.gz',
                'unicode/path/òàè+ù.vec',
                'http://www.embeddings.com/path/to/embeddings.gz',
                'http://www.embeddings.com/àèìòù?query=blabla.zip',
                ]
        second_level_paths = [
                'path/to/glove.27B.300d.vec',
                'òàè+ù.vec',
                'crawl-300d-2M.vec'
                ]

        for simple_path in first_level_paths:
            assert parse_embeddings_file_uri(simple_path) == (simple_path, None)

        for path1, path2 in zip(first_level_paths, second_level_paths):
            uri = format_embeddings_file_uri(path1, path2)
            decoded = parse_embeddings_file_uri(uri)
            assert decoded == (path1, path2)
Ejemplo n.º 12
0
    def test_read_pretrained_words(self):
        # The fixture "fake_embeddings.5d.txt" was generated using the words in this random quote
        words = set("If you think you are too small to make a difference "
                    "try to sleeping with a mosquito àèìòù".split(' '))

        # Reading from a single (compressed) file or a single-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt")
        for ext in ['', '.gz', '.lzma', '.bz2', '.zip', '.tar.gz']:
            file_path = base_path + ext
            words_read = set(_read_pretrained_tokens(file_path))
            assert words_read == words, f"Wrong words for file {file_path}\n" \
                                        f"   Read: {sorted(words_read)}\n" \
                                        f"Correct: {sorted(words)}"

        # Reading from a multi-file archive
        base_path = str(self.FIXTURES_ROOT / "embeddings/multi-file-archive")
        file_path = 'folder/fake_embeddings.5d.txt'
        for ext in ['.zip', '.tar.gz']:
            archive_path = base_path + ext
            embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path)
            words_read = set(_read_pretrained_tokens(embeddings_file_uri))
            assert words_read == words, f"Wrong words for file {archive_path}\n" \
                                        f"   Read: {sorted(words_read)}\n" \
                                        f"Correct: {sorted(words)}"