def test_gzip_bad_input():
    with pytest.raises(McGzipException):
        # noinspection PyTypeChecker
        gzip(None)

    with pytest.raises(McGunzipException):
        # noinspection PyTypeChecker
        gunzip(None)

    with pytest.raises(McGunzipException):
        gunzip(b'')

    with pytest.raises(McGunzipException):
        gunzip(b'No way this is valid Gzip data')
Exemple #2
0
def test_gzip_bad_input():
    with pytest.raises(McGzipException):
        # noinspection PyTypeChecker
        gzip(None)

    with pytest.raises(McGunzipException):
        # noinspection PyTypeChecker
        gunzip(None)

    with pytest.raises(McGunzipException):
        gunzip(b'')

    with pytest.raises(McGunzipException):
        gunzip(b'No way this is valid Gzip data')
Exemple #3
0
    def _compress_data_for_method(data: Union[bytes, str],
                                  compression_method: Compression) -> bytes:
        """Compress data."""

        if data is None:
            raise McKeyValueStoreCompressionException("Data is None.")

        if isinstance(data, str):
            data = data.encode('utf-8')

        if not isinstance(data, bytes):
            raise McKeyValueStoreCompressionException(
                "Data is not str or bytes: %s" % str(data))

        if compression_method == KeyValueStore.Compression.NONE:
            pass
        elif compression_method == KeyValueStore.Compression.GZIP:
            data = gzip(data)
        elif compression_method == KeyValueStore.Compression.BZIP2:
            data = bzip2(data)
        else:
            raise McKeyValueStoreCompressionException(
                "Invalid compression method: %s" % compression_method)

        return data
Exemple #4
0
    def __inner_test_gzip(data_: bytes) -> None:
        gzipped_data = gzip(data_)
        assert len(gzipped_data) > 0
        assert isinstance(gzipped_data, bytes)
        assert gzipped_data != data_

        gunzipped_data = gunzip(gzipped_data)
        assert gunzipped_data == data_
    def __inner_test_gzip(data_: bytes) -> None:
        gzipped_data = gzip(data_)
        assert len(gzipped_data) > 0
        assert isinstance(gzipped_data, bytes)
        assert gzipped_data != data_

        gunzipped_data = gunzip(gzipped_data)
        assert gunzipped_data == data_
    def store_model(self, model_data: bytes) -> int:
        compressed_model_data = gzip(model_data)

        models_id = self.__db.query("""
            INSERT INTO snap.word2vec_models (topics_id, snapshots_id, raw_data)
            VALUES (%(topics_id)s, %(snapshots_id)s, %(raw_data)s)
            RETURNING snap_word2vec_models_id
        """, {
            'topics_id': self.__topics_id,
            'snapshots_id': self.__snapshots_id,
            'raw_data': compressed_model_data,
        }).flat()[0]

        return models_id
Exemple #7
0
    def _compress_data_for_method(data: Union[bytes, str], compression_method: Compression) -> bytes:
        """Compress data."""

        if data is None:
            raise McKeyValueStoreCompressionException("Data is None.")

        if isinstance(data, str):
            data = data.encode('utf-8')

        if not isinstance(data, bytes):
            raise McKeyValueStoreCompressionException("Data is not str or bytes: %s" % str(data))

        if compression_method == KeyValueStore.Compression.NONE:
            pass
        elif compression_method == KeyValueStore.Compression.GZIP:
            data = gzip(data)
        elif compression_method == KeyValueStore.Compression.BZIP2:
            data = bzip2(data)
        else:
            raise McKeyValueStoreCompressionException("Invalid compression method: %s" % compression_method)

        return data
Exemple #8
0
    def test_sitemap_tree_for_homepage_huge_sitemap(self):
        """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""

        page_count = 1000

        sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                    xmlns:xhtml="http://www.w3.org/1999/xhtml">
        """
        for x in range(page_count):
            sitemap_xml += """
                <url>
                    <loc>{base_url}/news/page_{x}.html</loc>

                    <!-- Element present but empty -->
                    <lastmod />

                    <!-- Some other XML namespace -->
                    <xhtml:link rel="alternate"
                                media="only screen and (max-width: 640px)"
                                href="{base_url}/news/page_{x}.html?mobile=1" />

                    <news:news>
                        <news:publication>
                            <news:name>{publication_name}</news:name>
                            <news:language>{publication_language}</news:language>
                        </news:publication>
                        <news:publication_date>{publication_date}</news:publication_date>
                        <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                    </news:news>
                </url>
            """.format(
                x=x,
                base_url=self.__test_url,
                publication_name=self.TEST_PUBLICATION_NAME,
                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                publication_date=self.TEST_DATE_STR,
            )

        sitemap_xml += "</urlset>"

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml.gz
                    """.format(base_url=self.__test_url)).strip(),
            },
            '/sitemap.xml.gz': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(sitemap_xml),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert len(actual_sitemap_tree.all_pages()) == page_count
Exemple #9
0
    def test_sitemap_tree_for_homepage_plain_text(self):
        """Test sitemap_tree_for_homepage() with plain text sitemaps."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.txt
                        Sitemap: {base_url}/sitemap_2.txt.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text uncompressed sitemap
            '/sitemap_1.txt': {
                'content':
                textwrap.dedent("""

                    {base_url}/news/foo.html


                    {base_url}/news/bar.html

                    Some other stuff which totally doesn't look like an URL
                """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text compressed sitemap without .gz extension
            '/sitemap_2.txt.dat': {
                'header':
                'Content-Type: application/x-gzip',
                'content':
                gzip(
                    textwrap.dedent("""
                    {base_url}/news/bar.html
                        {base_url}/news/baz.html
                """.format(base_url=self.__test_url)).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesTextSitemap)
        assert len(sitemap_1.pages) == 2

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesTextSitemap)
        assert len(sitemap_2.pages) == 2

        pages = actual_sitemap_tree.all_pages()
        assert len(pages) == 3
        print(pages)
        assert SitemapPage(
            url='{}/news/foo.html'.format(self.__test_url)) in pages
        assert SitemapPage(
            url='{}/news/bar.html'.format(self.__test_url)) in pages
        assert SitemapPage(
            url='{}/news/baz.html'.format(self.__test_url)) in pages
Exemple #10
0
    def test_sitemap_tree_for_homepage_gzip(self):
        """Test sitemap_tree_for_homepage() with gzipped sitemaps."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.gz
                        Sitemap: {base_url}/sitemap_2.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Gzipped sitemap without correct HTTP header but with .gz extension
            '/sitemap_1.gz': {
                'content':
                gzip(
                    textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/foo.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                        base_url=self.__test_url,
                        publication_name=self.TEST_PUBLICATION_NAME,
                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                        publication_date=self.TEST_DATE_STR,
                    )).strip()),
            },

            # Gzipped sitemap with correct HTTP header but without .gz extension
            '/sitemap_2.dat': {
                'header':
                'Content-Type: application/x-gzip',
                'content':
                gzip(
                    textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[Bąž]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                        base_url=self.__test_url,
                        publication_name=self.TEST_PUBLICATION_NAME,
                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                        publication_date=self.TEST_DATE_STR,
                    )).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        # Don't do an in-depth check, we just need to make sure that gunzip works
        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesXMLSitemap)
        assert len(sitemap_1.pages) == 1

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesXMLSitemap)
        assert len(sitemap_2.pages) == 1
Exemple #11
0
 def __inner_test_wrong_algorithm(data_: bytes) -> None:
     with pytest.raises(McBunzip2Exception):
         bunzip2(gzip(data_))
     with pytest.raises(McGunzipException):
         gunzip(bzip2(data_))
 def __inner_test_wrong_algorithm(data_: bytes) -> None:
     with pytest.raises(McBunzip2Exception):
         bunzip2(gzip(data_))
     with pytest.raises(McGunzipException):
         gunzip(bzip2(data_))
Exemple #13
0
    def test_sitemap_tree_for_homepage_huge_sitemap(self):
        """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""

        page_count = 1000

        sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                    xmlns:xhtml="http://www.w3.org/1999/xhtml">
        """
        for x in range(page_count):
            sitemap_xml += """
                <url>
                    <loc>{base_url}/news/page_{x}.html</loc>

                    <!-- Element present but empty -->
                    <lastmod />

                    <!-- Some other XML namespace -->
                    <xhtml:link rel="alternate"
                                media="only screen and (max-width: 640px)"
                                href="{base_url}/news/page_{x}.html?mobile=1" />

                    <news:news>
                        <news:publication>
                            <news:name>{publication_name}</news:name>
                            <news:language>{publication_language}</news:language>
                        </news:publication>
                        <news:publication_date>{publication_date}</news:publication_date>
                        <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                    </news:news>
                </url>
            """.format(
                x=x,
                base_url=self.__test_url,
                publication_name=self.TEST_PUBLICATION_NAME,
                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                publication_date=self.TEST_DATE_STR,
            )

        sitemap_xml += "</urlset>"

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml.gz
                    """.format(base_url=self.__test_url)).strip(),
            },

            '/sitemap.xml.gz': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(sitemap_xml),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert len(actual_sitemap_tree.all_pages()) == page_count
Exemple #14
0
    def test_sitemap_tree_for_homepage_plain_text(self):
        """Test sitemap_tree_for_homepage() with plain text sitemaps."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.txt
                        Sitemap: {base_url}/sitemap_2.txt.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text uncompressed sitemap
            '/sitemap_1.txt': {
                'content': textwrap.dedent("""

                    {base_url}/news/foo.html


                    {base_url}/news/bar.html

                    Some other stuff which totally doesn't look like an URL
                """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text compressed sitemap without .gz extension
            '/sitemap_2.txt.dat': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(textwrap.dedent("""
                    {base_url}/news/bar.html
                        {base_url}/news/baz.html
                """.format(base_url=self.__test_url)).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesTextSitemap)
        assert len(sitemap_1.pages) == 2

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesTextSitemap)
        assert len(sitemap_2.pages) == 2

        pages = actual_sitemap_tree.all_pages()
        assert len(pages) == 3
        print(pages)
        assert SitemapPage(url='{}/news/foo.html'.format(self.__test_url)) in pages
        assert SitemapPage(url='{}/news/bar.html'.format(self.__test_url)) in pages
        assert SitemapPage(url='{}/news/baz.html'.format(self.__test_url)) in pages
Exemple #15
0
    def test_sitemap_tree_for_homepage_gzip(self):
        """Test sitemap_tree_for_homepage() with gzipped sitemaps."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.gz
                        Sitemap: {base_url}/sitemap_2.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Gzipped sitemap without correct HTTP header but with .gz extension
            '/sitemap_1.gz': {
                'content': gzip(textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/foo.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip()),
            },

            # Gzipped sitemap with correct HTTP header but without .gz extension
            '/sitemap_2.dat': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[Bąž]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        # Don't do an in-depth check, we just need to make sure that gunzip works
        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesXMLSitemap)
        assert len(sitemap_1.pages) == 1

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesXMLSitemap)
        assert len(sitemap_2.pages) == 1