Beispiel #1
0
    def test_run_block_with_large_work_mem(self):
        normal_work_mem = 256  # MB
        large_work_mem = 512  # MB

        old_large_work_mem = None
        config = py_get_config()
        if 'large_work_mem' in config['mediawords']:
            old_large_work_mem = config['mediawords']['large_work_mem']

        config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem
        py_set_config(config)

        self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem, ))

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        def __test_run_block_with_large_work_mem_inner():
            self.db().execute_with_large_work_mem("""
                INSERT INTO execute_large_work_mem (work_mem)
                SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
            """)

        self.db().query(
            'CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)'
        )
        self.db().run_block_with_large_work_mem(
            __test_run_block_with_large_work_mem_inner)

        statement_work_mem = int(self.db().query("""
            SELECT work_mem FROM execute_large_work_mem
        """).flat()[0])
        assert statement_work_mem == large_work_mem * 1024

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        config['mediawords']['large_work_mem'] = old_large_work_mem
        py_set_config(config)
    def test_run_block_with_large_work_mem(self):
        normal_work_mem = 256  # MB
        large_work_mem = 512  # MB

        old_large_work_mem = None
        config = py_get_config()
        if 'large_work_mem' in config['mediawords']:
            old_large_work_mem = config['mediawords']['large_work_mem']

        config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem
        py_set_config(config)

        self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem,))

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        def __test_run_block_with_large_work_mem_inner():
            self.db().execute_with_large_work_mem("""
                INSERT INTO execute_large_work_mem (work_mem)
                SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
            """)

        self.db().query('CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)')
        self.db().run_block_with_large_work_mem(__test_run_block_with_large_work_mem_inner)

        statement_work_mem = int(self.db().query("""
            SELECT work_mem FROM execute_large_work_mem
        """).flat()[0])
        assert statement_work_mem == large_work_mem * 1024

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        config['mediawords']['large_work_mem'] = old_large_work_mem
        py_set_config(config)
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = self.db().create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = self.db().query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
Beispiel #4
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media',
                                 insert_hash={
                                     'name': "test medium",
                                     'url': "url://test/medium",
                                 })

        story = self.db().create(table='stories',
                                 insert_hash={
                                     'media_id': media['media_id'],
                                     'url': 'url://story/a',
                                     'guid': 'guid://story/a',
                                     'title': 'story a',
                                     'description': 'description a',
                                     'publish_date': sql_now(),
                                     'collect_date': sql_now(),
                                     'full_text_rss': True,
                                 })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences',
                         insert_hash={
                             'stories_id': stories_id,
                             'sentence_number': 1,
                             'sentence':
                             'I hope that the CLIFF annotator is working.',
                             'media_id': media['media_id'],
                             'publish_date': sql_now(),
                             'language': 'en'
                         })

        def __nyt_labels_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(),
                                               stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query(
            """
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {
                'object_id': stories_id
            }).hash()
        assert annotation_exists is not None

        story_tags = self.db().query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags