Ejemplo n.º 1
0
    def test_ignores_documents_it_couldnt_load(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {'id': 'file_id', 'documentcloud_id': '100-foo'},
        ]
        dc_mock().documents.get.side_effect = Exception()

        processor.process(conf, conn)
Ejemplo n.º 2
0
    def test_updates_files_with_documentcloud_stripped_pages(
            self, dc_mock, conn, fda_file):
        conf = {}
        _enable_documentcloud_mock(dc_mock)
        doc_mock = dc_mock.documents.get()
        doc_mock.pages = 3
        doc_mock.get_page_text.side_effect = lambda num: 'page %d\n' % num
        dc_mock().documents.get.return_value = doc_mock

        processor.process(conf, conn)
        updated_file = conn['database']['files'].find_one(id=fda_file)

        assert updated_file['pages'] == ['page 1', 'page 2', 'page 3']
Ejemplo n.º 3
0
    def test_ignores_documents_without_fulltext(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {'id': 'file_id', 'documentcloud_id': '100-foo'},
        ]
        dc_mock().documents.get().get_full_text.side_effect = NotImplementedError()

        processor.process(conf, conn)

        conn['database'].update.assert_not_called()
Ejemplo n.º 4
0
    def test_ignores_documents_it_couldnt_load(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {
                'id': 'file_id',
                'documentcloud_id': '100-foo'
            },
        ]
        dc_mock().documents.get.side_effect = Exception()

        processor.process(conf, conn)
Ejemplo n.º 5
0
    def test_raises_stuff(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {'id': 'file_id', 'documentcloud_id': '100-foo'},
        ]
        exception = Exception()
        exception.code = 403
        dc_mock().documents.get.side_effect = exception

        with pytest.raises(Exception):
            processor.process(conf, conn)
Ejemplo n.º 6
0
    def test_raises_stuff(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {
                'id': 'file_id',
                'documentcloud_id': '100-foo'
            },
        ]
        exception = Exception()
        exception.code = 403
        dc_mock().documents.get.side_effect = exception

        with pytest.raises(Exception):
            processor.process(conf, conn)
Ejemplo n.º 7
0
    def test_ignores_documents_without_fulltext(self, dc_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        conn['database'].query.return_value = [
            {
                'id': 'file_id',
                'documentcloud_id': '100-foo'
            },
        ]
        dc_mock().documents.get(
        ).get_full_text.side_effect = NotImplementedError()

        processor.process(conf, conn)

        conn['database'].update.assert_not_called()
Ejemplo n.º 8
0
    def test_updates_files_with_documentcloud_stripped_pages(self, dc_mock, write_file_mock):
        conf = {}
        conn = {
            'database': mock.Mock(),
        }
        the_file = {
            'id': uuid.uuid1(),
            'documentcloud_id': '100-foo',
        }
        conn['database'].query.return_value = [the_file]
        _enable_documentcloud_mock(dc_mock)
        doc_mock = dc_mock.documents.get()
        doc_mock.pages = 3
        doc_mock.get_page_text.side_effect = lambda num: 'page %d\n' % num
        dc_mock().documents.get.return_value = doc_mock

        processor.process(conf, conn)

        dc_mock().documents.get.assert_called_with(the_file['documentcloud_id'])
        write_file_mock.assert_called_with(conn, {
            'id': the_file['id'].hex,
            'pages': ['page 1', 'page 2', 'page 3']
        })