def test_handle_response_for_invalid_content(mock_get, response_dir):
    """If invalid content is returned, store warning log entry"""

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=foo'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = b'foo bar'
    headers = {'Content-Type': 'text/plain'}
    mock_get.return_value.headers = headers
    resolver = Resolver()
    initLogging()

    # capture log
    log = getLogger('ocrd_models.utils.handle_oai_response')
    capt = FIFOIO(256)
    sh = StreamHandler(capt)
    sh.setFormatter(Formatter(LOG_FORMAT))
    log.addHandler(sh)

    # act
    resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    log_output = capt.getvalue()
    assert 'WARNING ocrd_models.utils.handle_oai_response' in log_output
def test_handle_common_oai_response(mock_get, response_dir, oai_response_content):
    """Base use case with valid OAI Response data"""
    initLogging()

    # arrange
    url = 'http://digital.bibliothek.uni-halle.de/hd/oai/?verb=GetRecord&metadataPrefix=mets&mode=xml&identifier=9049'
    mock_get.return_value.status_code = 200
    mock_get.return_value.content = oai_response_content
    headers = {'Content-Type': 'text/xml'}
    mock_get.return_value.headers = headers
    resolver = Resolver()

    # act
    result = resolver.download_to_directory(response_dir, url)

    # assert
    mock_get.assert_called_once_with(url)
    assert result == 'oai'
Example #3
0
class TestResolver(TestCase):
    def setUp(self):
        self.resolver = Resolver()
        self.folder = join(TMP_FOLDER, 'kant_aufklaerung_1784')
        if exists(TMP_FOLDER):
            rmtree(TMP_FOLDER)
            os.makedirs(TMP_FOLDER)
        copytree(FOLDER_KANT, self.folder)

    def test_workspace_from_url_bad(self):
        with self.assertRaisesRegex(Exception,
                                    "Must pass mets_url and/or baseurl"):
            self.resolver.workspace_from_url(None)

    def test_workspace_from_url_tempdir(self):
        self.resolver.workspace_from_url(
            mets_basename='foo.xml',
            mets_url=
            'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
        )

    def test_workspace_from_url_download(self):
        with TemporaryDirectory() as dst_dir:
            self.resolver.workspace_from_url(
                mets_basename='foo.xml',
                dst_dir=dst_dir,
                download=True,
                mets_url=
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
            )

    def test_workspace_from_url_no_clobber(self):
        with self.assertRaisesRegex(
                Exception, "already exists but clobber_mets is false"):
            with TemporaryDirectory() as dst_dir:
                with open(join(dst_dir, 'mets.xml'), 'w') as f:
                    f.write('CONTENT')
                self.resolver.workspace_from_url(
                    dst_dir=dst_dir,
                    mets_url=
                    'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
                )

    def test_workspace_from_url_404(self):
        with self.assertRaisesRegex(Exception, "Not found"):
            self.resolver.workspace_from_url(
                mets_url=
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX'
            )

    def test_workspace_from_url_rel_dir(self):
        with TemporaryDirectory() as dst_dir:
            os.chdir(FOLDER_KANT)
            self.resolver.workspace_from_url(
                None,
                baseurl='data',
                dst_dir='../../../../../../../../../../../../../../../../' +
                dst_dir[1:])
            os.chdir(oldpwd)

    def test_workspace_from_url(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual(f.ID, 'FILE_0001_IMAGE')
        #  print(f)

    def test_resolve_image(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        f = input_files[0]
        print(f.url)
        img_pil1 = workspace.resolve_image_as_pil(f.url)
        self.assertEqual(img_pil1.size, (2875, 3749))
        img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))
        img_pil2 = workspace.resolve_image_as_pil(f.url, [[0, 0], [1, 1]])

    def test_resolve_image_grayscale(self):
        img_url = assets.url_of(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017'
        )
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_resolve_image_bitonal(self):
        img_url = assets.url_of(
            'kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017'
        )
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace.resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_workspace_from_nothing(self):
        ws1 = self.resolver.workspace_from_nothing(None)
        self.assertIsNotNone(ws1.mets)
        tmp_dir = join(TMP_FOLDER, 'from-nothing')
        ws2 = self.resolver.workspace_from_nothing(tmp_dir)
        self.assertEqual(ws2.directory, tmp_dir)
        try:
            ws2 = self.resolver.workspace_from_nothing(tmp_dir)
            self.assertTrue(False, "expecting to fail")
        except Exception as e:
            self.assertTrue('Not clobbering' in str(e))

    def test_download_to_directory_badargs_url(self):
        with self.assertRaisesRegex(Exception, "'url' must be a string"):
            self.resolver.download_to_directory(None, None)

    def test_download_to_directory_badargs_directory(self):
        with self.assertRaisesRegex(Exception, "'directory' must be a string"):
            self.resolver.download_to_directory(None, 'foo')

    def test_download_to_directory_default(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir, 'file://' + join(self.folder, 'data/mets.xml'))
        self.assertEqual(
            fn,
            join(tmp_dir,
                 'file%s.data.mets.xml' % sub(r'[/_\.\-]', '.', self.folder)))

    def test_download_to_directory_basename(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir,
            'file://' + join(self.folder, 'data/mets.xml'),
            basename='foo')
        self.assertEqual(fn, join(tmp_dir, 'foo'))

    def test_download_to_directory_subdir(self):
        tmp_dir = join(TMP_FOLDER, 'target')
        fn = self.resolver.download_to_directory(
            tmp_dir,
            'file://' + join(self.folder, 'data/mets.xml'),
            subdir='baz')
        self.assertEqual(fn, join(tmp_dir, 'baz', 'mets.xml'))

    def test_workspace_add_file(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         mimetype='image/tiff',
                         content='CONTENT',
                         local_filename=fpath)
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.ID, 'ID1')
            self.assertEqual(f.mimetype, 'image/tiff')
            self.assertEqual(f.url, fpath)
            self.assertEqual(f.local_filename, fpath)
            self.assertTrue(exists(fpath))

    def test_workspace_add_file_basename_no_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.add_file('GRP', ID='ID1', mimetype='image/tiff')
            f = ws1.mets.find_files()[0]
            self.assertEqual(f.url, '')

    def test_workspace_add_file_binary_content(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            fpath = join(tempdir, 'subdir', 'ID1.tif')
            ws1.add_file('GRP',
                         ID='ID1',
                         content=b'CONTENT',
                         local_filename=fpath,
                         url='http://foo/bar')
            self.assertTrue(exists(fpath))

    def test_workspacec_add_file_content_wo_local_filename(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            with self.assertRaisesRegex(
                    Exception, "'content' was set but no 'local_filename'"):
                ws1.add_file('GRP', ID='ID1', content=b'CONTENT')

    def test_workspace_str(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir)

    def test_workspace_backup(self):
        with TemporaryDirectory() as tempdir:
            ws1 = self.resolver.workspace_from_nothing(directory=tempdir)
            ws1.automatic_backup = True
            ws1.save_mets()
            ws1.reload_mets()
            self.assertEqual(
                str(ws1),
                'Workspace[directory=%s, file_groups=[], files=[]]' % tempdir)

    def test_227_1(self):
        def find_recursive(root):
            ret = []
            for _, _, f in os.walk(root):
                for file in f:
                    ret.append(file)
            return ret

        with TemporaryDirectory() as wsdir:
            with open(
                    assets.path_to(
                        'SBB0000F29300010000/data/mets_one_file.xml'),
                    'r') as f_in:
                with open(join(wsdir, 'mets.xml'), 'w') as f_out:
                    f_out.write(f_in.read())
            self.assertEqual(len(find_recursive(wsdir)), 1)
            ws1 = Workspace(self.resolver, wsdir)
            for file in ws1.mets.find_files():
                ws1.download_file(file)
            self.assertEqual(len(find_recursive(wsdir)), 2)
            self.assertTrue(exists(join(wsdir, 'OCR-D-IMG/FILE_0005_IMAGE')))
Example #4
0
class TestResolver(TestCase):

    def setUp(self):
        self.resolver = Resolver()

    def test_workspace_from_url_bad(self):
        with self.assertRaisesRegex(Exception, "Must pass 'mets_url'"):
            self.resolver.workspace_from_url(None)

    def test_workspace_from_url_tempdir(self):
        self.resolver.workspace_from_url(
            mets_basename='foo.xml',
            mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml')

    def test_workspace_from_url_download(self):
        with TemporaryDirectory() as dst_dir:
            self.resolver.workspace_from_url(
                'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml',
                mets_basename='foo.xml',
                dst_dir=dst_dir,
                download=True)

    def test_workspace_from_url_no_clobber(self):
        with TemporaryDirectory() as dst_dir:
            src_mets = Path(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml'))
            dst_mets = Path(dst_dir, 'mets.xml')
            dst_mets.write_text(src_mets.read_text())
            self.resolver.workspace_from_url(
                    'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml',
                    clobber_mets=False,
                    dst_dir=dst_dir)

    def test_workspace_from_url_404(self):
        with self.assertRaisesRegex(Exception, "HTTP request failed"):
            self.resolver.workspace_from_url(mets_url='https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX')

    def test_workspace_from_url_rel_dir(self):
        with TemporaryDirectory() as dst_dir:
            bogus_dst_dir = '../../../../../../../../../../../../../../../../%s'  % dst_dir[1:]
            with pushd_popd(FOLDER_KANT):
                ws1 = self.resolver.workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir)
                self.assertEqual(ws1.mets_target, pjoin(dst_dir, 'mets.xml'))
                self.assertEqual(ws1.directory, dst_dir)

    def test_workspace_from_url0(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        #  print(workspace.mets)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        #  print [str(f) for f in input_files]
        image_file = input_files[0]
        #  print(image_file)
        f = workspace.download_file(image_file)
        self.assertEqual('%s.tif' % f.ID, 'FILE_0001_IMAGE.tif')
        self.assertEqual(f.local_filename, 'OCR-D-IMG/FILE_0001_IMAGE.tif')
        #  print(f)

    # pylint: disable=protected-access
    def test_resolve_image0(self):
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
        f = input_files[0]
        print(f.url)
        img_pil1 = workspace._resolve_image_as_pil(f.url)
        print(f.url)
        self.assertEqual(img_pil1.size, (2875, 3749))
        img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])
        print(f.url)
        self.assertEqual(img_pil2.size, (1, 1))
        img_pil2 = workspace._resolve_image_as_pil(f.url, [[0, 0], [1, 1]])

    # pylint: disable=protected-access
    def test_resolve_image_grayscale(self):
        img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')
        workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
        img_pil1 = workspace.resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    # pylint: disable=protected-access
    def test_resolve_image_bitonal(self):
        img_url = assets.url_of('kant_aufklaerung_1784-binarized/data/OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png')
        workspace = self.resolver.workspace_from_url(METS_HEROLD)
        img_pil1 = workspace._resolve_image_as_pil(img_url)
        self.assertEqual(img_pil1.size, (1457, 2083))
        img_pil2 = workspace._resolve_image_as_pil(img_url, [[0, 0], [1, 1]])
        self.assertEqual(img_pil2.size, (1, 1))

    def test_workspace_from_nothing(self):
        ws1 = self.resolver.workspace_from_nothing(None)
        self.assertIsNotNone(ws1.mets)

    def test_workspace_from_nothing_makedirs(self):
        with TemporaryDirectory() as tempdir:
            non_existant_dir = Path(tempdir, 'target')
            ws1 = self.resolver.workspace_from_nothing(non_existant_dir)
            self.assertEqual(ws1.directory, non_existant_dir)

    def test_workspace_from_nothing_noclobber(self):
        with TemporaryDirectory() as tempdir:
            ws2 = self.resolver.workspace_from_nothing(tempdir)
            self.assertEqual(ws2.directory, tempdir)
            with self.assertRaisesRegex(Exception, "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tempdir):
                # must fail because tempdir was just created
                self.resolver.workspace_from_nothing(tempdir)

    def test_download_to_directory_badargs_url(self):
        with self.assertRaisesRegex(Exception, "'url' must be a string"):
            self.resolver.download_to_directory(None, None)

    def test_download_to_directory_badargs_directory(self):
        with self.assertRaisesRegex(Exception, "'directory' must be a string"):
            self.resolver.download_to_directory(None, 'foo')

    def test_download_to_directory_default(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'))
                self.assertEqual(fn, 'mets.xml')
                self.assertTrue(Path(dst, fn).exists())

    def test_download_to_directory_basename(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), basename='foo')
                self.assertEqual(fn, 'foo')
                self.assertTrue(Path(dst, fn).exists())

    def test_download_to_directory_subdir(self):
        with copy_of_directory(FOLDER_KANT) as src:
            with TemporaryDirectory() as dst:
                fn = self.resolver.download_to_directory(dst, pjoin(src, 'data/mets.xml'), subdir='baz')
                self.assertEqual(fn, pjoin('baz', 'mets.xml'))
                self.assertTrue(Path(dst, fn).exists())