Example #1
    def _query(self, base_url, qargs, response_xmlclass):
        '''Utility method: Adds required query arguments, returns response
        as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
        necessary to enforce EUtils query speed policy.
        qargs = qargs.copy()
        if 'tool' not in qargs:
            qargs['tool'] = self.EUTILS_TOOL
        if 'email' not in qargs:
            qargs['email'] = self.EUTILS_EMAIL
        # TODO: When we start making more than one query we need to sleep to
        # avoid making more than 3 requests per second per E-Utilities
        # policies.
        qurl = base_url + urlencode(qargs)
        logger.debug('EntrezClient querying: ' + qurl)

        # use a url validator to examine if the qurl is a file location or a url
        # open the remote file with urllib.urlopen if it is a url
        # or open it as a file
        url_validator = URLValidator()
            target_file = urlopen(qurl)
            return xmlmap.load_xmlobject_from_file(target_file.read(), xmlclass=response_xmlclass)
        except ValidationError, e:
            return xmlmap.load_xmlobject_from_file(qurl, xmlclass=response_xmlclass)
Example #2
    def test_get_fulltext(self):
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            # abbyy finereader v8
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_('In presenting this,  the initial volume of  the' in text,
                'ocr text content should be present in plain text')
            self.assert_('Now, kind reader, we ask that you do not crit' in text,
                'ocr text content should be present in plain text')
            self.assert_(re.search(r'Baldwin\s+Dellinger\s+Brice', text),
                'table row content should be displayed on a single line')

            # abbyy finereader v6
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
            self.assert_('was late in the autumn, the vines yet kept their leaves,' in text,
                'ocr text content should be present in plain text')
            self.assert_('walked up the steps. The lady had not moved, and made' in text,
                'ocr text content should be present in plain text')
            self.assert_(re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text),
                'table row content should be displayed on a single line')
Example #3
 def setUp(self):
     # tei generated from mets alto
     self.alto_tei = load_xmlobject_from_file(
         os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.Facsimile)
     # tei generated from abbyy ocr
     self.abbyy_tei = load_xmlobject_from_file(
         os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'), tei.Facsimile)
Example #4
File: tei.py Project: WSULib/readux
 def setUp(self):
     # tei generated from mets alto
     self.alto_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),
     # tei generated from abbyy ocr
     self.abbyy_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'),
Example #5
    def setUp(self):
        search_fixture_path = self.fixture_path('esearch-response-withhist.xml')
        self.search_response = xmlmap.load_xmlobject_from_file(search_fixture_path,

        fetch_fixture_path = self.fixture_path('efetch-retrieval-from-hist.xml')
        self.fetch_response = xmlmap.load_xmlobject_from_file(fetch_fixture_path,

        self.mock_client = Mock(spec=EntrezClient)
Example #6
    def setUp(self):
        search_fixture_path = self.fixture_path(
        self.search_response = xmlmap.load_xmlobject_from_file(
            search_fixture_path, xmlclass=ESearchResponse)

        fetch_fixture_path = self.fixture_path(
        self.fetch_response = xmlmap.load_xmlobject_from_file(
            fetch_fixture_path, xmlclass=EFetchResponse)

        self.mock_client = Mock(spec=EntrezClient)
Example #7
 def setUp(self):
     super(HarvestRecordTest, self).setUp()
     article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml')
     self.fetch_response = xmlmap.load_xmlobject_from_file(
         article_fixture_path, xmlclass=EFetchResponse)
     # one corresponding author with an emory email
     self.article = self.fetch_response.articles[0]
Example #8
    def test_consolidate_bibl(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
        teinote = annotation_to_tei(self.zotero_note, teidoc)

        self.assertEqual(2, len(teidoc.citations),
            'annotation citations should be present in main document bibl')
        teinote = teidoc.annotations[0]
        self.assertEqual(0, len(teinote.citations),
            'citations should not be present on individual annotation')
        self.assertEqual(None, teinote.works_cited)
        self.assertEqual(None, teinote.zotero_items)
        self.assertEqual(None, teinote.works_cited_milestone)
        teinote_xml = teinote.serialize()
        self.assertFalse('<item><anchor xml:id="zotero-' in teinote_xml)
        self.assertFalse('<listBibl/>' in teinote_xml)

        # repeated zotero ids should only appear once in document bibl
        # load the same note and add it again
        teinote = annotation_to_tei(self.zotero_note, teidoc)
        self.assertEqual(2, len(teidoc.citations),
            'citations repeated in annotations should only appear once')
Example #9
    def language(self):
        wrapper arond field that chooses that prefered source
         :returns: a tuple containng language code and name
        marc_languages_xml = 'http://www.loc.gov/standards/codelists/languages.xml'
        langs =  xmlmap.load_xmlobject_from_file(marc_languages_xml)

        ns = {'lang':'info:lc/xmlns/codelist-v1'}

        if self.wos and self.wos.language:
            lang = self.wos.language
        elif self.scopus and self.scopus.language:
            lang = self.scopus.language
        elif self.pubmed and self.pubmed.language:
            lang = self.pubmed.language
        elif self.crossref and self.crossref.language:
            lang = self.crossref.language
        elif self.arxiv and self.arxiv.language:
            lang = self.arxiv.language
        elif self.repec and self.repec.language:
            lang = self.repec.language
        elif self.dblp and self.dblp.language:
            lang = self.dblp.language
        else: lang = ''

        nodes = langs.node.xpath("//lang:language[lang:name='%s' or lang:code='%s']" % (lang, lang), namespaces=ns)
        if nodes:
            return (nodes[0].findtext('lang:code', namespaces=ns), nodes[0].findtext('lang:name', namespaces=ns))

            return ('', '')
Example #10
def test_can_serialize_xsd300_ds():
    from eulxml.xmlmap import load_xmlobject_from_file

    with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml')) as f:
        xmlobject = load_xmlobject_from_file(f)
        serialized_object = serializers._xsd300_serializer(xmlobject)
        assert type(serialized_object) == bytes
Example #11
 def setUp(self):
     #load the three xml doc objects
     self.docs = dict()
     for file in self.FIXTURES:
         filebase = file.split('.')[0]
         self.docs[filebase] = xmlmap.load_xmlobject_from_file(
             path.join(exist_fixture_path, file), TestDocTitle)
def process(spreadsheet, xml_files_dir, sheet=1, control_row=None, force_dates=False,
        object_type='parent', input_encoding='utf8', copy_parent_to_children=False):
    '''Function to go through all the data and process it.'''
    #make sure we have a directory to put the mods files in
    os.makedirs(xml_files_dir, exist_ok=True)
    data_handler = DataHandler(spreadsheet, sheet=sheet, control_row=control_row, force_dates=force_dates,
            object_type=object_type, input_encoding=input_encoding)
    index = 1
    for record in data_handler.get_xml_records():
        filename = '%s.%s.xml' % (record.xml_id, record.record_type)
        full_path = os.path.join(xml_files_dir, filename)
        if os.path.exists(full_path):
            raise DataError('%s file already exists from previous record! Possible duplicate %s IDs?' % (filename, record.xml_id))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(xml_files_dir, u'%s.%s' % (record.group_id, record.record_type))
            parent_xml = None
            if os.path.exists(parent_filename):
                parent_xml = load_xmlobject_from_file(parent_filename, mods.Mods)
                mapper = Mapper(record.record_type, record.field_data(), parent_mods=parent_xml)
            mapper = Mapper(record.record_type, record.field_data())
        xml_obj = mapper.get_xml()
        xml_bytes = xml_obj.serializeDocument(pretty=True) #serializes as UTF-8
        with open(full_path, 'wb') as f:
        index = index + 1
Example #13
    def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf):

        with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f:
            from eulxml.xmlmap import load_xmlobject_from_file
            mock_pdf.content = load_xmlobject_from_file(f)
        mock_ds = ['ERUDITXSD300', ]  # noqa

        issue = IssueFactory.create(
            journal=self.journal, year=2010,
            date_published=dt.datetime.now() - dt.timedelta(days=1000))
        article = ArticleFactory.create(issue=issue)
        journal_id = self.journal.localidentifier
        issue_id = issue.localidentifier
        article_id = article.localidentifier
        url = reverse('public:journal:article_raw_xml', args=(
            journal_id, issue.volume_slug, issue_id, article_id
        request = self.factory.get(url)
        request.user = AnonymousUser()
        request.subscription = None

        # Run
        response = ArticleXmlView.as_view()(
            request, journal_code=journal_id, issue_slug=issue.volume_slug, issue_localid=issue_id,

        # Check
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')
Example #14
 def setUp(self):
     super(HarvestRecordTest, self).setUp()
     article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml')
     self.fetch_response = xmlmap.load_xmlobject_from_file(article_fixture_path,
     # one corresponding author with an emory email
     self.article = self.fetch_response.articles[0]
Example #15
    def test_rdf_type(self):
        # not enough information to determine type
        self.assertEqual(None, self.c1.rdf_type)
        # infer book, article, etc from title attributes
        self.assertEqual('bibo:Book', self.c3.rdf_type)
        self.assertEqual('bibo:Article', self.c4.rdf_type)

        # type inferred based on series; requires access to series, so load from fixtures
        # - bailey findingaid contains printed material, photographs, and audiovisual
        bailey = load_xmlobject_from_file(path.join(exist_fixture_path, 'bailey807.xml'),

        # patch in unittitles so it looks as though items have semantic data
        with patch('findingaids.fa.models.Series.unittitle_titles', new=[Title()]):

            # series 4 is printed material
            self.assertEqual('bibo:Document', bailey.dsc.c[3].c[0].rdf_type,
                'items in printed materials series should default to document type')

            # series 5 is photographs
            self.assertEqual('bibo:Image', bailey.dsc.c[4].c[0].rdf_type,
                'items in photograph series should default to image type')

            # series 9 is audiovisual
            self.assertEqual('bibo:AudioVisualDocument', bailey.dsc.c[8].c[0].rdf_type,
                'items in audiovisual series should default to audiovisualdocument type')

            # fallback type is manuscript
            self.assertEqual('bibo:Manuscript', bailey.dsc.c[0].c[0].rdf_type,
                'items in photograph series should default to image type')
Example #16
    def add_xml_datastream(self, xml_path, ds_id, label, control_group,
                           mimetype, checksum_type):
        """Add XML object."""
        xml_object = xmlmap.load_xmlobject_from_file(xml_path)

        if checksum_type == "SHA-512":
            checksum = self.generate_checksum(xml_path)

            checksum = None
                "Unable to generate checksum for specified type: {0}".format(

        logging.info("----adding datastream {0}: {1}".format(ds_id, label))

        new_datastream = DatastreamObject(self.obj,

        new_datastream.content = xml_object
        new_datastream.label = label
    def setUp(self):

        # load the three xml issue objects
        self.issue = dict()
        for file in self.FIXTURES:
            filebase = file.split('.')[0]
            self.issue[filebase] = xmlmap.load_xmlobject_from_file(path.join(exist_fixture_path, file), TestIssue)
Example #18
def process(dataHandler, copy_parent_to_children=False):
    '''Function to go through all the data and process it.'''
    #get dicts of columns that should be mapped & where they go in MODS
    index = 1
    for record in dataHandler.get_mods_records():
        filename = record.mods_filename
        if os.path.exists(os.path.join(MODS_DIR, filename)):
            raise Exception('%s already exists!' % filename)
        logger.info('Processing row %d to %s.' % (index, filename))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(MODS_DIR, record.parent_mods_filename)
            parent_mods = None
            if os.path.exists(parent_filename):
                parent_mods = load_xmlobject_from_file(parent_filename, mods.Mods)
                mapper = Mapper(parent_mods=parent_mods)
            mapper = Mapper()
        for field in record.field_data():
            mapper.add_data(field['mods_path'], field['data'])
        mods_obj = mapper.get_mods()
        mods_data = unicode(mods_obj.serializeDocument(pretty=True), 'utf-8')
        with codecs.open(os.path.join(MODS_DIR, filename), 'w', 'utf-8') as f:
        index = index + 1
 def setUp(self):
     # load the fixture file as a generic tei document
     self.tei = xmlmap.load_xmlobject_from_file(self.simmons_xml,
     # find the first groupsheet via xpath and load
     groups = self.tei.node.xpath('//t:text/t:group/t:group',
                                  namespaces={'t': teimap.TEI_NAMESPACE})
     self.groupsheet = TeiGroupSheet(groups[0])
def lsdibag():
    # create and return a LsdiBaggee object to use in tests
    digwf_item_response = os.path.join(FIXTURE_DIR, 'digwf_getitems_3031.xml')
    response = load_xmlobject_from_file(digwf_item_response, digwf.Items)
    # update path to use local fixture for marc xml
    item = response.items[0]
    item.marc_path = os.path.join(FIXTURE_DIR, 'ocm08951025_MRC.xml')
    return LsdiBaggee(response.items[0])
Example #21
 def set_attr_xml_content(self, attr, path):
     """Add xml content to datastream."""
     xml_object = xmlmap.load_xmlobject_from_file(path)
     xml_object = open(path)
     if attr == "dc":
         self.set_attr(attr + ".content", xml_object, sub_attr="dc")
         self.set_attr(attr, xml_object, sub_attr="content")
Example #22
def xml(request):
  "Display xml of a single issue."
    doc = xmlmap.load_xmlobject_from_file(filename=os.path.join(settings.BASE_DIR, 'static', 'xml', 'luther_text.xml'))
    raise Http404
  tei_xml = doc.serializeDocument(pretty=True)
  return HttpResponse(tei_xml, mimetype='application/xml')  
def test_no_content():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    # this file has text content
    assert not tei.no_content()

    # if we delete the lines and labels, it does not
    tei.lines = []
    tei.labels = []
    assert tei.no_content()
Example #24
    def setUp(self):
        self.vol = Volume(Mock())  # use a real volume, but Mock for api
        self.vol.pid = 'testvol:123'
        self.tei = load_xmlobject_from_file(
            os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), AnnotatedFacsimile)

        self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test')
        # for now, use defaults for page one, callback, images
        self.exporter = VolumeExport(self.vol, self.tei)
def test_fields():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    assert tei.pgpid == 968
    # should have text, lines, and labels
    assert tei.text
    assert tei.lines
    assert tei.labels
    assert len(tei.labels) == 4
    assert tei.source_authors == ["Gil"]
Example #26
    def from_file(cls, file_path, validate=True):
        """ Creates a Python object from a XML file

        :param file_path: Path to the XML file
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        return xmlmap.load_xmlobject_from_file(file_path, xmlclass=cls, validate=validate)
Example #27
    def setUp(self):
        self.vol = Volume(Mock())   # use a real volume, but Mock for api
        self.vol.pid = 'testvol:123'
        self.tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,

        self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test')
        # for now, use defaults for page one, callback, images
        self.exporter = VolumeExport(self.vol, self.tei)
Example #28
def update_999a(path, kdip_id, enumcron):
    Method to updae the 999a MARC field if/when it is changed
    in the database.
    marc_file = '%s/%s/marc.xml' %(path, kdip_id)
    marc = load_xmlobject_from_file(marc_file, models.Marc)
    marc.tag_999a = enumcron
    with open(marc_file, 'w') as marcxml:
Example #29
    def test_annotation_to_tei(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),

        note = Annotation(text="Here's the thing", quote="really",
            extra_data=json.dumps({'sample data': 'foobar',
                'tags': ['test', 'one', 'two']}))

        teinote = annotation_to_tei(note, teidoc)
        self.assert_(isinstance(teinote, tei.Note))
        self.assertEqual('annotation-%s' % note.id, teinote.id)
        self.assertEqual(note.text, teinote.paragraphs[0])

        # todo: add a schema validation once we get the output to be valid
        # teidoc.schema_valid()
        # access errors with teidoc.schema_validation_errors()

        # annotation user should be set as note response
        user = get_user_model()(username='******')
        note.user = user
        teinote = annotation_to_tei(note, teidoc)
        self.assertEqual(user.username, teinote.resp)

        # tags should be set as interp ids ana attribute
        for tag in note.info()['tags']:
            self.assert_('#%s' % tag in teinote.ana)

        # test that markdown formatting is coming through
        footnote = '''Footnotes[^1] have a label and content.

[^1]: This is some footnote content.'''
        note.text = footnote
        teinote = annotation_to_tei(note, teidoc)
        self.assert_('<ref target="#fn1" type="noteAnchor">1</ref>' in

        # markdown should be included in a code element
        self.assertEqual(note.text, teinote.markdown)

        # related page references
        rel_pages = [
        note.extra_data = json.dumps({'related_pages': rel_pages})
        teinote = annotation_to_tei(note, teidoc)
        self.assertEqual(len(rel_pages), len(teinote.related_pages))
        # first ark has a corresponding id in the fixture, should be converted
        self.assertEqual('#%s' % teidoc.page_id_by_xlink(rel_pages[0]),
        for idx in range(len(rel_pages)):
            self.assertEqual(rel_pages[idx], teinote.related_pages[idx].text)
Example #30
    def test_ocr_ids(self):
        # pach in fixture ocr content
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR,
            mockocr.content = ocr_xml

Example #31
    def test_ocr_ids(self):
        # pach in fixture ocr content
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

Example #32
    def from_file(cls, file_path, validate=True):
        """ Creates a Python object from a XML file

        :param file_path: Path to the XML file
        :param validate: XML should be validated against the embedded XSD definition
        :type validate: Boolean
        :returns: the Python object
        return xmlmap.load_xmlobject_from_file(file_path,
def test_text_to_plaintext_longlines():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    # replace the text of the last line with an excessively long line
    # - because the xmlobject isn't configured with an eye to updates,
    #   update the lxml node text directly
    tei.lines[-1].node.text = "superlongline" * 100
    plaintext = tei.text_to_plaintext()
    plaintext_lines = plaintext.split("\n")
    # line is slightly more than 100 because of ltr/rtl marks & line number
    # but should NOT be padded to match the superlongline
    assert len(plaintext_lines[1]) < 110
def test_text_to_plaintext():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    plaintext = tei.text_to_plaintext()
    assert plaintext.count("\n") == 43
    # two section breaks
    assert plaintext.count("\n\n") == 4
    # includes labels
    assert "Right Margin" in plaintext
    assert "מא" in plaintext
    assert "الحسن بن ابرهيم" in plaintext
    # includes line numbers and ltr/rtl marks
    assert ("\u200f        כתאבי אטאל אללה בקא מולי אלשיך ואדאם \u200e   1\n"
            in plaintext)
Example #35
    def test_items_xml(self):
        # basic inspection of sample result / xml mapping
        response = load_xmlobject_from_file(self.item_response, digwf.Items)
        assert response.count == 1
        assert len(response.items) == 1
        assert isinstance(response.items[0], digwf.Item)

        item = response.items[0]
        assert item.pid == '7svgb'
        assert item.item_id == '3031'
        assert item.control_key == 'ocm08951025'
        assert item.display_image_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output'
        assert item.display_image_count == 2218
        assert item.ocr_file_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output'
        assert item.ocr_file_count == 2218
        assert item.pdf == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.pdf'
        assert item.marc_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/ocm08951025_MRC.xml'
        assert item.ocr_file == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.xml'
        assert item.collection_id == 10
        assert item.collection_name == 'Atlanta City Directories'

        response = load_xmlobject_from_file(self.empty_response, digwf.Items)
        assert response.count == 0
Example #36
    def test_annotation_citation_to_tei(self):
        teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'),

        teinote = annotation_to_tei(self.zotero_note, teidoc)
        # print teinote.serialize(pretty=True)
        # number of citations should match
        # minimal inspection to check that values carried through as expected
        self.assertEqual('webpage', teinote.citations[0].type)
        self.assertEqual('journalArticle', teinote.citations[1].type)

        self.assertEqual('zotero-7CBCH6E8', teinote.citations[0].id)
        self.assertEqual('zotero-MUXAEE89', teinote.citations[1].id)
Example #37
    def test_get_fulltext(self):
        with patch.object(self.vol, 'ocr') as mockocr:
            mockocr.exists = True
            # abbyy finereader v8
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
                'In presenting this,  the initial volume of  the' in text,
                'ocr text content should be present in plain text')
                'Now, kind reader, we ask that you do not crit' in text,
                'ocr text content should be present in plain text')
                re.search(r'Baldwin\s+Dellinger\s+Brice', text),
                'table row content should be displayed on a single line')

            # abbyy finereader v6
            ocr_xml = load_xmlobject_from_file(
                os.path.join(FIXTURE_DIR, 'abbyyocr_fr6v1.xml'))
            mockocr.content = ocr_xml

            text = self.vol.get_fulltext()
            # check for arbitrary text content
                'was late in the autumn, the vines yet kept their leaves,'
                in text, 'ocr text content should be present in plain text')
                'walked up the steps. The lady had not moved, and made'
                in text, 'ocr text content should be present in plain text')
                re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text),
                'table row content should be displayed on a single line')
Example #38
        def mock_load(url, xmlclass):
            '''mock-like method wrapping load_xmlobject_from_file without
            actually making a network query, but still calling the requested
            xmlclass constructor.
            # figure out what fixture to return
            fixture = (mock_load.return_fixtures[mock_load.call_count]
                       if mock_load.call_count < len(mock_load.return_fixtures)
                       else mock_load.return_fixtures[-1])

            mock_load.call_count += 1
            test_response_path = fixture_path(fixture)
            test_response_obj = xmlmap.load_xmlobject_from_file(
                test_response_path, xmlclass=xmlclass)
            return test_response_obj
Example #39
        def mock_load(url, xmlclass):
            '''mock-like method wrapping load_xmlobject_from_file without
            actually making a network query, but still calling the requested
            xmlclass constructor.
            # figure out what fixture to return
            fixture = (mock_load.return_fixtures[mock_load.call_count]
                       if mock_load.call_count < len(mock_load.return_fixtures)
                       else mock_load.return_fixtures[-1])

            mock_load.call_count += 1
            test_response_path = fixture_path(fixture)
            test_response_obj = xmlmap.load_xmlobject_from_file(test_response_path,
            return test_response_obj
Example #40
    def init_xml_object(self):
        '''Initialize an xmlobject based on user-specified arguments
        for filename and type.  Returns an instance of the
        appropriate :class:`~eulxml.xmlmap.XmlObject`, or displays
        an error message if the document could not be parsed as XML.'''

        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
    def init_xml_object(self):
        '''Initialize an xmlobject based on user-specified arguments
        for filename and type.  Returns an instance of the
        appropriate :class:`~eulxml.xmlmap.XmlObject`, or displays
        an error message if the document could not be parsed as XML.'''

        if self.args.input == 'ead':
            xmlobj_class = EAD
        elif self.args.input == 'tei':
            xmlobj_class = Tei

            return load_xmlobject_from_file(self.args.filename, xmlobj_class)
        except Exception as err:
            print 'Error loading %s as XML: %s' % (self.args.filename, err)
def test_html():
    tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
    html = tei.text_to_html()
    # should result in 3 sections
    assert html.count("<section>") == 3
    assert "<h1>Right Margin</h1>" in html
    assert "<li value='1'>מא</li>" in html
    # three different lines that are # 1
    assert html.count("<li value='1'>") == 3

    # check that the last line / last block is included
    assert "<li value='6'>الحسن بن ابرهيم</li>" in html

    # assert that missing line number does not result in a line number of "None"
    assert "<li value='None'>" not in html
    assert "<li value=''>" not in html
Example #43
    def test_page_index_data(self, mockzipfile):
        mockzip_obj = mockzipfile.return_value.__enter__.return_value
        page_files = ['0001.txt', '00002.txt']
        mockzip_obj.namelist.return_value = page_files
        # simulate reading zip file contents
        contents = ('page content for one', 'hello! pshaw! what?')
        mockzip_obj.open.return_value.__enter__.return_value \
            .read.return_value.decode.side_effect = contents

        work = DigitizedWork(source_id='chi.79279237')

        # page data comes from mets
        mets = load_xmlobject_from_file(self.metsfile, hathi.MinimalMETS)
        with patch.object(DigitizedWork, 'hathi') as mock_hathiobj:
            mock_hathiobj.zipfile_path.return_value = '/path/to/79279237.zip'
            mock_hathiobj.metsfile_path.return_value = self.metsfile
            mock_hathiobj.content_dir = 'data'

            page_data = work.page_index_data()
            assert isinstance(page_data, types.GeneratorType)

            for i, data in enumerate(page_data):
                mets_page = mets.structmap_pages[i]
                assert data['id'] == '.'.join([work.source_id, mets_page.text_file.sequence])
                assert data['source_id'] == work.source_id
                assert data['content'] == contents[i]
                assert data['order'] == mets_page.order
                assert data['item_type'] == 'page'
                assert data['label'] == mets_page.display_label
                assert 'tags' in data
                assert data['tags'] == mets_page.label.split(', ')

            # not suppressed by no data
            mock_hathiobj.metsfile_path.side_effect = \
            # should log an error, not currently tested
            assert not list(work.page_index_data())

        # if item is suppressed - no page data
        work.status = DigitizedWork.SUPPRESSED
        assert not list(work.page_index_data())

        # non hathi item - no page data
        nonhathi_work = DigitizedWork(source=DigitizedWork.OTHER)
        assert not list(nonhathi_work.page_index_data())
Example #44
def process(spreadsheet,
    '''Function to go through all the data and process it.'''
    #make sure we have a directory to put the mods files in
    os.makedirs(xml_files_dir, exist_ok=True)
    data_handler = DataHandler(spreadsheet,
    index = 1
    for record in data_handler.get_xml_records():
        filename = '%s.%s.xml' % (record.xml_id, record.record_type)
        full_path = os.path.join(xml_files_dir, filename)
        if os.path.exists(full_path):
            raise DataError(
                '%s file already exists from previous record! Possible duplicate %s IDs?'
                % (filename, record.xml_id))
        if copy_parent_to_children:
            #load parent mods object if desired (& it exists)
            parent_filename = os.path.join(
                u'%s.%s' % (record.group_id, record.record_type))
            parent_xml = None
            if os.path.exists(parent_filename):
                parent_xml = load_xmlobject_from_file(parent_filename,
                mapper = Mapper(record.record_type,
            mapper = Mapper(record.record_type, record.field_data())
        xml_obj = mapper.get_xml()
        xml_bytes = xml_obj.serializeDocument(
            pretty=True)  #serializes as UTF-8
        with open(full_path, 'wb') as f:
        index = index + 1
Example #45
    def handle(self, *args, **options):
        repo = Repository()
        for pid in options['pid']:
            vol = repo.get_object(pid, type=Volume)
            if options['tei']:
                tei = load_xmlobject_from_file(options['tei'], Facsimile)
                tei = annotate.annotated_tei(vol.generate_volume_tei(),
                zipfile = export.website(vol, tei)
            except export.ExportException as err:
                raise CommandError(err)

            zipfilename = '%s-annotated-site.zip' % vol.noid
            shutil.copyfile(zipfile.name, zipfilename)

            print 'Export for %s complete, zipfile is %s' % (vol.noid, zipfilename)
Example #46
 def _query(self, base_url, qargs, response_xmlclass):
     '''Utility method: Adds required query arguments, returns response
     as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
     necessary to enforce EUtils query speed policy.
     qargs = qargs.copy()
     if 'tool' not in qargs:
         qargs['tool'] = self.EUTILS_TOOL
     if 'email' not in qargs:
         qargs['email'] = self.EUTILS_EMAIL
     # TODO: When we start making more than one query we need to sleep to
     # avoid making more than 3 requests per second per E-Utilities
     # policies.
     qurl = base_url + urlencode(qargs)
     logger.debug('EntrezClient querying: ' + qurl)
     return xmlmap.load_xmlobject_from_file(qurl,
Example #47
 def _query(self, base_url, qargs, response_xmlclass):
     '''Utility method: Adds required query arguments, returns response
     as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if
     necessary to enforce EUtils query speed policy.
     qargs = qargs.copy()
     if 'tool' not in qargs:
         qargs['tool'] = self.EUTILS_TOOL
     if 'email' not in qargs:
         qargs['email'] = self.EUTILS_EMAIL
     # TODO: When we start making more than one query we need to sleep to
     # avoid making more than 3 requests per second per E-Utilities
     # policies.
     qurl = base_url + urlencode(qargs)
     logger.debug('EntrezClient querying: ' + qurl)
     return xmlmap.load_xmlobject_from_file(qurl,
Example #48
    def handle(self, *args, **options):
        repo = Repository()
        for pid in options['pid']:
            vol = repo.get_object(pid, type=Volume)
            if options['tei']:
                tei = load_xmlobject_from_file(options['tei'], Facsimile)
                tei = annotate.annotated_tei(vol.generate_volume_tei(),
                zipfile = export.website(vol, tei)
            except export.ExportException as err:
                raise CommandError(err)

            zipfilename = '%s-annotated-site.zip' % vol.noid
            shutil.copyfile(zipfile.name, zipfilename)

            print 'Export for %s complete, zipfile is %s' % (vol.noid,
Example #49
def create_ht_marc(kdip):

    if isinstance(kdip, basestring):
        barcode = kdip
        barcode = kdip.kdip_id

    record = load_bib_record(barcode)
    remove_most_999_fields(record, barcode)

    marc_file = '%s/%s/marc.xml' % (settings.KDIP_DIR, barcode)

    # Write the marc.xml to disk.
    with open(marc_file, 'w') as marcxml:
        # When we insert the 035 field in position an empaty datafield is instered
        # at the bottom, so we get rid of that.
        marcxml.write(re.sub('\<datafield\/\>\\n', '', record.serialize(pretty=True)))

    return load_xmlobject_from_file(marc_file, models.Marc)
Example #50
    def as_publication_article(self, repo=None):
        '''Initialize (but do not save) a new
        :class:`~openemory.publication.models.Article` instance and
        based on harvested record information and Article XML.

        :param repo: optional; pass in an existing
           :class:`eulfedora.server.Repository` object initialized
           with the desired credentials

        :returns: unsaved :class:`~openemory.publication.models.Article`
        if repo is None:
            repo = Repository()
        article = repo.get_object(type=Article)
        # using comma-delimited usernames to indicate object has multiple owners
        # should work with existing XACML owner policy;
        # for more detail, see https://jira.duraspace.org/browse/FCREPO-82
        article.owner = ', '.join(auth.username for auth in self.authors.all())
        # VERY preliminary, minimal metadata mapping 
        article.label = self.title
        article.dc.content.title = self.title
                                                for auth in self.authors.all()])
                                               'PMC%d' % self.pmcid])

        # set the XML article content as the contentMetadata datastream
        # - record content is a file field with a read method, which should be
        #   handled correctly by eulfedora for ingest
        if hasattr(self.content, 'read'):
            article.contentMetadata.content = load_xmlobject_from_file(self.content, NlmArticle)

        if article.contentMetadata.content:
            article.descMetadata.content = article.contentMetadata.content.as_article_mods()

        # FIXME: datastream checksum!
        # TODO: format uri for this datastream ? 

        return article
Example #51
    def as_publication_article(self, repo=None):
        '''Initialize (but do not save) a new
        :class:`~openemory.publication.models.Article` instance and
        based on harvested record information and Article XML.

        :param repo: optional; pass in an existing
           :class:`eulfedora.server.Repository` object initialized
           with the desired credentials

        :returns: unsaved :class:`~openemory.publication.models.Article`
        if repo is None:
            repo = Repository()
        article = repo.get_object(type=Article)
        # using comma-delimited usernames to indicate object has multiple owners
        # should work with existing XACML owner policy;
        # for more detail, see https://jira.duraspace.org/browse/FCREPO-82
        article.owner = ', '.join(auth.username for auth in self.authors.all())
        # VERY preliminary, minimal metadata mapping 
        article.label = self.title
        article.dc.content.title = self.title
                                                for auth in self.authors.all()])
                                               'PMC%d' % self.pmcid])

        # set the XML article content as the contentMetadata datastream
        # - record content is a file field with a read method, which should be
        #   handled correctly by eulfedora for ingest
        if hasattr(self.content, 'read'):
            article.contentMetadata.content = load_xmlobject_from_file(self.content, NlmArticle)

        if article.contentMetadata.content:
            article.descMetadata.content = article.contentMetadata.content.as_article_mods()

        # FIXME: datastream checksum!
        # TODO: format uri for this datastream ? 

        return article
Example #52
    def page_index_data(self):
        '''Get page content for this work from Hathi pairtree and return
        data to be indexed in solr.'''

        # If an item has been suppressed or is from a source other than
        # hathi, bail out. No pages to index.
        if self.is_suppressed or self.source != self.HATHI:

        # load mets record to pull metadata about the images
            mmets = load_xmlobject_from_file(self.hathi.metsfile_path(),
        except storage_exceptions.ObjectNotFoundException:
            logger.error('Pairtree data for %s not found but status is %s',
                         self.source_id, self.get_status_display())

        # read zipfile contents in place, without unzipping
        with ZipFile(self.hathi.zipfile_path()) as ht_zip:

            # yield a generator of index data for each page; iterate
            # over pages in METS structmap
            for page in mmets.structmap_pages:
                # zipfile spec uses / for path regardless of OS
                pagefilename = '/'.join([self.hathi.content_dir, page.text_file_location])
                with ht_zip.open(pagefilename) as pagefile:
                        yield {
                            'id': '%s.%s' % (self.source_id, page.text_file.sequence),
                            'source_id': self.source_id,   # for grouping with work record
                            'content': pagefile.read().decode('utf-8'),
                            'order': page.order,
                            'label': page.display_label,
                            'tags': page.label.split(', ') if page.label else [],
                            'item_type': 'page'
                    except StopIteration:
Example #53
    def test_check_ht(self):
        test_xml = [

        job = Job(pk=1)
        kdip0 = KDip.objects.create(kdip_id='10002350302', oclc="12345", note='0', pid='r8d9b', create_date = '2015-12-30 15:43:17', job_id=1)
        kdip1 = KDip.objects.create(kdip_id='10002350304', oclc="12345", note='1', pid='r8d9y', create_date = '2015-12-30 15:43:17', job_id=1)
        kdip2 = KDip.objects.create(kdip_id='10002350306', oclc="67890", note='2', pid='r8d9s', create_date = '2015-12-30 15:43:17', job_id=1)
        text590 = "The online edition of this book in the public domain, i.e., not protected by copyright, has been produced by the Emory University Digital library Publications Program."

        for xml in test_xml:
            index = test_xml.index(xml)
            kdip = KDip.objects.get(note=index)
            marc = load_xmlobject_from_file(xml, AlmaBibRecord)
            marc = check_ht.add_856(marc, kdip)
            marc = Utils.remove_all_999_fields(marc)
            marc = Utils.update_583(marc)

            text_856 = '<datafield tag="856" ind1="4" ind2="1"><subfield code="3">%s</subfield><subfield code="u">http://pid.emory.edu/ark:/25593/%s/HT</subfield><subfiled code="y">HathiTrust version</subfiled></datafield>' % (index, kdip.pid)
            field856s = []
            for tag856 in marc.field856:

            self.assertIn(text_856, field856s)

            self.assertEqual(len(marc.field999), 0)

            self.assertNotIn(marc.serialize().lower(), text590.lower())

            marc = check_ht.add_590(marc)

            self.assertEqual(marc.field590, text590)

            self.assertEqual(marc.tag583a, 'digitized')
Example #54
    def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf):

        with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f:
            from eulxml.xmlmap import load_xmlobject_from_file
            mock_pdf.content = load_xmlobject_from_file(f)
        mock_ds = [
        ]  # noqa

        issue = IssueFactory.create(journal=self.journal,
                                    date_published=dt.datetime.now() -
        article = ArticleFactory.create(issue=issue)
        journal_id = self.journal.localidentifier
        issue_id = issue.localidentifier
        article_id = article.localidentifier
        url = reverse('public:journal:article_raw_xml',
                      args=(journal_id, issue.volume_slug, issue_id,
        request = self.factory.get(url)
        request.user = AnonymousUser()
        request.subscription = None

        # Run
        response = ArticleXmlView.as_view()(request,

        # Check
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')
Example #55
    def handle(self, *paths, **options):

        if not len(paths):
            raise CommandError('Please specify path to content for import.')
        if len(paths) > 1:
            # this limitation is kind of arbitrary, but keep thing simple for now
            raise CommandError('Import currently only supports a single volume.')
        path = paths[0]

        dry_run = options.get('dry_run', False)
        verbosity = options.get('verbosity', self.v_normal)

        repo = ManagementRepository()

        # make collection required to avoid accidentally forgetting it
        coll = options.get('collection', None)
        if coll is None:
            raise CommandError('Please specify collection pid')

        collection = repo.get_object(coll, type=Collection)
        if not collection.exists:
            raise CommandError('Collection %s does not exist' % coll)
        if not collection.has_requisite_content_models:
            raise CommandError('%s is not a collection' % coll)

            start = time.time()
            bag = bagit.Bag(path)
            # NOTE: could consider using fast validation, but files probably are
            # not so large or so numerous that this will be an issue
            if verbosity > self.v_normal:
                self.stdout.write('Validating bag %s' % path)
            fast_validate = options.get('fast_validate')
            if verbosity >= self.v_normal:
                self.stdout.write('Validated %s in %.02fs %s' % (path, time.time() - start,
                    '(fast validation enabled)' if fast_validate else ''))
        except bagit.BagError as err:
            # failed to load directory as a bag
            raise CommandError('Please supply a valid BagIt as input. %s' % err)
        except bagit.BagValidationError as err:
            # bag is not valid
            raise CommandError('Input is not a valid bag. %s' % err)

        files = {'pdf': None, 'marcxml': None, 'dc': None}
        checksums = {}

        # this is potentially a long list, but go ahead and store since we will
        # be consulting it multiple times
        payload_files = list(bag.payload_files())

        # identify required contents within the bag by extension and name
        for data_path in payload_files:
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)

            # get extension and name
            basename = os.path.basename(filename)
            basefile, ext = os.path.splitext(basename)
            # NOTE: splitext leaves . on the ext portion

            if ext.lower() == '.pdf':
                files['pdf'] = filename
                checksums['pdf'] = bag.entries[data_path].get('md5', None)

            elif ext.lower() == '.xml':

                if basefile.lower() == 'marc':
                    files['marcxml'] = filename
                    checksums['marcxml'] = bag.entries[data_path].get('md5', None)

                elif basefile.lower() == 'dc':
                    files['dc'] = filename
                    checksums['dc'] = bag.entries[data_path].get('md5', None)

        # check that required components are present
        err = False
        for label, filepath in files.iteritems():
            if filepath is None:
                self.stderr.write('%s not found' % label.upper())
                err = True

            elif checksums[label] is None:
                self.stderr.write('No MD5 checksum found for %s' % label.upper())
                err = True

        if err:
            raise CommandError('Cannot import without all required files and checksums.')

        # all pieces are available, so proceed with ingest

        # construct book and ingest
        if verbosity > self.v_normal:
            self.stdout.write('Creating book object with marxml %s' % files['marcxml'])
            marcxml = load_xmlobject_from_file(files['marcxml'], MinMarcxml)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' % (files['marcxml'], err))
            dcxml = load_xmlobject_from_file(files['dc'], DublinCore)
        except XMLSyntaxError as err:
            raise CommandError('Failed to load %s as xml: %s' % (files['dc'], err))

        # look for book by ocm number first, in case a previous ingest failed
        book_pids = Book.pids_by_label(marcxml.ocm_number)
        # error if we find more than one
        if len(book_pids) > 1:
            raise CommandError('Multiple books exist with label %s. Please correct this first.' \
                                % marcxml.ocm_number)

        # if we find exactly one, use that instead of creating a new book
        elif len(book_pids) == 1:
            book = repo.get_object(book_pids[0], type=Book)
            if verbosity >= self.v_normal:
                self.stdout.write('Using existing book %s with ocm number %s' % \
                    (book.pid, marcxml.ocm_number))

        # otherwise, ingest new book
            book = repo.get_object(type=Book)
            # set book label to ocm number from the marc
            book.label = marcxml.ocm_number
            if verbosity > self.v_normal:
                self.stdout.write('Book label %s' % book.label)

            # associate with collection
            if collection is not None:
                book.collection = collection
                if verbosity > self.v_normal:
                    self.stdout.write('Associating with collection %s' % collection.short_label)
            book.marcxml.content = marcxml
            # NOTE: import checksum can't be used because xml may be serialized differently
            # book.marcxml.checksum = checksums['marcxml']
            book.dc.content = dcxml
            # NOTE: import checksum can't be used because DC is modified to add ARK
            # book.dc.checksum = checksums['dc']

            # save; bail if error
            if not dry_run:
                    saved = book.save('ingest')
                    if not saved:
                        raise CommandError('Failed to ingest book into repository')
                    if verbosity >= self.v_normal:
                        self.stdout.write('Successfully ingested book %s' \
                                    % book.pid)
                except RequestFailed as err:
                    raise CommandError('Error ingesting book: %s' % err)

        # in case of pre-existing book object, check for existing volume
        if book.volume_set:
            if len(book.volume_set) > 1:
                raise CommandError('Book %s has multiple volumes; import not supported' \
                    % book.pid)
                # use existing volume object
                vol = book.volume_set[0]
                if verbosity >= self.v_normal:
                    self.stdout.write('Using existing volume %s' % vol.pid)

        # otherwise, create new volume object
            # construct volume (v1.1), associate with book, and ingest
            if verbosity > self.v_normal:
                self.stdout.write('Creating volume with %s' % files['pdf'])
            with open(files['pdf']) as pdf_file:
                vol = repo.get_object(type=VolumeV1_1)
                # set volume label to ocm number from the marc + volume number
                # for consistency with lsdi content, use ocm_v# notation
                # V.0 indicates single-volume book
                vol.label = '%s_V.0' % marcxml.ocm_number
                # set pdf content
                vol.pdf.content = pdf_file
                vol.pdf.checksum = checksums['pdf']
                # set relation to parent book object
                vol.book = book
                # minimal DC metadata derived from book metadata
                vol.dc.content.title = book.dc.content.title
                for t in book.dc.content.type_list:
                vol.dc.content.format = book.dc.content.format
                vol.dc.content.language = book.dc.content.language
                vol.dc.content.rights = book.dc.content.rights

                if not dry_run:
                        saved = vol.save('ingest')
                        if not saved:
                            # NOTE: possibly, if this fails, we should deactivate the book object
                            # but will leave that to manual processing for now
                            raise CommandError('Failed to ingest volume into repository')
                            if verbosity >= self.v_normal:
                                self.stdout.write('Successfully ingested volume %s' \
                                    % vol.pid)
                    except RequestFailed as err:
                        raise CommandError('Error ingesting volume: %s' % err)

        #### page import

        # if volume has existing pages, bail
        if len(vol.pages):
            raise CommandError('Volume %s already has %s page%s' % \
                (vol.pid, len(vol.pages), '' if len(vol.pages) == 1 else 's'))

        # should page import happen here?
        # - identify numeric jp2/jpf files in the bag and get total count
        # - identify numeric .xml files in the bag and get total count
        # - make sure counts match up
        # Question: can we assume no start/end blank pages for now?
        # - start looping through, create page-1.1 and associate with book,
        #   and ingest
        # - set first page as primary image on the volume
        # - report number of pages ingested

        image_files = []

        # identify page files (images and ocr xml)
        for data_path in payload_files:
            # get extension and name
            basename = os.path.basename(data_path)
            basefile, ext = os.path.splitext(basename)
            if ext in ['.jp2', '.jpf']:
                # check that MD5 is present and bail if not
                # - this is probably redundant since by this point validation
                # has passed and previous content has checksums, but
                # ingest will assume checksums are available so better to error
                # *before* starting to ingest page-level content
                if bag.entries[data_path].get('md5', None) is None:
                    raise CommandError('No MD5 checksum for %s' % data_path)

        # ensure pages are sorted into page-order

        # NOTE: disabled for now; tunebook does not appear to include alto
        # for pages with no text content
        ## find matching page ocr files
        # for imgfile in image_files:
        #     basefile, ext = os.path.splitext(imgfile)
        #     ocrfile = '%s.xml' % basefile
        #     if ocrfile not in payload_files:
        #         raise CommandError('No OCR xml page present for %s (expected %s)' % \
        #             (imgfile, ocrfile))

        # pre-generate empty xml in case we need it to force eulfedora to not
        # create ocr datastream when no ocr is present
        emptyxml = load_xmlobject_from_string('<empty/>')

        # iterate through page images and put into fedora
        pageindex = 1
        for imgfile in image_files:
            if verbosity > self.v_normal:
                print 'Creating Page object for %s' % imgfile
            # path is relative to bag root dir
            img_filename = os.path.join(path, imgfile)

            page = repo.get_object(type=PageV1_1)
            # set page label
            page.label = '%s page %d' % (vol.label, pageindex)
            # set the relation to the volume object
            page.volume = vol
            logger.debug('Page %s volume %s' % (page.pid, page.volume.pid))
            # set a dc:title based on volume title
            page.dc.content.title = '%s page %d' % (vol.dc.content.title, pageindex)
            # set page order
            page.page_order = pageindex

            with open(img_filename) as img_content:
                # set image content
                page.image.content = img_content
                page.image.checksum = bag.entries[imgfile]['md5']
                # assume jpeg2000 for now (only looking for jp2/jpf)
                page.image.mimetype = 'image/jp2'

                # check for ocr xml within the bag, same base name as image
                basefile, ext = os.path.splitext(imgfile)
                ocrfile = '%s.xml' % basefile

                if ocrfile in payload_files:
                    page.ocr.content = load_xmlobject_from_file(os.path.join(path, ocrfile))
                    # NOTE: can't use MD5 from bag because XML may be
                    # serialized differently when sent to Fedora
                    # (unless we treat as file instead of xml...)
                    # page.ocr.checksum = bag.entries[ocrfile]['md5']

                    if verbosity > self.v_normal:
                        print 'Setting OCR for Page from %s' % ocrfile

                    # warn but do not error if ocr xml is not found
                    self.stdout.write('Warning: no OCR xml found for %s' % imgfile)
                    # explicitly set xml content to empty so eulfedora doesn't
                    # attempt to bootstrap & ingest (and error)
                    page.ocr.content = emptyxml

                if not dry_run:
                        # for now, if any page ingest errors, bail out
                        # (unclear what would cause load to fail midway)

                        saved = page.save()

                        if not saved:
                            raise CommandError('Failed to ingest page %d into repository' \
                                % pageindex)

                    except RequestFailed as err:
                        raise CommandError('Error ingesting page %d: %s' % (pageindex, err))

            # set first page as primary image for the volume
            if not dry_run and pageindex == 1:
                vol.primary_image = page
                vol.save('adding primary image relation')

            # increase page index for next page
            pageindex += 1

        if verbosity >= self.v_normal:
            # total is pageindex - 1 since pageindex incremented at end of loop
            self.stdout.write('Created %d pages' % (pageindex - 1))
Example #56
 def setUp(self):
     self.account = load_xmlobject_from_file(self.FIXTURE_FILE, cerp.Account)
     self.folder = self.account.folders[0]
     self.message = self.folder.messages[0]
Example #57
 def setUp(self):
     self.fr6v1 = load_xmlobject_from_file(self.fr6v1_doc,
     self.fr8v2 = load_xmlobject_from_file(self.fr8v2_doc,