def _query(self, base_url, qargs, response_xmlclass): '''Utility method: Adds required query arguments, returns response as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if necessary to enforce EUtils query speed policy. ''' self._enforce_query_timing() qargs = qargs.copy() if 'tool' not in qargs: qargs['tool'] = self.EUTILS_TOOL if 'email' not in qargs: qargs['email'] = self.EUTILS_EMAIL # TODO: When we start making more than one query we need to sleep to # avoid making more than 3 requests per second per E-Utilities # policies. qurl = base_url + urlencode(qargs) logger.debug('EntrezClient querying: ' + qurl) # use a url validator to examine if the qurl is a file location or a url # open the remote file with urllib.urlopen if it is a url # or open it as a file url_validator = URLValidator() try: url_validator(qurl) target_file = urlopen(qurl) return xmlmap.load_xmlobject_from_file(target_file.read(), xmlclass=response_xmlclass) except ValidationError, e: return xmlmap.load_xmlobject_from_file(qurl, xmlclass=response_xmlclass)
def test_get_fulltext(self): with patch.object(self.vol, 'ocr') as mockocr: mockocr.exists = True # abbyy finereader v8 ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml')) mockocr.content = ocr_xml text = self.vol.get_fulltext() # check for arbitrary text content self.assert_('In presenting this, the initial volume of the' in text, 'ocr text content should be present in plain text') self.assert_('Now, kind reader, we ask that you do not crit' in text, 'ocr text content should be present in plain text') self.assert_(re.search(r'Baldwin\s+Dellinger\s+Brice', text), 'table row content should be displayed on a single line') # abbyy finereader v6 ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'abbyyocr_fr6v1.xml')) mockocr.content = ocr_xml text = self.vol.get_fulltext() # check for arbitrary text content self.assert_('was late in the autumn, the vines yet kept their leaves,' in text, 'ocr text content should be present in plain text') self.assert_('walked up the steps. The lady had not moved, and made' in text, 'ocr text content should be present in plain text') self.assert_(re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text), 'table row content should be displayed on a single line')
def setUp(self): # tei generated from mets alto self.alto_tei = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.Facsimile) # tei generated from abbyy ocr self.abbyy_tei = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'), tei.Facsimile)
def setUp(self): # tei generated from mets alto self.alto_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.Facsimile) # tei generated from abbyy ocr self.abbyy_tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile_abbyy.xml'), tei.Facsimile)
def setUp(self): search_fixture_path = self.fixture_path('esearch-response-withhist.xml') self.search_response = xmlmap.load_xmlobject_from_file(search_fixture_path, xmlclass=ESearchResponse) fetch_fixture_path = self.fixture_path('efetch-retrieval-from-hist.xml') self.fetch_response = xmlmap.load_xmlobject_from_file(fetch_fixture_path, xmlclass=EFetchResponse) self.mock_client = Mock(spec=EntrezClient)
def setUp(self): search_fixture_path = self.fixture_path( 'esearch-response-withhist.xml') self.search_response = xmlmap.load_xmlobject_from_file( search_fixture_path, xmlclass=ESearchResponse) fetch_fixture_path = self.fixture_path( 'efetch-retrieval-from-hist.xml') self.fetch_response = xmlmap.load_xmlobject_from_file( fetch_fixture_path, xmlclass=EFetchResponse) self.mock_client = Mock(spec=EntrezClient)
def setUp(self): super(HarvestRecordTest, self).setUp() article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml') self.fetch_response = xmlmap.load_xmlobject_from_file( article_fixture_path, xmlclass=EFetchResponse) # one corresponding author with an emory email self.article = self.fetch_response.articles[0]
def test_consolidate_bibl(self): teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.AnnotatedFacsimile) teinote = annotation_to_tei(self.zotero_note, teidoc) teidoc.annotations.append(teinote) consolidate_bibliography(teidoc) self.assertEqual(2, len(teidoc.citations), 'annotation citations should be present in main document bibl') teinote = teidoc.annotations[0] self.assertEqual(0, len(teinote.citations), 'citations should not be present on individual annotation') self.assertEqual(None, teinote.works_cited) self.assertEqual(None, teinote.zotero_items) self.assertEqual(None, teinote.works_cited_milestone) teinote_xml = teinote.serialize() self.assertFalse('<item><anchor xml:id="zotero-' in teinote_xml) self.assertFalse('<listBibl/>' in teinote_xml) # repeated zotero ids should only appear once in document bibl # load the same note and add it again teinote = annotation_to_tei(self.zotero_note, teidoc) teidoc.annotations.append(teinote) consolidate_bibliography(teidoc) self.assertEqual(2, len(teidoc.citations), 'citations repeated in annotations should only appear once')
def language(self): ''' wrapper arond field that chooses that prefered source :returns: a tuple containng language code and name ''' marc_languages_xml = 'http://www.loc.gov/standards/codelists/languages.xml' langs = xmlmap.load_xmlobject_from_file(marc_languages_xml) ns = {'lang':'info:lc/xmlns/codelist-v1'} if self.wos and self.wos.language: lang = self.wos.language elif self.scopus and self.scopus.language: lang = self.scopus.language elif self.pubmed and self.pubmed.language: lang = self.pubmed.language elif self.crossref and self.crossref.language: lang = self.crossref.language elif self.arxiv and self.arxiv.language: lang = self.arxiv.language elif self.repec and self.repec.language: lang = self.repec.language elif self.dblp and self.dblp.language: lang = self.dblp.language else: lang = '' nodes = langs.node.xpath("//lang:language[lang:name='%s' or lang:code='%s']" % (lang, lang), namespaces=ns) if nodes: return (nodes[0].findtext('lang:code', namespaces=ns), nodes[0].findtext('lang:name', namespaces=ns)) else: return ('', '')
def test_can_serialize_xsd300_ds(): from eulxml.xmlmap import load_xmlobject_from_file with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml')) as f: xmlobject = load_xmlobject_from_file(f) serialized_object = serializers._xsd300_serializer(xmlobject) assert type(serialized_object) == bytes
def setUp(self): #load the three xml doc objects self.docs = dict() for file in self.FIXTURES: filebase = file.split('.')[0] self.docs[filebase] = xmlmap.load_xmlobject_from_file( path.join(exist_fixture_path, file), TestDocTitle)
def process(spreadsheet, xml_files_dir, sheet=1, control_row=None, force_dates=False, object_type='parent', input_encoding='utf8', copy_parent_to_children=False): '''Function to go through all the data and process it.''' #make sure we have a directory to put the mods files in os.makedirs(xml_files_dir, exist_ok=True) data_handler = DataHandler(spreadsheet, sheet=sheet, control_row=control_row, force_dates=force_dates, object_type=object_type, input_encoding=input_encoding) index = 1 for record in data_handler.get_xml_records(): filename = '%s.%s.xml' % (record.xml_id, record.record_type) full_path = os.path.join(xml_files_dir, filename) if os.path.exists(full_path): raise DataError('%s file already exists from previous record! Possible duplicate %s IDs?' % (filename, record.xml_id)) if copy_parent_to_children: #load parent mods object if desired (& it exists) parent_filename = os.path.join(xml_files_dir, u'%s.%s' % (record.group_id, record.record_type)) parent_xml = None if os.path.exists(parent_filename): parent_xml = load_xmlobject_from_file(parent_filename, mods.Mods) mapper = Mapper(record.record_type, record.field_data(), parent_mods=parent_xml) else: mapper = Mapper(record.record_type, record.field_data()) xml_obj = mapper.get_xml() xml_bytes = xml_obj.serializeDocument(pretty=True) #serializes as UTF-8 with open(full_path, 'wb') as f: f.write(xml_bytes) index = index + 1
def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf): with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f: from eulxml.xmlmap import load_xmlobject_from_file mock_pdf.content = load_xmlobject_from_file(f) mock_ds = ['ERUDITXSD300', ] # noqa issue = IssueFactory.create( journal=self.journal, year=2010, date_published=dt.datetime.now() - dt.timedelta(days=1000)) article = ArticleFactory.create(issue=issue) journal_id = self.journal.localidentifier issue_id = issue.localidentifier article_id = article.localidentifier url = reverse('public:journal:article_raw_xml', args=( journal_id, issue.volume_slug, issue_id, article_id )) request = self.factory.get(url) request.user = AnonymousUser() request.subscription = None # Run response = ArticleXmlView.as_view()( request, journal_code=journal_id, issue_slug=issue.volume_slug, issue_localid=issue_id, localid=article_id) # Check self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/xml')
def setUp(self): super(HarvestRecordTest, self).setUp() article_fixture_path = fixture_path('efetch-retrieval-from-hist.xml') self.fetch_response = xmlmap.load_xmlobject_from_file(article_fixture_path, xmlclass=EFetchResponse) # one corresponding author with an emory email self.article = self.fetch_response.articles[0]
def test_rdf_type(self): # not enough information to determine type self.assertEqual(None, self.c1.rdf_type) # infer book, article, etc from title attributes self.assertEqual('bibo:Book', self.c3.rdf_type) self.assertEqual('bibo:Article', self.c4.rdf_type) # type inferred based on series; requires access to series, so load from fixtures # - bailey findingaid contains printed material, photographs, and audiovisual bailey = load_xmlobject_from_file(path.join(exist_fixture_path, 'bailey807.xml'), FindingAid) # patch in unittitles so it looks as though items have semantic data with patch('findingaids.fa.models.Series.unittitle_titles', new=[Title()]): # series 4 is printed material self.assertEqual('bibo:Document', bailey.dsc.c[3].c[0].rdf_type, 'items in printed materials series should default to document type') # series 5 is photographs self.assertEqual('bibo:Image', bailey.dsc.c[4].c[0].rdf_type, 'items in photograph series should default to image type') # series 9 is audiovisual self.assertEqual('bibo:AudioVisualDocument', bailey.dsc.c[8].c[0].rdf_type, 'items in audiovisual series should default to audiovisualdocument type') # fallback type is manuscript self.assertEqual('bibo:Manuscript', bailey.dsc.c[0].c[0].rdf_type, 'items in photograph series should default to image type')
def add_xml_datastream(self, xml_path, ds_id, label, control_group, mimetype, checksum_type): """Add XML object.""" xml_object = xmlmap.load_xmlobject_from_file(xml_path) if checksum_type == "SHA-512": checksum = self.generate_checksum(xml_path) else: checksum = None logging.warning( "Unable to generate checksum for specified type: {0}".format( checksum_type)) logging.info("----adding datastream {0}: {1}".format(ds_id, label)) new_datastream = DatastreamObject(self.obj, ds_id, label, mimetype=mimetype, control_group=control_group, checksum_type=checksum_type, checksum=checksum) new_datastream.content = xml_object new_datastream.label = label new_datastream.save()
def setUp(self): # load the three xml issue objects self.issue = dict() for file in self.FIXTURES: filebase = file.split('.')[0] self.issue[filebase] = xmlmap.load_xmlobject_from_file(path.join(exist_fixture_path, file), TestIssue)
def process(dataHandler, copy_parent_to_children=False): '''Function to go through all the data and process it.''' #get dicts of columns that should be mapped & where they go in MODS index = 1 for record in dataHandler.get_mods_records(): filename = record.mods_filename if os.path.exists(os.path.join(MODS_DIR, filename)): raise Exception('%s already exists!' % filename) logger.info('Processing row %d to %s.' % (index, filename)) if copy_parent_to_children: #load parent mods object if desired (& it exists) parent_filename = os.path.join(MODS_DIR, record.parent_mods_filename) parent_mods = None if os.path.exists(parent_filename): parent_mods = load_xmlobject_from_file(parent_filename, mods.Mods) mapper = Mapper(parent_mods=parent_mods) else: mapper = Mapper() for field in record.field_data(): mapper.add_data(field['mods_path'], field['data']) mods_obj = mapper.get_mods() mods_data = unicode(mods_obj.serializeDocument(pretty=True), 'utf-8') with codecs.open(os.path.join(MODS_DIR, filename), 'w', 'utf-8') as f: f.write(mods_data) index = index + 1
def setUp(self): # load the fixture file as a generic tei document self.tei = xmlmap.load_xmlobject_from_file(self.simmons_xml, teimap.Tei) # find the first groupsheet via xpath and load groups = self.tei.node.xpath('//t:text/t:group/t:group', namespaces={'t': teimap.TEI_NAMESPACE}) self.groupsheet = TeiGroupSheet(groups[0])
def lsdibag(): # create and return a LsdiBaggee object to use in tests digwf_item_response = os.path.join(FIXTURE_DIR, 'digwf_getitems_3031.xml') response = load_xmlobject_from_file(digwf_item_response, digwf.Items) # update path to use local fixture for marc xml item = response.items[0] item.marc_path = os.path.join(FIXTURE_DIR, 'ocm08951025_MRC.xml') return LsdiBaggee(response.items[0])
def set_attr_xml_content(self, attr, path): """Add xml content to datastream.""" xml_object = xmlmap.load_xmlobject_from_file(path) xml_object = open(path) if attr == "dc": self.set_attr(attr + ".content", xml_object, sub_attr="dc") else: self.set_attr(attr, xml_object, sub_attr="content")
def xml(request): "Display xml of a single issue." try: doc = xmlmap.load_xmlobject_from_file(filename=os.path.join(settings.BASE_DIR, 'static', 'xml', 'luther_text.xml')) except: raise Http404 tei_xml = doc.serializeDocument(pretty=True) return HttpResponse(tei_xml, mimetype='application/xml')
def test_no_content(): tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) # this file has text content assert not tei.no_content() # if we delete the lines and labels, it does not tei.lines = [] tei.labels = [] assert tei.no_content()
def setUp(self): self.vol = Volume(Mock()) # use a real volume, but Mock for api self.vol.pid = 'testvol:123' self.tei = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), AnnotatedFacsimile) self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test') # for now, use defaults for page one, callback, images self.exporter = VolumeExport(self.vol, self.tei)
def test_fields(): tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) assert tei.pgpid == 968 # should have text, lines, and labels assert tei.text assert tei.lines assert tei.labels assert len(tei.labels) == 4 assert tei.source_authors == ["Gil"]
def from_file(cls, file_path, validate=True): """ Creates a Python object from a XML file :param file_path: Path to the XML file :param validate: XML should be validated against the embedded XSD definition :type validate: Boolean :returns: the Python object """ return xmlmap.load_xmlobject_from_file(file_path, xmlclass=cls, validate=validate)
def setUp(self): self.vol = Volume(Mock()) # use a real volume, but Mock for api self.vol.pid = 'testvol:123' self.tei = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), AnnotatedFacsimile) self.tmpdir = tempfile.mkdtemp(prefix='rdx-export-test') # for now, use defaults for page one, callback, images self.exporter = VolumeExport(self.vol, self.tei)
def update_999a(path, kdip_id, enumcron): """ Method to updae the 999a MARC field if/when it is changed in the database. """ marc_file = '%s/%s/marc.xml' %(path, kdip_id) marc = load_xmlobject_from_file(marc_file, models.Marc) marc.tag_999a = enumcron with open(marc_file, 'w') as marcxml: marcxml.write(marc.serialize(pretty=True))
def test_annotation_to_tei(self): teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.AnnotatedFacsimile) note = Annotation(text="Here's the thing", quote="really", extra_data=json.dumps({'sample data': 'foobar', 'tags': ['test', 'one', 'two']})) teinote = annotation_to_tei(note, teidoc) self.assert_(isinstance(teinote, tei.Note)) self.assertEqual('annotation-%s' % note.id, teinote.id) self.assert_(teinote.href.endswith(note.get_absolute_url())) self.assertEqual(note.text, teinote.paragraphs[0]) # todo: add a schema validation once we get the output to be valid # teidoc.schema_valid() # access errors with teidoc.schema_validation_errors() # annotation user should be set as note response user = get_user_model()(username='******') user.save() note.user = user teinote = annotation_to_tei(note, teidoc) self.assertEqual(user.username, teinote.resp) # tags should be set as interp ids ana attribute for tag in note.info()['tags']: self.assert_('#%s' % tag in teinote.ana) # test that markdown formatting is coming through footnote = '''Footnotes[^1] have a label and content. [^1]: This is some footnote content.''' note.text = footnote teinote = annotation_to_tei(note, teidoc) self.assert_('<ref target="#fn1" type="noteAnchor">1</ref>' in teinote.serialize()) # markdown should be included in a code element self.assertEqual(note.text, teinote.markdown) # related page references rel_pages = [ 'http://testpid.co/ark:/1234/11', 'http://testpid.co/ark:/1234/22', 'http://testpid.co/ark:/1234/qq' ] note.extra_data = json.dumps({'related_pages': rel_pages}) teinote = annotation_to_tei(note, teidoc) self.assertEqual(len(rel_pages), len(teinote.related_pages)) # first ark has a corresponding id in the fixture, should be converted self.assertEqual('#%s' % teidoc.page_id_by_xlink(rel_pages[0]), teinote.related_pages[0].target) for idx in range(len(rel_pages)): self.assertEqual(rel_pages[idx], teinote.related_pages[idx].text)
def test_ocr_ids(self): # pach in fixture ocr content with patch.object(self.vol, 'ocr') as mockocr: mockocr.exists = True ocr_xml = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml')) mockocr.content = ocr_xml self.assertFalse(self.vol.ocr_has_ids) self.vol.add_ocr_ids() self.assertTrue(self.vol.ocr_has_ids)
def test_ocr_ids(self): # pach in fixture ocr content with patch.object(self.vol, 'ocr') as mockocr: mockocr.exists = True ocr_xml = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml')) mockocr.content = ocr_xml self.assertFalse(self.vol.ocr_has_ids) self.vol.add_ocr_ids() self.assertTrue(self.vol.ocr_has_ids)
def test_text_to_plaintext_longlines(): tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) # replace the text of the last line with an excessively long line # - because the xmlobject isn't configured with an eye to updates, # update the lxml node text directly tei.lines[-1].node.text = "superlongline" * 100 plaintext = tei.text_to_plaintext() plaintext_lines = plaintext.split("\n") # line is slightly more than 100 because of ltr/rtl marks & line number # but should NOT be padded to match the superlongline assert len(plaintext_lines[1]) < 110
def test_text_to_plaintext(): tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) plaintext = tei.text_to_plaintext() assert plaintext.count("\n") == 43 # two section breaks assert plaintext.count("\n\n") == 4 # includes labels assert "Right Margin" in plaintext assert "מא" in plaintext assert "الحسن بن ابرهيم" in plaintext # includes line numbers and ltr/rtl marks assert ("\u200f כתאבי אטאל אללה בקא מולי אלשיך ואדאם \u200e 1\n" in plaintext)
def test_items_xml(self): # basic inspection of sample result / xml mapping response = load_xmlobject_from_file(self.item_response, digwf.Items) assert response.count == 1 assert len(response.items) == 1 assert isinstance(response.items[0], digwf.Item) item = response.items[0] assert item.pid == '7svgb' assert item.item_id == '3031' assert item.control_key == 'ocm08951025' assert item.display_image_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output' assert item.display_image_count == 2218 assert item.ocr_file_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output' assert item.ocr_file_count == 2218 assert item.pdf == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.pdf' assert item.marc_path == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/ocm08951025_MRC.xml' assert item.ocr_file == '/mnt/lsdi/diesel/lts_new/ocm08951025-3031/ocm08951025/Output/Output.xml' assert item.collection_id == 10 assert item.collection_name == 'Atlanta City Directories' response = load_xmlobject_from_file(self.empty_response, digwf.Items) assert response.count == 0
def test_annotation_citation_to_tei(self): teidoc = load_xmlobject_from_file(os.path.join(FIXTURE_DIR, 'teifacsimile.xml'), tei.AnnotatedFacsimile) teinote = annotation_to_tei(self.zotero_note, teidoc) # print teinote.serialize(pretty=True) # number of citations should match self.assertEqual(len(self.zotero_note.extra_data['citations']), len(teinote.citations)) # minimal inspection to check that values carried through as expected self.assertEqual('webpage', teinote.citations[0].type) self.assertEqual('journalArticle', teinote.citations[1].type) self.assertEqual('zotero-7CBCH6E8', teinote.citations[0].id) self.assertEqual('zotero-MUXAEE89', teinote.citations[1].id)
def test_get_fulltext(self): with patch.object(self.vol, 'ocr') as mockocr: mockocr.exists = True # abbyy finereader v8 ocr_xml = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'abbyyocr_fr8v2.xml')) mockocr.content = ocr_xml text = self.vol.get_fulltext() # check for arbitrary text content self.assert_( 'In presenting this, the initial volume of the' in text, 'ocr text content should be present in plain text') self.assert_( 'Now, kind reader, we ask that you do not crit' in text, 'ocr text content should be present in plain text') self.assert_( re.search(r'Baldwin\s+Dellinger\s+Brice', text), 'table row content should be displayed on a single line') # abbyy finereader v6 ocr_xml = load_xmlobject_from_file( os.path.join(FIXTURE_DIR, 'abbyyocr_fr6v1.xml')) mockocr.content = ocr_xml text = self.vol.get_fulltext() # check for arbitrary text content self.assert_( 'was late in the autumn, the vines yet kept their leaves,' in text, 'ocr text content should be present in plain text') self.assert_( 'walked up the steps. The lady had not moved, and made' in text, 'ocr text content should be present in plain text') self.assert_( re.search(r'Modern\.\s+New Standard\.\s+Popular\.', text), 'table row content should be displayed on a single line')
def mock_load(url, xmlclass): '''mock-like method wrapping load_xmlobject_from_file without actually making a network query, but still calling the requested xmlclass constructor. ''' # figure out what fixture to return fixture = (mock_load.return_fixtures[mock_load.call_count] if mock_load.call_count < len(mock_load.return_fixtures) else mock_load.return_fixtures[-1]) mock_load.call_count += 1 mock_load.urls.append(url) test_response_path = fixture_path(fixture) test_response_obj = xmlmap.load_xmlobject_from_file( test_response_path, xmlclass=xmlclass) return test_response_obj
def mock_load(url, xmlclass): '''mock-like method wrapping load_xmlobject_from_file without actually making a network query, but still calling the requested xmlclass constructor. ''' # figure out what fixture to return fixture = (mock_load.return_fixtures[mock_load.call_count] if mock_load.call_count < len(mock_load.return_fixtures) else mock_load.return_fixtures[-1]) mock_load.call_count += 1 mock_load.urls.append(url) test_response_path = fixture_path(fixture) test_response_obj = xmlmap.load_xmlobject_from_file(test_response_path, xmlclass=xmlclass) return test_response_obj
def init_xml_object(self): '''Initialize an xmlobject based on user-specified arguments for filename and type. Returns an instance of the appropriate :class:`~eulxml.xmlmap.XmlObject`, or displays an error message if the document could not be parsed as XML.''' if self.args.input == 'ead': xmlobj_class = EAD elif self.args.input == 'tei': xmlobj_class = Tei try: return load_xmlobject_from_file(self.args.filename, xmlobj_class) except Exception as err: print 'Error loading %s as XML: %s' % (self.args.filename, err) exit(-1)
def test_html(): tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) html = tei.text_to_html() # should result in 3 sections assert html.count("<section>") == 3 assert "<h1>Right Margin</h1>" in html assert "<li value='1'>מא</li>" in html # three different lines that are # 1 assert html.count("<li value='1'>") == 3 # check that the last line / last block is included assert "<li value='6'>الحسن بن ابرهيم</li>" in html # assert that missing line number does not result in a line number of "None" assert "<li value='None'>" not in html assert "<li value=''>" not in html
def test_page_index_data(self, mockzipfile): mockzip_obj = mockzipfile.return_value.__enter__.return_value page_files = ['0001.txt', '00002.txt'] mockzip_obj.namelist.return_value = page_files # simulate reading zip file contents contents = ('page content for one', 'hello! pshaw! what?') mockzip_obj.open.return_value.__enter__.return_value \ .read.return_value.decode.side_effect = contents work = DigitizedWork(source_id='chi.79279237') # page data comes from mets mets = load_xmlobject_from_file(self.metsfile, hathi.MinimalMETS) with patch.object(DigitizedWork, 'hathi') as mock_hathiobj: mock_hathiobj.zipfile_path.return_value = '/path/to/79279237.zip' mock_hathiobj.metsfile_path.return_value = self.metsfile mock_hathiobj.content_dir = 'data' page_data = work.page_index_data() assert isinstance(page_data, types.GeneratorType) for i, data in enumerate(page_data): mets_page = mets.structmap_pages[i] assert data['id'] == '.'.join([work.source_id, mets_page.text_file.sequence]) assert data['source_id'] == work.source_id assert data['content'] == contents[i] assert data['order'] == mets_page.order assert data['item_type'] == 'page' assert data['label'] == mets_page.display_label assert 'tags' in data assert data['tags'] == mets_page.label.split(', ') # not suppressed by no data mock_hathiobj.metsfile_path.side_effect = \ storage_exceptions.ObjectNotFoundException # should log an error, not currently tested assert not list(work.page_index_data()) # if item is suppressed - no page data work.status = DigitizedWork.SUPPRESSED assert not list(work.page_index_data()) # non hathi item - no page data nonhathi_work = DigitizedWork(source=DigitizedWork.OTHER) assert not list(nonhathi_work.page_index_data())
def process(spreadsheet, xml_files_dir, sheet=1, control_row=None, force_dates=False, object_type='parent', input_encoding='utf8', copy_parent_to_children=False): '''Function to go through all the data and process it.''' #make sure we have a directory to put the mods files in os.makedirs(xml_files_dir, exist_ok=True) data_handler = DataHandler(spreadsheet, sheet=sheet, control_row=control_row, force_dates=force_dates, object_type=object_type, input_encoding=input_encoding) index = 1 for record in data_handler.get_xml_records(): filename = '%s.%s.xml' % (record.xml_id, record.record_type) full_path = os.path.join(xml_files_dir, filename) if os.path.exists(full_path): raise DataError( '%s file already exists from previous record! Possible duplicate %s IDs?' % (filename, record.xml_id)) if copy_parent_to_children: #load parent mods object if desired (& it exists) parent_filename = os.path.join( xml_files_dir, u'%s.%s' % (record.group_id, record.record_type)) parent_xml = None if os.path.exists(parent_filename): parent_xml = load_xmlobject_from_file(parent_filename, mods.Mods) mapper = Mapper(record.record_type, record.field_data(), parent_mods=parent_xml) else: mapper = Mapper(record.record_type, record.field_data()) xml_obj = mapper.get_xml() xml_bytes = xml_obj.serializeDocument( pretty=True) #serializes as UTF-8 with open(full_path, 'wb') as f: f.write(xml_bytes) index = index + 1
def handle(self, *args, **options): repo = Repository() for pid in options['pid']: vol = repo.get_object(pid, type=Volume) if options['tei']: tei = load_xmlobject_from_file(options['tei'], Facsimile) else: tei = annotate.annotated_tei(vol.generate_volume_tei(), vol.annotations()) try: zipfile = export.website(vol, tei) except export.ExportException as err: raise CommandError(err) zipfilename = '%s-annotated-site.zip' % vol.noid shutil.copyfile(zipfile.name, zipfilename) print 'Export for %s complete, zipfile is %s' % (vol.noid, zipfilename)
def _query(self, base_url, qargs, response_xmlclass): '''Utility method: Adds required query arguments, returns response as a caller-specified :class:`~eulxml.xmlmap.XmlObject`. Delays if necessary to enforce EUtils query speed policy. ''' self._enforce_query_timing() qargs = qargs.copy() if 'tool' not in qargs: qargs['tool'] = self.EUTILS_TOOL if 'email' not in qargs: qargs['email'] = self.EUTILS_EMAIL # TODO: When we start making more than one query we need to sleep to # avoid making more than 3 requests per second per E-Utilities # policies. qurl = base_url + urlencode(qargs) logger.debug('EntrezClient querying: ' + qurl) return xmlmap.load_xmlobject_from_file(qurl, xmlclass=response_xmlclass)
def create_ht_marc(kdip): if isinstance(kdip, basestring): barcode = kdip else: barcode = kdip.kdip_id record = load_bib_record(barcode) cleanup_035s(record) remove_most_999_fields(record, barcode) transform_035(record) marc_file = '%s/%s/marc.xml' % (settings.KDIP_DIR, barcode) # Write the marc.xml to disk. with open(marc_file, 'w') as marcxml: # When we insert the 035 field in position an empaty datafield is instered # at the bottom, so we get rid of that. marcxml.write(re.sub('\<datafield\/\>\\n', '', record.serialize(pretty=True))) return load_xmlobject_from_file(marc_file, models.Marc)
def as_publication_article(self, repo=None): '''Initialize (but do not save) a new :class:`~openemory.publication.models.Article` instance and based on harvested record information and Article XML. :param repo: optional; pass in an existing :class:`eulfedora.server.Repository` object initialized with the desired credentials :returns: unsaved :class:`~openemory.publication.models.Article` ''' if repo is None: repo = Repository() article = repo.get_object(type=Article) # using comma-delimited usernames to indicate object has multiple owners # should work with existing XACML owner policy; # for more detail, see https://jira.duraspace.org/browse/FCREPO-82 article.owner = ', '.join(auth.username for auth in self.authors.all()) # VERY preliminary, minimal metadata mapping article.label = self.title article.dc.content.title = self.title article.dc.content.creator_list.extend([auth.get_full_name() for auth in self.authors.all()]) article.dc.content.identifier_list.extend([self.access_url, 'PMC%d' % self.pmcid]) # set the XML article content as the contentMetadata datastream # - record content is a file field with a read method, which should be # handled correctly by eulfedora for ingest if hasattr(self.content, 'read'): article.contentMetadata.content = load_xmlobject_from_file(self.content, NlmArticle) if article.contentMetadata.content: article.descMetadata.content = article.contentMetadata.content.as_article_mods() # FIXME: datastream checksum! # TODO: format uri for this datastream ? return article
def page_index_data(self): '''Get page content for this work from Hathi pairtree and return data to be indexed in solr.''' # If an item has been suppressed or is from a source other than # hathi, bail out. No pages to index. if self.is_suppressed or self.source != self.HATHI: return # load mets record to pull metadata about the images try: mmets = load_xmlobject_from_file(self.hathi.metsfile_path(), MinimalMETS) except storage_exceptions.ObjectNotFoundException: logger.error('Pairtree data for %s not found but status is %s', self.source_id, self.get_status_display()) return # read zipfile contents in place, without unzipping with ZipFile(self.hathi.zipfile_path()) as ht_zip: # yield a generator of index data for each page; iterate # over pages in METS structmap for page in mmets.structmap_pages: # zipfile spec uses / for path regardless of OS pagefilename = '/'.join([self.hathi.content_dir, page.text_file_location]) with ht_zip.open(pagefilename) as pagefile: try: yield { 'id': '%s.%s' % (self.source_id, page.text_file.sequence), 'source_id': self.source_id, # for grouping with work record 'content': pagefile.read().decode('utf-8'), 'order': page.order, 'label': page.display_label, 'tags': page.label.split(', ') if page.label else [], 'item_type': 'page' } except StopIteration: return
def test_check_ht(self): test_xml = [ 'digitizedbooks/apps/publish/fixtures/bib1.xml', 'digitizedbooks/apps/publish/fixtures/bib2.xml', 'digitizedbooks/apps/publish/fixtures/bib3.xml' ] job = Job(pk=1) job.save() kdip0 = KDip.objects.create(kdip_id='10002350302', oclc="12345", note='0', pid='r8d9b', create_date = '2015-12-30 15:43:17', job_id=1) kdip1 = KDip.objects.create(kdip_id='10002350304', oclc="12345", note='1', pid='r8d9y', create_date = '2015-12-30 15:43:17', job_id=1) kdip2 = KDip.objects.create(kdip_id='10002350306', oclc="67890", note='2', pid='r8d9s', create_date = '2015-12-30 15:43:17', job_id=1) text590 = "The online edition of this book in the public domain, i.e., not protected by copyright, has been produced by the Emory University Digital library Publications Program." for xml in test_xml: index = test_xml.index(xml) kdip = KDip.objects.get(note=index) marc = load_xmlobject_from_file(xml, AlmaBibRecord) marc = check_ht.add_856(marc, kdip) marc = Utils.remove_all_999_fields(marc) marc = Utils.update_583(marc) text_856 = '<datafield tag="856" ind1="4" ind2="1"><subfield code="3">%s</subfield><subfield code="u">http://pid.emory.edu/ark:/25593/%s/HT</subfield><subfiled code="y">HathiTrust version</subfiled></datafield>' % (index, kdip.pid) field856s = [] for tag856 in marc.field856: field856s.append(tag856.serialize()) self.assertIn(text_856, field856s) self.assertEqual(len(marc.field999), 0) self.assertNotIn(marc.serialize().lower(), text590.lower()) marc = check_ht.add_590(marc) self.assertEqual(marc.field590, text590) self.assertEqual(marc.tag583a, 'digitized')
def test_can_retrieve_xml_of_existing_articles(self, mock_ds, mock_pdf): with open(os.path.join(FIXTURE_ROOT, '1023796ar.xml'), 'r') as f: from eulxml.xmlmap import load_xmlobject_from_file mock_pdf.content = load_xmlobject_from_file(f) mock_ds = [ 'ERUDITXSD300', ] # noqa issue = IssueFactory.create(journal=self.journal, year=2010, date_published=dt.datetime.now() - dt.timedelta(days=1000)) IssueFactory.create(journal=self.journal, year=2010, date_published=dt.datetime.now()) article = ArticleFactory.create(issue=issue) journal_id = self.journal.localidentifier issue_id = issue.localidentifier article_id = article.localidentifier url = reverse('public:journal:article_raw_xml', args=(journal_id, issue.volume_slug, issue_id, article_id)) request = self.factory.get(url) request.user = AnonymousUser() request.subscription = None # Run response = ArticleXmlView.as_view()(request, journal_code=journal_id, issue_slug=issue.volume_slug, issue_localid=issue_id, localid=article_id) # Check self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/xml')
def handle(self, *paths, **options): if not len(paths): raise CommandError('Please specify path to content for import.') if len(paths) > 1: # this limitation is kind of arbitrary, but keep thing simple for now raise CommandError('Import currently only supports a single volume.') path = paths[0] dry_run = options.get('dry_run', False) verbosity = options.get('verbosity', self.v_normal) repo = ManagementRepository() # make collection required to avoid accidentally forgetting it coll = options.get('collection', None) if coll is None: raise CommandError('Please specify collection pid') collection = repo.get_object(coll, type=Collection) if not collection.exists: raise CommandError('Collection %s does not exist' % coll) if not collection.has_requisite_content_models: raise CommandError('%s is not a collection' % coll) try: start = time.time() bag = bagit.Bag(path) # NOTE: could consider using fast validation, but files probably are # not so large or so numerous that this will be an issue if verbosity > self.v_normal: self.stdout.write('Validating bag %s' % path) fast_validate = options.get('fast_validate') bag.validate(fast=fast_validate) if verbosity >= self.v_normal: self.stdout.write('Validated %s in %.02fs %s' % (path, time.time() - start, '(fast validation enabled)' if fast_validate else '')) except bagit.BagError as err: # failed to load directory as a bag raise CommandError('Please supply a valid BagIt as input. %s' % err) except bagit.BagValidationError as err: # bag is not valid raise CommandError('Input is not a valid bag. %s' % err) files = {'pdf': None, 'marcxml': None, 'dc': None} checksums = {} # this is potentially a long list, but go ahead and store since we will # be consulting it multiple times payload_files = list(bag.payload_files()) # identify required contents within the bag by extension and name for data_path in payload_files: # path is relative to bag root dir filename = os.path.join(path, data_path) # get extension and name basename = os.path.basename(filename) basefile, ext = os.path.splitext(basename) # NOTE: splitext leaves . on the ext portion if ext.lower() == '.pdf': files['pdf'] = filename checksums['pdf'] = bag.entries[data_path].get('md5', None) elif ext.lower() == '.xml': if basefile.lower() == 'marc': files['marcxml'] = filename checksums['marcxml'] = bag.entries[data_path].get('md5', None) elif basefile.lower() == 'dc': files['dc'] = filename checksums['dc'] = bag.entries[data_path].get('md5', None) # check that required components are present err = False for label, filepath in files.iteritems(): if filepath is None: self.stderr.write('%s not found' % label.upper()) err = True elif checksums[label] is None: self.stderr.write('No MD5 checksum found for %s' % label.upper()) err = True if err: raise CommandError('Cannot import without all required files and checksums.') # all pieces are available, so proceed with ingest # construct book and ingest if verbosity > self.v_normal: self.stdout.write('Creating book object with marxml %s' % files['marcxml']) try: marcxml = load_xmlobject_from_file(files['marcxml'], MinMarcxml) except XMLSyntaxError as err: raise CommandError('Failed to load %s as xml: %s' % (files['marcxml'], err)) try: dcxml = load_xmlobject_from_file(files['dc'], DublinCore) except XMLSyntaxError as err: raise CommandError('Failed to load %s as xml: %s' % (files['dc'], err)) # look for book by ocm number first, in case a previous ingest failed book_pids = Book.pids_by_label(marcxml.ocm_number) # error if we find more than one if len(book_pids) > 1: raise CommandError('Multiple books exist with label %s. Please correct this first.' \ % marcxml.ocm_number) # if we find exactly one, use that instead of creating a new book elif len(book_pids) == 1: book = repo.get_object(book_pids[0], type=Book) if verbosity >= self.v_normal: self.stdout.write('Using existing book %s with ocm number %s' % \ (book.pid, marcxml.ocm_number)) # otherwise, ingest new book else: book = repo.get_object(type=Book) # set book label to ocm number from the marc book.label = marcxml.ocm_number if verbosity > self.v_normal: self.stdout.write('Book label %s' % book.label) # associate with collection if collection is not None: book.collection = collection if verbosity > self.v_normal: self.stdout.write('Associating with collection %s' % collection.short_label) book.marcxml.content = marcxml # NOTE: import checksum can't be used because xml may be serialized differently # book.marcxml.checksum = checksums['marcxml'] book.dc.content = dcxml # NOTE: import checksum can't be used because DC is modified to add ARK # book.dc.checksum = checksums['dc'] # save; bail if error if not dry_run: try: saved = book.save('ingest') if not saved: raise CommandError('Failed to ingest book into repository') if verbosity >= self.v_normal: self.stdout.write('Successfully ingested book %s' \ % book.pid) except RequestFailed as err: raise CommandError('Error ingesting book: %s' % err) # in case of pre-existing book object, check for existing volume if book.volume_set: if len(book.volume_set) > 1: raise CommandError('Book %s has multiple volumes; import not supported' \ % book.pid) else: # use existing volume object vol = book.volume_set[0] if verbosity >= self.v_normal: self.stdout.write('Using existing volume %s' % vol.pid) # otherwise, create new volume object else: # construct volume (v1.1), associate with book, and ingest if verbosity > self.v_normal: self.stdout.write('Creating volume with %s' % files['pdf']) with open(files['pdf']) as pdf_file: vol = repo.get_object(type=VolumeV1_1) # set volume label to ocm number from the marc + volume number # for consistency with lsdi content, use ocm_v# notation # V.0 indicates single-volume book vol.label = '%s_V.0' % marcxml.ocm_number # set pdf content vol.pdf.content = pdf_file vol.pdf.checksum = checksums['pdf'] # set relation to parent book object vol.book = book # minimal DC metadata derived from book metadata vol.dc.content.title = book.dc.content.title for t in book.dc.content.type_list: vol.dc.content.type_list.append(t) vol.dc.content.format = book.dc.content.format vol.dc.content.language = book.dc.content.language vol.dc.content.rights = book.dc.content.rights if not dry_run: try: saved = vol.save('ingest') if not saved: # NOTE: possibly, if this fails, we should deactivate the book object # but will leave that to manual processing for now raise CommandError('Failed to ingest volume into repository') else: if verbosity >= self.v_normal: self.stdout.write('Successfully ingested volume %s' \ % vol.pid) except RequestFailed as err: raise CommandError('Error ingesting volume: %s' % err) #### page import # if volume has existing pages, bail if len(vol.pages): raise CommandError('Volume %s already has %s page%s' % \ (vol.pid, len(vol.pages), '' if len(vol.pages) == 1 else 's')) # should page import happen here? # - identify numeric jp2/jpf files in the bag and get total count # - identify numeric .xml files in the bag and get total count # - make sure counts match up # Question: can we assume no start/end blank pages for now? # - start looping through, create page-1.1 and associate with book, # and ingest # - set first page as primary image on the volume # - report number of pages ingested image_files = [] # identify page files (images and ocr xml) for data_path in payload_files: # get extension and name basename = os.path.basename(data_path) basefile, ext = os.path.splitext(basename) if ext in ['.jp2', '.jpf']: image_files.append(data_path) # check that MD5 is present and bail if not # - this is probably redundant since by this point validation # has passed and previous content has checksums, but # ingest will assume checksums are available so better to error # *before* starting to ingest page-level content if bag.entries[data_path].get('md5', None) is None: raise CommandError('No MD5 checksum for %s' % data_path) # ensure pages are sorted into page-order image_files.sort() # NOTE: disabled for now; tunebook does not appear to include alto # for pages with no text content ## find matching page ocr files # for imgfile in image_files: # basefile, ext = os.path.splitext(imgfile) # ocrfile = '%s.xml' % basefile # if ocrfile not in payload_files: # raise CommandError('No OCR xml page present for %s (expected %s)' % \ # (imgfile, ocrfile)) # pre-generate empty xml in case we need it to force eulfedora to not # create ocr datastream when no ocr is present emptyxml = load_xmlobject_from_string('<empty/>') # iterate through page images and put into fedora pageindex = 1 for imgfile in image_files: if verbosity > self.v_normal: print 'Creating Page object for %s' % imgfile # path is relative to bag root dir img_filename = os.path.join(path, imgfile) page = repo.get_object(type=PageV1_1) # set page label page.label = '%s page %d' % (vol.label, pageindex) # set the relation to the volume object page.volume = vol logger.debug('Page %s volume %s' % (page.pid, page.volume.pid)) # set a dc:title based on volume title page.dc.content.title = '%s page %d' % (vol.dc.content.title, pageindex) # set page order page.page_order = pageindex with open(img_filename) as img_content: # set image content page.image.content = img_content page.image.checksum = bag.entries[imgfile]['md5'] # assume jpeg2000 for now (only looking for jp2/jpf) page.image.mimetype = 'image/jp2' # check for ocr xml within the bag, same base name as image basefile, ext = os.path.splitext(imgfile) ocrfile = '%s.xml' % basefile if ocrfile in payload_files: page.ocr.content = load_xmlobject_from_file(os.path.join(path, ocrfile)) # NOTE: can't use MD5 from bag because XML may be # serialized differently when sent to Fedora # (unless we treat as file instead of xml...) # page.ocr.checksum = bag.entries[ocrfile]['md5'] if verbosity > self.v_normal: print 'Setting OCR for Page from %s' % ocrfile else: # warn but do not error if ocr xml is not found self.stdout.write('Warning: no OCR xml found for %s' % imgfile) # explicitly set xml content to empty so eulfedora doesn't # attempt to bootstrap & ingest (and error) page.ocr.content = emptyxml if not dry_run: try: # for now, if any page ingest errors, bail out # (unclear what would cause load to fail midway) saved = page.save() if not saved: raise CommandError('Failed to ingest page %d into repository' \ % pageindex) except RequestFailed as err: raise CommandError('Error ingesting page %d: %s' % (pageindex, err)) # set first page as primary image for the volume if not dry_run and pageindex == 1: vol.primary_image = page vol.save('adding primary image relation') # increase page index for next page pageindex += 1 if verbosity >= self.v_normal: # total is pageindex - 1 since pageindex incremented at end of loop self.stdout.write('Created %d pages' % (pageindex - 1))
def setUp(self): self.account = load_xmlobject_from_file(self.FIXTURE_FILE, cerp.Account) self.folder = self.account.folders[0] self.message = self.folder.messages[0]
def setUp(self): self.fr6v1 = load_xmlobject_from_file(self.fr6v1_doc, abbyyocr.Document) self.fr8v2 = load_xmlobject_from_file(self.fr8v2_doc, abbyyocr.Document)