def _parse(soup): # journal journal_name = ElsevierReadMetaData.get_text_or_none(soup, 'xocs:srctitle') or \ ElsevierReadMetaData.get_text_or_none(soup, 'prism:publicationName') doi = ElsevierReadMetaData.get_text_or_none(soup, 'xocs:doi') # https://www.elsevier.com/__data/assets/pdf_file/0003/58872/ja5_tagbytag5_v1.9.5.pdf # Elsevier XML definition pp. 46 head_node = soup.find('head') title = ElsevierReadMetaData.get_text_or_none(head_node, 'ce:title', extract_ce_title) or \ ElsevierReadMetaData.get_text_or_none(soup, 'dc:title') keywords = [] if head_node is not None: # Elsevier XML definition pp. 366 for node in head_node.find_all('ce:keyword'): text_node = node.find('ce:text') if text_node is not None: keyword = remove_consecutive_whitespaces( extract_ce_text(text_node), keep_newline=False).strip() keywords.append(keyword) if len(keywords) == 0: for subject in soup.find_all('dcterms:subject'): keywords.append(subject.get_text().strip()) return soup, { 'Journal': journal_name, 'DOI': doi, 'Title': title, 'Keywords': keywords }
def test_simple_text(self): xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd"> <ce:text>Example</ce:text> </root>""" tag = BeautifulSoup(xml_string, 'xml').find('ce:text') text = extract_ce_text(tag) self.assertEqual(text, 'Example')
def test_space(self): xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd"> <ce:text>Sub-2<ce:hsp sp="0.25"/>μm core–shell particles</ce:text> </root>""" tag = BeautifulSoup(xml_string, 'xml').find('ce:text') text = extract_ce_text(tag) self.assertEqual(text, 'Sub-2 μm core–shell particles')
def test_named_entities(self): xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd"> <ce:text>&z.dshfnc; ◻ ⊨ 𝓇 ј</ce:text> </root>""" tag = BeautifulSoup(resolve_elsevier_entities(xml_string), 'xml').find('ce:text') text = extract_ce_text(tag) self.assertEqual(text, '┆ ◻ ⊨ 𝓇 ј')
def test_named_entity(self): xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd"> <ce:text>é é é</ce:text> </root>""" tag = BeautifulSoup(resolve_elsevier_entities(xml_string), 'xml').find('ce:text') text = extract_ce_text(tag) self.assertEqual(text, 'é é é')
def test_font_change(self): xml_string = '<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">' \ '<ce:text>Test for <ce:bold>bold</ce:bold> ' \ '<ce:italic>italic</ce:italic> <ce:monospace>monospace</ce:monospace> ' \ '<ce:sans-serif>sans-serif</ce:sans-serif> <ce:small-caps>small-caps</ce:small-caps> ' \ '</ce:text></root>' tag = BeautifulSoup(xml_string, 'xml').find('ce:text') text = extract_ce_text(tag) self.assertEqual( text, 'Test for bold italic monospace sans-serif small-caps ')