Beispiel #1
0
    def _parse(soup):
        # journal
        journal_name = ElsevierReadMetaData.get_text_or_none(soup, 'xocs:srctitle') or \
                       ElsevierReadMetaData.get_text_or_none(soup, 'prism:publicationName')
        doi = ElsevierReadMetaData.get_text_or_none(soup, 'xocs:doi')

        # https://www.elsevier.com/__data/assets/pdf_file/0003/58872/ja5_tagbytag5_v1.9.5.pdf
        # Elsevier XML definition pp. 46
        head_node = soup.find('head')

        title = ElsevierReadMetaData.get_text_or_none(head_node, 'ce:title', extract_ce_title) or \
                ElsevierReadMetaData.get_text_or_none(soup, 'dc:title')

        keywords = []
        if head_node is not None:
            # Elsevier XML definition pp. 366
            for node in head_node.find_all('ce:keyword'):
                text_node = node.find('ce:text')
                if text_node is not None:
                    keyword = remove_consecutive_whitespaces(
                        extract_ce_text(text_node),
                        keep_newline=False).strip()
                    keywords.append(keyword)

        if len(keywords) == 0:
            for subject in soup.find_all('dcterms:subject'):
                keywords.append(subject.get_text().strip())

        return soup, {
            'Journal': journal_name,
            'DOI': doi,
            'Title': title,
            'Keywords': keywords
        }
Beispiel #2
0
    def test_simple_text(self):
        xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
        <ce:text>Example</ce:text>
        </root>"""

        tag = BeautifulSoup(xml_string, 'xml').find('ce:text')
        text = extract_ce_text(tag)
        self.assertEqual(text, 'Example')
Beispiel #3
0
    def test_space(self):
        xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
            <ce:text>Sub-2<ce:hsp sp="0.25"/>μm core–shell particles</ce:text>
        </root>"""

        tag = BeautifulSoup(xml_string, 'xml').find('ce:text')
        text = extract_ce_text(tag)
        self.assertEqual(text, 'Sub-2 μm core–shell particles')
Beispiel #4
0
    def test_named_entities(self):
        xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
            <ce:text>&z.dshfnc; &EmptySmallSquare; &DoubleRightTee; &rscr; &jsercy;</ce:text>
        </root>"""

        tag = BeautifulSoup(resolve_elsevier_entities(xml_string),
                            'xml').find('ce:text')
        text = extract_ce_text(tag)
        self.assertEqual(text, '┆ ◻ ⊨ 𝓇 ј')
Beispiel #5
0
    def test_named_entity(self):
        xml_string = """<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
            <ce:text>é &eacute; &#x000E9;</ce:text>
        </root>"""

        tag = BeautifulSoup(resolve_elsevier_entities(xml_string),
                            'xml').find('ce:text')
        text = extract_ce_text(tag)
        self.assertEqual(text, 'é é é')
Beispiel #6
0
    def test_font_change(self):
        xml_string = '<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">' \
                     '<ce:text>Test for <ce:bold>bold</ce:bold> ' \
                     '<ce:italic>italic</ce:italic> <ce:monospace>monospace</ce:monospace> ' \
                     '<ce:sans-serif>sans-serif</ce:sans-serif> <ce:small-caps>small-caps</ce:small-caps> ' \
                     '</ce:text></root>'

        tag = BeautifulSoup(xml_string, 'xml').find('ce:text')
        text = extract_ce_text(tag)
        self.assertEqual(
            text, 'Test for bold italic monospace sans-serif small-caps ')