def test_remove_tags_unicode(): snippet = u'<p>😋</p>' result = remove_tags(snippet) expected = u'😋' assert result == expected
def abstract(self): abstract_nodes = self.root.xpath('./front//abstract[1]') if not abstract_nodes: return abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() return abstract
def test_remove_tags_strip_keeps_tails(): strip = 'self::foo' snippet = '<foo>This goes</foo> but this remains.' result = remove_tags(snippet, strip=strip) expected = u' but this remains.' assert result == expected
def test_remove_tags_allowed_tags_strip(): allowed_tags = ('b', ) strip = '@class="hidden"' snippet = '<p><b><i>Only</i></b> this text remains.<span class="hidden">Not this one.</span></p>' result = remove_tags(snippet, allowed_tags=allowed_tags, strip=strip) expected = u'<b>Only</b> this text remains.' assert result == expected
def abstract(self): abstract_nodes = self.root.xpath( ".//head/abstract[not(@graphical)]/abstract-sec/simple-para") if not abstract_nodes: return abstract_paragraphs = [ remove_tags(abstract_node, **self.remove_tags_config_abstract).strip("/ \n") for abstract_node in abstract_nodes ] abstract = ' '.join(abstract_paragraphs) return abstract
def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_) if affiliation_node: affiliation = remove_tags( affiliation_node[0], strip="self::label | self::email" ).strip() return affiliation
def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0] affiliation = remove_tags( affiliation_node, strip='self::label | self::email' ).strip() return affiliation
def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj
def get_reference_iter(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Yields: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ # handle also unstructured refs for citation_node in ref_node.xpath("./reference|./other-ref"): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format="Elsevier", ) fields = [ ( ("string(.//series/title/maintitle[1])"), builder.set_journal_title, ), ( "string(.//title[parent::edited-book|parent::book]/maintitle[1])", builder.add_parent_title, ), ("string(./publisher/name[1])", builder.set_publisher), ("string(.//volume-nr[1])", builder.set_journal_volume), ("string(.//issue-nr[1])", builder.set_journal_issue), ("string(.//date[1])", builder.set_year), ("string(.//inter-ref[1])", builder.add_url), ("string(.//doi[1])", builder.add_uid), ( 'string(pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")][1])', builder.add_report_number, ), ("string(./title/maintitle[1])", builder.add_title), ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) label_value = ref_node.xpath("string(./label[1])").extract_first() builder.set_label(label_value.strip("[]")) pages = self.get_reference_pages(citation_node) artid = self.get_reference_artid(citation_node) if artid: builder.set_page_artid(artid=artid) if any(pages): builder.set_page_artid(*pages) remainder = (remove_tags( citation_node, strip="self::authors" "|self::article-number" "|self::volume-nr" "|self::issue-nr" "|self::inter-ref" "|self::maintitle" "|self::date" "|self::label" "|self::publisher" "|self::doi" "|self::pages").strip("\"';,. \t\n\r").replace("()", "")) if remainder: builder.add_misc(remainder) for editor in self.get_reference_editors(citation_node): builder.add_author(editor, "editor") for author in self.get_reference_authors(citation_node): builder.add_author(author, "author") yield builder.obj