def conflict(soup): conflict_tags = extract_nodes(soup, "fn", attr="fn-type", value="conflict") conflict_tags += extract_nodes(soup, "fn", attr="fn-type", value="COI-statement") return conflict_tags
def custom_meta(soup, meta_name=None): custom_meta_tags = extract_nodes(soup, "custom-meta") if meta_name is not None: custom_meta_tags = [ tag for tag in custom_meta_tags if node_contents_str( first(extract_nodes(tag, "meta-name"))) == meta_name ] return custom_meta_tags
def fn_group(soup, content_type=None): if content_type: return extract_nodes(soup, "fn-group", attr="content-type", value=content_type) else: return extract_nodes(soup, "fn-group")
def pub_id(soup, pub_id_type=None): if pub_id_type: return extract_nodes(soup, "pub-id", attr="pub-id-type", value=pub_id_type) else: return extract_nodes(soup, "pub-id")
def ext_link(soup, ext_link_type=None): if ext_link_type: return extract_nodes(soup, "ext-link", attr="ext-link-type", value=ext_link_type) else: return extract_nodes(soup, "ext-link")
def abstract(soup, abstract_type=None): if abstract_type: return extract_nodes(soup, "abstract", attr="abstract-type", value=abstract_type) else: return extract_nodes(soup, "abstract")
def authors(soup, contrib_type="author"): if contrib_type: return extract_nodes(soup, "contrib", attr="contrib-type", value=contrib_type) else: return extract_nodes(soup, "contrib")
def pub_date(soup, date_type=None, pub_type=None): if date_type is not None: return extract_nodes(soup, "pub-date", attr="date-type", value=date_type) elif pub_type is not None: return extract_nodes(soup, "pub-date", attr="pub-type", value=pub_type) else: return extract_nodes(soup, "pub-date")
def journal_issn(soup, pub_format, pub_type): if pub_format is None and pub_type is None: # return the first issn tag found regardless of which type return first(extract_nodes(soup, "issn")) elif pub_format is not None: return first( extract_nodes(soup, "issn", attr="publication-format", value=pub_format)) elif pub_type is not None: return first( extract_nodes(soup, "issn", attr="pub-type", value=pub_type))
def journal_id(soup): # the first non-nil tag return firstnn( extract_nodes(soup, "journal-id", attr="journal-id-type", value="publisher-id"))
def author_keywords(soup): # A few articles have kwd-group with no kwd-group-type, so account for those tags = extract_nodes(soup, "kwd-group") keyword_tags = [] for tag in tags: if (tag.get("kwd-group-type") == "author-keywords" or tag.get("kwd-group-type") is None): keyword_tags += [tag for tag in tag if tag.name == "kwd"] return keyword_tags
def article_contributors(soup): article_meta_tag = article_meta(soup) if article_meta_tag: contributor_tags = extract_nodes(article_meta_tag, ["contrib", "on-behalf-of"]) return [ tag for tag in contributor_tags if tag.parent.name == "contrib-group" ]
def research_organism_keywords(soup): tags = first( extract_nodes(soup, "kwd-group", attr="kwd-group-type", value="research-organism")) if not tags: return None return [tag for tag in tags if tag.name == "kwd"] or None
def full_subject_area(soup, subject_group_type=None): subject_group_tags = extract_nodes(soup, "subj-group") subject_group_tags = [ tag for tag in subject_group_tags if tag.parent.name == "article-categories" and tag.parent.parent.name == "article-meta" ] if subject_group_type: subject_group_tags = list( filter( lambda tag: tag.get("subj-group-type" == subject_group_type))) return subject_group_tags
def subject_area(soup, subject_group_type=None): # Supports all subject areas or just particular ones filtered by subject_area_tags = [] tags = extract_nodes(soup, "subject") subject_area_tags = [ tag for tag in tags if tag.parent.name == "subj-group" and tag.parent.parent.name == "article-categories" and tag.parent.parent.parent.name == "article-meta" ] if subject_group_type: subject_area_tags = [ tag for tag in tags if tag.parent.get("subj-group-type") == subject_group_type ] return subject_area_tags
def fig_group(soup): return extract_nodes(soup, "fig-group")
def string_name(soup): return extract_nodes(soup, "string-name")
def principal_award_recipient(soup): return extract_nodes(soup, "principal-award-recipient")
def award_group(soup): return extract_nodes(soup, "award-group")
def funding_group(soup): return extract_nodes(soup, "funding-group")
def journal_title(soup): return first(extract_nodes(soup, "journal-title"))
def article_type(soup): # returns raw data, just that the data doesn't contain any BS nodes return first(extract_nodes(soup, "article")).get("article-type")
def article_meta(soup): return first(extract_nodes(soup, "article-meta"))
def fig(soup): return extract_nodes(soup, "fig")
def math(soup): return extract_nodes(soup, "math")
def disp_formula(soup): return extract_nodes(soup, "disp-formula")
def publisher(soup): return first(extract_nodes(soup, "publisher-name"))
def list(soup): # Redefining `list` could be problematic return extract_nodes(soup, "list")
def boxed_text(soup): return extract_nodes(soup, "boxed-text")
def list_item(soup): return extract_nodes(soup, "list-item")