def parse_article_info(medline, year_info_only, nlm_category, author_list): """Parse article nodes from Medline dataset Parameters ---------- medline: Element The lxml node pointing to a medline document year_info_only: bool see: date_extractor() nlm_category: bool see: parse_medline_xml() author_list: bool, if True, return output as list, else Returns ------- article: dict Dictionary containing information about the article, including `title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`, `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field `delete` is always `False` because this function parses articles that by definition are not deleted. """ article = medline.find('Article') if article.find('ArticleTitle') is not None: title = stringify_children(article.find('ArticleTitle')).strip() or '' else: title = '' category = 'NlmCategory' if nlm_category else 'Label' if article.find('Abstract/AbstractText') is not None: # parsing structured abstract if len(article.findall('Abstract/AbstractText')) > 1: abstract_list = list() for abstract in article.findall('Abstract/AbstractText'): section = abstract.attrib.get(category, '') if section != 'UNASSIGNED': abstract_list.append('\n') abstract_list.append(abstract.attrib.get(category, '')) section_text = stringify_children(abstract).strip() abstract_list.append(section_text) abstract = '\n'.join(abstract_list).strip() else: abstract = stringify_children(article.find('Abstract/AbstractText')).strip() or '' elif article.find('Abstract') is not None: abstract = stringify_children(article.find('Abstract')).strip() or '' else: abstract = '' authors_dict = parse_author_affiliation(medline) if not author_list: affiliations = ';'.join([author.get('affiliation', '') for author in authors_dict if author.get('affiliation', '') is not '']) authors = ';'.join([author.get('firstname', '') + ' ' + author.get('lastname', '') for author in authors_dict]) else: authors = authors_dict journal = article.find('Journal') journal_name = ' '.join(journal.xpath('Title/text()')) pubdate = date_extractor(journal, year_info_only) pmid = parse_pmid(medline) doi = parse_doi(medline) mesh_terms = parse_mesh_terms(medline) publication_types = parse_publication_types(medline) chemical_list = parse_chemical_list(medline) keywords = parse_keywords(medline) other_id_dict = parse_other_id(medline) journal_info_dict = parse_journal_info(medline) dict_out = { 'title': title, 'abstract': abstract, 'journal': journal_name, 'authors': authors, 'pubdate': pubdate, 'pmid': pmid, 'mesh_terms': mesh_terms, 'publication_types': publication_types, 'chemical_list': chemical_list, 'keywords': keywords, 'doi': doi, 'delete': False } if not author_list: dict_out.update({'affiliations': affiliations}) dict_out.update(other_id_dict) dict_out.update(journal_info_dict) return dict_out
def parse_article_info(pubmed_article, year_info_only, nlm_category, author_list, reference_list): """Parse article nodes from Medline dataset Parameters ---------- pubmed_article: Element The lxml element pointing to a medline document year_info_only: bool see more details in date_extractor() nlm_category: bool see more details in parse_medline_xml() author_list: bool if True, return output as list, else reference_list: bool if True, parse reference list as an output Returns ------- article: dict Dictionary containing information about the article, including `title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`, `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field `delete` is always `False` because this function parses articles that by definition are not deleted. """ medline = pubmed_article.find("MedlineCitation") article = medline.find("Article") if article.find("ArticleTitle") is not None: title = stringify_children(article.find("ArticleTitle")).strip() or "" else: title = "" category = "NlmCategory" if nlm_category else "Label" if article.find("Abstract/AbstractText") is not None: # parsing structured abstract if len(article.findall("Abstract/AbstractText")) > 1: abstract_list = list() for abstract in article.findall("Abstract/AbstractText"): section = abstract.attrib.get(category, "") if section != "UNASSIGNED": abstract_list.append("\n") abstract_list.append(abstract.attrib.get(category, "")) section_text = stringify_children(abstract).strip() abstract_list.append(section_text) abstract = "\n".join(abstract_list).strip() else: abstract = (stringify_children( article.find("Abstract/AbstractText")).strip() or "") elif article.find("Abstract") is not None: abstract = stringify_children(article.find("Abstract")).strip() or "" else: abstract = "" authors_dict = parse_author_affiliation(medline) if not author_list: affiliations = ";".join([ author.get("affiliation", "") for author in authors_dict if author.get("affiliation", "") is not "" ]) authors = ";".join([ author.get("firstname", "") + " " + author.get("lastname", "") for author in authors_dict ]) else: authors = authors_dict journal = article.find("Journal") journal_name = " ".join(journal.xpath("Title/text()")) pmid = parse_pmid(pubmed_article) doi = parse_doi(pubmed_article) references = parse_references(pubmed_article, reference_list) pubdate = date_extractor(journal, year_info_only) mesh_terms = parse_mesh_terms(medline) publication_types = parse_publication_types(medline) chemical_list = parse_chemical_list(medline) keywords = parse_keywords(medline) other_id_dict = parse_other_id(medline) journal_info_dict = parse_journal_info(medline) dict_out = { "title": title, "abstract": abstract, "journal": journal_name, "authors": authors, "pubdate": pubdate, "pmid": pmid, "mesh_terms": mesh_terms, "publication_types": publication_types, "chemical_list": chemical_list, "keywords": keywords, "doi": doi, "references": references, "delete": False, } if not author_list: dict_out.update({"affiliations": affiliations}) dict_out.update(other_id_dict) dict_out.update(journal_info_dict) return dict_out
def parse_article_info(medline, year_info_only): """Parse article nodes from Medline dataset Parameters ---------- medline: Element The lxml node pointing to a medline document year_info_only: bool see: date_extractor(). Returns ------- article: dict Dictionary containing information about the article, including `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`, `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field `delete` is always `False` because this function parses articles that by definition are not deleted. """ article = medline.find('Article') if article.find('ArticleTitle') is not None: title = stringify_children(article.find('ArticleTitle')).strip() or '' else: title = '' if article.find('Abstract/AbstractText') is not None: abstract = stringify_children(article.find('Abstract/AbstractText')).strip() or '' elif article.find('Abstract') is not None: abstract = stringify_children(article.find('Abstract')).strip() or '' else: abstract = '' if article.find('AuthorList') is not None: authors = article.find('AuthorList').getchildren() authors_info = list() affiliations_info = list() for author in authors: if author.find('Initials') is not None: firstname = author.find('Initials').text or '' else: firstname = '' if author.find('LastName') is not None: lastname = author.find('LastName').text or '' else: lastname = '' if author.find('AffiliationInfo/Affiliation') is not None: affiliation = author.find('AffiliationInfo/Affiliation').text or '' else: affiliation = '' authors_info.append((firstname + ' ' + lastname).strip()) affiliations_info.append(affiliation) affiliations_info = ' '.join([a for a in affiliations_info if a is not '']) authors_info = '; '.join(authors_info) else: affiliations_info = '' authors_info = '' journal = article.find('Journal') journal_name = ' '.join(journal.xpath('Title/text()')) pubdate = date_extractor(journal, year_info_only) pmid = parse_pmid(medline) mesh_terms = parse_mesh_terms(medline) keywords = parse_keywords(medline) other_id_dict = parse_other_id(medline) journal_info_dict = parse_journal_info(medline) dict_out = {'title': title, 'abstract': abstract, 'journal': journal_name, 'author': authors_info, 'affiliation': affiliations_info, 'pubdate': pubdate, 'pmid': pmid, 'mesh_terms': mesh_terms, 'keywords': keywords, 'delete': False} dict_out.update(other_id_dict) dict_out.update(journal_info_dict) return dict_out
def parse_article_info_abcam(pubmed_article): """Parse article nodes from Medline dataset --- specifically for Abcams needs Parameters ---------- pubmed_article: Element The lxml element pointing to a medline document Returns ------- article: dict Dictionary containing information about the article, as per the Abcams requirements """ # Fixtures year_info_only = True nlm_category = True reference_list = True medline = pubmed_article.find("MedlineCitation") article = medline.find("Article") if article.find("ArticleTitle") is not None: title = stringify_children(article.find("ArticleTitle")).strip() or "" else: title = "" category = "NlmCategory" if nlm_category else "Label" if article.find("Abstract/AbstractText") is not None: # parsing structured abstract if len(article.findall("Abstract/AbstractText")) > 1: abstract_list = list() for abstract in article.findall("Abstract/AbstractText"): section = abstract.attrib.get(category, "") if section != "UNASSIGNED": abstract_list.append("\n") abstract_list.append(abstract.attrib.get(category, "")) section_text = stringify_children(abstract).strip() abstract_list.append(section_text) abstract = "\n".join(abstract_list).strip() else: abstract = ( stringify_children(article.find("Abstract/AbstractText")).strip() or "" ) elif article.find("Abstract") is not None: abstract = stringify_children(article.find("Abstract")).strip() or "" else: abstract = "" authors = parse_author_affiliation(medline) journal = article.find("Journal") journal_name = " ".join(journal.xpath("Title/text()")) language_field = article.findall("Language") language = [''.join(elem.itertext()) for elem in language_field] pmid = parse_pmid(pubmed_article) doi = parse_doi(pubmed_article) pmcid = parse_pmcid(pubmed_article) references = parse_references(pubmed_article, reference_list) year = date_extractor(journal, year_info_only) other_id_dict = parse_other_id(medline) journal_info_dict = parse_journal_info(medline) dt = str(datetime.now()) dict_out = { "PMID": pmid, "PMCID": pmcid, "DOI": doi, "Title": title, "Abstract": abstract, "Language": language, "Journal": journal_name, "JournalAbv": journal_info_dict.get('medline_ta'), "Year": year, "Authors": authors, "References": references, "IngestionTime": dt, "delete": False, } return dict_out
def parse_article_info(medline, year_info_only, nlm_category, subscpt=None, supscpt=None, incl_sections=False): """Parse article nodes from Medline dataset Parameters ---------- medline: Element The lxml node pointing to a medline document year_info_only: bool see: date_extractor() nlm_category: bool see: parse_medline_xml() Returns ------- article: dict Dictionary containing information about the article, including `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`, `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field `delete` is always `False` because this function parses articles that by definition are not deleted. """ article = medline.find('Article') if article.find('ArticleTitle') is not None: title = stringify_children(article.find('ArticleTitle'), subscpt, supscpt).strip() or '' else: title = '' title = replace_multiple(inp_list, title) title = re.sub(' +', ' ', title.replace("\n", "")).strip() if incl_sections: category = 'NlmCategory' if nlm_category else 'Label' if article.find('Abstract/AbstractText') is not None: # parsing structured abstract if len(article.findall('Abstract/AbstractText')) > 1: abstract_list = list() for abstract in article.findall('Abstract/AbstractText'): if incl_sections: section = abstract.attrib.get(category, '') if section != 'UNASSIGNED': abstract_list.append('\n') abstract_list.append(abstract.attrib.get(category, '')) section_text = stringify_children(abstract, subscpt, supscpt).strip() abstract_list.append(section_text) abstract = ' '.join(abstract_list).strip() else: abstract = stringify_children( article.find('Abstract/AbstractText'), subscpt, supscpt).strip() or '' elif article.find('Abstract') is not None: abstract = stringify_children(article.find('Abstract'), subscpt, supscpt).strip() or '' else: abstract = '' abstract = replace_multiple(inp_list, abstract) abstract = re.sub(' +', ' ', abstract.replace("\n", "")).strip() if article.find('AuthorList') is not None: authors = article.find('AuthorList').getchildren() authors_info = list() affiliations_info = list() for author in authors: if author.find('Initials') is not None: firstname = author.find('Initials').text or '' else: firstname = '' if author.find('LastName') is not None: lastname = author.find('LastName').text or '' else: lastname = '' if author.find('AffiliationInfo/Affiliation') is not None: affiliation = author.find( 'AffiliationInfo/Affiliation').text or '' else: affiliation = '' authors_info.append((firstname + ' ' + lastname).strip()) affiliations_info.append(affiliation) affiliations_info = '\n'.join( [a for a in affiliations_info if a is not '']) authors_info = '; '.join(authors_info) else: affiliations_info = '' authors_info = '' journal = article.find('Journal') journal_name = ' '.join(journal.xpath('Title/text()')) pubdate = date_extractor(journal, year_info_only) pmid = parse_pmid(medline) doi = parse_doi(medline) mesh_terms = parse_mesh_terms(medline) publication_types = parse_publication_types(medline) chemical_list = parse_chemical_list(medline) keywords = parse_keywords(medline) other_id_dict = parse_other_id(medline) journal_info_dict = parse_journal_info(medline) dict_out = { 'title': title, 'abstract': abstract, 'journal': journal_name, 'author': authors_info, 'affiliation': affiliations_info, 'pubdate': pubdate, 'pmid': pmid, 'mesh_terms': mesh_terms, 'publication_types': publication_types, 'chemical_list': chemical_list, 'keywords': keywords, 'doi': doi, 'delete': False } dict_out.update(other_id_dict) dict_out.update(journal_info_dict) return dict_out
def parse_article_info(medline, year_info_only): """Parse article nodes from Medline dataset Parameters ---------- medline: Element The lxml node pointing to a medline document year_info_only: bool see: date_extractor(). Returns ------- article: dict Dictionary containing information about the article, including `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`, `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field `delete` is always `False` because this function parses articles that by definition are not deleted. """ article = medline.find('Article') if article.find('ArticleTitle') is not None: title = stringify_children(article.find('ArticleTitle')).strip() else: title = '' if article.find('Abstract') is not None: abstract = stringify_children(article.find('Abstract')) else: abstract = '' if article.find('AuthorList') is not None: authors = article.find('AuthorList').getchildren() authors_info = list() affiliations_info = list() for author in authors: if author.find('Initials') is not None: firstname = author.find('Initials').text or '' else: firstname = '' if author.find('LastName') is not None: lastname = author.find('LastName').text or '' else: lastname = '' if author.find('AffiliationInfo/Affiliation') is not None: affiliation = author.find('AffiliationInfo/Affiliation').text or '' else: affiliation = '' authors_info.append((firstname + ' ' + lastname).strip()) affiliations_info.append(affiliation) affiliations_info = ' '.join([a for a in affiliations_info if a is not '']) authors_info = '; '.join(authors_info) else: affiliations_info = '' authors_info = '' journal = article.find('Journal') journal_name = ' '.join(journal.xpath('Title/text()')) pubdate = date_extractor(journal, year_info_only) pmid = parse_pmid(medline) mesh_terms = parse_mesh_terms(medline) keywords = parse_keywords(medline) other_id_dict = parse_other_id(medline) dict_out = {'title': title, 'abstract': abstract, 'journal': journal_name, 'author': authors_info, 'affiliation': affiliations_info, 'pubdate': pubdate, 'pmid': pmid, 'mesh_terms': mesh_terms, 'keywords': keywords, 'delete': False} dict_out.update(other_id_dict) return dict_out