Exemple #1
0
def parse_article_info(medline, year_info_only, nlm_category, author_list):
    """Parse article nodes from Medline dataset

    Parameters
    ----------
    medline: Element
        The lxml node pointing to a medline document
    year_info_only: bool
        see: date_extractor()
    nlm_category: bool
        see: parse_medline_xml()
    author_list: bool, if True, return output as list, else

    Returns
    -------
    article: dict
        Dictionary containing information about the article, including
        `title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`,
        `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
        `delete` is always `False` because this function parses
        articles that by definition are not deleted.
    """
    article = medline.find('Article')

    if article.find('ArticleTitle') is not None:
        title = stringify_children(article.find('ArticleTitle')).strip() or ''
    else:
        title = ''

    category = 'NlmCategory' if nlm_category else 'Label'
    if article.find('Abstract/AbstractText') is not None:
        # parsing structured abstract
        if len(article.findall('Abstract/AbstractText')) > 1:
            abstract_list = list()
            for abstract in article.findall('Abstract/AbstractText'):
                section = abstract.attrib.get(category, '')
                if section != 'UNASSIGNED':
                    abstract_list.append('\n')
                    abstract_list.append(abstract.attrib.get(category, ''))
                section_text = stringify_children(abstract).strip()
                abstract_list.append(section_text)
            abstract = '\n'.join(abstract_list).strip()
        else:
            abstract = stringify_children(article.find('Abstract/AbstractText')).strip() or ''
    elif article.find('Abstract') is not None:
        abstract = stringify_children(article.find('Abstract')).strip() or ''
    else:
        abstract = ''

    authors_dict = parse_author_affiliation(medline)
    if not author_list:
        affiliations = ';'.join([author.get('affiliation', '') 
                                for author in authors_dict if author.get('affiliation', '') is not ''])
        authors = ';'.join([author.get('firstname', '') + ' ' + author.get('lastname', '')
                            for author in authors_dict])
    else:
        authors = authors_dict
    journal = article.find('Journal')
    journal_name = ' '.join(journal.xpath('Title/text()'))

    pubdate = date_extractor(journal, year_info_only)
    pmid = parse_pmid(medline)
    doi = parse_doi(medline)
    mesh_terms = parse_mesh_terms(medline)
    publication_types = parse_publication_types(medline)
    chemical_list = parse_chemical_list(medline)
    keywords = parse_keywords(medline)
    other_id_dict = parse_other_id(medline)
    journal_info_dict = parse_journal_info(medline)
    dict_out = {
        'title': title,
        'abstract': abstract,
        'journal': journal_name,
        'authors': authors,
        'pubdate': pubdate,
        'pmid': pmid,
        'mesh_terms': mesh_terms,
        'publication_types': publication_types,
        'chemical_list': chemical_list,
        'keywords': keywords,
        'doi': doi,
        'delete': False
    }
    if not author_list:
        dict_out.update({'affiliations': affiliations})
    dict_out.update(other_id_dict)
    dict_out.update(journal_info_dict)
    return dict_out
Exemple #2
0
def parse_article_info(pubmed_article, year_info_only, nlm_category,
                       author_list, reference_list):
    """Parse article nodes from Medline dataset

    Parameters
    ----------
    pubmed_article: Element
        The lxml element pointing to a medline document
    year_info_only: bool
        see more details in date_extractor()
    nlm_category: bool
        see more details in parse_medline_xml()
    author_list: bool
        if True, return output as list, else
    reference_list: bool
        if True, parse reference list as an output

    Returns
    -------
    article: dict
        Dictionary containing information about the article, including
        `title`, `abstract`, `journal`, `authors`, `affiliations`, `pubdate`,
        `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
        `delete` is always `False` because this function parses
        articles that by definition are not deleted.
    """
    medline = pubmed_article.find("MedlineCitation")
    article = medline.find("Article")

    if article.find("ArticleTitle") is not None:
        title = stringify_children(article.find("ArticleTitle")).strip() or ""
    else:
        title = ""

    category = "NlmCategory" if nlm_category else "Label"
    if article.find("Abstract/AbstractText") is not None:
        # parsing structured abstract
        if len(article.findall("Abstract/AbstractText")) > 1:
            abstract_list = list()
            for abstract in article.findall("Abstract/AbstractText"):
                section = abstract.attrib.get(category, "")
                if section != "UNASSIGNED":
                    abstract_list.append("\n")
                    abstract_list.append(abstract.attrib.get(category, ""))
                section_text = stringify_children(abstract).strip()
                abstract_list.append(section_text)
            abstract = "\n".join(abstract_list).strip()
        else:
            abstract = (stringify_children(
                article.find("Abstract/AbstractText")).strip() or "")
    elif article.find("Abstract") is not None:
        abstract = stringify_children(article.find("Abstract")).strip() or ""
    else:
        abstract = ""

    authors_dict = parse_author_affiliation(medline)
    if not author_list:
        affiliations = ";".join([
            author.get("affiliation", "") for author in authors_dict
            if author.get("affiliation", "") is not ""
        ])
        authors = ";".join([
            author.get("firstname", "") + " " + author.get("lastname", "")
            for author in authors_dict
        ])
    else:
        authors = authors_dict
    journal = article.find("Journal")
    journal_name = " ".join(journal.xpath("Title/text()"))

    pmid = parse_pmid(pubmed_article)
    doi = parse_doi(pubmed_article)
    references = parse_references(pubmed_article, reference_list)
    pubdate = date_extractor(journal, year_info_only)
    mesh_terms = parse_mesh_terms(medline)
    publication_types = parse_publication_types(medline)
    chemical_list = parse_chemical_list(medline)
    keywords = parse_keywords(medline)
    other_id_dict = parse_other_id(medline)
    journal_info_dict = parse_journal_info(medline)
    dict_out = {
        "title": title,
        "abstract": abstract,
        "journal": journal_name,
        "authors": authors,
        "pubdate": pubdate,
        "pmid": pmid,
        "mesh_terms": mesh_terms,
        "publication_types": publication_types,
        "chemical_list": chemical_list,
        "keywords": keywords,
        "doi": doi,
        "references": references,
        "delete": False,
    }
    if not author_list:
        dict_out.update({"affiliations": affiliations})
    dict_out.update(other_id_dict)
    dict_out.update(journal_info_dict)
    return dict_out
def parse_article_info(medline, year_info_only):
    """Parse article nodes from Medline dataset

    Parameters
    ----------
    medline: Element
        The lxml node pointing to a medline document
    year_info_only: bool
        see: date_extractor().

    Returns
    -------
    article: dict
        Dictionary containing information about the article, including
        `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`,
        `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
        `delete` is always `False` because this function parses
        articles that by definition are not deleted.
    """
    article = medline.find('Article')

    if article.find('ArticleTitle') is not None:
        title = stringify_children(article.find('ArticleTitle')).strip() or ''
    else:
        title = ''

    if article.find('Abstract/AbstractText') is not None:
        abstract = stringify_children(article.find('Abstract/AbstractText')).strip() or ''
    elif article.find('Abstract') is not None:
        abstract = stringify_children(article.find('Abstract')).strip() or ''
    else:
        abstract = ''

    if article.find('AuthorList') is not None:
        authors = article.find('AuthorList').getchildren()
        authors_info = list()
        affiliations_info = list()
        for author in authors:
            if author.find('Initials') is not None:
                firstname = author.find('Initials').text or ''
            else:
                firstname = ''
            if author.find('LastName') is not None:
                lastname = author.find('LastName').text or ''
            else:
                lastname = ''
            if author.find('AffiliationInfo/Affiliation') is not None:
                affiliation = author.find('AffiliationInfo/Affiliation').text or ''
            else:
                affiliation = ''
            authors_info.append((firstname + ' ' + lastname).strip())
            affiliations_info.append(affiliation)
        affiliations_info = ' '.join([a for a in affiliations_info if a is not ''])
        authors_info = '; '.join(authors_info)
    else:
        affiliations_info = ''
        authors_info = ''

    journal = article.find('Journal')
    journal_name = ' '.join(journal.xpath('Title/text()'))
    pubdate = date_extractor(journal, year_info_only)

    pmid = parse_pmid(medline)
    mesh_terms = parse_mesh_terms(medline)
    keywords = parse_keywords(medline)
    other_id_dict = parse_other_id(medline)
    journal_info_dict = parse_journal_info(medline)

    dict_out = {'title': title,
                'abstract': abstract,
                'journal': journal_name,
                'author': authors_info,
                'affiliation': affiliations_info,
                'pubdate': pubdate,
                'pmid': pmid,
                'mesh_terms': mesh_terms,
                'keywords': keywords,
                'delete': False}
    dict_out.update(other_id_dict)
    dict_out.update(journal_info_dict)
    return dict_out
Exemple #4
0
def parse_article_info_abcam(pubmed_article):
    """Parse article nodes from Medline dataset --- specifically for Abcams needs

    Parameters
    ----------
    pubmed_article: Element
        The lxml element pointing to a medline document

    Returns
    -------
    article: dict
        Dictionary containing information about the article, as per the Abcams requirements
    """

    # Fixtures
    year_info_only = True
    nlm_category = True
    reference_list = True

    medline = pubmed_article.find("MedlineCitation")
    article = medline.find("Article")

    if article.find("ArticleTitle") is not None:
        title = stringify_children(article.find("ArticleTitle")).strip() or ""
    else:
        title = ""

    category = "NlmCategory" if nlm_category else "Label"
    if article.find("Abstract/AbstractText") is not None:
        # parsing structured abstract
        if len(article.findall("Abstract/AbstractText")) > 1:
            abstract_list = list()
            for abstract in article.findall("Abstract/AbstractText"):
                section = abstract.attrib.get(category, "")
                if section != "UNASSIGNED":
                    abstract_list.append("\n")
                    abstract_list.append(abstract.attrib.get(category, ""))
                section_text = stringify_children(abstract).strip()
                abstract_list.append(section_text)
            abstract = "\n".join(abstract_list).strip()
        else:
            abstract = (
                stringify_children(article.find("Abstract/AbstractText")).strip() or ""
            )
    elif article.find("Abstract") is not None:
        abstract = stringify_children(article.find("Abstract")).strip() or ""
    else:
        abstract = ""

    authors = parse_author_affiliation(medline)

    journal = article.find("Journal")
    journal_name = " ".join(journal.xpath("Title/text()"))

    language_field = article.findall("Language")
    language = [''.join(elem.itertext()) for elem in language_field]

    pmid = parse_pmid(pubmed_article)
    doi = parse_doi(pubmed_article)
    pmcid = parse_pmcid(pubmed_article)
    references = parse_references(pubmed_article, reference_list)
    year = date_extractor(journal, year_info_only)
    other_id_dict = parse_other_id(medline)
    journal_info_dict = parse_journal_info(medline)
    dt = str(datetime.now())

    dict_out = {
        "PMID": pmid,
        "PMCID": pmcid,
        "DOI": doi,
        "Title": title,
        "Abstract": abstract,
        "Language": language,
        "Journal": journal_name,
        "JournalAbv": journal_info_dict.get('medline_ta'),
        "Year": year,
        "Authors": authors,
        "References": references,
        "IngestionTime": dt,
        "delete": False,
    }

    return dict_out
Exemple #5
0
def parse_article_info(medline,
                       year_info_only,
                       nlm_category,
                       subscpt=None,
                       supscpt=None,
                       incl_sections=False):
    """Parse article nodes from Medline dataset

    Parameters
    ----------
    medline: Element
        The lxml node pointing to a medline document
    year_info_only: bool
        see: date_extractor()
    nlm_category: bool
        see: parse_medline_xml()

    Returns
    -------
    article: dict
        Dictionary containing information about the article, including
        `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`,
        `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
        `delete` is always `False` because this function parses
        articles that by definition are not deleted.
    """
    article = medline.find('Article')

    if article.find('ArticleTitle') is not None:
        title = stringify_children(article.find('ArticleTitle'), subscpt,
                                   supscpt).strip() or ''
    else:
        title = ''

    title = replace_multiple(inp_list, title)
    title = re.sub(' +', ' ', title.replace("\n", "")).strip()

    if incl_sections:
        category = 'NlmCategory' if nlm_category else 'Label'
    if article.find('Abstract/AbstractText') is not None:
        # parsing structured abstract
        if len(article.findall('Abstract/AbstractText')) > 1:
            abstract_list = list()
            for abstract in article.findall('Abstract/AbstractText'):
                if incl_sections:
                    section = abstract.attrib.get(category, '')
                    if section != 'UNASSIGNED':
                        abstract_list.append('\n')
                        abstract_list.append(abstract.attrib.get(category, ''))
                section_text = stringify_children(abstract, subscpt,
                                                  supscpt).strip()
                abstract_list.append(section_text)
            abstract = ' '.join(abstract_list).strip()
        else:
            abstract = stringify_children(
                article.find('Abstract/AbstractText'), subscpt,
                supscpt).strip() or ''
    elif article.find('Abstract') is not None:
        abstract = stringify_children(article.find('Abstract'), subscpt,
                                      supscpt).strip() or ''
    else:
        abstract = ''

    abstract = replace_multiple(inp_list, abstract)
    abstract = re.sub(' +', ' ', abstract.replace("\n", "")).strip()

    if article.find('AuthorList') is not None:
        authors = article.find('AuthorList').getchildren()
        authors_info = list()
        affiliations_info = list()
        for author in authors:
            if author.find('Initials') is not None:
                firstname = author.find('Initials').text or ''
            else:
                firstname = ''
            if author.find('LastName') is not None:
                lastname = author.find('LastName').text or ''
            else:
                lastname = ''
            if author.find('AffiliationInfo/Affiliation') is not None:
                affiliation = author.find(
                    'AffiliationInfo/Affiliation').text or ''
            else:
                affiliation = ''
            authors_info.append((firstname + ' ' + lastname).strip())
            affiliations_info.append(affiliation)
        affiliations_info = '\n'.join(
            [a for a in affiliations_info if a is not ''])
        authors_info = '; '.join(authors_info)
    else:
        affiliations_info = ''
        authors_info = ''

    journal = article.find('Journal')
    journal_name = ' '.join(journal.xpath('Title/text()'))
    pubdate = date_extractor(journal, year_info_only)

    pmid = parse_pmid(medline)
    doi = parse_doi(medline)
    mesh_terms = parse_mesh_terms(medline)
    publication_types = parse_publication_types(medline)
    chemical_list = parse_chemical_list(medline)
    keywords = parse_keywords(medline)
    other_id_dict = parse_other_id(medline)
    journal_info_dict = parse_journal_info(medline)
    dict_out = {
        'title': title,
        'abstract': abstract,
        'journal': journal_name,
        'author': authors_info,
        'affiliation': affiliations_info,
        'pubdate': pubdate,
        'pmid': pmid,
        'mesh_terms': mesh_terms,
        'publication_types': publication_types,
        'chemical_list': chemical_list,
        'keywords': keywords,
        'doi': doi,
        'delete': False
    }
    dict_out.update(other_id_dict)
    dict_out.update(journal_info_dict)
    return dict_out
def parse_article_info(medline, year_info_only):
    """Parse article nodes from Medline dataset

    Parameters
    ----------
    medline: Element
        The lxml node pointing to a medline document
    year_info_only: bool
        see: date_extractor().

    Returns
    -------
    article: dict
        Dictionary containing information about the article, including
        `title`, `abstract`, `journal`, `author`, `affiliation`, `pubdate`,
        `pmid`, `other_id`, `mesh_terms`, and `keywords`. The field
        `delete` is always `False` because this function parses
        articles that by definition are not deleted.
    """
    article = medline.find('Article')

    if article.find('ArticleTitle') is not None:
        title = stringify_children(article.find('ArticleTitle')).strip()
    else:
        title = ''

    if article.find('Abstract') is not None:
        abstract = stringify_children(article.find('Abstract'))
    else:
        abstract = ''

    if article.find('AuthorList') is not None:
        authors = article.find('AuthorList').getchildren()
        authors_info = list()
        affiliations_info = list()
        for author in authors:
            if author.find('Initials') is not None:
                firstname = author.find('Initials').text or ''
            else:
                firstname = ''
            if author.find('LastName') is not None:
                lastname = author.find('LastName').text or ''
            else:
                lastname = ''
            if author.find('AffiliationInfo/Affiliation') is not None:
                affiliation = author.find('AffiliationInfo/Affiliation').text or ''
            else:
                affiliation = ''
            authors_info.append((firstname + ' ' + lastname).strip())
            affiliations_info.append(affiliation)
        affiliations_info = ' '.join([a for a in affiliations_info if a is not ''])
        authors_info = '; '.join(authors_info)
    else:
        affiliations_info = ''
        authors_info = ''

    journal = article.find('Journal')
    journal_name = ' '.join(journal.xpath('Title/text()'))
    pubdate = date_extractor(journal, year_info_only)

    pmid = parse_pmid(medline)
    mesh_terms = parse_mesh_terms(medline)
    keywords = parse_keywords(medline)
    other_id_dict = parse_other_id(medline)

    dict_out = {'title': title,
                'abstract': abstract,
                'journal': journal_name,
                'author': authors_info,
                'affiliation': affiliations_info,
                'pubdate': pubdate,
                'pmid': pmid,
                'mesh_terms': mesh_terms,
                'keywords': keywords,
                'delete': False}
    dict_out.update(other_id_dict)
    return dict_out