Python read_xml Examples, pubmed_parser.utils.read_xml Python Examples

Example #1

0

Show file

def parse_medline_grant_id(path):
    """Parse grant id from Medline XML file

    Parameters
    ----------
    path: str
        The path to the XML with the information

    Return
    ------
    grant_id_list: list
        A list of dictionaries contains the grants in a given path. Each dictionary
        has the keys of 'pmid', 'grant_id', 'grant_acronym', 'country', and 'agency'

    >>> pubmed_parser.parse_medline_grant_id('data/pubmed20n0014.xml.gz')
    [{
        'pmid': '399300',
        'grant_id': 'HL17731',
        'grant_acronym': 'HL',
        'country': 'United States',
        'agency': 'NHLBI NIH HHS'
    }, ...
    ]
    """
    tree = read_xml(path)
    medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation")
    if len(medline_citations) == 0:
        medline_citations = tree.findall("//PubmedArticle")
    grant_id_list = list(map(parse_grant_id, medline_citations))
    grant_id_list = list(chain(*grant_id_list))  # flatten list
    return grant_id_list

Example #2

0

Show file

def get_medline_tree(path, to_string=False, encoding='utf-8'):
    """Initial parsing of the xml file tree. Finds all the articles.

    Parameters
    ----------
    path: str
        The path

    to_string: bool
        If True, return a list of string elements

    encoding: str
        How to encode the elements if `to_string=True`

    Return
    ------
    medline_citations: list
        A list of lxml.etree._Element, each being a pubmed article
    """
    tree = read_xml(path)
    medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation")
    if len(medline_citations) == 0:
        medline_citations = tree.findall("//PubmedArticle")

    if to_string:
        return [lxml.etree.tostring(elem, encoding=encoding) for elem in medline_citations]

    return medline_citations

Example #3

0

Show file

File: medline_parser.py Project: lucian-whu/pubmed_parser

def parse_medline_xml(path, year_info_only=True, nlm_category=False):
    """Parse XML file from Medline XML format available at
    ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/

    Parameters
    ----------
    path: str
        The path
    year_info_only: bool
        if True, this tool will only attempt to extract year information from PubDate.
        if False, an attempt will be made to harvest all available PubDate information.
        If only year and month information is available, this will yield a date of
        the form 'YYYY-MM'. If year, month and day information is available,
        a date of the form 'YYYY-MM-DD' will be returned.
        NOTE: the resolution of PubDate information in the Medline(R) database varies
        between articles.
        Defaults to True.
    nlm_category: bool, default False
        if True, this will parse structured abstract where each section if original Label
        if False, this will parse structured abstract where each section will be assigned to
        NLM category of each sections

    Returns
    -------
    article_list: list
        Dictionary containing information about articles in NLM format (see
        `parse_article_info`). Articles that have been deleted will be
        added with no information other than the field `delete` being `True`
    """
    tree = read_xml(path)
    medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation')
    if len(medline_citations) == 0:
        medline_citations = tree.findall('//MedlineCitation')
    article_list = list(
        map(lambda m: parse_article_info(m, year_info_only, nlm_category),
            medline_citations))
    delete_citations = tree.findall('//DeleteCitation/PMID')
    dict_delete = \
        [
            {'title': None,
             'abstract': None,
             'journal': None,
             'author': None,
             'affiliation': None,
             'pubdate': None,
             'pmid': p.text,
             'other_id': None,
             'pmc': None,
             'mesh_terms': None,
             'keywords': None,
             'delete': True,
             'medline_ta': None,
             'nlm_unique_id': None,
             'issn_linking': None,
             'country': None
             } for p in delete_citations
        ]
    article_list.extend(dict_delete)
    return article_list

Example #4

0

Show file

File: medline_parser.py Project: H-Plus-Time/pubmed_parser

def parse_medline_xml(path, year_info_only=True):
    """Parse XML file from Medline XML format available at
    ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/

    Parameters
    ----------
    path: str
        The path
    year_info_only: bool
        if True, this tool will only attempt to extract year information from PubDate.
        if False, an attempt will be made to harvest all available PubDate information.
        If only year and month information is available, this will yield a date of
        the form 'YYYY-MM'. If year, month and day information is available,
        a date of the form 'YYYY-MM-DD' will be returned.
        NOTE: the resolution of PubDate information in the Medline(R) database varies
        between articles.
        Defaults to True.

    Returns
    -------
    article_list: list
        Dictionary containing information about articles in NLM format (see
        `parse_article_info`). Articles that have been deleted will be
        added with no information other than the field `delete` being `True`
    """
    tree = read_xml(path)
    medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation')
    if len(medline_citations) == 0:
        medline_citations = tree.findall('//MedlineCitation')
    article_list = list(map(lambda m: parse_article_info(m, year_info_only), medline_citations))
    delete_citations = tree.findall('//DeleteCitation/PMID')
    dict_delete = \
        [
            {'title': None,
             'abstract': None,
             'journal': None,
             'author': None,
             'affiliation': None,
             'pubdate': None,
             'pmid': p.text,
             'other_id': None,
             'pmc': None,
             'mesh_terms': None,
             'keywords': None,
             'delete': True
             } for p in delete_citations
            ]
    article_list.extend(dict_delete)
    return article_list

Example #5

0

Show file

File: medline_parser.py Project: dterg/pubmed_parser

def parse_medline_grant_id(path):
    """Parse grant id from Medline XML file

    Parameters
    ----------
    path: str
        The path to the XML with the information

    Returns
    -------
    grant_id_list: list
        List of dictionaries for all files in `path`. Each dictionary
        will have the information returned by `parse_grant_id`
    """
    tree = read_xml(path)
    medline_citations = tree.xpath('//MedlineCitationSet/MedlineCitation')
    grant_id_list = list(map(parse_grant_id, medline_citations))
    grant_id_list = list(chain(*grant_id_list))  # flatten list
    return grant_id_list

Example #6

0

Show file

def parse_medline_grant_id(path):
    """Parse grant id from Medline XML file

    Parameters
    ----------
    path: str
        The path to the XML with the information

    Return
    ------
    grant_id_list: list
        A list of dictionaries contains the grants in a given path. Each dictionary
        has the keys of 'pmid', 'grant_id', 'grant_acronym', 'country', and 'agency'
    """
    tree = read_xml(path)
    medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation")
    if len(medline_citations) == 0:
        medline_citations = tree.findall("//PubmedArticle")
    grant_id_list = list(map(parse_grant_id, medline_citations))
    grant_id_list = list(chain(*grant_id_list))  # flatten list
    return grant_id_list

Example #7

0

Show file

File: medline_parser.py Project: H-Plus-Time/pubmed_parser

def parse_medline_grant_id(path):
    """Parse grant id from Medline XML file

    Parameters
    ----------
    path: str
        The path to the XML with the information

    Returns
    -------
    grant_id_list: list
        List of dictionaries for all files in `path`. Each dictionary
        will have the information returned by `parse_grant_id`
    """
    tree = read_xml(path)
    medline_citations = tree.findall('//MedlineCitationSet/MedlineCitation')
    if len(medline_citations) == 0:
        medline_citations = tree.findall('//MedlineCitation')
    grant_id_list = list(map(parse_grant_id, medline_citations))
    grant_id_list = list(chain(*grant_id_list)) # flatten list
    return grant_id_list

Example #8

0

Show file

def parse_medline_xml(
    path,
    year_info_only=True,
    nlm_category=False,
    author_list=False,
    reference_list=False,
):
    """Parse XML file from Medline XML format available at
    ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/

    Parameters
    ----------
    path: str
        The path
    year_info_only: bool
        if True, this tool will only attempt to extract year information from PubDate.
        if False, an attempt will be made to harvest all available PubDate information.
        If only year and month information is available, this will yield a date of
        the form 'YYYY-MM'. If year, month and day information is available,
        a date of the form 'YYYY-MM-DD' will be returned.
        NOTE: the resolution of PubDate information in the Medline(R) database varies
        between articles.
        default: True
    nlm_category: bool
        if True, this will parse structured abstract where each section if original Label
        if False, this will parse structured abstract where each section will be assigned to
        NLM category of each sections
        default: False
    author_list: bool 
        if True, return parsed author output as a list of authors
        if False, return parsed author output as a string of authors concatenated with ``;``
        default: False
    reference_list: bool
        if True, parse reference list as an output
        if False, return string of PMIDs concatenated with ;
        default: False

    Return
    ------
    article_list: list
        A list of dictionary containing information about articles in NLM format (see
        `parse_article_info`). Articles that have been deleted will be
        added with no information other than the field `delete` being `True`
    """
    tree = read_xml(path)
    medline_citations = tree.findall("//MedlineCitationSet/MedlineCitation")
    if len(medline_citations) == 0:
        medline_citations = tree.findall("//PubmedArticle")
    article_list = list(
        map(
            lambda m: parse_article_info(m, year_info_only, nlm_category,
                                         author_list, reference_list),
            medline_citations,
        ))
    delete_citations = tree.findall("//DeleteCitation/PMID")
    dict_delete = [{
        "title": np.nan,
        "abstract": np.nan,
        "journal": np.nan,
        "authors": np.nan,
        "affiliations": np.nan,
        "pubdate": np.nan,
        "pmid": p.text.strip(),
        "doi": np.nan,
        "other_id": np.nan,
        "pmc": np.nan,
        "mesh_terms": np.nan,
        "keywords": np.nan,
        "publication_types": np.nan,
        "chemical_list": np.nan,
        "delete": True,
        "medline_ta": np.nan,
        "nlm_unique_id": np.nan,
        "issn_linking": np.nan,
        "country": np.nan,
        "references": np.nan,
    } for p in delete_citations]
    article_list.extend(dict_delete)
    return article_list