Exemple #1
0
def get_asbmb_full_text(url):
    # Get the location of the full text PDF from the target URL
    req = requests.get(url)
    if req.status_code != 200:
        logger.warning(
            'ASBMB full text query returned status code %s: URL %s' %
            (req.status_code, url))
        return (None, None)
    # If we're here that means that we successfully got the paper URL
    xml_str = req.text
    tree = ET.XML(xml_str, parser=UTB())
    fulltext_elem = tree.find('.//{http://www.w3.org/1999/xhtml}meta'
                              '[@name="citation_fulltext_html_url"]')
    # Couldn't find the element containing the full text URL
    if fulltext_elem is None:
        logger.warning("ASBMB full text: couldn't find the full text URL "
                       "element among the meta tags.")
        return (None, None)
    fulltext_url = fulltext_elem.attrib['content']
    # Now, get the full text HTML page
    req2 = requests.get(fulltext_url)
    if req2.status_code != 200:
        logger.warning(
            'ASBMB full text query returned status code %s: URL %s' %
            (req.status_code, fulltext_url))
        return (None, None)
    # We've got the full text page!
    # Get all the section elements
    xml_str2 = req2.text
    tree2 = ET.XML(xml_str2, parser=UTB())
    return None, None
Exemple #2
0
def get_dois(query_str, count=100):
    """Search ScienceDirect through the API for articles.

    See http://api.elsevier.com/content/search/fields/scidir for constructing a
    query string to pass here.  Example: 'abstract(BRAF) AND all("colorectal
    cancer")'
    """
    url = '%s/%s' % (elsevier_search_url, query_str)
    if elsevier_keys is None:
        logger.error('Missing API key at %s, could not perform search.' %
                     api_key_file)
        return None
    params = {
        'query': query_str,
        'count': count,
        'httpAccept': 'application/xml',
        'sort': '-coverdate',
        'field': 'doi'
    }
    res = requests.get(url, params)
    if not res.status_code == 200:
        return None
    tree = ET.XML(res.content, parser=UTB())
    doi_tags = tree.findall('atom:entry/prism:doi', elsevier_ns)
    dois = [dt.text for dt in doi_tags]
    return dois
Exemple #3
0
    def __init__(self, xml_string):
        self.statements = []
        # Parse XML
        try:
            self.tree = ET.XML(xml_string, parser=UTB())
        except ET.ParseError:
            logger.error('Could not parse XML string')
            self.tree = None
            return

        # Get the document ID from the EKB tag.
        self.doc_id = self.tree.attrib.get('id')

        # Store all paragraphs and store all sentences in a data structure
        paragraph_tags = self.tree.findall('input/paragraphs/paragraph')
        sentence_tags = self.tree.findall('input/sentences/sentence')
        self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags}
        self.sentences = {s.attrib['id']: s.text for s in sentence_tags}
        self.par_to_sec = {p.attrib['id']: p.attrib.get('sec-type')
                           for p in paragraph_tags}

        # Extract statements
        self.extract_noun_relations('CC')
        self.extract_noun_relations('EVENT')
        return
Exemple #4
0
def get_xml(pmc_id):
    """Returns XML for the article corresponding to a PMC ID."""
    if pmc_id.upper().startswith('PMC'):
        pmc_id = pmc_id[3:]
    # Request params
    params = {}
    params['verb'] = 'GetRecord'
    params['identifier'] = 'oai:pubmedcentral.nih.gov:%s' % pmc_id
    params['metadataPrefix'] = 'pmc'
    # Submit the request
    res = requests.get(pmc_url, params)
    if not res.status_code == 200:
        logger.warning("Couldn't download %s" % pmc_id)
        return None
    # Read the bytestream
    xml_bytes = res.content
    # Check for any XML errors; xml_str should still be bytes
    tree = ET.XML(xml_bytes, parser=UTB())
    xmlns = "http://www.openarchives.org/OAI/2.0/"
    err_tag = tree.find('{%s}error' % xmlns)
    if err_tag is not None:
        err_code = err_tag.attrib['code']
        err_text = err_tag.text
        logger.warning('PMC client returned with error %s: %s' %
                       (err_code, err_text))
        return None
    # If no error, return the XML as a unicode string
    else:
        return xml_bytes.decode('utf-8')
Exemple #5
0
    def __init__(self, xml_string):
        self.statements = []
        # Parse XML
        try:
            self.tree = ET.XML(xml_string, parser=UTB())
        except ET.ParseError:
            logger.error('Could not parse XML string')
            self.tree = None
            return

        # Get the document ID from the EKB tag.
        self.doc_id = self.tree.attrib.get('id')

        # Store all paragraphs and store all sentences in a data structure
        paragraph_tags = self.tree.findall('input/paragraphs/paragraph')
        sentence_tags = self.tree.findall('input/sentences/sentence')
        self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags}
        self.sentences = {s.attrib['id']: s.text for s in sentence_tags}
        self.par_to_sec = {p.attrib['id']: p.attrib.get('sec-type')
                           for p in paragraph_tags}

        # Keep a list of unhandled events for development purposes
        self._unhandled_events = []

        # Extract statements
        self.extract_noun_relations('CC')
        self.extract_noun_relations('EVENT')

        # In some EKBs we get two redundant relations over the same arguments,
        # we eliminate these
        self._remove_multi_extraction_artifacts()

        # Print unhandled event types
        logger.debug('Unhandled event types: %s' %
                     (', '.join(sorted(list(set(self._unhandled_events))))))
Exemple #6
0
    def __init__(self, xml_string):
        self.statements = []
        # Parse XML
        try:
            self.tree = ET.XML(xml_string, parser=UTB())
        except ET.ParseError:
            logger.error('Could not parse XML string')
            self.tree = None
            return

        # Get the document ID from the EKB tag.
        self.doc_id = self.tree.attrib.get('id')

        # Store all paragraphs and store all sentences in a data structure
        paragraph_tags = self.tree.findall('input/paragraphs/paragraph')
        sentence_tags = self.tree.findall('input/sentences/sentence')
        self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags}
        self.sentences = {s.attrib['id']: s.text for s in sentence_tags}
        self.par_to_sec = {
            p.attrib['id']: p.attrib.get('sec-type')
            for p in paragraph_tags
        }

        # Keep a list of events that are part of relations and events
        # subsumed by other events
        self.relation_events = set()
        self.subsumed_events = set()

        # Keep a list of unhandled events for development purposes
        self._unhandled_events = set()

        self._preprocess_events()
Exemple #7
0
def process_xml(xml_str):
    try:
        tree = ET.XML(xml_str, parser=UTB())
    except ET.ParseError:
        logger.error('Could not parse XML string')
        return None
    sp = _process_elementtree(tree)
    return sp
Exemple #8
0
def get_abstract(doi):
    """Get the abstract text of an article from Elsevier given a doi."""
    xml_string = download_article(doi)
    if xml_string is None:
        return None
    assert isinstance(xml_string, str)
    xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB())
    if xml_tree is None:
        return None
    coredata = xml_tree.find('article:coredata', elsevier_ns)
    abstract = coredata.find('dc:description', elsevier_ns)
    abs_text = abstract.text
    return abs_text
Exemple #9
0
def extract_paragraphs(xml_string):
    """Get paragraphs from the body of the given Elsevier xml."""
    assert isinstance(xml_string, str)
    xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB())
    full_text = xml_tree.find('article:originalText', elsevier_ns)
    if full_text is None:
        logger.info('Could not find full text element article:originalText')
        return None
    article_body = _get_article_body(full_text)
    if article_body:
        return article_body
    raw_text = _get_raw_text(full_text)
    if raw_text:
        return [raw_text]
    return None
Exemple #10
0
def send_request(url, data):
    try:
        res = requests.get(url, params=data)
    except requests.exceptions.Timeout as e:
        logger.error('PubMed request timed out')
        logger.error('url: %s, data: %s' % (url, data))
        logger.error(e)
        return None
    except requests.exceptions.RequestException as e:
        logger.error('PubMed request exception')
        logger.error('url: %s, data: %s' % (url, data))
        logger.error(e)
        return None
    if not res.status_code == 200:
        return None
    tree = ET.XML(res.content, parser=UTB())
    return tree
Exemple #11
0
def get_hgnc_entry(hgnc_id):
    """Return the HGNC entry for the given HGNC ID from the web service.

    Parameters
    ----------
    hgnc_id : str
        The HGNC ID to be converted.

    Returns
    -------
    xml_tree : ElementTree
        The XML ElementTree corresponding to the entry for the
        given HGNC ID.
    """
    url = hgnc_url + 'hgnc_id/%s' % hgnc_id
    headers = {'Accept': '*/*'}
    res = requests.get(url, headers=headers)
    if not res.status_code == 200:
        return None
    xml_tree = ET.XML(res.content, parser=UTB())
    return xml_tree
Exemple #12
0
def extract_text(xml_string):
    if xml_string is None:
        return None
    #with open('/Users/johnbachman/Desktop/elsevier.xml', 'wb') as f:
    #    f.write(xml_string.encode('utf-8'))
    assert isinstance(xml_string, str)
    # Build XML ElementTree
    xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB())
    # Look for full text element
    full_text = xml_tree.find('article:originalText', elsevier_ns)
    if full_text is None:
        logger.info('Could not find full text element article:originalText')
        return None
    article_body = _get_article_body(full_text)
    if article_body:
        return article_body
    raw_text = _get_raw_text(full_text)
    if raw_text:
        return raw_text
    #pdf = _get_pdf_attachment(full_text)
    #if pdf:
    #    return pdf
    return None
Exemple #13
0
def process_xml(xml_str):
    """Return processor with Statements extracted from a Sparser XML.

    Parameters
    ----------
    xml_str : str
        The XML string obtained by reading content with Sparser, using the
        'xml' output mode.

    Returns
    -------
    sp : SparserXMLProcessor
        A SparserXMLProcessor which has extracted Statements as its
        statements attribute.
    """
    try:
        tree = ET.XML(xml_str, parser=UTB())
    except ET.ParseError as e:
        logger.error('Could not parse XML string')
        logger.error(e)
        return None
    sp = _process_elementtree(tree)
    return sp
Exemple #14
0
def test_unicode_tree_builder():
    xml = u'<html><bar>asdf</bar></html>'.encode('utf-8')
    xml_io = BytesIO(xml)
    tree = ET.parse(xml_io, parser=UTB())
    bar = tree.find('.//bar')
    assert unicode_strs(bar)
Exemple #15
0
def send_request(url, data):
    res = requests.get(url, params=data)
    if not res.status_code == 200:
        return None
    tree = ET.XML(res.content, parser=UTB())
    return tree
Exemple #16
0
 def get_xml_file(self, xml_file):
     "Get the content from an xml file as an ElementTree."
     logger.info("Downloading %s" % (xml_file))
     xml_bytes = self.get_uncompressed_bytes(xml_file, force_str=False)
     logger.info("Parsing XML metadata")
     return ET.XML(xml_bytes, parser=UTB())