def getPubmedDoi(pmid):
    """ retrieve doi for pmid via http eutils"""
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&[email protected]&retmode=xml&id=%s' % pmid
    xp = maxXml.XmlParser(url=url)
    doi = xp.getTextFirst("PubmedArticle/PubmedData/ArticleIdList/ArticleId", reqAttrDict={'IdType':'doi'}, default=None)
    logging.debug("Found DOI: %s" % doi)
    return doi
def getPubmedOutlinks(pmid, preferPmc=True):
    """ use eutils to get outlinks from pubmed """
    logging.debug("%s: Getting outlink from pubmed" % (pmid))
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid
    xp = maxXml.XmlParser(url=url)
    #req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) [email protected]')
    outlinks = []
    aggregatorOutlinks = []

    for objUrl in xp.getXmlAll("LinkSet/IdUrlList/IdUrlSet/ObjUrl"):
        url = objUrl.getTextFirst("Url")
        SubjType = objUrl.getTextFirst("SubjectType")
        if SubjType != "publishers/providers":
            logging.log(
                5,
                "skipping url %s, is not a URL to a provider/publisher" % url)
            if SubjType == "aggregators":
                aggregatorOutlinks.append(url)
            continue

        attrList = list(objUrl.getTextAll("Attribute"))
        if not "full-text online" in attrList and \
            "full-text PDF" not in attrList:
            logging.log(
                5, "skipping url %s, does not seem to provide fulltext" % url)
            continue
        else:
            outlinks.append(url)

    logging.debug("Found %d outlinks" % len(outlinks))
    logging.log(5, "Outlinks: %s" % str(outlinks))
    if len(outlinks) == 0:
        logging.debug("No Outlinks found, checking for PMC aggregator")
        if len(aggregatorOutlinks)>1 and \
           "ukpmc" in aggregatorOutlinks[0]: # let's get rid of UKPMC
            aggregatorOutlinks.pop(0)
        for outlink in aggregatorOutlinks:
            if preferPmc and httpStartsWith("http://www.ncbi.nlm.nih.gov/pmc",
                                            outlink):
                logging.debug("Found PMC outlink")
                return outlink
        logging.debug("No PMC outlink")
        return None
    else:
        if "ukpmc" in outlinks[0] and len(outlinks) > 1:
            outlinks.pop(0)
        if "swetswise" in outlinks[0] and len(outlinks) > 1:
            outlinks.pop(0)
        outlink = outlinks[0]
        logging.debug("Using outlink: %s" % outlink)
        return outlink
Beispiel #3
0
def parsePubmedMedlineIter(xml):
    """
    Parse pubmed xml format and yield as dictionary, see parseMedline
    records come either from Pubmed as a <PubmedArticleSet><PubmedArticle><MedlineCitation>...
    or from Medline as <MedlineCitationSet><MedlineCitation>...</MedlineCitation>
    """
    #if fromMedline:
        #recordTag = "MedlineCitation"
        #closeTag = "</MedlineArticleSet>"
        #openTag = "<MedlineArticleSet"
    #else:
    recordTag = "PubmedArticle"
    #recordTag = "PubmedArticle/MedlineCitation"
    closeTag = "</PubmedArticleSet>"
    openTag = "<PubmedArticleSet>"
    # NCBI eutils sometimes "forgets" the opening/closing tags
    if xml.strip()=="":
        logging.error("Got empty XML file from NCBI")
        raise PubmedError("Got empty XML from NCBI", "pubmedEmptyXml")

    if not openTag in xml:
        logging.warn("Addding opening tag")
        xml = openTag + "\n" + xml

    if not closeTag in xml:
        logging.warn("Addding closing tag")
        xml = xml+"\n"+closeTag

    logging.debug("Parsing pubmed file")
    try:
        topEl       = maxXml.XmlParser(string=xml)
    except ParseError:
        logging.debug("Error on parsing this XML: %s" % xml)
        raise
    except ParseError2:
        logging.debug("Error on parsing this XML: %s" % xml)
        raise

    for artEl in topEl.getXmlAll(recordTag):
        medlineCitEl = artEl.getXmlFirst("MedlineCitation")
        dataDict = parseMedline(medlineCitEl)

        pubmedCitEl = artEl.getXmlFirst("PubmedData")
        dataDict = parsePubmedFields(pubmedCitEl, dataDict)
        yield dataDict