def getPubmedDoi(pmid): """ retrieve doi for pmid via http eutils""" url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&[email protected]&retmode=xml&id=%s' % pmid xp = maxXml.XmlParser(url=url) doi = xp.getTextFirst("PubmedArticle/PubmedData/ArticleIdList/ArticleId", reqAttrDict={'IdType':'doi'}, default=None) logging.debug("Found DOI: %s" % doi) return doi
def getPubmedOutlinks(pmid, preferPmc=True): """ use eutils to get outlinks from pubmed """ logging.debug("%s: Getting outlink from pubmed" % (pmid)) url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid xp = maxXml.XmlParser(url=url) #req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) [email protected]') outlinks = [] aggregatorOutlinks = [] for objUrl in xp.getXmlAll("LinkSet/IdUrlList/IdUrlSet/ObjUrl"): url = objUrl.getTextFirst("Url") SubjType = objUrl.getTextFirst("SubjectType") if SubjType != "publishers/providers": logging.log( 5, "skipping url %s, is not a URL to a provider/publisher" % url) if SubjType == "aggregators": aggregatorOutlinks.append(url) continue attrList = list(objUrl.getTextAll("Attribute")) if not "full-text online" in attrList and \ "full-text PDF" not in attrList: logging.log( 5, "skipping url %s, does not seem to provide fulltext" % url) continue else: outlinks.append(url) logging.debug("Found %d outlinks" % len(outlinks)) logging.log(5, "Outlinks: %s" % str(outlinks)) if len(outlinks) == 0: logging.debug("No Outlinks found, checking for PMC aggregator") if len(aggregatorOutlinks)>1 and \ "ukpmc" in aggregatorOutlinks[0]: # let's get rid of UKPMC aggregatorOutlinks.pop(0) for outlink in aggregatorOutlinks: if preferPmc and httpStartsWith("http://www.ncbi.nlm.nih.gov/pmc", outlink): logging.debug("Found PMC outlink") return outlink logging.debug("No PMC outlink") return None else: if "ukpmc" in outlinks[0] and len(outlinks) > 1: outlinks.pop(0) if "swetswise" in outlinks[0] and len(outlinks) > 1: outlinks.pop(0) outlink = outlinks[0] logging.debug("Using outlink: %s" % outlink) return outlink
def parsePubmedMedlineIter(xml): """ Parse pubmed xml format and yield as dictionary, see parseMedline records come either from Pubmed as a <PubmedArticleSet><PubmedArticle><MedlineCitation>... or from Medline as <MedlineCitationSet><MedlineCitation>...</MedlineCitation> """ #if fromMedline: #recordTag = "MedlineCitation" #closeTag = "</MedlineArticleSet>" #openTag = "<MedlineArticleSet" #else: recordTag = "PubmedArticle" #recordTag = "PubmedArticle/MedlineCitation" closeTag = "</PubmedArticleSet>" openTag = "<PubmedArticleSet>" # NCBI eutils sometimes "forgets" the opening/closing tags if xml.strip()=="": logging.error("Got empty XML file from NCBI") raise PubmedError("Got empty XML from NCBI", "pubmedEmptyXml") if not openTag in xml: logging.warn("Addding opening tag") xml = openTag + "\n" + xml if not closeTag in xml: logging.warn("Addding closing tag") xml = xml+"\n"+closeTag logging.debug("Parsing pubmed file") try: topEl = maxXml.XmlParser(string=xml) except ParseError: logging.debug("Error on parsing this XML: %s" % xml) raise except ParseError2: logging.debug("Error on parsing this XML: %s" % xml) raise for artEl in topEl.getXmlAll(recordTag): medlineCitEl = artEl.getXmlFirst("MedlineCitation") dataDict = parseMedline(medlineCitEl) pubmedCitEl = artEl.getXmlFirst("PubmedData") dataDict = parsePubmedFields(pubmedCitEl, dataDict) yield dataDict