def _extractClassInfo(self, cls): assert (isinstance(cls, Class)) if cls.dataAvailable: return True compounddef = self._findCompounddef(cls.refid) if compounddef == None: return False XMLParser._extractCompoundTypeInfo(cls, compounddef) if cls.compoundname != "": inheritInfo = [ Class.InheritInfo( xmlutil.getText(basecompoundref, "refid"), basecompoundref.text, xmlutil.getText(basecompoundref, "prot"), xmlutil.getText(basecompoundref, "virt") == "virtual") for basecompoundref in compounddef.iter("basecompoundref") ] if len(inheritInfo) > 0: cls.inheritInfo = inheritInfo lastOfColon = cls.compoundname.rfind(":") if lastOfColon == -1: cls.name = cls.compoundname else: cls.name = cls.compoundname[lastOfColon + 1:] for innerclass in compounddef.iter("innerclass"): self.project.addClass( innerclass.get("refid")).setParent(cls) for innerclass in cls.innerclasses: self._makeCompoundDataAvailable( innerclass, XMLParser._extractClassInfo) return True else: return False
def __init__(self, node): """ create a new concept element given a UMLS node """ self.id = "" self.types = set([]) self.snomed = "" self.sources = [] self.score = 0 self.isNegated = False self.inSnomed = False self.inRxnorm = False self.id = node.getAttribute("id") self.snomed = node.getAttribute("snomed") self.score = int(node.getAttribute("score")) self.isNegated = node.getAttribute("negated") == "true" tNodeList = node.getElementsByTagName("type") for tNode in tNodeList: self.types.add(xmlutil.getText(tNode)) sNodeList = node.getElementsByTagName("source") for sNode in sNodeList: s = xmlutil.getText(sNode) if s == "SNOMEDCT": self.inSnomed = True elif s == "RXNORM": self.inRxnorm = True
def load(self, node): """Load exit codes from XML DOM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "l": self.lower = int(xmlutil.getText(child)) elif tagn == "u": self.upper = int(xmlutil.getText(child)) child = child.nextSibling()
def load(self, node): """Load env var from XML DOM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "name": self.e_name = xmlutil.getText(child) elif tagn == "value": self.e_value = xmlutil.getText(child) child = child.nextSibling()
def load(self, node): """Load permissions from XML DDM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "uperm": self.u_flags = int(xmlutil.getText(child)) elif tagn == "gperm": self.g_flags = int(xmlutil.getText(child)) elif tagn == "operm": self.o_flags = int(xmlutil.getText(child)) child = child.nextSibling()
def load(self, node): """Load redirection from XML DOM""" self.action = int(str(node.toElement().attribute("type", "0"))) child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "fd": self.fd = int(xmlutil.getText(child)) elif tagn == "file": self.filename = xmlutil.getText(child) elif tagn == "fd2": self.fd2 = int(xmlutil.getText(child)) child = child.nextSibling()
def load(self, node): """Load constant value from XML DOM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "intval": self.isdef = True self.isint = True self.value = int(xmlutil.getText(child)) elif tagn == "textval": self.isdef = True self.isint = False self.value = xmlutil.getText(child) child = child.nextSibling()
def __init__(self, pubInfoNode): """ Parse a PublicationInformation """ assert pubInfoNode is not None assert isinstance(pubInfoNode, xml.dom.minidom.Element) self._journalNode = None self._authorListNode = None self._country = "" self._publicationTypeListNode = None journalNodes = pubInfoNode.getElementsByTagName('Journal') if len(journalNodes) > 0: self._journalNode = journalNodes[0].cloneNode(deep=True) countryNodes = pubInfoNode.getElementsByTagName('Country') if len(countryNodes) > 0: self._country = xmlutil.getText(countryNodes[0]) authorListNodes = pubInfoNode.getElementsByTagName('AuthorList') if len(authorListNodes) > 0: self._authorListNode = authorListNodes[0].cloneNode(deep=True) publicationTypeListNodes = pubInfoNode.getElementsByTagName('PublicationTypeList') if len(publicationTypeListNodes) > 0: self._publicationTypeListNode = publicationTypeListNodes[0].cloneNode(deep=True)
def keepForDiabetesCorpus(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least one cost value or term. """ abstractNodes = xmldoc.getElementsByTagName('Abstract') if abstractNodes is None or len(abstractNodes) == 0: return False textNodeList = abstractNodes[0].getElementsByTagName('AbstractText') if textNodeList is None or len(textNodeList) == 0: return False nCostValues = 0 nCostTerms = 0 tokenCount = 0 cueLemmaSet = {"cost", "QALY", "QALYs"} for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: tokenCount += 1 lemmatizeabstracts.lemmatizeToken(token) if token.lemma in cueLemmaSet or token.text.find('cost') >= 0: nCostTerms += 1 if cvFinder.tokenIsCostValue(token): nCostValues += 1 return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
def gethostvarname(node): """Extract variable name as host:variable name. We may vary the format to have host and name separately in some cases""" child = node.firstChild() if child.isText(): return str(child.toText().data()) hostn = vname = "" while not child.isNull(): tagn = child.toElement().tagName() if tagn == "host": hostn = xmlutil.getText(child) elif tagn == "name": vname = xmlutil.getText(child) child = child.nextSibling() if len(hostn) == 0: return vname return hostn + ':' + vname
def _extractCodeUnitInfo(codeunit, xmlelem): assert (isinstance(codeunit, CodeUnit)) if codeunit.location == None: codeunit.location = CodeUnit.Location() codeunit.kind = xmlutil.getText(xmlelem, "kind") codeunit.name = xmlutil.findText(xmlelem, "name") codeunit.location.file = xmlutil.findTagProp(xmlelem, "location", "file") codeunit.location.line = _toLineNumber( xmlutil.findTagProp(xmlelem, "location", "line"))
def load(self, node): """Slurp from XML node""" self.inclnull = False self.limuser = "" self.limgroup = "" self.limqueue = "" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "inclnull": self.inclnull = True elif tagn == "limuser": self.limuser = xmlutil.getText(child) elif tagn == "limgroup": self.limgroup = xmlutil.getText(child) elif tagn == "limqueue": self.limqueue = xmlutil.getText(child) child = child.nextSibling()
def load(self, node): """Load variable from XML DOM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "name": self.var_name = xmlutil.getText(child) elif tagn == "comment": self.var_comment = xmlutil.getText(child) elif tagn == "value": self.var_value.load(child) elif tagn == "vmode": self.var_mode.load(child) elif tagn == "type": self.var_type = int(xmlutil.getText(child)) elif tagn == "flags": self.var_flags = int(xmutil.getText(child)) child = child.nextSibling()
def parseXML(self, node=None): """ load information from an xml node """ # parse xml element if given one if node != None: self.type = node.getAttribute('type').lower() for childNode in node.childNodes: if childNode.nodeType == xml.dom.Node.ELEMENT_NODE: attribName = childNode.tagName value = xmlutil.getText(childNode) self.attributes[attribName] = value
def load(self, node): """Load state from XML file""" child = node.firstChild() if child.toElement().tagName() == "ip": self.servname = "" self.namebyip = True self.servip = gbnetid.gbnetid(xmlutil.getText(child)) else: self.servname = xmlutil.getText(child) self.namebyip = False try: self.servip = gbnetid.gbnetid(socket.gethostbyname(self.servname)) except socket.gaierror: self.servip = gbnetid.gbnetid() el = node.toElement() if el.hasAttribute("alias"): self.alias = str(el.attribute("alias")) self.autoconn = False if el.hasAttribute("autoconn") and el.attribute("autoconn") == "y": self.autoconn = True
def keepForIschemiaCorpus(xmldoc): """ Return True if we should keep this abstract for the ischemia corpus Include abstract in ischemia corpus if it contains at least 4 integers. """ textNodeList = xmldoc.getElementsByTagName('AbstractText') nIntegers = 0 for textNode in textNodeList: text = xmlutil.getText(textNode) tokens = tokenizer.tokenize(text) for token in tokens: if token.isInteger(): nIntegers += 1 return nIntegers > 3
def getPublicationTypes(self): """ Return list of strings describing the type of publication that the abstract is """ if self._publicationTypeListNode is None: return [] pTypes = [] publicationTypeNodes = self._publicationTypeListNode.getElementsByTagName('PublicationType') for node in publicationTypeNodes: pType = xmlutil.getText(node) if pType is not None and pType is not "": pTypes.append(pType) return pTypes
def parseXML(self, tNode, index, sentence): """ create a new token from an xml token element. tNode = xml token element index = the index of the element in the sentence (0 indexed) sentence = the Sentence object containing this token """ self.sentence = sentence self.index = index self.text = xmlutil.normalizeText(tNode.getAttribute('text')) if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \ and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')): # first word in the sentence is capitalized and is not part of an acronym self.text = self.text.lower() self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma')) if len(self.lemma) == 0: self.lemma = self.text self.pos = tNode.getAttribute('pos') if self.pos == None: self.pos = '' dNodes = tNode.getElementsByTagName('dep') self.dependents = parsetree.DependencyList(dNodes) gNodes = tNode.getElementsByTagName('gov') self.governors = parsetree.DependencyList(gNodes) for gov in self.governors: if gov.index == self.index: # print 'Governor index matches dependent index' # print self.text # print self.sentence.toString() # sys.exit() self.governors.remove(gov) aNodes = tNode.getElementsByTagName('annotation') self.annotations = AnnotationList(aNodes) lNodes = tNode.getElementsByTagName('label') self.labels = AnnotationList(lNodes) sNodes = tNode.getElementsByTagName('semantic') for node in sNodes: semTag = xmlutil.getText(node) self.semanticTags.add(semTag) uNodes = tNode.getElementsByTagName('umls') for node in uNodes: self.umlsConcepts.append(umlsconcept.UMLSConcept(node))
def __init__(self, filename): """Given the name of a file containing the XML summary, parse the file and read its contents """ xmldoc = xml.dom.minidom.parse(filename) pmidNodes = xmldoc.getElementsByTagName('Name') self.id = int(xmlutil.getText(pmidNodes[0])) subjectNodes = xmldoc.getElementsByTagName('Subjects') if len(subjectNodes) == 0: self.groupNodes = subjectNodes.getElementsByTagName('Group') else: self.groupNodes = [] olistNodes = xmldoc.getElementsByTagName('Outcomes') if len(olistNodes) == 1: self.outcomeListNode = olistNodes[0] else: self.outcomeListNode = None htmlSummaryNodes = xmldoc.getElementsByTagName('HTMLData') if len(htmlSummaryNodes) == 1: self.htmlData = xmlutil.getText(htmlSummaryNodes[0])
def parseXML(self, sNode, index, abstract): self.section = sNode.getAttribute('section').replace(' ', '_') self.index = index self.abstract = abstract self.nlmCategory = sNode.getAttribute('nlmCategory') tNodes = sNode.getElementsByTagName('token') i = 0 for node in tNodes: t = sentencetoken.Token() t.parseXML(node, i, self) self.tokens.append(t) i = i + 1 if self.tokens[-1].text == '.': self.tokens[-1].text = '-EOS-' self.tokens[-1].lemma = '-EOS-' self.tokens[-1].pos = 'eos' # parse the parse tree pNodes = sNode.getElementsByTagName('parse') if len(pNodes) == 1: self.parseString = xmlutil.getText(pNodes[0]) # build parse trees if len(self.parseString) > 0: self.parseTree = parsetree.ParseTreeNode() self.parseTree.buildParseTree(self.parseString, self.tokens) # self.simpleTree = SimplifiedTreeNode() # self.simpleTree.buildSimplifiedTree(self.parseTree) for token in self.tokens: for dep in token.dependents: dep.token = self.tokens[dep.index] for gov in token.governors: gov.token = self.tokens[gov.index] if token.isRoot(): self.dependencyGraphRoot.append(token) # self.dependencyGraphBFS() # build list of umls terms in sentence uNodeList = sNode.getElementsByTagName('umlsChunk') for uNode in uNodeList: umlsChunk = umlschunk.UMLSChunk(uNode, self) self.umlsChunks.append(umlsChunk) for i in range(umlsChunk.startIdx, umlsChunk.endIdx + 1): token = self.tokens[i] token.umlsChunks.append(umlsChunk) # see if we can determine the types of some of the numbers self.findSpecialValues()
def load(self, node): """Load list from XML file""" # Do it in two passes so we've got getloc and myaddr set up first child = node.firstChild() self.getlocaddr = False while not child.isNull(): tagn = child.toElement().tagName() if tagn == "lahost": self.lahost = xmlutil.getText(child) elif tagn == "laport": self.laport = int(xmlutil.getText(child)) elif tagn == "servers": self.serversanyname = dict() self.serversbyname = dict() self.serversbyip = dict() self.namecache = dict() self.ipcache = dict() srv = child.firstChild() while not srv.isNull(): s = gbserver() s.load(srv) self.add(s) srv = srv.nextSibling() child = child.nextSibling()
def __init__(self, node): self.id = xmlutil.getTextFromNodeCalled('id', node) self.gender = xmlutil.getTextFromNodeCalled('gender', node) self.minAge = xmlutil.getTextFromNodeCalled('minAge', node) self.maxAge = xmlutil.getTextFromNodeCalled('maxAge', node) self.locations = [] lcNodes = node.getElementsByTagName('location_countries') if len(lcNodes) > 0: cNodes = lcNodes[0].getElementsByTagName('country') for countryNode in cNodes: self.locations.append(xmlutil.getText(countryNode)) self.conditions = [] cNodes = node.getElementsByTagName('condition') for cNode in cNodes: self.conditions.append(ReportEntry(cNode)) self.eligibilityCriteria = [] ecNodes = node.getElementsByTagName('eligibility') if len(ecNodes) > 0: cNodes = ecNodes[0].getElementsByTagName('criteria') for cNode in cNodes: self.eligibilityCriteria.append(ReportEntry(cNode)) self.inclusionCriteria = [] icNodes = node.getElementsByTagName('inclusion') if len(icNodes) > 0: cNodes = icNodes[0].getElementsByTagName('criteria') for cNode in cNodes: self.inclusionCriteria.append(ReportEntry(cNode)) self.exclusionCriteria = [] ecNodes = node.getElementsByTagName('exclusion') if len(ecNodes) > 0: cNodes = ecNodes[0].getElementsByTagName('criteria') for cNode in cNodes: self.exclusionCriteria.append(ReportEntry(cNode)) self.interventions = [] iNodes = node.getElementsByTagName('intervention') for iNode in iNodes: self.interventions.append(Intervention(iNode)) self.outcomes = [] oNodes = node.getElementsByTagName('outcome') for oNode in oNodes: self.outcomes.append(Outcome(oNode))
def load(self, node): """Load assignment from XML DOM""" self.bja_op = int(str(node.toElement().attribute("type", "0"))) self.bja_iscrit = False child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "vname": self.bja_varname = gethostvarname(child) elif tagn == "const": self.bja_con.load(child) elif tagn == "iscrit": self.bja_iscrit = True elif tagn == "flags": self.bja_flags = int(xmlutil.getText(child)) child = child.nextSibling()
def keepForDiabetesCorpusCostValue(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least *one* currency value. """ textNodeList = xmldoc.getElementsByTagName('AbstractText') nCostValues = 0 for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: lemmatizeabstracts.lemmatizeToken(token) if cvFinder.tokenIsCostValue(token): nCostValues += 1 return nCostValues > 0
def load(self, node): """Load save time details from XML file""" self.tc_istime = node.toElement().attribute("timeset", "n") == "y" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() value = int(xmlutil.getText(child)) if tagn == "nexttime": self.tc_nexttime = value elif tagn == "repeat": self.tc_repeat = value elif tagn == "rate": self.tc_rate = value elif tagn == "mday": self.tc_rate = value elif tagn == "nvaldays": self.tc_nvaldays = value elif tagn == "nposs": self.tc_nposs = value child = child.nextSibling()
def _extractHeaderInfo(self, header): assert (isinstance(header, Header)) if header.dataAvailable: return True compounddef = self._findCompounddef(header.refid) if compounddef == None: return False XMLParser._extractCompoundTypeInfo(header, compounddef) header.updateName() for include in compounddef.iter("includes"): header.includes.append( Header.Include(include.text, xmlutil.getText(include, "local") == "yes")) for innernamespaceDB in compounddef.iter("innernamespace"): header.namespaces.append( self.project.addNamespace(innernamespaceDB.get("refid"))) for innerclassDB in compounddef.iter("innerclass"): header.innerclasses.append( self.project.addClass(innerclassDB.get("refid"))) return True
def load(self, parent): """Load saved port numbers from XML file""" node = parent.firstChild() while not node.isNull(): tagn = node.toElement().tagName() if tagn == "CONN_TCP": self.connect_tcp = int(xmlutil.getText(node)) elif tagn == "CONN_UDP": self.connect_udp = int(xmlutil.getText(node)) elif tagn == "CLIENT_ACCESS": self.client_access = int(xmlutil.getText(node)) elif tagn == "JOBVIEW": self.jobview = int(xmlutil.getText(node)) elif tagn == "API_TCP": self.api_tcp = int(xmlutil.getText(node)) elif tagn == "API_UDP": self.api_udp = int(xmlutil.getText(node)) node = node.nextSibling()
print "downloaded registry information is written to the file '<PMID>.nct.xml'" sys.exit() nctCount = 0 isrctnCount = 0 nctPattern = re.compile('.*NCT\s*\d+.*') for i in range(1, len(sys.argv)): file = sys.argv[i] print file xmldoc = minidom.parse(file) idNodeList = xmldoc.getElementsByTagName('AccessionNumber') for node in idNodeList: id = xmlutil.getText(node) if len(id) > 3 and id[0:3] == 'NCT': try: # fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?resultsxml=true' fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?displayxml=true' print 'Downloading:', fetchCmd doc = urllib2.urlopen(fetchCmd) out = open(id+'.xml', 'w') out.write(doc.read()) out.close() nctCount += 1 except: print '***Could not download:', fetchCmd elif len(id) > 6 and id[0:6] == 'ISRCTN': print id isrctnID = id
for line in file.readlines(): [pmid, xml] = line.split('.') ignoreSet.add(pmid) if inputPath[-1] != '/': inputPath += '/' if outputPath[-1] != '/': outputPath += '/' # initialize sentence splitter and tokenizer sentenceSplitter = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() cvFinder = costvaluefinder.CostValueFinder() fileList = glob.glob(inputPath+'*.xml') for filename in fileList: xmldoc = xml.dom.minidom.parse(filename) pmidNodes = xmldoc.getElementsByTagName('PMID') if len(pmidNodes) > 0: pmid = xmlutil.getText(pmidNodes[0]) if pmid in ignoreSet: print pmid, 'already annotated' else: # if keepForIschemiaCorpus(xmldoc): if keepForDiabetesCorpus(xmldoc): # copy abstract print 'Copying: ', filename shutil.copy(filename, outputPath)
print "downloaded registry information is written to the file '<PMID>.nct.xml'" sys.exit() nctCount = 0 isrctnCount = 0 nctPattern = re.compile('.*NCT\s*\d+.*') for i in range(1, len(sys.argv)): file = sys.argv[i] print file xmldoc = minidom.parse(file) idNodeList = xmldoc.getElementsByTagName('AccessionNumber') for node in idNodeList: id = xmlutil.getText(node) if len(id) > 3 and id[0:3] == 'NCT': try: # fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?resultsxml=true' fetchCmd = 'http://clinicaltrials.gov/show/' + id + '?displayxml=true' print 'Downloading:', fetchCmd doc = urllib2.urlopen(fetchCmd) out = open(id + '.xml', 'w') out.write(doc.read()) out.close() nctCount += 1 except: print '***Could not download:', fetchCmd elif len(id) > 6 and id[0:6] == 'ISRCTN': print id isrctnID = id
def load(self, node): """Load job from XML DOM""" child = node.firstChild() while not child.isNull(): tagn = child.toElement().tagName() if tagn == "progress": self.bj_progress = int(xmlutil.getText(child)) elif tagn == "pri": self.bj_pri = int(xmlutil.getText(child)) elif tagn == "ll": self.bj_ll = int(xmlutil.getText(child)) elif tagn == "umask": self.bj_umask = int(xmlutil.getText(child)) elif tagn == "ulimit": self.bj_ulimit = int(xmlutil.getText(child)) elif tagn == "jflags": self.bj_jflags = int(xmlutil.getText(child)) elif tagn == "title": self.bj_title = xmlutil.getText(child) elif tagn == "direct": self.bj_direct = xmlutil.getText(child) elif tagn == "runtime": self.bj_runtime = int(xmlutil.getText(child)) elif tagn == "autoksig": self.bj_autoksig = int(xmlutil.getText(child)) elif tagn == "runon": self.bj_runon = int(xmlutil.getText(child)) elif tagn == "deltime": self.bj_deltime = int(xmlutil.getText(child)) elif tagn == "cmdinterp": self.bj_cmdinterp = xmlutil.getText(child) elif tagn == "jmode": self.bj_mode.load(child) elif tagn == "times": self.bj_times.load(child) elif tagn == "conds": gc = child.firstChild() self.bj_conds = [] while not gc.isNull(): if gc.toElement().tagName() == "cond": nc = jcond() nc.load(gc) self.bj_conds.append(nc) gc = gc.nextSibling() elif tagn == "asses": gc = child.firstChild() self.bj_asses = [] while not gc.isNull(): if gc.toElement().tagName() == "ass": na = jass() na.load(gc) self.bj_asses.append(na) gc = gc.nextSibling() elif tagn == "args": gc = child.firstChild() self.bj_arg = [] while not gc.isNull(): if gc.toElement().tagName() == "arg": self.bj_arg.append(xmlutil.getText(gc)) gc = gc.nextSibling() elif tagn == "envs": gc = child.firstChild() self.bj_env = [] while not gc.isNull(): if gc.toElement().tagName() == "env": ne = envir() ne.load(gc) self.bj_env.append(ne) gc = gc.nextSibling() elif tagn == "redirs": gc = child.firstChild() self.bj_redirs = [] while not gc.isNull(): if gc.toElement().tagName() == "redir": nr = redir() nr.load(gc) self.bj_redirs.append(nr) gc = gc.nextSibling() elif tagn == "nexit": self.exitn.load(child) elif tagn == "eexit": self.exite.load(child) child = child.nextSibling()
else: targetIdSet = set([]) searchCmd = eutils + 'esearch.fcgi?db=pubmed' + searchArgs print searchCmd searchResults = urllib2.urlopen(searchCmd) # parse results xmldoc = xml.dom.minidom.parseString(searchResults.read()) idNodeList = xmldoc.getElementsByTagName('Id') print 'Number of documents found =', len(idNodeList) #idNodeList = open('abs.txt', 'r').readlines() searchResultSet = set([]) for idNode in idNodeList: id = xmlutil.getText(idNode) id.strip() searchResultSet.add(id) if len(targetIdSet) > 0: missingAbstracts = list(targetIdSet - searchResultSet) missingAbstracts.sort() foundIds = list(targetIdSet.intersection(searchResultSet)) foundIds.sort() print len(foundIds), 'abstracts found' for pmid in foundIds: print pmid print len(missingAbstracts), 'Missing abstracts:' for pmid in missingAbstracts: print pmid
for line in file.readlines(): [pmid, xml] = line.split('.') ignoreSet.add(pmid) if inputPath[-1] != '/': inputPath += '/' if outputPath[-1] != '/': outputPath += '/' # initialize sentence splitter and tokenizer sentenceSplitter = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() cvFinder = costvaluefinder.CostValueFinder() fileList = glob.glob(inputPath + '*.xml') for filename in fileList: xmldoc = xml.dom.minidom.parse(filename) pmidNodes = xmldoc.getElementsByTagName('PMID') if len(pmidNodes) > 0: pmid = xmlutil.getText(pmidNodes[0]) if pmid in ignoreSet: print pmid, 'already annotated' else: # if keepForIschemiaCorpus(xmldoc): if keepForDiabetesCorpus(xmldoc): # copy abstract print 'Copying: ', filename shutil.copy(filename, outputPath)
def _extractMemberInfo(member, xmlMemberdef): assert (isinstance(member, Member)) XMLParser._extractCodeUnitInfo(member, xmlMemberdef) member.isStatic = (xmlutil.getText(xmlMemberdef, "static") == "yes") member.scope = xmlutil.getText(xmlMemberdef, "prot")
from abstract import Abstract if len(sys.argv) < 3: print "Usage: meshlist.py <INPUT_PATH> <OUTPUT_FILE>" print "output list of mesh terms for all abstracts" print "in the directory specified by <INPUT_PATH>" sys.exit() inputPath = sys.argv[1] fileList = glob.glob(inputPath+'/*.xml') meshTerms = set([]) for filename in fileList: print filename xmldoc = minidom.parse(filename) dNodes = xmldoc.getElementsByTagName('DescriptorName') for node in dNodes: meshTerms.add(xmlutil.getText(node).lower()) # abs = Abstract(filename) # for meshHeading in abs.meshHeadingList: # meshTerms.add(meshHeading.descriptorName.name.lower()) meshTerms = sorted(list(meshTerms)) out = open(sys.argv[2], 'w') for term in meshTerms: out.write(term+'\n') out.close()
else: targetIdSet = set([]) searchCmd = eutils + 'esearch.fcgi?db=pubmed'+ searchArgs print searchCmd searchResults = urllib2.urlopen(searchCmd) # parse results xmldoc = xml.dom.minidom.parseString(searchResults.read()) idNodeList = xmldoc.getElementsByTagName('Id') print 'Number of documents found =', len(idNodeList) #idNodeList = open('abs.txt', 'r').readlines() searchResultSet = set([]) for idNode in idNodeList: id = xmlutil.getText(idNode) id.strip() searchResultSet.add(id) if len(targetIdSet) > 0: missingAbstracts = list(targetIdSet - searchResultSet) missingAbstracts.sort() foundIds = list(targetIdSet.intersection(searchResultSet)) foundIds.sort() print len(foundIds), 'abstracts found' for pmid in foundIds: print pmid print len(missingAbstracts), 'Missing abstracts:' for pmid in missingAbstracts: print pmid