Python getText Examples, xmlutil.getText Python Examples

Example #1

0

Show file

File: xmlparser.py Project: sepcon/Scripting

    def _extractClassInfo(self, cls):
        assert (isinstance(cls, Class))
        if cls.dataAvailable: return True

        compounddef = self._findCompounddef(cls.refid)
        if compounddef == None: return False

        XMLParser._extractCompoundTypeInfo(cls, compounddef)
        if cls.compoundname != "":
            inheritInfo = [
                Class.InheritInfo(
                    xmlutil.getText(basecompoundref, "refid"),
                    basecompoundref.text,
                    xmlutil.getText(basecompoundref, "prot"),
                    xmlutil.getText(basecompoundref, "virt") == "virtual")
                for basecompoundref in compounddef.iter("basecompoundref")
            ]
            if len(inheritInfo) > 0: cls.inheritInfo = inheritInfo

            lastOfColon = cls.compoundname.rfind(":")
            if lastOfColon == -1:
                cls.name = cls.compoundname
            else:
                cls.name = cls.compoundname[lastOfColon + 1:]

                for innerclass in compounddef.iter("innerclass"):
                    self.project.addClass(
                        innerclass.get("refid")).setParent(cls)
                for innerclass in cls.innerclasses:
                    self._makeCompoundDataAvailable(
                        innerclass, XMLParser._extractClassInfo)

                return True
        else:
            return False

Example #2

0

Show file

File: umlsconcept.py Project: rlsummerscales/acres

    def __init__(self, node):
        """ create a new concept element given a UMLS node """
        self.id = ""
        self.types = set([])
        self.snomed = ""
        self.sources = []
        self.score = 0
        self.isNegated = False
        self.inSnomed = False
        self.inRxnorm = False

        self.id = node.getAttribute("id")
        self.snomed = node.getAttribute("snomed")
        self.score = int(node.getAttribute("score"))
        self.isNegated = node.getAttribute("negated") == "true"
        tNodeList = node.getElementsByTagName("type")
        for tNode in tNodeList:
            self.types.add(xmlutil.getText(tNode))
        sNodeList = node.getElementsByTagName("source")
        for sNode in sNodeList:
            s = xmlutil.getText(sNode)
            if s == "SNOMEDCT":
                self.inSnomed = True
            elif s == "RXNORM":
                self.inRxnorm = True

Example #3

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load exit codes from XML DOM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "l":
             self.lower = int(xmlutil.getText(child))
         elif tagn == "u":
             self.upper = int(xmlutil.getText(child))
         child = child.nextSibling()

Example #4

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load env var from XML DOM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "name":
             self.e_name = xmlutil.getText(child)
         elif tagn == "value":
             self.e_value = xmlutil.getText(child)
         child = child.nextSibling()

Example #5

0

Show file

File: btmode.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load permissions from XML DDM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "uperm":
             self.u_flags = int(xmlutil.getText(child))
         elif tagn == "gperm":
             self.g_flags = int(xmlutil.getText(child))
         elif tagn == "operm":
             self.o_flags = int(xmlutil.getText(child))
         child = child.nextSibling()

Example #6

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load redirection from XML DOM"""
     self.action = int(str(node.toElement().attribute("type", "0")))
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "fd":
             self.fd = int(xmlutil.getText(child))
         elif tagn == "file":
             self.filename = xmlutil.getText(child)
         elif tagn == "fd2":
             self.fd2 = int(xmlutil.getText(child))
         child = child.nextSibling()

Example #7

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load constant value from XML DOM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "intval":
             self.isdef = True
             self.isint = True
             self.value = int(xmlutil.getText(child))
         elif tagn == "textval":
             self.isdef = True
             self.isint = False
             self.value = xmlutil.getText(child)
         child = child.nextSibling()

Example #8

0

Show file

File: publicationinfo.py Project: rlsummerscales/acres

    def __init__(self, pubInfoNode):
        """ Parse a PublicationInformation
        """
        assert pubInfoNode is not None
        assert isinstance(pubInfoNode, xml.dom.minidom.Element)

        self._journalNode = None
        self._authorListNode = None
        self._country = ""
        self._publicationTypeListNode = None

        journalNodes = pubInfoNode.getElementsByTagName('Journal')
        if len(journalNodes) > 0:
            self._journalNode = journalNodes[0].cloneNode(deep=True)

        countryNodes = pubInfoNode.getElementsByTagName('Country')
        if len(countryNodes) > 0:
            self._country = xmlutil.getText(countryNodes[0])

        authorListNodes = pubInfoNode.getElementsByTagName('AuthorList')
        if len(authorListNodes) > 0:
            self._authorListNode = authorListNodes[0].cloneNode(deep=True)

        publicationTypeListNodes = pubInfoNode.getElementsByTagName('PublicationTypeList')
        if len(publicationTypeListNodes) > 0:
            self._publicationTypeListNode = publicationTypeListNodes[0].cloneNode(deep=True)

Example #9

0

Show file

File: filterabstracts.py Project: olabknbit/acres

def keepForDiabetesCorpus(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least one cost value or term.
    """
    abstractNodes = xmldoc.getElementsByTagName('Abstract')
    if abstractNodes is None or len(abstractNodes) == 0:
        return False

    textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
    if textNodeList is None or len(textNodeList) == 0:
        return False

    nCostValues = 0
    nCostTerms = 0
    tokenCount = 0
    cueLemmaSet = {"cost", "QALY", "QALYs"}

    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                tokenCount += 1
                lemmatizeabstracts.lemmatizeToken(token)
                if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
                    nCostTerms += 1
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100

Example #10

0

Show file

File: filterabstracts.py Project: rlsummerscales/acres

def keepForDiabetesCorpus(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least one cost value or term.
    """
    abstractNodes = xmldoc.getElementsByTagName('Abstract')
    if abstractNodes is None or len(abstractNodes) == 0:
        return False

    textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
    if textNodeList is None or len(textNodeList) == 0:
        return False

    nCostValues = 0
    nCostTerms = 0
    tokenCount = 0
    cueLemmaSet = {"cost", "QALY", "QALYs"}

    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                tokenCount += 1
                lemmatizeabstracts.lemmatizeToken(token)
                if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
                    nCostTerms += 1
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100

Example #11

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

def gethostvarname(node):
    """Extract variable name as host:variable name.

We may vary the format to have host and name separately in some cases"""
    child = node.firstChild()
    if child.isText():
        return str(child.toText().data())
    hostn = vname = ""
    while not child.isNull():
        tagn = child.toElement().tagName()
        if tagn == "host":
            hostn = xmlutil.getText(child)
        elif tagn == "name":
            vname = xmlutil.getText(child)
        child = child.nextSibling()
    if len(hostn) == 0: return vname
    return hostn + ':' + vname

Example #12

0

Show file

File: xmlparser.py Project: sepcon/Scripting

 def _extractCodeUnitInfo(codeunit, xmlelem):
     assert (isinstance(codeunit, CodeUnit))
     if codeunit.location == None: codeunit.location = CodeUnit.Location()
     codeunit.kind = xmlutil.getText(xmlelem, "kind")
     codeunit.name = xmlutil.findText(xmlelem, "name")
     codeunit.location.file = xmlutil.findTagProp(xmlelem, "location",
                                                  "file")
     codeunit.location.line = _toLineNumber(
         xmlutil.findTagProp(xmlelem, "location", "line"))

Example #13

0

Show file

File: dispopts.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Slurp from XML node"""
     self.inclnull = False
     self.limuser = ""
     self.limgroup = ""
     self.limqueue = ""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "inclnull":
             self.inclnull = True
         elif tagn == "limuser":
             self.limuser = xmlutil.getText(child)
         elif tagn == "limgroup":
             self.limgroup = xmlutil.getText(child)
         elif tagn == "limqueue":
             self.limqueue = xmlutil.getText(child)
         child = child.nextSibling()

Example #14

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load variable from XML DOM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "name":
             self.var_name = xmlutil.getText(child)
         elif tagn == "comment":
             self.var_comment = xmlutil.getText(child)
         elif tagn == "value":
             self.var_value.load(child)
         elif tagn == "vmode":
             self.var_mode.load(child)
         elif tagn == "type":
             self.var_type = int(xmlutil.getText(child))
         elif tagn == "flags":
             self.var_flags = int(xmutil.getText(child))
         child = child.nextSibling()

Example #15

0

Show file

File: annotation.py Project: rlsummerscales/acres

 def parseXML(self, node=None):
   """ load information from an xml node """
   # parse xml element if given one
   if node != None:
      self.type = node.getAttribute('type').lower()      
      for childNode in node.childNodes:
        if childNode.nodeType == xml.dom.Node.ELEMENT_NODE:
          attribName = childNode.tagName
          value = xmlutil.getText(childNode)
          self.attributes[attribName] = value

Example #16

0

Show file

File: annotation.py Project: olabknbit/acres

 def parseXML(self, node=None):
     """ load information from an xml node """
     # parse xml element if given one
     if node != None:
         self.type = node.getAttribute('type').lower()
         for childNode in node.childNodes:
             if childNode.nodeType == xml.dom.Node.ELEMENT_NODE:
                 attribName = childNode.tagName
                 value = xmlutil.getText(childNode)
                 self.attributes[attribName] = value

Example #17

0

Show file

 def load(self, node):
     """Load state from XML file"""
     child = node.firstChild()
     if child.toElement().tagName() == "ip":
         self.servname = ""
         self.namebyip = True
         self.servip = gbnetid.gbnetid(xmlutil.getText(child))
     else:
         self.servname = xmlutil.getText(child)
         self.namebyip = False
         try:
             self.servip = gbnetid.gbnetid(socket.gethostbyname(self.servname))
         except socket.gaierror:
             self.servip = gbnetid.gbnetid()
     el = node.toElement()
     if el.hasAttribute("alias"):
         self.alias = str(el.attribute("alias"))
     self.autoconn = False
     if el.hasAttribute("autoconn") and el.attribute("autoconn") == "y":
         self.autoconn = True

Example #18

0

Show file

File: filterabstracts.py Project: olabknbit/acres

def keepForIschemiaCorpus(xmldoc):
    """ Return True if we should keep this abstract for the ischemia corpus
        Include abstract in ischemia corpus if it contains at least 4 integers.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nIntegers = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        tokens = tokenizer.tokenize(text)
        for token in tokens:
            if token.isInteger():
                nIntegers += 1

    return nIntegers > 3

Example #19

0

Show file

File: publicationinfo.py Project: rlsummerscales/acres

    def getPublicationTypes(self):
        """
         Return list of strings describing the type of publication that the abstract is
        """
        if self._publicationTypeListNode is None:
            return []
        pTypes = []
        publicationTypeNodes = self._publicationTypeListNode.getElementsByTagName('PublicationType')
        for node in publicationTypeNodes:
            pType = xmlutil.getText(node)
            if pType is not None and pType is not "":
                pTypes.append(pType)

        return pTypes

Example #20

0

Show file

File: sentencetoken.py Project: rlsummerscales/acres

    def parseXML(self, tNode, index, sentence):
        """ create a new token from an xml token element.
            tNode = xml token element
            index = the index of the element in the sentence (0 indexed)
            sentence = the Sentence object containing this token
            """
        self.sentence = sentence
        self.index = index

        self.text = xmlutil.normalizeText(tNode.getAttribute('text'))
        if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \
                and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')):
            # first word in the sentence is capitalized and is not part of an acronym
            self.text = self.text.lower()

        self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma'))
        if len(self.lemma) == 0:
            self.lemma = self.text

        self.pos = tNode.getAttribute('pos')
        if self.pos == None:
            self.pos = ''

        dNodes = tNode.getElementsByTagName('dep')
        self.dependents = parsetree.DependencyList(dNodes)

        gNodes = tNode.getElementsByTagName('gov')
        self.governors = parsetree.DependencyList(gNodes)
        for gov in self.governors:
            if gov.index == self.index:
                #         print 'Governor index matches dependent index'
                #         print self.text
                #         print self.sentence.toString()
                #         sys.exit()
                self.governors.remove(gov)

        aNodes = tNode.getElementsByTagName('annotation')
        self.annotations = AnnotationList(aNodes)

        lNodes = tNode.getElementsByTagName('label')
        self.labels = AnnotationList(lNodes)

        sNodes = tNode.getElementsByTagName('semantic')
        for node in sNodes:
            semTag = xmlutil.getText(node)
            self.semanticTags.add(semTag)

        uNodes = tNode.getElementsByTagName('umls')
        for node in uNodes:
            self.umlsConcepts.append(umlsconcept.UMLSConcept(node))

Example #21

0

Show file

File: filterabstracts.py Project: rlsummerscales/acres

def keepForIschemiaCorpus(xmldoc):
    """ Return True if we should keep this abstract for the ischemia corpus
        Include abstract in ischemia corpus if it contains at least 4 integers.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nIntegers = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        tokens = tokenizer.tokenize(text)
        for token in tokens:
            if token.isInteger():
                nIntegers += 1

    return nIntegers > 3

Example #22

0

Show file

File: rankabstracts.py Project: rlsummerscales/acres

    def __init__(self, filename):
        """Given the name of a file containing the XML summary,
         parse the file and read its contents
        """
        xmldoc = xml.dom.minidom.parse(filename)
        pmidNodes = xmldoc.getElementsByTagName('Name')
        self.id = int(xmlutil.getText(pmidNodes[0]))
        subjectNodes = xmldoc.getElementsByTagName('Subjects')

        if len(subjectNodes) == 0:
            self.groupNodes = subjectNodes.getElementsByTagName('Group')
        else:
            self.groupNodes = []

        olistNodes = xmldoc.getElementsByTagName('Outcomes')
        if len(olistNodes) == 1:
            self.outcomeListNode = olistNodes[0]
        else:
            self.outcomeListNode = None

        htmlSummaryNodes = xmldoc.getElementsByTagName('HTMLData')
        if len(htmlSummaryNodes) == 1:
            self.htmlData = xmlutil.getText(htmlSummaryNodes[0])

Example #23

0

Show file

File: sentence.py Project: rlsummerscales/acres

    def parseXML(self, sNode, index, abstract):
        self.section = sNode.getAttribute('section').replace(' ', '_')
        self.index = index
        self.abstract = abstract
        self.nlmCategory = sNode.getAttribute('nlmCategory')

        tNodes = sNode.getElementsByTagName('token')
        i = 0
        for node in tNodes:
            t = sentencetoken.Token()
            t.parseXML(node, i, self)
            self.tokens.append(t)
            i = i + 1
        if self.tokens[-1].text == '.':
            self.tokens[-1].text = '-EOS-'
            self.tokens[-1].lemma = '-EOS-'
            self.tokens[-1].pos = 'eos'

        # parse the parse tree
        pNodes = sNode.getElementsByTagName('parse')
        if len(pNodes) == 1:
            self.parseString = xmlutil.getText(pNodes[0])
            # build parse trees
            if len(self.parseString) > 0:
                self.parseTree = parsetree.ParseTreeNode()
                self.parseTree.buildParseTree(self.parseString, self.tokens)
                #         self.simpleTree = SimplifiedTreeNode()
                #         self.simpleTree.buildSimplifiedTree(self.parseTree)

                for token in self.tokens:
                    for dep in token.dependents:
                        dep.token = self.tokens[dep.index]
                    for gov in token.governors:
                        gov.token = self.tokens[gov.index]
                    if token.isRoot():
                        self.dependencyGraphRoot.append(token)
                        #        self.dependencyGraphBFS()

        # build list of umls terms in sentence
        uNodeList = sNode.getElementsByTagName('umlsChunk')
        for uNode in uNodeList:
            umlsChunk = umlschunk.UMLSChunk(uNode, self)
            self.umlsChunks.append(umlsChunk)
            for i in range(umlsChunk.startIdx, umlsChunk.endIdx + 1):
                token = self.tokens[i]
                token.umlsChunks.append(umlsChunk)

        # see if we can determine the types of some of the numbers
        self.findSpecialValues()

Example #24

0

Show file

 def load(self, node):
     """Load list from XML file"""
     # Do it in two passes so we've got getloc and myaddr set up first
     child = node.firstChild()
     self.getlocaddr = False
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "lahost":
             self.lahost = xmlutil.getText(child)
         elif tagn == "laport":
             self.laport = int(xmlutil.getText(child))
         elif tagn == "servers":
             self.serversanyname = dict()
             self.serversbyname = dict()
             self.serversbyip = dict()
             self.namecache = dict()
             self.ipcache = dict()
             srv = child.firstChild()
             while not srv.isNull():
                 s = gbserver()
                 s.load(srv)
                 self.add(s)
                 srv = srv.nextSibling()
         child = child.nextSibling()

Example #25

0

Show file

File: nctreport.py Project: rlsummerscales/acres

    def __init__(self, node):
        self.id = xmlutil.getTextFromNodeCalled('id', node)
        self.gender = xmlutil.getTextFromNodeCalled('gender', node)
        self.minAge = xmlutil.getTextFromNodeCalled('minAge', node)
        self.maxAge = xmlutil.getTextFromNodeCalled('maxAge', node)

        self.locations = []
        lcNodes = node.getElementsByTagName('location_countries')
        if len(lcNodes) > 0:
            cNodes = lcNodes[0].getElementsByTagName('country')
            for countryNode in cNodes:
                self.locations.append(xmlutil.getText(countryNode))

        self.conditions = []
        cNodes = node.getElementsByTagName('condition')
        for cNode in cNodes:
            self.conditions.append(ReportEntry(cNode))

        self.eligibilityCriteria = []
        ecNodes = node.getElementsByTagName('eligibility')
        if len(ecNodes) > 0:
            cNodes = ecNodes[0].getElementsByTagName('criteria')
            for cNode in cNodes:
                self.eligibilityCriteria.append(ReportEntry(cNode))

        self.inclusionCriteria = []
        icNodes = node.getElementsByTagName('inclusion')
        if len(icNodes) > 0:
            cNodes = icNodes[0].getElementsByTagName('criteria')
            for cNode in cNodes:
                self.inclusionCriteria.append(ReportEntry(cNode))

        self.exclusionCriteria = []
        ecNodes = node.getElementsByTagName('exclusion')
        if len(ecNodes) > 0:
            cNodes = ecNodes[0].getElementsByTagName('criteria')
            for cNode in cNodes:
                self.exclusionCriteria.append(ReportEntry(cNode))

        self.interventions = []
        iNodes = node.getElementsByTagName('intervention')
        for iNode in iNodes:
            self.interventions.append(Intervention(iNode))

        self.outcomes = []
        oNodes = node.getElementsByTagName('outcome')
        for oNode in oNodes:
            self.outcomes.append(Outcome(oNode))

Example #26

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load assignment from XML DOM"""
     self.bja_op = int(str(node.toElement().attribute("type", "0")))
     self.bja_iscrit = False
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "vname":
             self.bja_varname = gethostvarname(child)
         elif tagn == "const":
             self.bja_con.load(child)
         elif tagn == "iscrit":
             self.bja_iscrit = True
         elif tagn == "flags":
             self.bja_flags = int(xmlutil.getText(child))
         child = child.nextSibling()

Example #27

0

Show file

File: filterabstracts.py Project: rlsummerscales/acres

def keepForDiabetesCorpusCostValue(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least *one* currency value.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nCostValues = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                lemmatizeabstracts.lemmatizeToken(token)
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return nCostValues > 0

Example #28

0

Show file

File: timecon.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load save time details from XML file"""
     self.tc_istime = node.toElement().attribute("timeset", "n") == "y"
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         value = int(xmlutil.getText(child))
         if tagn == "nexttime":
             self.tc_nexttime = value
         elif tagn == "repeat":
             self.tc_repeat = value
         elif tagn == "rate":
             self.tc_rate = value
         elif tagn == "mday":
             self.tc_rate = value
         elif tagn == "nvaldays":
             self.tc_nvaldays = value
         elif tagn == "nposs":
             self.tc_nposs = value
         child = child.nextSibling()

Example #29

0

Show file

File: filterabstracts.py Project: olabknbit/acres

def keepForDiabetesCorpusCostValue(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least *one* currency value.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nCostValues = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                lemmatizeabstracts.lemmatizeToken(token)
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return nCostValues > 0

Example #30

0

Show file

File: xmlparser.py Project: sepcon/Scripting

    def _extractHeaderInfo(self, header):
        assert (isinstance(header, Header))
        if header.dataAvailable: return True

        compounddef = self._findCompounddef(header.refid)
        if compounddef == None: return False

        XMLParser._extractCompoundTypeInfo(header, compounddef)
        header.updateName()
        for include in compounddef.iter("includes"):
            header.includes.append(
                Header.Include(include.text,
                               xmlutil.getText(include, "local") == "yes"))

        for innernamespaceDB in compounddef.iter("innernamespace"):
            header.namespaces.append(
                self.project.addNamespace(innernamespaceDB.get("refid")))
        for innerclassDB in compounddef.iter("innerclass"):
            header.innerclasses.append(
                self.project.addClass(innerclassDB.get("refid")))

        return True

Example #31

0

Show file

    def load(self, parent):
        """Load saved port numbers from XML file"""

        node = parent.firstChild()
        while not node.isNull():
            tagn = node.toElement().tagName()
            if tagn == "CONN_TCP":
                self.connect_tcp = int(xmlutil.getText(node))
            elif tagn == "CONN_UDP":
                self.connect_udp = int(xmlutil.getText(node))
            elif tagn == "CLIENT_ACCESS":
                self.client_access = int(xmlutil.getText(node))
            elif tagn == "JOBVIEW":
                self.jobview = int(xmlutil.getText(node))
            elif tagn == "API_TCP":
                self.api_tcp = int(xmlutil.getText(node))
            elif tagn == "API_UDP":
                self.api_udp = int(xmlutil.getText(node))
            node = node.nextSibling()

Example #32

0

Show file

File: downloadregistry.py Project: rlsummerscales/acres

  print "downloaded registry information is written to the file '<PMID>.nct.xml'" 
  sys.exit()

nctCount = 0
isrctnCount = 0
  
nctPattern = re.compile('.*NCT\s*\d+.*')

for i in range(1, len(sys.argv)):
  file = sys.argv[i]
  print file
  
  xmldoc = minidom.parse(file)
  idNodeList = xmldoc.getElementsByTagName('AccessionNumber')
  for node in idNodeList:
    id = xmlutil.getText(node)    
    if len(id) > 3 and id[0:3] == 'NCT':
      try:
#        fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?resultsxml=true'
        fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?displayxml=true'
        print 'Downloading:', fetchCmd
        doc = urllib2.urlopen(fetchCmd)
        out = open(id+'.xml', 'w')
        out.write(doc.read())
        out.close()
        nctCount += 1
      except:
        print '***Could not download:', fetchCmd
    elif len(id) > 6 and id[0:6] == 'ISRCTN':
      print id
      isrctnID = id

Example #33

0

Show file

File: filterabstracts.py Project: rlsummerscales/acres

    for line in file.readlines():
        [pmid, xml] = line.split('.')
        ignoreSet.add(pmid)

if inputPath[-1] != '/':
    inputPath += '/'
if outputPath[-1] != '/':
    outputPath += '/'

# initialize sentence splitter and tokenizer
sentenceSplitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

cvFinder = costvaluefinder.CostValueFinder()

fileList = glob.glob(inputPath+'*.xml')
for filename in fileList:
    xmldoc = xml.dom.minidom.parse(filename)
    pmidNodes = xmldoc.getElementsByTagName('PMID')
    if len(pmidNodes) > 0:
        pmid = xmlutil.getText(pmidNodes[0])
        if pmid in ignoreSet:
            print pmid, 'already annotated'
        else:
            #        if keepForIschemiaCorpus(xmldoc):
            if keepForDiabetesCorpus(xmldoc):
                # copy abstract
                print 'Copying: ', filename
                shutil.copy(filename, outputPath)

Example #34

0

Show file

File: downloadregistry.py Project: olabknbit/acres

    print "downloaded registry information is written to the file '<PMID>.nct.xml'"
    sys.exit()

nctCount = 0
isrctnCount = 0

nctPattern = re.compile('.*NCT\s*\d+.*')

for i in range(1, len(sys.argv)):
    file = sys.argv[i]
    print file

    xmldoc = minidom.parse(file)
    idNodeList = xmldoc.getElementsByTagName('AccessionNumber')
    for node in idNodeList:
        id = xmlutil.getText(node)
        if len(id) > 3 and id[0:3] == 'NCT':
            try:
                #        fetchCmd = 'http://clinicaltrials.gov/show/'+id+'?resultsxml=true'
                fetchCmd = 'http://clinicaltrials.gov/show/' + id + '?displayxml=true'
                print 'Downloading:', fetchCmd
                doc = urllib2.urlopen(fetchCmd)
                out = open(id + '.xml', 'w')
                out.write(doc.read())
                out.close()
                nctCount += 1
            except:
                print '***Could not download:', fetchCmd
        elif len(id) > 6 and id[0:6] == 'ISRCTN':
            print id
            isrctnID = id

Example #35

0

Show file

File: btclasses.py Project: gitGNU/gnu_gnubatch

 def load(self, node):
     """Load job from XML DOM"""
     child = node.firstChild()
     while not child.isNull():
         tagn = child.toElement().tagName()
         if tagn == "progress":
             self.bj_progress = int(xmlutil.getText(child))
         elif tagn == "pri":
             self.bj_pri = int(xmlutil.getText(child))
         elif tagn == "ll":
             self.bj_ll = int(xmlutil.getText(child))
         elif tagn == "umask":
             self.bj_umask = int(xmlutil.getText(child))
         elif tagn == "ulimit":
             self.bj_ulimit = int(xmlutil.getText(child))
         elif tagn == "jflags":
             self.bj_jflags = int(xmlutil.getText(child))
         elif tagn == "title":
             self.bj_title = xmlutil.getText(child)
         elif tagn == "direct":
             self.bj_direct = xmlutil.getText(child)
         elif tagn == "runtime":
             self.bj_runtime = int(xmlutil.getText(child))
         elif tagn == "autoksig":
             self.bj_autoksig = int(xmlutil.getText(child))
         elif tagn == "runon":
             self.bj_runon = int(xmlutil.getText(child))
         elif tagn == "deltime":
             self.bj_deltime = int(xmlutil.getText(child))
         elif tagn == "cmdinterp":
             self.bj_cmdinterp = xmlutil.getText(child)
         elif tagn == "jmode":
             self.bj_mode.load(child)
         elif tagn == "times":
             self.bj_times.load(child)
         elif tagn == "conds":
             gc = child.firstChild()
             self.bj_conds = []
             while not gc.isNull():
                 if gc.toElement().tagName() == "cond":
                     nc = jcond()
                     nc.load(gc)
                     self.bj_conds.append(nc)
                 gc = gc.nextSibling()
         elif tagn == "asses":
             gc = child.firstChild()
             self.bj_asses = []
             while not gc.isNull():
                 if gc.toElement().tagName() == "ass":
                     na = jass()
                     na.load(gc)
                     self.bj_asses.append(na)
                 gc = gc.nextSibling()
         elif tagn == "args":
             gc = child.firstChild()
             self.bj_arg = []
             while not gc.isNull():
                 if gc.toElement().tagName() == "arg":
                     self.bj_arg.append(xmlutil.getText(gc))
                 gc = gc.nextSibling()
         elif tagn == "envs":
             gc = child.firstChild()
             self.bj_env = []
             while not gc.isNull():
                 if gc.toElement().tagName() == "env":
                     ne = envir()
                     ne.load(gc)
                     self.bj_env.append(ne)
                 gc = gc.nextSibling()
         elif tagn == "redirs":
             gc = child.firstChild()
             self.bj_redirs = []
             while not gc.isNull():
                 if gc.toElement().tagName() == "redir":
                     nr = redir()
                     nr.load(gc)
                     self.bj_redirs.append(nr)
                 gc = gc.nextSibling()
         elif tagn == "nexit":
             self.exitn.load(child)
         elif tagn == "eexit":
             self.exite.load(child)
         child = child.nextSibling()

Example #36

0

Show file

File: getabstracts.py Project: olabknbit/acres

else:
    targetIdSet = set([])

searchCmd = eutils + 'esearch.fcgi?db=pubmed' + searchArgs
print searchCmd
searchResults = urllib2.urlopen(searchCmd)

# parse results
xmldoc = xml.dom.minidom.parseString(searchResults.read())
idNodeList = xmldoc.getElementsByTagName('Id')
print 'Number of documents found =', len(idNodeList)

#idNodeList = open('abs.txt', 'r').readlines()
searchResultSet = set([])
for idNode in idNodeList:
    id = xmlutil.getText(idNode)
    id.strip()
    searchResultSet.add(id)

if len(targetIdSet) > 0:
    missingAbstracts = list(targetIdSet - searchResultSet)
    missingAbstracts.sort()
    foundIds = list(targetIdSet.intersection(searchResultSet))
    foundIds.sort()
    print len(foundIds), 'abstracts found'
    for pmid in foundIds:
        print pmid

    print len(missingAbstracts), 'Missing abstracts:'
    for pmid in missingAbstracts:
        print pmid

Example #37

0

Show file

File: filterabstracts.py Project: olabknbit/acres

    for line in file.readlines():
        [pmid, xml] = line.split('.')
        ignoreSet.add(pmid)

if inputPath[-1] != '/':
    inputPath += '/'
if outputPath[-1] != '/':
    outputPath += '/'

# initialize sentence splitter and tokenizer
sentenceSplitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

cvFinder = costvaluefinder.CostValueFinder()

fileList = glob.glob(inputPath + '*.xml')
for filename in fileList:
    xmldoc = xml.dom.minidom.parse(filename)
    pmidNodes = xmldoc.getElementsByTagName('PMID')
    if len(pmidNodes) > 0:
        pmid = xmlutil.getText(pmidNodes[0])
        if pmid in ignoreSet:
            print pmid, 'already annotated'
        else:
            #        if keepForIschemiaCorpus(xmldoc):
            if keepForDiabetesCorpus(xmldoc):
                # copy abstract
                print 'Copying: ', filename
                shutil.copy(filename, outputPath)

Example #38

0

Show file

File: xmlparser.py Project: sepcon/Scripting

 def _extractMemberInfo(member, xmlMemberdef):
     assert (isinstance(member, Member))
     XMLParser._extractCodeUnitInfo(member, xmlMemberdef)
     member.isStatic = (xmlutil.getText(xmlMemberdef, "static") == "yes")
     member.scope = xmlutil.getText(xmlMemberdef, "prot")

Example #39

0

Show file

File: meshlist.py Project: rlsummerscales/acres

from abstract import Abstract

if len(sys.argv) < 3:
  print "Usage: meshlist.py <INPUT_PATH> <OUTPUT_FILE>"
  print "output list of mesh terms for all abstracts"
  print "in the directory specified by <INPUT_PATH>"
  sys.exit()

  
inputPath = sys.argv[1]
fileList = glob.glob(inputPath+'/*.xml')
meshTerms = set([])

for filename in fileList:
  print filename
  xmldoc = minidom.parse(filename)
  dNodes = xmldoc.getElementsByTagName('DescriptorName')
  for node in dNodes:
    meshTerms.add(xmlutil.getText(node).lower())
#   abs = Abstract(filename)
#   for meshHeading in abs.meshHeadingList:
#     meshTerms.add(meshHeading.descriptorName.name.lower())
    
meshTerms = sorted(list(meshTerms))    
out = open(sys.argv[2], 'w')
for term in meshTerms:
  out.write(term+'\n')
  
out.close()

Example #40

0

Show file

File: getabstracts.py Project: rlsummerscales/acres

else:
    targetIdSet = set([])

searchCmd = eutils + 'esearch.fcgi?db=pubmed'+ searchArgs
print searchCmd
searchResults = urllib2.urlopen(searchCmd)

# parse results
xmldoc = xml.dom.minidom.parseString(searchResults.read())
idNodeList = xmldoc.getElementsByTagName('Id')
print 'Number of documents found =', len(idNodeList)

#idNodeList = open('abs.txt', 'r').readlines()
searchResultSet = set([])
for idNode in idNodeList:
    id = xmlutil.getText(idNode)
    id.strip()
    searchResultSet.add(id)

if len(targetIdSet) > 0:
    missingAbstracts = list(targetIdSet - searchResultSet)
    missingAbstracts.sort()
    foundIds = list(targetIdSet.intersection(searchResultSet))
    foundIds.sort()
    print len(foundIds), 'abstracts found'
    for pmid in foundIds:
        print pmid

    print len(missingAbstracts), 'Missing abstracts:'
    for pmid in missingAbstracts:
        print pmid