def parseXML(self, tNode, index, sentence): """ create a new token from an xml token element. tNode = xml token element index = the index of the element in the sentence (0 indexed) sentence = the Sentence object containing this token """ self.sentence = sentence self.index = index self.text = xmlutil.normalizeText(tNode.getAttribute('text')) if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \ and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')): # first word in the sentence is capitalized and is not part of an acronym self.text = self.text.lower() self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma')) if len(self.lemma) == 0: self.lemma = self.text self.pos = tNode.getAttribute('pos') if self.pos == None: self.pos = '' dNodes = tNode.getElementsByTagName('dep') self.dependents = parsetree.DependencyList(dNodes) gNodes = tNode.getElementsByTagName('gov') self.governors = parsetree.DependencyList(gNodes) for gov in self.governors: if gov.index == self.index: # print 'Governor index matches dependent index' # print self.text # print self.sentence.toString() # sys.exit() self.governors.remove(gov) aNodes = tNode.getElementsByTagName('annotation') self.annotations = AnnotationList(aNodes) lNodes = tNode.getElementsByTagName('label') self.labels = AnnotationList(lNodes) sNodes = tNode.getElementsByTagName('semantic') for node in sNodes: semTag = xmlutil.getText(node) self.semanticTags.add(semTag) uNodes = tNode.getElementsByTagName('umls') for node in uNodes: self.umlsConcepts.append(umlsconcept.UMLSConcept(node))
def removeAllLabels(self, labelList=[]): """ remove ALL labels assigned by a classifier. If given a list of labels, remove only those labels that appear on the list. """ if len(labelList) == 0: self.labels = AnnotationList() else: for label in labelList: self.removeLabel(label)
class Token: """ This object contains all relevant information about a token. """ text = '' # the token itself pos = '' # part of speech tag # stem='' # stem of the token with affixes stripped lemma = '' # common lemma for the word __units = '' index = -1 # position of token in sentence governors = None # dependencies where this token is the dependent dependents = None # dependencies where this token is the governor __visited = False # has the node been officially "visited" __discovered = False # has the node been discovered yet? parent = False # parent on shortest path to a node in dependency graph features = {} allFeatures = {} semanticTags = None annotations = None # annotations from original corpus labels = None # annotations that have been detected by later systems specialValueType = None # token is a number of a special type that we do not classify # e.g. HR, ARR, NNT, CI # stemmer = nltk.stem.PorterStemmer() # lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() sentence = None parseTreeNode = None simplifiedTreeNode = None umlsChunks = None umlsConcepts = None # numberPattern = re.compile('(-?\d+\.\d*$)|(-?\d+$)|(-?\.\d+$)') acronymPattern = re.compile('[A-Z]+[A-Z0-9]*$') specialTokenAnnotations = {'itt_analysis': 'intention to treat analysis', 'per_protocol_analysis': 'per protocol analysis', '95_confidence_interval': '95% confidence interval', 'odds_ratio': 'odds ratio', 'hazard_ratio': 'hazard ratio', 'absolute_risk_reduction': 'absolute risk reduction', 'absolute_risk_increase': 'absolute risk increase', 'number_needed_to_treat': 'number needed to treat', 'number_needed_to_harm': 'number needed to harm', 'relative_risk_reduction': 'relative risk reduction', 'relative_risk_increase': 'relative risk increase', 'relative_risk': 'relative risk'} valueAcronyms = {'OR': 'odds_ratio', 'HR': 'hazard_ratio', 'ARR': 'absolute_risk_reduction', 'ARI': 'absolute_risk_increase', 'NNT': 'number_needed_to_treat', 'NNH': 'number_needed_to_harm', 'RRR': 'relative_risk_reduction', 'RRI': 'relative_risk_increase', 'RR': 'relative_risk', 'CI': '95_confidence_interval', 'SD': 'standard_deviation'} symbolTokens = {'~', '`', '!', '@', '#', '$', '%', '^', '&', '*', '-LRB-', '-RRB-', '-', '+', '=', '<', '>', 'plus_minus', ',', '.', '/', '?', '\'', ':', ';'} currencyWordSet = {"pound", "dollar", "euro", "$"} stopWordSet = nltk.corpus.stopwords.words('english') negationWordSet = {'not', 'no', 'without', 'never', 'neither', 'none', 'non'} nullNumberWords = {'none', 'no'} timeFrequencyWords = {'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'diem'} def __init__(self, text='', lemma=None, pos=None): """ create a new token """ self.text = text if lemma == None: self.lemma = self.text else: self.lemma = lemma if pos == None: self.pos = '' else: self.pos = pos self.__units = '' self.features = {} self.allFeatures = {} self.topKLabels = {} self.index = None self.semanticTags = set([]) self.umlsChunks = [] self.umlsConcepts = [] self.parseTreeNode = None self.sentence = None self.simplifiedTreeNode = None self.__visited = False self.parent = None self.__discovered = False self.specialValueType = None self.dependents = None self.governors = None self.annotations = set([]) self.labels = set([]) def parseXML(self, tNode, index, sentence): """ create a new token from an xml token element. tNode = xml token element index = the index of the element in the sentence (0 indexed) sentence = the Sentence object containing this token """ self.sentence = sentence self.index = index self.text = xmlutil.normalizeText(tNode.getAttribute('text')) if self.index == 0 and self.text[0] >= 'A' and self.text[0] <= 'Z' \ and (len(self.text) == 1 or (self.text[1] >= 'a' and self.text[1] <= 'z')): # first word in the sentence is capitalized and is not part of an acronym self.text = self.text.lower() self.lemma = xmlutil.normalizeText(tNode.getAttribute('lemma')) if len(self.lemma) == 0: self.lemma = self.text self.pos = tNode.getAttribute('pos') if self.pos == None: self.pos = '' dNodes = tNode.getElementsByTagName('dep') self.dependents = parsetree.DependencyList(dNodes) gNodes = tNode.getElementsByTagName('gov') self.governors = parsetree.DependencyList(gNodes) for gov in self.governors: if gov.index == self.index: # print 'Governor index matches dependent index' # print self.text # print self.sentence.toString() # sys.exit() self.governors.remove(gov) aNodes = tNode.getElementsByTagName('annotation') self.annotations = AnnotationList(aNodes) lNodes = tNode.getElementsByTagName('label') self.labels = AnnotationList(lNodes) sNodes = tNode.getElementsByTagName('semantic') for node in sNodes: semTag = xmlutil.getText(node) self.semanticTags.add(semTag) uNodes = tNode.getElementsByTagName('umls') for node in uNodes: self.umlsConcepts.append(umlsconcept.UMLSConcept(node)) def isRoot(self): """ return True if this token has a dependency from the root """ if len(self.governors) == 0 and len(self.dependents) > 0: return True # print 'ERROR: Token "'+self.text+'" has dependents, but no governors' # print self.sentence.toString() # sys.exit() for gov in self.governors: if gov.isRoot(): return True return False def listOfNextTokens(self, nTokens): """ return list of tokens following this token """ startIndex = self.index + 1 stopIndex = startIndex + nTokens if nTokens > 0 and startIndex > 0 and stopIndex < len(self.sentence): return self.sentence.tokens[startIndex:stopIndex] else: return [] def listOfPreviousTokens(self, nTokens): """ return list of tokens before this token """ startIndex = self.index - nTokens stopIndex = self.index if nTokens > 0 and startIndex >= 0 and stopIndex < len(self.sentence): return self.sentence.tokens[startIndex:stopIndex] else: return [] def nextToken(self): """ return the next token in the sentence or None if this is the last token in the sentence """ nextIndex = self.index + 1 if nextIndex < len(self.sentence): return self.sentence[nextIndex] else: return None def previousToken(self): """ return the previous token in the sentence or None if this is the first token in the sentence """ prevIndex = self.index - 1 if prevIndex >= 0: return self.sentence[prevIndex] else: return None def hasAnnotation(self, name): """ return true if the token has a given annotation """ if name in self.annotations: return True else: return False def getAnnotationAttribute(self, name, attrib): """ return the given attribute value for a given annotation. return empty string if the token does not have the annotation or attribute name = name of the annotation attrib = name of the attribute """ annotation = self.annotations.get(name) if annotation != None: return annotation.getAttributeValue(attrib) else: return '' def getLabelAttribute(self, name, attrib): """ return the given attribute value for a given label. return empty string if the token does not have the label or attribute name = name of the annotation attrib = name of the attribute """ label = self.labels.get(name) if label != None: return label.getAttributeValue(attrib) else: return '' def hasLabel(self, name, mode='test'): """ return true if the token has a given label (assigned by classifier) if mode is 'train', check if the token is actually annotated with the given label. Otherwise, check the token's list of labels assigned by the system. (The mode parameter is optional and the default value is 'test'.) """ if mode == 'train': return self.hasAnnotation(name) else: return name in self.labels def getAnnotationMatches(self, nameSet): """ return list of annotations matching a name in the given name set """ matches = set([]) for name in nameSet: if self.hasAnnotation(name): matches.add(name) return matches def getLabelMatches(self, nameSet): """ return list of labels matching a name in the given name set """ matches = set([]) for name in nameSet: if self.hasLabel(name): matches.add(name) return matches def addLabel(self, name): """ add a new label (assigned by a classifier) """ label = Annotation(name) self.labels.add(label) def setLabelAttribute(self, name, attrib, value): """ add an attribute value to a given label. name = name of label attrib = name of attribute value = value of attribute """ label = self.labels.get(name) if label == None: # label does not exist, create it first self.addLabel(name) label = self.labels.get(name) label.setAttributeValue(attrib, value) def addAnnotation(self, name): """ add a new annotation (treated as ground truth. use wisely). It does not add the annotation if it already exists. """ if self.hasAnnotation(name) == False: annotation = Annotation(name) self.annotations.add(annotation) def setAnnotationAttribute(self, name, attrib, value): """ add an attribute value to a given Annotation. (treated as ground truth. use wisely). name = name of annotation attrib = name of attribute value = value of attribute """ annotation = self.annotations.get(name) if annotation == None: # label does not exist, create it first self.addAnnotation(name) annotation = self.annotations.get(name) annotation.setAttributeValue(attrib, value) def copyAnnotation(self, token, name): """ copy an annotation with a given name from a given token """ if token.hasAnnotation(name) == True and self.hasAnnotation(name) == False: annotation = token.annotations.get(name) newAnnotation = Annotation(name) newAnnotation.copy(annotation) self.annotations.add(newAnnotation) def convertLabelToAnnotation(self, name): """ transform the label with the given name into an annotation for this token. then remove the label from the the token. """ if name in self.labels: self.annotations.add(self.labels.get(name)) self.labels.remove(name) def removeAnnotation(self, name): """ remove an annotation that was manually assigned. """ if name in self.annotations: self.annotations.remove(name) def removeLabel(self, name): """ remove a label assigned by a classifier """ if name in self.labels: self.labels.remove(name) def removeAllLabels(self, labelList=[]): """ remove ALL labels assigned by a classifier. If given a list of labels, remove only those labels that appear on the list. """ if len(labelList) == 0: self.labels = AnnotationList() else: for label in labelList: self.removeLabel(label) def addSemanticTag(self, tag): """ add a new semantic tag to list of semantic tags for this token """ self.semanticTags.add(tag) def hasSemanticTag(self, tag): """ return True if token has given semantic tag """ return tag in self.semanticTags def getSemanticTagMatches(self, tagList): """ return list of semantic tags that this token has """ tagMatches = [] for tag in tagList: if self.hasSemanticTag(tag): tagMatches.append(tag) return tagMatches def getFeatureSet(self): """ return set of all features """ features = set([]) for featureSet in self.features.values(): for f in featureSet: features.add(f) return features def filterFeatures(self, targetFeatures): """ only keep features that also appear in target feature set """ self.allFeatures = self.features oldFeatures = self.getFeatureSet() newFeatureSet = oldFeatures.intersection(targetFeatures) if len(newFeatureSet) == 0: newFeatureSet.add('NO_FEATURES') self.features = {} self.features['filtered'] = newFeatureSet def restoreFeatures(self): """ restore all features. This undoes any filtering performed by filterFeatures """ self.features = self.allFeatures def getAbstract(self): """ return the abstract that contains this token """ if self.sentence != None: return self.sentence.abstract else: return None def getAcronymExpansion(self): """ If this token is an acronym, return the list of tokens in its expansion (if known) """ abstract = self.getAbstract() if abstract != None and self.isAcronym() and self.text in abstract.acronyms: return abstract.acronyms[self.text] else: return [] def getTerm(self): """ return the base term used for a token in n-gram counts """ return self.lemma def getFeatureText(self): """ get the text string to be used for generating features """ if self.isNumber(): # if self.isInteger(): # return 'integer' # else: # return 'float_value' return 'numeric_value' else: return self.lemma # return self.text.lower() def getSpecialTokenAnnotation(self): """ return the special token annotation for this token or None if it does not have one""" if self.isSpecialToken() == False: return None for annotation in self.specialTokenAnnotations: if self.hasAnnotation(annotation): return annotation return None def getDisplayText(self, capitalizeFirstLetter): """ Return the version of the token text meant for human reading. """ if self.text == '-LRB-': return '(' if self.text == '-RRB-': return ')' if self.text == 'plus_minus': return '±' if self.text == '-EOS-': return '.' annotation = self.getSpecialTokenAnnotation() if annotation != None: return self.specialTokenAnnotations[annotation] if self.hasAnnotation('percentage'): return self.text + '%' if capitalizeFirstLetter and len(self.text) > 0: firstLetter = self.text[0].upper() return firstLetter + self.text[1:] return self.text def getXML(self, doc): """ return an xml element containing all token information """ node = doc.createElement('token') node.setAttribute('id', str(self.index)) node.setAttribute('text', self.text) if len(self.lemma) > 0 and self.lemma != self.text: node.setAttribute('lemma', self.lemma) if len(self.pos) > 0: node.setAttribute('pos', self.pos) # add governors for gov in self.governors: node.appendChild(gov.getXML(doc, 'gov')) # add dependents for dep in self.dependents: node.appendChild(dep.getXML(doc, 'dep')) # add semantic tags for tag in self.semanticTags: sNode = xmlutil.createNodeWithTextChild(doc, 'semantic', tag) node.appendChild(sNode) # add true (original) annotations for annotation in self.annotations: node.appendChild(annotation.getXML(doc, 'annotation')) # add detected labels for label in self.labels: node.appendChild(label.getXML(doc, 'label')) for uc in self.umlsConcepts: node.appendChild(uc.getXML(doc)) return node def isImportantInteger(self): """ return true if the integer is one that could be an outcome number or group size """ if (self.specialValueType == None and self.isInteger() == True and self.getValue() >= 0 and self.isPercentage() == False): return True else: return False def isImportantNumber(self): """ return true if the integer is one that could be an outcome number, group size, or event rate """ if self.isNumber() and self.specialValueType is None and self.getValue() >= 0: return True else: return False def isInteger(self): """ return true if the token is an integer """ if self.text.isdigit() or self.isNullNumberWord(): return True else: return False def isNumber(self): """ return true if the token is a number (integer or floating point) """ if self.isNullNumberWord(): return True try: float(self.text) return True except: return False # if self.numberPattern.match(self.text) != None or self.isNullNumberWord(): # return True # else: # return False def isNullNumberWord(self): """ return true if the token is a word that is sometimes interpreted as zero. {"none", "no"} """ return self.text.lower() in self.nullNumberWords def isPercentage(self): """ return true if the token is a percentage number (e.g. 50%) """ return self.hasAnnotation('percentage') def isSymbol(self): """ return true if the token is a symbol token (e.g. !, @, /, <, >)""" return self.text in self.symbolTokens def isAcronym(self): """ return true if the token is an acronym """ return len(self.text) > 1 and self.acronymPattern.match(self.text) != None def isLocation(self): """ return true if the token is a location word """ if self.text == 'USA' or self.text == 'UK': return True if self.text[0] >= 'A' and self.text[0] <= 'Z' and self.isAcronym() == False: for uc in self.umlsConcepts: if 'geoa' in uc.types: return True return False def isStopWord(self): """ return true if the token is in a list of stop words """ return self.text in self.stopWordSet or self.text == 'versus' def isSpecialToken(self): """ return true if the token is in a list of special tokens added during preprocessing phase""" return self.text == 'SPECIAL_TERM' def isSpecialValueTerm(self): """ return true if this token is a term that refers to a type of value found in medical research papers (e.g. NNT, confidence interval) """ return (self.isSpecialToken() and self.hasAnnotation('itt_analysis') == False \ and self.hasAnnotation('per_protocol_analysis') == False) \ or self.text in self.valueAcronyms def getSpecialValueAnnotation(self): """ if this token is a term that refers to a special value, return the corresponding annotation """ if self.isSpecialValueTerm() == False: return None elif self.isSpecialToken(): return self.getSpecialTokenAnnotation() elif self.text in self.valueAcronyms: return self.valueAcronyms[self.text] else: return None def isTimeWord(self): """ return true if this token appears in a list of common time words """ return 'time' in self.semanticTags def isTimeUnitWord(self): """ return true if this token appears in a list of common time UNITS e.g. seconds, minutes, yrs but exclude time words that describe the frequency (daily, monthly, yearly) """ return 'time' in self.semanticTags and self.text.lower() not in self.timeFrequencyWords def isMeasurementWord(self): """ return true if this token appears in a list of common units of measurement""" if 'measurement' in self.semanticTags: return True elif '/' in self.text: # check if this token matches a common pattern for ratio of units # i.e. units1 / units2 parts = self.text.split('/') # the first character of each part should be a letter. # assume that this implies they are units. for part in parts: part = part.lower() if len(part) == 0 or part[0] < 'a' or part[0] > 'z': return False return True return False def isCurrencyWord(self): """ return true if this token appears in a list of common currencies (e.g. dollars, pounds, euros).""" if self.lemma in self.currencyWordSet: return True else: return False def isLeftParenthesis(self): """ return true if this token is a left parenthesis character """ return self.text == '-LRB-' def isRightParenthesis(self): """ return true if this token is a right parenthesis character """ return self.text == '-RRB-' def getUnits(self): """ if this token is a measurement, e.g. mass, volume, time, return its units, if we know what they are, otherwise, return None. For now assume that the next token is the units. """ if self.__units is not '': return self.__units elif self.isNumber(): nextToken = self.nextToken() if nextToken is not None and (nextToken.isMeasurementWord() or nextToken.isCurrencyWord()): self.__units = nextToken.text elif self.previousToken() is not None and self.previousToken().isCurrencyWord(): self.__units = self.previousToken().text if self.__units == '$': self.__units = 'dollars' return self.__units def isGroupWord(self): """ return true if this token is in a list of common group words.""" return 'group' in self.semanticTags def isOutcomeWord(self): """ return true if this token is in a list of common outcome words.""" return 'outcome' in self.semanticTags def isNegationWord(self): """ return true if this token is in a list of common negation words. """ return self.text in self.negationWordSet def isValueAcronym(self): """ return true if this token is in a list of common value acronyms. (e.g. ARR, NNT, HR) """ return self.text in self.valueAcronyms def isVerb(self): """ return true if this token is tagged as a verb """ return self.pos[0:2] == 'VB' def getValue(self): """ return the numeric value for this token or the value 'None' if it is not a number""" if self.isNullNumberWord(): return 0 if self.isInteger(): return int(self.text) if self.isNumber(): return float(self.text) return None def visit(self): """ mark this token as visited """ self.__visited = True self.__discovered = True def unvisit(self): """ mark token as unvisited """ self.__visited = False self.__discovered = False self.parent = None def discover(self): """ mark token as discovered, but unvisited """ self.__visited = False self.__discovered = True def isVisited(self): """ return True if token has been visited """ return self.__visited() def isDiscovered(self): """ return True if token has been discovered """ return self.__discovered def pathToRoot(self): if self.parent == None: return self.text else: return self.parent.token.pathToRoot() + '<-' + self.text def pathToVerb(self, includeSpecific=True): """ find the path from word to nearest verb in the sentence. follow governors. return string of dependency relationships leading to verb.""" if self.parent == None and self.isVerb() == False: return 'ROOT<-' + self.lemma elif self.isVerb(): return 'VB_' + self.lemma else: if includeSpecific and self.parent.specific != None and len(self.parent.specific) > 0: parentType = self.parent.type + '_' + self.parent.specific else: parentType = self.parent.type return self.parent.token.pathToVerb() + '<-' + parentType #+':'+self.lemma def parentVerb(self, includeSpecific=True): """ find the path from word to nearest verb in the sentence. follow governors. return the verb.""" if self.parent == None and self.isVerb() == False: return None elif self.isVerb(): return self else: return self.parent.token.parentVerb()