def conlist(self): """ Useful during evaluation """ # Cached for later calls if self.concepts: return self.concepts # For each word, store a corresponding concept label # Initially, all labels will be stored as 'none' for line in self.data: tmp = [] for word in line: tmp.append('none') self.concepts.append(tmp) # Use the classifications to correct all mislabled 'none's for classification in self.derived_note.getClassificationTuples(): concept = classification[0] char_spans = classification[1] # Assumption - assumes no clustering third pass line_inds = self.derived_note.getLineIndices() data = self.derived_note.getTokenizedSentences() text = self.derived_note.getText() for span in char_spans: lineno, tokspan = lineno_and_tokspan(line_inds, data, text, span) start, end = tokspan self.concepts[lineno][start] = concept for i in range(start, end): self.concepts[lineno][i + 1] = concept return self.concepts
def getIOBLabels(self): """ Purpose: return a list of list of IOB labels """ # Only compute if not already memoized if self.iob_labels: return self.iob_labels # Build list of proper dimensions (1:1 with self.data) self.getTokenizedSentences() iobs = [ ['O' for tok in sent] for sent in self.data ] line_inds = self.derived_note.getLineIndices() data = self.derived_note.data text = self.derived_note.text #for d in data: # print d #exit() b_count = 0 # Add 'B's and 'I's from concept spans for classification in self.derived_note.getClassificationTuples(): concept,char_spans = classification #print '\n\n' #print concept #print char_spans # Each span (could be noncontiguous span) for span in char_spans: start_ind,end_ind = span lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, "i2b2") start,end = tokspan #print '\t', lineno, tokspan #print '\t\t', data[lineno] #print '\t\t', data[lineno][tokspan[0]:tokspan[1]+1] #print '\t\t', iobs[lineno] # Update concept tokens to 'B's and 'I's assert iobs[lineno][start] == 'O' iobs[lineno][start] = 'B' b_count += 1 #print 'B: ', b_count for i in range(start+1,end+1): #print '\t\t\t', i assert iobs[lineno][i] == 'O' iobs[lineno][i] = 'I' #print '\t\t', iobs[lineno] #exit() #exit() # Memoize for next call self.iob_labels = iobs return iobs
def getIOBLabels(self): """ Purpose: return a list of list of IOB labels """ # Only comput if not already memoized if self.iob_labels: return self.iob_labels # Build list of proper dimensions (1:1 with self.data) self.getTokenizedSentences() iobs = [['O' for tok in sent] for sent in self.data] line_inds = self.derived_note.getLineIndices() data = self.derived_note.data text = self.derived_note.text # Add 'B's and 'I's from concept spans for classification in self.derived_note.getClassificationTuples(): concept, char_spans = classification # Each span (could be noncontiguous span) for span in char_spans: lineno, tokspan = lineno_and_tokspan(line_inds, data, text, span) start, end = tokspan # Update concept tokens to 'B's and 'I's iobs[lineno][start] = 'B' for i in range(start + 1, end + 1): iobs[lineno][i] = 'I' # Memoize for next call self.iob_labels = iobs return iobs
def conlist(self): """ Useful during evaluation """ # Cached for later calls if self.concepts: return self.concepts # For each word, store a corresponding concept label # Initially, all labels will be stored as 'none' for line in self.data: tmp = [] for word in line: tmp.append('none') self.concepts.append(tmp) # Use the classifications to correct all mislabled 'none's for classification in self.derived_note.getClassificationTuples(): concept = classification[0] char_spans = classification[1] # Assumption - assumes no clustering third pass line_inds = self.derived_note.getLineIndices() data = self.derived_note.getTokenizedSentences() text = self.derived_note.getText() for span in char_spans: lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span) start,end = tokspan self.concepts[lineno][start] = concept for i in range(start, end): self.concepts[lineno][i+1] = concept return self.concepts
def getIOBLabels(self): """ Purpose: return a list of list of IOB labels """ # Only comput if not already memoized if self.iob_labels: return self.iob_labels # Build list of proper dimensions (1:1 with self.data) self.getTokenizedSentences() iobs = [ ['O' for tok in sent] for sent in self.data ] line_inds = self.derived_note.getLineIndices() data = self.derived_note.data text = self.derived_note.text # Add 'B's and 'I's from concept spans for classification in self.derived_note.getClassificationTuples(): concept,char_spans = classification # Each span (could be noncontiguous span) for span in char_spans: lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span) start,end = tokspan # Update concept tokens to 'B's and 'I's iobs[lineno][start] = 'B' for i in range(start+1,end+1): iobs[lineno][i] = 'I' # Memoize for next call self.iob_labels = iobs return iobs
def read_standard(self, txt, con=None): """ Note_xml::read_standard() @param txt. A file path for the tokenized medical record @param con. A file path for the standardized annotated concepts for txt """ start = 0 end = 0 with open(txt) as f: # Get entire file text = f.read() self.text = text # Split into lines self.data = map(lambda s: s.split(), text.split('\n')) # Tokenize each sentence into words (and save line number indices) toks = [] gold = [] # Actual lines for sent in self.data: gold.append(sent) # Keep track of which indices each line has for word in sent: end += len(word) + 1 self.line_inds.append( (start,end-1) ) start = end # Skip ahead to next non-whitespace while (start < len(text)) and text[start].isspace(): start += 1 # If an accompanying concept file was specified, read it if con: classifications = [] with open(con) as f: for line in f: # Empty line if line == '\n': continue # Parse concept file line fields = line.strip().split('||') #print fields concept = fields[0] span_inds = [] for i in range(1,len(fields),2): span = int(fields[i]), int(fields[i+1]) span_inds.append( span ) # FIXME - For now, treat non-contiguous spans as separate for span in span_inds: # Add the classification to the Note object l,(start,end) = lineno_and_tokspan(self.line_inds, self.data, self.text, span) #print 'span: ', span #print 'lineno: ', l #print 'start: ', start #print 'end: ', end #print '\n' classifications.append((concept,l+1,start,end)) # Safe guard against concept file having duplicate entries classifications = list(set(classifications)) # Concept file does not guarantee ordering by line number self.classifications = sorted(classifications,cmp=classification_cmp)
def read(self, txt, con=None): # print "semeval note read called" # Filename self.fileName = txt # print self.fileName start = 0 end = 0 with open(txt) as f: # Get entire file original_text = f.read() text = remove_non_ascii(original_text) # print "original text:" # print original_text # print "text with ascii removed:" # print text #print "\nTEXT:------------------" #print text self.text = text # Sentence splitter sents = self.sent_tokenizer.tokenize(txt, "semeval") # sents = self.opennlp_tokenizer.sentenize(text) # print "sentenized text: " # print sents #print "\nSENTS:-----------------------------" # for line in sents: # print line # Tokenize each sentence into words (and save line number indices) toks = [] gold = [] # Actual lines i = 0 for s in sents: i += 1 gold.append(s) b = False #if b: print "\nsentence:-------------------------------" #if b: print '<s>' + s + '</s>' #print s # Store data toks = self.word_tokenizer.tokenize(s, "semeval") # toks = self.opennlp_tokenizer.tokenize(s) #print toks #if b: print "\ntokenized sentence:----------------------------" #if b: print toks self.data.append(toks) # print self.data # Keep track of which indices each line has end = start + len(s) #if b: print "\nindices:---------------------------------------" #if b: print (start, end) #if b: print "\nusing index on entire txt----------------------" #if b: print '<s>' + text[start:end] + '</s>' # EQUAL? # assert( text[start:end] == s ), 'data and text must agree' self.line_inds.append((start, end)) start = end # Skip ahead to next non-whitespace while (start < len(text)) and text[start].isspace(): start += 1 ''' for line,inds in zip(gold,self.line_inds): print '!!!' + line + '!!!' print '\t', 'xx'*10 print inds print '\t', 'xx'*10 print '!!!' + text[inds[0]: inds[1]] + '!!!' print '---' print '\n' print 'Xx' * 20 ''' # print "tokenized data: " # print self.data # print "tokens, one per line: " # for token in [token for l in self.data for token in l]: # print token # print "\n\n" # If an accompanying concept file was specified, read it if con: classifications = [] with open(con) as f: for line in f: # print line # Empty line if line == '\n': continue # Parse concept file line fields = line.strip().split('|') #print fields cui = fields[2] span_inds = [] spans = fields[1] spans = spans.split(',') spans = [s.split('-') for s in spans] # print spans for span in spans: span = int(span[0]), int(span[1]) span_inds.append(span) # print span_inds #for i in range(3,len(fields),2): # span = int(fields[i]), int(fields[i+1]) # span_inds.append( span ) # Everything is a Disease_Disorder concept = 'problem' # if len(spans) == 1: classifications.append((concept, span_inds)) # else: # print "skipping > 1" # Safe guard against concept file having duplicate entries classifications = sorted(classifications, cmp=concept_cmp) # Hack: Throw away noncontiguous spans that cross line numbers newClassifications = [] #print classifications for classification in classifications: concept, char_spans = classification # Each span (could be noncontiguous span) tok_spans = [] first_lineno = None ignore = False for span in char_spans: # character offset span --> lineno and list of token index spans lineno, tokspan = lineno_and_tokspan( self.line_inds, self.data, self.text, span, "semeval") tok_spans.append(tokspan) # Ensure all noncontig spans are together on one line if first_lineno == None: first_lineno = lineno # Throw away noncontig spans that cross lines if lineno != first_lineno: ignore = True if not ignore: newClassifications.append(classification) # Copy changes over classifications = newClassifications # Hack: Throw away subsumed spans # ex. "left and right atrial dilitation" from 02136-017465.text classifs = reduce(lambda a, b: a + b, map(lambda t: t[1], classifications)) classifs = list(set(classifs)) classifs = sorted(classifs, key=lambda s: s[0]) #print classifs from utilities_for_notes import span_relationship newClassifications = [] for c in classifications: ignore = False for span in c[1]: #print '\t', span # Slow! # Determine if any part of span is subsumed by other span for cand in classifs: # Don't let identity spans mess up comparison if span == cand: continue # Is current span subsumed? rel = span_relationship(span, cand) if rel == 'subsumes': #print 'SUBSUMED!' ignore = True # Only add if no spans are subsumed by others if not ignore: newClassifications.append(c) #for c in newClassifications: print c self.classifications = newClassifications
def get_disjoint_IOBLabels(self): """ Purpose: return a list of list of IOB labels """ # Only comput if not already memoized if self.iob_labels: return self.iob_labels # Build list of proper dimensions (1:1 with self.data) self.getTokenizedSentences() # iobs = [ ['O' for tok in sent] for sent in self.derived_note.data ] iobs = [ ['O' for tok in sent] for sent in self.data] line_inds = self.derived_note.getLineIndices() data = self.derived_note.data text = self.derived_note.text #print '\n\n'+'/'*40+'\n\n' ''' # Hack: Throw away subsumed spans # ex. "left and right atrial dilitation" from 02136-017465.text classifs = reduce(lambda a,b: a+b,map(lambda t:t[1],classifications)) classifs = list(set(classifs)) classifs = sorted(classifs, key=lambda s:s[0]) #print classifs from utilities_for_notes import span_relationship newClassifications = [] for c in classifications: for span in c[1]: #print span # Slow! # Determine if any part of span is subsumed by other span ignore = False for cand in classifs: # Don't let identity spans mess up comparison if span == cand: continue # Is current span subsumed? rel = span_relationship(span,cand) if rel == 'subsumes': ignore = True # Only add if no spans are subsumed by others if not ignore: newClassifications.append(c) ''' # print self.derived_note.getClassificationTuples() # Add 'B's and 'I's from concept spans for classification in self.derived_note.getClassificationTuples(): concept,char_spans = classification # Each span (could be noncontiguous span) for span in char_spans: lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, "semeval") start,end = tokspan # print "lineno: ", lineno # print "start: ", start # print "iobs: ", iobs[lineno] # print iobs[lineno][start] # Update concept tokens to 'B's and 'I's iobs[lineno][start] = 'B' for i in range(start+1,end+1): iobs[lineno][i] = 'I' # Memoize for next call self.iob_labels = iobs return iobs
def getNonContiguousSpans(self): """ Purpose: Return a list of classification tuples (contains noncontig info) How to Use: return value is a list of (concept,lineno,chunk_inds) *** where chunk_inds is a list of indices of chunks that belong in the same noncontiguous span *** """ j = 0 data = self.derived_note.data # Create mapping from token indices to chunk indices tok_to_chunk_index_map = [] for iobs in self.getIOBLabels(): # One line of chunked phrases line = {} seen_chunks = 0 # Word-by-word grouping for i,iob in enumerate(iobs): if iob == 'O': seen_chunks += 1 if iob == 'B': line[i] = seen_chunks seen_chunks += 1 j += 1 tok_to_chunk_index_map.append(line) # Return value tok_classifications = [] # Used for converting character offset -> token index self.getTokenizedSentences() line_inds = self.derived_note.getLineIndices() data = self.derived_note.data text = self.derived_note.text #for i,line in enumerate(data): print i, ': ', line #print '\n\n\n' # For each concept instance for classification in self.derived_note.getClassificationTuples(): concept,char_spans = classification #print #print char_spans # Each span (could be noncontiguous span) tok_spans = [] first_lineno = None #print '-'*80 for span in char_spans: # character offset span --> lineno and list of token index spans lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, self.format) tok_spans.append(tokspan) #if p: exit() # Ensure all noncontig spans are together on one line if first_lineno == None: first_lineno = lineno #print #print lineno #print first_lineno assert (lineno == first_lineno) # list of token index spans --> list of chunk indices chunk_inds = [] for span in tok_spans: ind = tok_to_chunk_index_map[lineno][span[0]] chunk_inds.append(ind) tok_classifications.append( (concept, lineno, chunk_inds) ) return tok_classifications
def write(self, labels=None): """ Note_i2b2::write() Purpose: Return the given concept label predictions in i2b2 format @param labels. A list of classifications @return A string of i2b2-concept-file-formatted data """ # Return value retStr = '' # List of list of words (line-by-line) tlist = self.data # If given labels to write, use them. Default to self.classifications if labels != None: classifications = labels elif self.classifications != None: line_inds = self.line_inds data = self.data text = self.text classifications = [] for concept, char_span in self.classifications: lineno, tokspan = lineno_and_tokspan(line_inds, data, text, char_span) classifications.append( (concept, lineno + 1, tokspan[0], tokspan[1])) classifications = sorted(classifications, cmp=classification_cmp) else: raise Exception('Cannot write concept file: must specify labels') ''' #print classifications for classification in classifications: concept = classification[0] lineno = classification[1] start = classification[2] end = classification[3] span = lno_and_tokspan__to__char_span(self.line_inds, self.data, self.text, lineno-1, (start,end)) print "classification: ", classification print "lineno: ", lineno print "start: ", start print "end ", end print "tokens: <%s>" % ' '.join(self.data[lineno-1][start:end+1]).replace('\n','\t') print "concept: ", concept print 'span: ', span print '\n\n\n' ''' #exit() #print '\n'*40 # For each classification for classification in classifications: #if classification != ('test', 6, 309, 311): continue # Ensure 'none' classifications are skipped if classification[0] == 'none': raise ('Classification label "none" should never happen') concept = classification[0] lineno = classification[1] start = classification[2] end = classification[3] # A list of words (corresponding line from the text file) text = tlist[lineno - 1] #print "\n" + "-" * 80 #print "classification: ", classification #print "lineno: ", lineno #print "start: ", start #print "end ", end #print "text: ", text #print 'len(text): ', len(text) #print "text[start]: ", text[start] #print "tokens: ", text[start:end+1] #print "concept: ", concept # Find the text string that the concept refers to span = lno_and_tokspan__to__char_span(self.line_inds, self.data, self.text, lineno - 1, (start, end)) span_start, span_end = span #print 'span: ', span datum = self.text[span_start:span_end].replace('\n', '\t') #print 'span_text: <%s>' % datum # Classification label = concept # Print format (very similar to i2b2) retStr += "c=\"%s\" %d %d||t=\"%s\"\n" % (datum, span_start, span_end, label) # return formatted data return retStr.strip()