Ejemplo n.º 1
0
    def conlist(self):
        """
        Useful during evaluation
        """

        # Cached for later calls
        if self.concepts: return self.concepts

        # For each word, store a corresponding concept label
        # Initially, all labels will be stored as 'none'
        for line in self.data:
            tmp = []
            for word in line:
                tmp.append('none')
            self.concepts.append(tmp)

        # Use the classifications to correct all mislabled 'none's
        for classification in self.derived_note.getClassificationTuples():
            concept = classification[0]
            char_spans = classification[1]

            # Assumption - assumes no clustering third pass
            line_inds = self.derived_note.getLineIndices()
            data = self.derived_note.getTokenizedSentences()
            text = self.derived_note.getText()
            for span in char_spans:
                lineno, tokspan = lineno_and_tokspan(line_inds, data, text,
                                                     span)
                start, end = tokspan

            self.concepts[lineno][start] = concept
            for i in range(start, end):
                self.concepts[lineno][i + 1] = concept

        return self.concepts
Ejemplo n.º 2
0
    def getIOBLabels(self):
        """
        Purpose: return a list of list of IOB labels
        """

        # Only compute if not already memoized
        if self.iob_labels: return self.iob_labels

        # Build list of proper dimensions (1:1 with self.data)
        self.getTokenizedSentences()
        iobs = [ ['O' for tok in sent] for sent in self.data ]

        line_inds = self.derived_note.getLineIndices()
        data = self.derived_note.data
        text = self.derived_note.text

        #for d in data:
        #    print d
        #exit()

        b_count = 0

        # Add 'B's and 'I's from concept spans
        for classification in self.derived_note.getClassificationTuples():
            concept,char_spans = classification

            #print '\n\n'
            #print concept
            #print char_spans

            # Each span (could be noncontiguous span)
            for span in char_spans:
                start_ind,end_ind = span

                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, "i2b2")
                start,end = tokspan

                #print '\t', lineno, tokspan
                #print '\t\t', data[lineno]
                #print '\t\t', data[lineno][tokspan[0]:tokspan[1]+1]
                #print '\t\t', iobs[lineno]

                # Update concept tokens to 'B's and 'I's
                assert iobs[lineno][start] == 'O'
                iobs[lineno][start] = 'B'
                b_count += 1
                #print 'B: ', b_count
                for i in range(start+1,end+1):
                    #print '\t\t\t', i
                    assert iobs[lineno][i] == 'O'
                    iobs[lineno][i] = 'I'

                #print '\t\t', iobs[lineno]

            #exit()
        #exit()

        # Memoize for next call
        self.iob_labels = iobs
        return iobs
Ejemplo n.º 3
0
    def getIOBLabels(self):
        """
        Purpose: return a list of list of IOB labels
        """

        # Only comput if not already memoized
        if self.iob_labels: return self.iob_labels

        # Build list of proper dimensions (1:1 with self.data)
        self.getTokenizedSentences()
        iobs = [['O' for tok in sent] for sent in self.data]

        line_inds = self.derived_note.getLineIndices()
        data = self.derived_note.data
        text = self.derived_note.text

        # Add 'B's and 'I's from concept spans
        for classification in self.derived_note.getClassificationTuples():
            concept, char_spans = classification

            # Each span (could be noncontiguous span)
            for span in char_spans:
                lineno, tokspan = lineno_and_tokspan(line_inds, data, text,
                                                     span)
                start, end = tokspan

                # Update concept tokens to 'B's and 'I's
                iobs[lineno][start] = 'B'
                for i in range(start + 1, end + 1):
                    iobs[lineno][i] = 'I'

        # Memoize for next call
        self.iob_labels = iobs
        return iobs
Ejemplo n.º 4
0
    def conlist(self):
        """
        Useful during evaluation
        """

        # Cached for later calls
        if self.concepts: return self.concepts

        # For each word, store a corresponding concept label
        # Initially, all labels will be stored as 'none'
        for line in self.data:
            tmp = []
            for word in line:
                tmp.append('none')
            self.concepts.append(tmp)

        # Use the classifications to correct all mislabled 'none's
        for classification in self.derived_note.getClassificationTuples():
            concept    = classification[0]
            char_spans = classification[1]

            # Assumption - assumes no clustering third pass
            line_inds = self.derived_note.getLineIndices()
            data      = self.derived_note.getTokenizedSentences()
            text      = self.derived_note.getText()
            for span in char_spans:
                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
                start,end = tokspan

            self.concepts[lineno][start] = concept
            for i in range(start, end):
                self.concepts[lineno][i+1] = concept

        return self.concepts
Ejemplo n.º 5
0
    def getIOBLabels(self):
        """
        Purpose: return a list of list of IOB labels
        """

        # Only comput if not already memoized
        if self.iob_labels: return self.iob_labels

        # Build list of proper dimensions (1:1 with self.data)
        self.getTokenizedSentences()
        iobs = [ ['O' for tok in sent] for sent in self.data ]

        line_inds = self.derived_note.getLineIndices()
        data = self.derived_note.data
        text = self.derived_note.text

        # Add 'B's and 'I's from concept spans
        for classification in self.derived_note.getClassificationTuples():
            concept,char_spans = classification

            # Each span (could be noncontiguous span)
            for span in char_spans:
                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
                start,end = tokspan

                # Update concept tokens to 'B's and 'I's
                iobs[lineno][start] = 'B'
                for i in range(start+1,end+1):
                    iobs[lineno][i] = 'I'

        # Memoize for next call
        self.iob_labels = iobs
        return iobs
Ejemplo n.º 6
0
    def read_standard(self, txt, con=None):

        """
        Note_xml::read_standard()

        @param txt. A file path for the tokenized medical record
        @param con. A file path for the standardized annotated concepts for txt
        """

        start = 0
        end = 0

        with open(txt) as f:

            # Get entire file
            text = f.read()
            self.text = text

            # Split into lines
            self.data = map(lambda s: s.split(), text.split('\n'))

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []          # Actual lines

            for sent in self.data:

                gold.append(sent)

                # Keep track of which indices each line has
                for word in sent:
                    end += len(word) + 1

                self.line_inds.append( (start,end-1) )
                start = end

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace(): start += 1


        # If an accompanying concept file was specified, read it
        if con:
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[0]
                    span_inds = []
                    for i in range(1,len(fields),2):
                        span = int(fields[i]), int(fields[i+1])
                        span_inds.append( span )

                    # FIXME - For now, treat non-contiguous spans as separate
                    for span in span_inds:
                        # Add the classification to the Note object
                        l,(start,end) = lineno_and_tokspan(self.line_inds, self.data, self.text, span)
                        #print 'span:   ', span
                        #print 'lineno: ', l
                        #print 'start:  ', start
                        #print 'end:    ', end
                        #print '\n'
                        classifications.append((concept,l+1,start,end))

            # Safe guard against concept file having duplicate entries
            classifications = list(set(classifications))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications,cmp=classification_cmp)
Ejemplo n.º 7
0
    def read_standard(self, txt, con=None):

        """
        Note_xml::read_standard()

        @param txt. A file path for the tokenized medical record
        @param con. A file path for the standardized annotated concepts for txt
        """

        start = 0
        end = 0

        with open(txt) as f:

            # Get entire file
            text = f.read()
            self.text = text

            # Split into lines
            self.data = map(lambda s: s.split(), text.split('\n'))

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []          # Actual lines

            for sent in self.data:

                gold.append(sent)

                # Keep track of which indices each line has
                for word in sent:
                    end += len(word) + 1

                self.line_inds.append( (start,end-1) )
                start = end

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace(): start += 1


        # If an accompanying concept file was specified, read it
        if con:
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[0]
                    span_inds = []
                    for i in range(1,len(fields),2):
                        span = int(fields[i]), int(fields[i+1])
                        span_inds.append( span )

                    # FIXME - For now, treat non-contiguous spans as separate
                    for span in span_inds:
                        # Add the classification to the Note object
                        l,(start,end) = lineno_and_tokspan(self.line_inds, self.data, self.text, span)
                        #print 'span:   ', span
                        #print 'lineno: ', l
                        #print 'start:  ', start
                        #print 'end:    ', end
                        #print '\n'
                        classifications.append((concept,l+1,start,end))

            # Safe guard against concept file having duplicate entries
            classifications = list(set(classifications))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications,cmp=classification_cmp)
Ejemplo n.º 8
0
    def read(self, txt, con=None):
        #        print "semeval note read called"

        # Filename
        self.fileName = txt

        #        print self.fileName

        start = 0
        end = 0
        with open(txt) as f:

            # Get entire file
            original_text = f.read()
            text = remove_non_ascii(original_text)

            #            print "original text:"
            #            print original_text

            #            print "text with ascii removed:"
            #            print text

            #print "\nTEXT:------------------"
            #print text

            self.text = text

            # Sentence splitter

            sents = self.sent_tokenizer.tokenize(txt, "semeval")
            #            sents = self.opennlp_tokenizer.sentenize(text)

            #            print "sentenized text: "
            #            print sents

            #print "\nSENTS:-----------------------------"
            #            for line in sents:
            #                print line

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []  # Actual lines

            i = 0
            for s in sents:
                i += 1

                gold.append(s)

                b = False
                #if b: print "\nsentence:-------------------------------"
                #if b: print '<s>' + s + '</s>'

                #print s

                # Store data

                toks = self.word_tokenizer.tokenize(s, "semeval")
                #    toks = self.opennlp_tokenizer.tokenize(s)

                #print toks

                #if b: print "\ntokenized sentence:----------------------------"
                #if b: print toks

                self.data.append(toks)

                #                print self.data

                # Keep track of which indices each line has
                end = start + len(s)

                #if b: print "\nindices:---------------------------------------"
                #if b: print (start, end)

                #if b: print "\nusing index on entire txt----------------------"
                #if b: print '<s>' + text[start:end] + '</s>'

                # EQUAL?
                # assert( text[start:end] == s ), 'data and text must agree'

                self.line_inds.append((start, end))
                start = end

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace():
                    start += 1
            '''
            for line,inds in zip(gold,self.line_inds):
                print '!!!' + line + '!!!'
                print '\t', 'xx'*10
                print inds
                print '\t', 'xx'*10
                print '!!!' + text[inds[0]: inds[1]] + '!!!'
                print '---'
                print '\n'
                print 'Xx' * 20
            '''

#        print "tokenized data: "
#        print self.data

#        print "tokens, one per line: "
#        for token in [token for l in self.data for token in l]:

#            print token

#        print "\n\n"

# If an accompanying concept file was specified, read it
        if con:

            classifications = []
            with open(con) as f:
                for line in f:

                    #                    print line

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('|')
                    #print fields

                    cui = fields[2]
                    span_inds = []

                    spans = fields[1]
                    spans = spans.split(',')
                    spans = [s.split('-') for s in spans]

                    #                    print spans
                    for span in spans:
                        span = int(span[0]), int(span[1])
                        span_inds.append(span)

                # print span_inds

                #for i in range(3,len(fields),2):
                #    span = int(fields[i]), int(fields[i+1])
                #    span_inds.append( span )

                # Everything is a Disease_Disorder
                    concept = 'problem'

                    #                    if len(spans) == 1:
                    classifications.append((concept, span_inds))


#                    else:
#                        print "skipping > 1"

# Safe guard against concept file having duplicate entries
            classifications = sorted(classifications, cmp=concept_cmp)

            # Hack: Throw away noncontiguous spans that cross line numbers
            newClassifications = []

            #print classifications

            for classification in classifications:
                concept, char_spans = classification

                # Each span (could be noncontiguous span)
                tok_spans = []
                first_lineno = None

                ignore = False
                for span in char_spans:
                    # character offset span --> lineno and list of token index spans
                    lineno, tokspan = lineno_and_tokspan(
                        self.line_inds, self.data, self.text, span, "semeval")
                    tok_spans.append(tokspan)

                    # Ensure all noncontig spans are together on one line
                    if first_lineno == None: first_lineno = lineno

                    # Throw away noncontig spans that cross lines
                    if lineno != first_lineno:
                        ignore = True

                if not ignore:
                    newClassifications.append(classification)

            # Copy changes over
            classifications = newClassifications

            # Hack: Throw away subsumed spans
            # ex. "left and right atrial dilitation" from 02136-017465.text
            classifs = reduce(lambda a, b: a + b,
                              map(lambda t: t[1], classifications))
            classifs = list(set(classifs))
            classifs = sorted(classifs, key=lambda s: s[0])
            #print classifs

            from utilities_for_notes import span_relationship

            newClassifications = []
            for c in classifications:

                ignore = False
                for span in c[1]:
                    #print '\t', span

                    # Slow!
                    # Determine if any part of span is subsumed by other span
                    for cand in classifs:
                        # Don't let identity spans mess up comparison
                        if span == cand: continue

                        # Is current span subsumed?
                        rel = span_relationship(span, cand)
                        if rel == 'subsumes':
                            #print 'SUBSUMED!'
                            ignore = True

                # Only add if no spans are subsumed by others
                if not ignore:
                    newClassifications.append(c)

            #for c in newClassifications: print c
            self.classifications = newClassifications
Ejemplo n.º 9
0
    def get_disjoint_IOBLabels(self):
        """
        Purpose: return a list of list of IOB labels
        """

        # Only comput if not already memoized
        if self.iob_labels: return self.iob_labels


        # Build list of proper dimensions (1:1 with self.data)
        self.getTokenizedSentences()
#        iobs = [ ['O' for tok in sent] for sent in self.derived_note.data ]
        iobs = [ ['O' for tok in sent] for sent in self.data]

        line_inds = self.derived_note.getLineIndices()
        data = self.derived_note.data
        text = self.derived_note.text

        #print '\n\n'+'/'*40+'\n\n'

        '''
        # Hack: Throw away subsumed spans
        # ex. "left and right atrial dilitation" from 02136-017465.text
        classifs = reduce(lambda a,b: a+b,map(lambda t:t[1],classifications))
        classifs = list(set(classifs))
        classifs = sorted(classifs, key=lambda s:s[0])
        #print classifs

        from utilities_for_notes import span_relationship

        newClassifications = []
        for c in classifications:

            for span in c[1]:
                #print span

                # Slow!
                # Determine if any part of span is subsumed by other span
                ignore = False
                for cand in classifs:
                    # Don't let identity spans mess up comparison
                    if span == cand: continue

                    # Is current span subsumed?
                    rel = span_relationship(span,cand)
                    if rel == 'subsumes':
                        ignore = True

            # Only add if no spans are subsumed by others
            if not ignore:
                newClassifications.append(c)
        '''

   #     print self.derived_note.getClassificationTuples()

        # Add 'B's and 'I's from concept spans
        for classification in self.derived_note.getClassificationTuples():
            concept,char_spans = classification

            # Each span (could be noncontiguous span)
            for span in char_spans:
                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, "semeval")
                start,end = tokspan

    #            print "lineno: ", lineno
    #            print "start: ", start
    #            print "iobs: ", iobs[lineno]
    #            print iobs[lineno][start]

                # Update concept tokens to 'B's and 'I's
                iobs[lineno][start] = 'B'
                for i in range(start+1,end+1):
                    iobs[lineno][i] = 'I'

        # Memoize for next call
        self.iob_labels = iobs
        return iobs
Ejemplo n.º 10
0
    def getNonContiguousSpans(self):
        """
        Purpose: Return a list of classification tuples (contains noncontig info)

        How to Use: return value is a list of (concept,lineno,chunk_inds)
                    *** where chunk_inds is a list of indices of chunks
                        that belong in the same noncontiguous span      ***
        """

        j = 0
        data = self.derived_note.data

        # Create mapping from token indices to chunk indices
        tok_to_chunk_index_map = []
        for iobs in self.getIOBLabels():
            # One line of chunked phrases
            line = {}
            seen_chunks = 0

            # Word-by-word grouping
            for i,iob in enumerate(iobs):
                if iob == 'O':
                    seen_chunks += 1
                if iob == 'B':
                    line[i] = seen_chunks
                    seen_chunks += 1

            j += 1
            tok_to_chunk_index_map.append(line)


        # Return value
        tok_classifications = []

        # Used for converting character offset -> token index
        self.getTokenizedSentences()
        line_inds = self.derived_note.getLineIndices()
        data = self.derived_note.data
        text = self.derived_note.text

        #for i,line in enumerate(data): print i, ': ', line
        #print '\n\n\n'

        # For each concept instance
        for classification in self.derived_note.getClassificationTuples():
            concept,char_spans = classification

            #print
            #print char_spans

            # Each span (could be noncontiguous span)
            tok_spans = []
            first_lineno = None

            #print '-'*80

            for span in char_spans:
                # character offset span --> lineno and list of token index spans
                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, self.format)
                tok_spans.append(tokspan)
                #if p: exit()

                # Ensure all noncontig spans are together on one line
                if first_lineno == None: first_lineno = lineno
                #print
                #print lineno
                #print first_lineno
                assert (lineno == first_lineno)

            # list of token index spans --> list of chunk indices
            chunk_inds = []
            for span in tok_spans:
                ind = tok_to_chunk_index_map[lineno][span[0]]
                chunk_inds.append(ind)

            tok_classifications.append( (concept, lineno, chunk_inds)  )


        return tok_classifications
Ejemplo n.º 11
0
    def write(self, labels=None):
        """
        Note_i2b2::write()

        Purpose: Return the given concept label predictions in i2b2 format

        @param  labels. A list of classifications
        @return         A string of i2b2-concept-file-formatted data
        """

        # Return value
        retStr = ''

        # List of list of words (line-by-line)
        tlist = self.data

        # If given labels to write, use them. Default to self.classifications
        if labels != None:
            classifications = labels
        elif self.classifications != None:
            line_inds = self.line_inds
            data = self.data
            text = self.text
            classifications = []
            for concept, char_span in self.classifications:
                lineno, tokspan = lineno_and_tokspan(line_inds, data, text,
                                                     char_span)
                classifications.append(
                    (concept, lineno + 1, tokspan[0], tokspan[1]))
            classifications = sorted(classifications, cmp=classification_cmp)
        else:
            raise Exception('Cannot write concept file: must specify labels')
        '''
        #print classifications
        for classification in classifications:
            concept = classification[0]
            lineno  = classification[1]
            start   = classification[2]
            end     = classification[3]

            span = lno_and_tokspan__to__char_span(self.line_inds, self.data, self.text, lineno-1, (start,end))

            print "classification: ", classification
            print "lineno:         ", lineno
            print "start:          ", start
            print "end             ", end

            print "tokens:          <%s>" % ' '.join(self.data[lineno-1][start:end+1]).replace('\n','\t')
            print "concept:        ", concept
            print 'span:           ', span
            print '\n\n\n'
        '''

        #exit()
        #print '\n'*40

        # For each classification
        for classification in classifications:

            #if classification != ('test', 6, 309, 311): continue

            # Ensure 'none' classifications are skipped
            if classification[0] == 'none':
                raise ('Classification label "none" should never happen')

            concept = classification[0]
            lineno = classification[1]
            start = classification[2]
            end = classification[3]

            # A list of words (corresponding line from the text file)
            text = tlist[lineno - 1]

            #print "\n" + "-" * 80
            #print "classification: ", classification
            #print "lineno:         ", lineno
            #print "start:          ", start
            #print "end             ", end
            #print "text:           ", text
            #print 'len(text):      ', len(text)
            #print "text[start]:    ", text[start]
            #print "tokens:         ", text[start:end+1]
            #print "concept:        ", concept

            # Find the text string that the concept refers to
            span = lno_and_tokspan__to__char_span(self.line_inds, self.data,
                                                  self.text, lineno - 1,
                                                  (start, end))
            span_start, span_end = span

            #print 'span:           ', span

            datum = self.text[span_start:span_end].replace('\n', '\t')
            #print 'span_text:       <%s>' % datum

            # Classification
            label = concept

            # Print format (very similar to i2b2)
            retStr += "c=\"%s\" %d %d||t=\"%s\"\n" % (datum, span_start,
                                                      span_end, label)

        # return formatted data
        return retStr.strip()