Esempio n. 1
0
class Sentence(object):
    """Sentence from a document, to be annotated"""
    def __init__(self, text, offset=0, **kwargs):
        self.text = text
        self.sid = kwargs.get("sid")
        self.did = kwargs.get("did")
        self.entities = Entities(sid=self.sid, did=self.did)
        self.offset = offset
        self.pairs = Pairs()
        self.parsetree = None
        self.depparse = None
        self.tokens = []
        self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')

    def tokenize_words(self):
        pass

    def process_sentence(self, corenlpserver, doctype="biomedical"):
        corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
            'ssplit.eolonly': True,
            'annotators': 'tokenize,ssplit,pos,ner,lemma',
            #'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
            'outputFormat': 'json',
        })
        if isinstance(corenlpres, basestring):
            print corenlpres
            corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,lemma',
                'outputFormat': 'json',
            })
        if isinstance(corenlpres, basestring):
            print "could not process this sentence:", self.text.encode("utf8")
            print corenlpres
        else:
            self.process_corenlp_output(corenlpres)
        return corenlpres

    def process_corenlp_output(self, corenlpres):

        """
        Process the results obtained with CoreNLP for this sentence
        :param corenlpres:
        :return:
        """
        # self.sentences = []
        if len(corenlpres['sentences']) > 1:
            print self.text
            sys.exit("Number of sentences from CoreNLP is not 1.")
        if len(corenlpres['sentences']) == 0:
            self.tokens = []
            self.create_newtoken("", {})
            logging.debug("no sentences")
            logging.debug(self.text)
            return
        sentence = corenlpres['sentences'][0]
        #logging.debug(str(sentence.keys()))
        #print "sentence", self.text.encode("utf8")
        #print "parse", pp.pprint(sentence["parse"])
        #print "basic", pp.pprint(sentence["basic-dependencies"])
        #print "collapsed", pp.pprint(sentence["collapsed-dependencies"])
        #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"])
        self.parsetree = sentence.get('parse')
        self.depparse = sentence.get('basic-dependencies')
        for t in sentence['tokens']:
            if t["word"]:
                # TODO: specific rules for each corpus
                #if ""
                token_seq = self.regex_tokens.split(t["originalText"])#, flags=re.U)
                #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0])
                #token_seq = [t[0]]
                # print t[0], token_seq
                if len(token_seq) > 3: #and t["word"] not in stanford_coding.keys():
                    # logging.info("{}: {}".format(t["word"], "&".join(token_seq)))
                    for its, ts in enumerate(token_seq):
                        if ts.strip() != "":
                            charoffset_begin = int(t["characterOffsetBegin"])
                            if token_seq[:its]: # not the first token
                                charoffset_begin += sum([len(x) for x in token_seq[:its]])
                            # charoffset_begin += its
                            charoffset_end = len(ts) + charoffset_begin
                            #logging.info(str(charoffset_begin) + ":" + str(charoffset_end))
                            ts_props = {"characterOffsetBegin": charoffset_begin,
                                        "characterOffsetEnd": charoffset_end,
                                        "pos": t["pos"],
                                        "ner": t["ner"],
                                        "lemma": t["lemma"][charoffset_begin:charoffset_end]}
                            self.create_newtoken(ts, ts_props)

                else:
                    self.create_newtoken(t["word"], t)

    def create_newtoken(self, text, props):
        newtoken = Token2(text, order=len(self.tokens))
        try:
            newtoken.start = int(props["characterOffsetBegin"])
            newtoken.dstart = newtoken.start + self.offset
            newtoken.end = int(props["characterOffsetEnd"])
            newtoken.dend = newtoken.end + self.offset
            newtoken.pos = props["pos"]
            newtoken.tag = props["ner"]
            newtoken.lemma = props["lemma"]
            # newtoken.stem = porter.stem_word(newtoken.text)
            newtoken.tid = self.sid + ".t" + str(len(self.tokens))
            self.tokens.append(newtoken)
            # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end])
        except KeyError:
            logging.debug("error: text={} props={}".format(text, props))
            return None
        # logging.debug(newtoken.text)
        return newtoken

    def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.sid + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.sid + ".p0"
        if subtype == "tlink":
            p = TLink(entity1, entity2, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype)
        else:
            p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did)
        self.pairs.add_pair(p, source)
        return p

    def exclude_entity(self, start, end, source):
        """
        Exclude all entities matching start-end relative to sentence
        :param start:
        :param end:
        """
        to_delete = []
        for e in self.entities.elist[source]:
            if e.start == start and e.end == end:
                to_delete.append(e)
                for t in e.tokens:
                    tagkeys = t.tags.keys()
                    for tag in tagkeys:
                        if tag.startswith(source):
                            del t.tags[tag]
        for e in to_delete:
            #print "removing {}".format(e)
            self.entities.elist[source].remove(e)
            #print [(ee.start, ee.end) for ee in self.entities.elist[source]]


    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None,
                   text=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    #return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    #for t in self.tokens:
                    #    print (t.start, t.end, t.text),
                    #print
                    #return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            subtype = kwargs.get("subtype", "all")
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                kwargs["eid"] = eid
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            # print self.entities.elist["goldstandard"]
            self.label_tokens(tlist, source, etype, subtype=subtype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))

    def label_tokens(self, tlist, source, etype, subtype="all"):
        if len(tlist) == 1:
            tlist[0].tags[source] = "single"
            tlist[0].tags[source + "_subtype"] = etype
            tlist[0].tags[source + "_" + etype] = "single"
            if subtype != "all":
                #print subtype
                tlist[0].tags[source + "_" + etype + "-" + subtype] = "single"
        else:
            for t in range(len(tlist)):
                if t == 0:
                    tlist[t].tags[source] = "start"
                    tlist[t].tags[source + "_" + etype] = "start"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "start"
                elif t == len(tlist) - 1:
                    tlist[t].tags[source] = "end"
                    tlist[t].tags[source + "_" + etype] = "end"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "end"
                else:
                    tlist[t].tags[source] = "middle"
                    tlist[t].tags[source + "_" + etype] = "middle"
                    tlist[t].tags[source + "_subtype"] = etype
                    if subtype != "all":
                        tlist[t].tags[source + "_" + etype + "-" + subtype] = "middle"
        # logging.debug([t.tags for t in tlist])

    def write_bioc_results(self, parent, source):
        bioc_sentence = ET.SubElement(parent, "sentence")
        bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset")
        bioc_sentence_offset.text = str(self.tokens[0].dstart)
        bioc_sentence_text = ET.SubElement(bioc_sentence, "text")
        bioc_sentence_text.text = self.text

        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                bioc_annotation = entity.write_bioc_annotation(bioc_sentence)
        return bioc_sentence

    def get_dic(self, source):
        dic = {}
        dic["id"] = self.sid
        dic["offset"] = str(self.tokens[0].dstart)
        dic["text"] = self.text
        dic["entities"] = []
        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                dic["entities"].append(entity.get_dic())
            dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
            for ei, e in enumerate(dic["entities"]):
                e["eid"] = self.sid + ".e{}".format(ei)
        elif source == "all":
            offsets = Offsets()
            for esource in self.entities.elist:
                for entity in self.entities.elist[esource]:
                    toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                           exclude_this_if=[1, -1, 2, -3],
                                                                           exclude_others_if=[2])
                    if toadd:
                        dic["entities"].append(entity.get_dic())
                dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
                for ei, e in enumerate(dic["entities"]):
                    e["eid"] = self.sid + ".e{}".format(ei)
        dic["pairs"] = self.pairs.get_dic()
        return dic

    def find_tokens(self, text, start, end, count, relativeto="doc"):
        candidates = []
        for t in self.tokens:
            if t.text == text:
                print t.text, text
                candidates.append(t)
        print text, candidates
        if len(candidates) == 0:
            print "could not find tokens!"
        elif len(candidates) == 1:
            return candidates
        elif len(candidates)-1 > count:
            candidates[count]
        """else:
            dist = []
            for c in candidates:
                if relativeto == "doc":
                    d = c.dstart
                else:
                    d = c.start
                dist.append(abs(d-start))
            return [candidates[dist.index(min(dist))]]"""

    def find_tokens_between(self, start, end, relativeto="doc"):
        """Return list of tokens between offsets. Use relativeto to consider doc indexes or
           sentence indexes."""
        foundtokens = []
        for t in self.tokens:
            if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end:
                foundtokens.append(t)
            elif relativeto.startswith("sent") and t.start >= start and t.end <= end:
                foundtokens.append(t)
        return foundtokens

    def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED],
                       tag="", backup=False, printstd=False):
        #data =  ddi_train_slk.model, ddi_train_sst.model
        tempfiles = []

        if relations.SLK_PRED in classifiers:
            logging.info("**Testing SLK classifier %s ..." % (tag,))
            #testpairdic = ddi_kernels.fromddiDic(testdocs)
            ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt")
            ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                 model=tag + "all_ddi_train_slk.model")
            self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                                      self.pairs.pairs)
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt")
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt")

        if relations.SST_PRED in classifiers:
            logging.info("****Testing SST classifier %s ..." % (tag,))
            self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs,
                                             model=tag + "all_ddi_train_sst.model", tag=tag)
        for p in self.pairs.pairs:
            for r in self.pairs.pairs[p].recognized_by:
                if self.pairs.pairs[p].recognized_by[r] == 1:
                    p.relation = True
        return tempfiles

    def get_entitites_between(self, entity1, entity2, source):
        if entity1.start > entity2.start:  # entity1 should always be the first entity
            entity1, entity2 = entity2, entity1
        first_between = entity1.end
        last_between = entity2.start
        entities = []
        for entity in self.entities.elist[source]:
            if entity.start >= first_between and entity.end <= last_between:
                entities.append(entity)
        return entities
Esempio n. 2
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""

    def __init__(self, text, process=False, doctype="biomedical", ssplit=False, **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.source = kwargs.get("source")
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        #if self.title:
        #    sid = self.did + ".s0"
        #    self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = ["./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                #'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                'outputFormat': 'json',
            })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(s.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'outputFormat': 'json',
            })
            if isinstance(corenlpres, basestring):
                print "could not process this sentence:", s.text.encode("utf8")
                print corenlpres
                continue
            else:
                s.process_corenlp_output(corenlpres)


    def tag_chemdner_entity(self, start, end, subtype, **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        if doct == "T": # If it's in the title, we already know the sentence (it's the first)
            self.sentences[0].tag_entity(start, end, subtype, **kwargs)
        else: # we have to find the sentence
            found = False
            totalchars = 0
            for s in self.sentences[1:]:
                if totalchars <= start and totalchars + len(s.text) >= end:  # entity is in this sentence
                    s.tag_entity(start-totalchars, end-totalchars, subtype,
                                 totalchars=totalchars, **kwargs)
                    # print "found entity on sentence %s" % s.sid
                    found = True
                    break

                totalchars += len(s.text)
                totalchars = self.get_space_between_sentences(totalchars)
            if not found:
                print "could not find sentence for %s:%s on %s!" % (start,
                                                                       end, self.did)
                # sys.exit()

    def add_relation(self, entity1, entity2, subtype, relation, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.info("adding {}:{}=>{}".format(pid, entity1.text.encode("utf8"), entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1, entity2, relation=relation, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype, between_text=between_text)
        else:
            pair = Pair((entity1, entity2), subtype, did=self.did, pid=pid, original_id=kwargs.get("original_id"), between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        doc_entities = {}
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entitites = s.entities.get_unique_entities(source, ths, rules)
                    for e in sentence_entitites:
                        sentence_entitites[e].append(s.text[int(sentence_entitites[e][1]):int(sentence_entitites[e][2])])
                    # print sentence_entitites
                elif mode == "re":
                    sentence_entitites = s.entities.get_unique_relations(source)
            # print doc_entities, sentence_entitites
            doc_entities.update(sentence_entitites)
            # print doc_entities
            # print
        logging.info("{} has {} unique entities".format(self.did, len(doc_entities)))
        return doc_entities

    def write_chemdner_results(self, source, outfile, ths={"chebi":0.0}, rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(source, outfile, ths, rules, totalentities+1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon", {"key":"type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon", {"key":"type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title":{}, "abstract":{}}
        dic = {"abstract":{}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                logging.debug("sentence without tokens: {} {}".format(s.sid, s.text))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            logging.debug("{} {} {} {} {}".format(s.tokens[0].dstart <= start, s.tokens[-1].dend >= end,
                                                s.tokens[0].dstart, s.tokens[-1].dend, s.text))
        return None

    def get_entity_offsets(self, esource, ths, rules):
        offsets = []
        for s in self.sentences:
            if s.entities:
                offsets += s.entities.get_entity_offsets(esource, ths, rules)
        return offsets

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                   return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities

    def get_abbreviations(self):
        self.abbreviations = {}
        first_elem = []
        second_elem = []
        open_paren = False
        for sentence in self.sentences:
            # print sentence.text
            for i, t in enumerate(sentence.tokens):
                if t.text == "-LRB-":
                    open_paren = True
                    last_token = sentence.tokens[i-1]
                    while last_token.pos.startswith("NN") or last_token.pos.startswith("JJ"): # use nouns before the parenthesis
                        first_elem.insert(0, last_token)
                        if last_token.order == 0:
                            break
                        else:
                            last_token = sentence.tokens[last_token.order - 1]  # check the token before this one
                    if len(first_elem) > 0:
                        logging.info("starting abbreviation for this text: " + str([tt.text for tt in first_elem]))
                    else:
                        open_paren = False
                elif t.text == "-RRB-" and open_paren == True:
                    first_text = sentence.text[first_elem[0].start:first_elem[-1].end]
                    second_text = sentence.text[second_elem[0].start:second_elem[-1].end]
                    if len(first_text) > len(second_text): #abbreviation is the smallest word
                        second_text, first_text = first_text, second_text
                    # rules
                    if not first_text.islower() and len(first_text) > 1:
                        self.abbreviations[first_text] = second_text
                    open_paren = False
                    first_elem = []
                    second_elem = []
                elif open_paren:
                    second_elem.append(t)
        for abv in self.abbreviations:
            if not any([c.isalpha() for c in abv]):
                print abv, ":", self.abbreviations[abv]
Esempio n. 3
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""
    def __init__(self,
                 text,
                 process=False,
                 doctype="biomedical",
                 ssplit=False,
                 **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        if self.title:
            sid = self.did + ".s0"
            self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = [
            "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"
        ]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(
                    Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(
                s.text.encode("utf8"),
                properties={
                    'ssplit.eolonly': True,
                    # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                    'annotators':
                    'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                    'gazetteer':
                    '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt',
                    'outputFormat': 'json',
                })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(
                    s.text.encode("utf8"),
                    properties={
                        'ssplit.eolonly': True,
                        # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                        'nfl.gazetteer':
                        '/scr/nlp/data/machine-reading/Machine_Reading_P1_Reading_Task_V2.0/data/SportsDomain/NFLScoring_UseCase/NFLgazetteer.txt',
                        'annotators': 'tokenize,ssplit,pos,ner,lemma',
                        'outputFormat': 'json',
                    })
            s.process_corenlp_sentence(corenlpres)

    def tag_chemdner_entity(self, start, end, subtype, **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        if doct == "T":  # If it's in the title, we already know the sentence (it's the first)
            self.sentences[0].tag_entity(start, end, subtype, **kwargs)
        else:  # we have to find the sentence
            found = False
            totalchars = 0
            for s in self.sentences[1:]:
                if totalchars <= start and totalchars + len(
                        s.text) >= end:  # entity is in this sentence
                    s.tag_entity(start - totalchars,
                                 end - totalchars,
                                 subtype,
                                 totalchars=totalchars,
                                 **kwargs)
                    # print "found entity on sentence %s" % s.sid
                    found = True
                    break

                totalchars += len(s.text)
                totalchars = self.get_space_between_sentences(totalchars)
            if not found:
                print "could not find sentence for %s:%s on %s!" % (start, end,
                                                                    self.did)
                # sys.exit()

    def add_relation(self,
                     entity1,
                     entity2,
                     subtype,
                     relation,
                     source="goldstandard",
                     **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.info("adding {}:{}=>{}".format(pid,
                                               entity1.text.encode("utf8"),
                                               entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1,
                         entity2,
                         relation=relation,
                         original_id=kwargs.get("original_id"),
                         did=self.did,
                         pid=pid,
                         rtype=subtype,
                         between_text=between_text)
        else:
            pair = Pair((entity1, entity2),
                        subtype,
                        did=self.did,
                        pid=pid,
                        original_id=kwargs.get("original_id"),
                        between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        entries = set()
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entries = s.entities.get_unique_entities(
                        source, ths, rules)
                elif mode == "re":
                    sentence_entries = s.entities.get_unique_relations(source)
                entries.update(sentence_entries)
        return entries

    def write_chemdner_results(self,
                               source,
                               outfile,
                               ths={"chebi": 0.0},
                               rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(
                    source, outfile, ths, rules, totalentities + 1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon",
                                        {"key": "type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(
            bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon",
                                           {"key": "type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(
                bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title": {}, "abstract": {}}
        dic = {"abstract": {}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                logging.debug("sentence without tokens: {} {}".format(
                    s.sid, s.text))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            print s.tokens[0].dstart <= start, s.tokens[
                -1].dend >= end, s.tokens[0].dstart, s.tokens[-1].dend, s.text
        return None

    def get_offsets(self, esource, ths, rules, off_list=None):
        #print esource

        offsets = []
        for s in self.sentences:
            #print s.text
            offies = gazette.easy_search_terms(s, esource, ths, rules,
                                               off_list)
            if len(offies) == 1:
                offsets += offies  #Check it doesn't affect normal results
            else:
                if s.entities:
                    offsets += s.entities.get_offsets2(esource, ths, rules)
                    offsets += offies

        return list(set(offsets))

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                    return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities
Esempio n. 4
0
class Sentence(object):
    """Sentence from a document, to be annotated"""
    def __init__(self, text, offset=0, **kwargs):
        self.text = text
        self.sid = kwargs.get("sid")
        self.did = kwargs.get("did")
        self.entities = Entities(sid=self.sid, did=self.did)
        self.offset = offset
        self.pairs = Pairs()
        self.parsetree = None
        self.depparse = None
        self.tokens = []
        self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')

    def tokenize_words(self):
        pass

    def process_sentence(self, corenlpserver, doctype="biomedical"):
        corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
            'ssplit.eolonly': True,
            # 'annotators': 'tokenize,ssplit,pos,ner,lemma',
            'annotators': 'tokenize,ssplit,pos,parse,ner,lemma,depparse',
            'outputFormat': 'json',
        })
        if isinstance(corenlpres, basestring):
            print corenlpres
            corenlpres = corenlpserver.annotate(self.text.encode("utf8"), properties={
                'ssplit.eolonly': True,
                # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                'annotators': 'tokenize,ssplit,pos,ner,lemma',
                'outputFormat': 'json',
            })
        if isinstance(corenlpres, basestring):
            print "could not process this sentence:", self.text.encode("utf8")
            print corenlpres
        else:
            self.process_corenlp_output(corenlpres)
        return corenlpres

    def process_corenlp_output(self, corenlpres):

        """
        Process the results obtained with CoreNLP for this sentence
        :param corenlpres:
        :return:
        """
        # self.sentences = []
        if len(corenlpres['sentences']) > 1:
            print self.text
            sys.exit("Number of sentences from CoreNLP is not 1.")
        if len(corenlpres['sentences']) == 0:
            self.tokens = []
            self.create_newtoken("", {})
            logging.debug("no sentences")
            logging.debug(self.text)
            return
        sentence = corenlpres['sentences'][0]
        #logging.debug(str(sentence.keys()))
        #print "sentence", self.text.encode("utf8")
        #print "parse", pp.pprint(sentence["parse"])
        #print "basic", pp.pprint(sentence["basic-dependencies"])
        #print "collapsed", pp.pprint(sentence["collapsed-dependencies"])
        #print "ccprocessed", pp.pprint(sentence["collapsed-ccprocessed-dependencies"])
        self.parsetree = sentence.get('parse')
        self.depparse = sentence.get('basic-dependencies')
        for t in sentence['tokens']:
            # print t[0]
            if t["word"]:
                # TODO: specific rules for each corpus
                #if ""
                token_seq = self.regex_tokens.split(t["word"])#, flags=re.U)
                #token_seq = rext.split(r'(\w+)(/|\\|\+|\.)(\w+)', t[0])
                #token_seq = [t[0]]
                # print t[0], token_seq
                if len(token_seq) > 3 and t["word"] not in stanford_coding.keys():
                    # logging.info("{}: {}".format(t["word"], "&".join(token_seq)))
                    for its, ts in enumerate(token_seq):
                        if ts.strip() != "":
                            charoffset_begin = int(t["characterOffsetBegin"])
                            if token_seq[:its]: # not the first token
                                charoffset_begin += sum([len(x) for x in token_seq[:its]])
                            # charoffset_begin += its
                            charoffset_end = len(ts) + charoffset_begin
                            #logging.info(str(charoffset_begin) + ":" + str(charoffset_end))
                            ts_props = {"characterOffsetBegin": charoffset_begin,
                                        "characterOffsetEnd": charoffset_end,
                                        "pos": t["pos"],
                                        "ner": t["ner"],
                                        "lemma": t["lemma"][charoffset_begin:charoffset_end]}
                            self.create_newtoken(ts, ts_props)

                else:
                    self.create_newtoken(t["word"], t)

    def create_newtoken(self, text, props):
        newtoken = Token2(text, order=len(self.tokens))
        try:
            newtoken.start = int(props["characterOffsetBegin"])
            newtoken.dstart = newtoken.start + self.offset
            newtoken.end = int(props["characterOffsetEnd"])
            newtoken.dend = newtoken.end + self.offset
            newtoken.pos = props["pos"]
            newtoken.tag = props["ner"]
            newtoken.lemma = props["lemma"]
            # newtoken.stem = porter.stem_word(newtoken.text)
            newtoken.tid = self.sid + ".t" + str(len(self.tokens))
            self.tokens.append(newtoken)
            # print "|{}| <=> |{}|".format(text, self.text[newtoken.start:newtoken.end])
        except KeyError:
            logging.debug("error: text={} props={}".format(text, props))
            return None
        # logging.debug(newtoken.text)
        return newtoken

    def add_relation(self, entity1, entity2, subtype, source="goldstandard", **kwargs):
        if self.pairs.pairs:
            pid = self.sid + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.sid + ".p0"
        if subtype == "tlink":
            p = TLink(entity1, entity2, original_id=kwargs.get("original_id"),
                                     did=self.did, pid=pid, rtype=subtype)
        else:
            p = Pair((entity1, entity2), subtype, pid=pid, sid=self.sid, did=self.did)
        self.pairs.add_pair(p, source)
        return p

    def exclude_entity(self, start, end, source):
        """
        Exclude all entities matching start-end relative to sentence
        :param start:
        :param end:
        """
        to_delete = []
        for e in self.entities.elist[source]:
            if e.start == start and e.end == end:
                to_delete.append(e)
                for t in e.tokens:
                    tagkeys = t.tags.keys()
                    for tag in tagkeys:
                        if tag.startswith(source):
                            del t.tags[tag]
        for e in to_delete:
            #print "removing {}".format(e)
            self.entities.elist[source].remove(e)
            #print [(ee.start, ee.end) for ee in self.entities.elist[source]]


    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            self.label_tokens(tlist, source, etype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))

    def label_tokens(self, tlist, source, etype):
        if len(tlist) == 1:
            tlist[0].tags[source] = "single"
            tlist[0].tags[source + "_subtype"] = etype
            tlist[0].tags[source + "_" + etype] = "single"
        else:
            for t in range(len(tlist)):
                if t == 0:
                    tlist[t].tags[source] = "start"
                    tlist[t].tags[source + "_" + etype] = "start"
                    tlist[t].tags[source + "_subtype"] = etype
                elif t == len(tlist) - 1:
                    tlist[t].tags[source] = "end"
                    tlist[t].tags[source + "_" + etype] = "end"
                    tlist[t].tags[source + "_subtype"] = etype
                else:
                    tlist[t].tags[source] = "middle"
                    tlist[t].tags[source + "_" + etype] = "middle"
                    tlist[t].tags[source + "_subtype"] = etype
        # logging.debug([t.tags for t in tlist])

    def write_bioc_results(self, parent, source):
        bioc_sentence = ET.SubElement(parent, "sentence")
        bioc_sentence_offset = ET.SubElement(bioc_sentence, "offset")
        bioc_sentence_offset.text = str(self.tokens[0].dstart)
        bioc_sentence_text = ET.SubElement(bioc_sentence, "text")
        bioc_sentence_text.text = self.text

        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                bioc_annotation = entity.write_bioc_annotation(bioc_sentence)
        return bioc_sentence

    def get_dic(self, source):
        dic = {}
        dic["id"] = self.sid
        dic["offset"] = str(self.tokens[0].dstart)
        dic["text"] = self.text
        dic["entities"] = []
        if source in self.entities.elist:
            for entity in self.entities.elist[source]:
                dic["entities"].append(entity.get_dic())
            dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
            for ei, e in enumerate(dic["entities"]):
                e["eid"] = self.sid + ".e{}".format(ei)
        elif source == "all":
            offsets = Offsets()
            for esource in self.entities.elist:
                for entity in self.entities.elist[esource]:
                    toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end),
                                                                           exclude_this_if=[1, -1, 2, -3],
                                                                           exclude_others_if=[2])
                    if toadd:
                        dic["entities"].append(entity.get_dic())
                dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset'])
                for ei, e in enumerate(dic["entities"]):
                    e["eid"] = self.sid + ".e{}".format(ei)
        dic["pairs"] = self.pairs.get_dic()
        return dic

    def find_tokens(self, text, start, end, count, relativeto="doc"):
        candidates = []
        for t in self.tokens:
            if t.text == text:
                print t.text, text
                candidates.append(t)
        print text, candidates
        if len(candidates) == 0:
            print "could not find tokens!"
        elif len(candidates) == 1:
            return candidates
        elif len(candidates)-1 > count:
            candidates[count]
        """else:
            dist = []
            for c in candidates:
                if relativeto == "doc":
                    d = c.dstart
                else:
                    d = c.start
                dist.append(abs(d-start))
            return [candidates[dist.index(min(dist))]]"""

    def find_tokens_between(self, start, end, relativeto="doc"):
        """Return list of tokens between offsets. Use relativeto to consider doc indexes or
           sentence indexes."""
        foundtokens = []
        for t in self.tokens:
            if relativeto.startswith("doc") and t.dstart >= start and t.dend <= end:
                foundtokens.append(t)
            elif relativeto.startswith("sent") and t.start >= start and t.end <= end:
                foundtokens.append(t)
        return foundtokens

    def test_relations(self, pairs, basemodel, classifiers=[relations.SLK_PRED, relations.SST_PRED],
                       tag="", backup=False, printstd=False):
        #data =  ddi_train_slk.model, ddi_train_sst.model
        tempfiles = []

        if relations.SLK_PRED in classifiers:
            logging.info("**Testing SLK classifier %s ..." % (tag,))
            #testpairdic = ddi_kernels.fromddiDic(testdocs)
            ddi_kernels.generatejSREdata(pairs, self, basemodel, tag + "ddi_test_jsre.txt")
            ddi_kernels.testjSRE(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                 model=tag + "all_ddi_train_slk.model")
            self.pairs.pairs = ddi_kernels.getjSREPredicitons(tag + "ddi_test_jsre.txt", tag + "ddi_test_result.txt",
                                                      self.pairs.pairs)
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_jsre.txt")
            tempfiles.append(ddi_kernels.basedir + tag + "ddi_test_result.txt")

        if relations.SST_PRED in classifiers:
            logging.info("****Testing SST classifier %s ..." % (tag,))
            self.pairs.pairs = ddi_kernels.testSVMTK(self, self.pairs.pairs, pairs,
                                             model=tag + "all_ddi_train_sst.model", tag=tag)
        for p in self.pairs.pairs:
            for r in self.pairs.pairs[p].recognized_by:
                if self.pairs.pairs[p].recognized_by[r] == 1:
                    p.relation = True
        return tempfiles
Esempio n. 5
0
class Document(object):
    """A document is constituted by one or more sentences. It should have an ID and
    title. s0, the first sentence, is always the title sentence."""
    def __init__(self,
                 text,
                 process=False,
                 doctype="biomedical",
                 ssplit=False,
                 **kwargs):
        self.text = text
        self.title = kwargs.get("title")
        self.sentences = kwargs.get("sentences", [])
        self.did = kwargs.get("did", "d0")
        self.invalid_sids = []
        self.title_sids = []
        self.source = kwargs.get("source")
        self.pairs = Pairs()
        if ssplit:
            self.sentence_tokenize(doctype)
        if process:
            self.process_document(doctype)

    def sentence_tokenize(self, doctype):
        """
        Split the document text into sentences, add to self.sentences list
        :param doctype: Can be used in the future to choose different methods
        """
        # first sentence should be the title if it exists
        #if self.title:
        #    sid = self.did + ".s0"
        #    self.sentences.append(Sentence(self.title, sid=sid, did=self.did))
        # inputtext = clean_whitespace(self.text)
        inputtext = self.text
        with io.open("/tmp/geniainput.txt", 'w',
                     encoding='utf-8') as geniainput:
            geniainput.write(inputtext)
        current_dir = os.getcwd()
        os.chdir(geniass_path)
        geniaargs = [
            "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt"
        ]
        Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate()
        os.chdir(current_dir)
        offset = 0
        with io.open("/tmp/geniaoutput.txt", 'r',
                     encoding="utf-8") as geniaoutput:
            for l in geniaoutput:
                stext = l.strip()
                if stext == "":
                    offset = self.get_space_between_sentences(offset)
                    continue
                sid = self.did + ".s" + str(len(self.sentences))
                self.sentences.append(
                    Sentence(stext, offset=offset, sid=sid, did=self.did))
                offset += len(stext)
                offset = self.get_space_between_sentences(offset)

    def process_document(self, corenlpserver, doctype="biomedical"):
        """
        Process each sentence in the text (sentence split if there are no sentences) using Stanford CoreNLP
        :param corenlpserver:
        :param doctype:
        :return:
        """
        if len(self.sentences) == 0:
            # use specific sentence splitter
            self.sentence_tokenize(doctype)
        for s in self.sentences:
            #corenlpres = corenlpserver.raw_parse(s.text)
            corenlpres = corenlpserver.annotate(
                s.text.encode("utf8"),
                properties={
                    'ssplit.eolonly': True,
                    #'annotators': 'tokenize,ssplit,pos,ner,lemma',
                    'annotators':
                    'tokenize,ssplit,pos,parse,ner,lemma,depparse',
                    'outputFormat': 'json',
                })
            if isinstance(corenlpres, basestring):
                print corenlpres
                corenlpres = corenlpserver.annotate(
                    s.text.encode("utf8"),
                    properties={
                        'ssplit.eolonly': True,
                        # 'annotators': 'tokenize,ssplit,pos,depparse,parse',
                        'annotators': 'tokenize,ssplit,pos,ner,lemma',
                        'outputFormat': 'json',
                    })
            if isinstance(corenlpres, basestring):
                print "could not process this sentence:", s.text.encode("utf8")
                print corenlpres
                continue
            else:
                s.process_corenlp_output(corenlpres)

    def tag_chemdner_entity(self,
                            start,
                            end,
                            subtype,
                            source="goldstandard",
                            **kwargs):
        """
        Create an CHEMDNER entity relative to this document.
        :param start: Start index of entity
        :param end: End index of entity
        :param subtype: Subtype of CHEMDNER entity
        :param kwargs: Extra stuff like the text
        :return:
        """
        doct = kwargs.get("doct")
        title_offset = 0
        if doct == "A":
            title_offset = len(self.title) + 1  # account for extra .
        start, end = start + title_offset, end + title_offset
        sentence = self.find_sentence_containing(start, end, chemdner=False)
        if sentence:
            eid = sentence.tag_entity(start - sentence.offset,
                                      end - sentence.offset,
                                      "chemical",
                                      source=source,
                                      text=kwargs.get("text"),
                                      subtype=subtype,
                                      score=kwargs.get("score"))
            if eid:
                entity = sentence.entities.get_entity(eid, source)
                return entity
        else:
            print "sentence not found between:", start, end
            print "ignored ", kwargs.get("text")
            # print len(self.documents[pmid].title), self.documents[pmid].title
            # for s in self.documents[pmid].sentences:
            #    print s.sid, s.tokens[0].dstart, s.tokens[-1].dend, s.text

    def add_relation(self,
                     entity1,
                     entity2,
                     subtype,
                     relation,
                     source="goldstandard",
                     **kwargs):
        if self.pairs.pairs:
            pid = self.did + ".p" + str(len(self.pairs.pairs))
        else:
            pid = self.did + ".p0"
        between_text = self.text[entity1.dend:entity2.start]
        logging.debug("adding {}:{}=>{}".format(pid,
                                                entity1.text.encode("utf8"),
                                                entity2.text.encode("utf8")))
        # print between_text
        if subtype == "tlink":
            pair = TLink(entity1,
                         entity2,
                         relation=relation,
                         original_id=kwargs.get("original_id"),
                         did=self.did,
                         pid=pid,
                         rtype=subtype,
                         between_text=between_text)
        else:
            pair = Pair((entity1, entity2),
                        subtype,
                        did=self.did,
                        pid=pid,
                        original_id=kwargs.get("original_id"),
                        between_text=between_text)
        self.pairs.add_pair(pair, source)
        return pair

    def get_space_between_sentences(self, totalchars):
        """
        When the sentences are split, the whitespace between each sentence is not preserved, so we need to get it back
        :param totalchars: offset of the end of sentence
        :return: Index where the next sentence starts
        """
        while totalchars < len(self.text) and self.text[totalchars].isspace():
            totalchars += 1
        return totalchars

    def get_unique_results(self, source, ths, rules, mode):
        doc_entities = {}
        for s in self.sentences:
            if s.entities:
                if mode == "ner":
                    sentence_entitites = s.entities.get_unique_entities(
                        source, ths, rules)
                    for e in sentence_entitites:
                        sentence_entitites[e].append(
                            s.text[int(sentence_entitites[e][1]
                                       ):int(sentence_entitites[e][2])])
                    # print sentence_entitites
                elif mode == "re":
                    sentence_entitites = s.entities.get_unique_relations(
                        source)
            # print doc_entities, sentence_entitites
            doc_entities.update(sentence_entitites)
            # print doc_entities
            # print
        logging.info("{} has {} unique entities".format(
            self.did, len(doc_entities)))
        return doc_entities

    def write_chemdner_results(self,
                               source,
                               outfile,
                               ths={"chebi": 0.0},
                               rules=[]):
        lines = []
        totalentities = 0
        for s in self.sentences:
            # print "processing", s.sid, "with", len(s.entities.elist[source]), "entities"
            if s.entities:
                res = s.entities.write_chemdner_results(
                    source, outfile, len(self.sentences[0].text), ths, rules,
                    totalentities + 1)
                lines += res[0]
                totalentities = res[1]
        return lines

    def write_bioc_results(self, parent, source, ths={}):
        bioc_document = ET.SubElement(parent, "document")
        bioc_id = ET.SubElement(bioc_document, "id")
        bioc_id.text = self.did

        bioc_title_passage = ET.SubElement(bioc_document, "passage")
        bioc_title_info = ET.SubElement(bioc_title_passage, "infon",
                                        {"key": "type"})
        bioc_title_info.text = "title"
        bioc_title_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_title_offset.text = str(0)
        bioc_title = self.sentences[0].write_bioc_results(
            bioc_title_passage, source)

        bioc_abstract_passage = ET.SubElement(bioc_document, "passage")
        bioc_abstract_info = ET.SubElement(bioc_abstract_passage, "infon",
                                           {"key": "type"})
        bioc_abstract_info.text = "abstract"
        bioc_abstract_offset = ET.SubElement(bioc_title_passage, "offset")
        bioc_abstract_offset.text = str(len(self.sentences[0].text) + 1)
        for i, sentence in enumerate(self.sentences[1:]):
            bioc_sentence = sentence.write_bioc_results(
                bioc_abstract_passage, source)
        return bioc_document

    def get_dic(self, source, ths={}):
        dic = {"title": {}, "abstract": {}}
        dic = {"abstract": {}}
        # dic["title"]["offset"] = "0"
        # dic["title"]["sentences"] = self.sentences[0].get_dic(source)

        dic["abstract"]["offset"] = str(len(self.sentences[0].text) + 1)
        dic["abstract"]["sentences"] = []
        for i, sentence in enumerate(self.sentences[1:]):
            dic["abstract"]["sentences"].append(sentence.get_dic(source))
        return dic

    def get_sentence(self, sid):
        """
        Get the sentence by sentence ID
        :param sid: sentence ID
        :return: the sentence object if it exists
        """
        for s in self.sentences:
            # logging.debug([(t.start, t.end) for t in s.tokens])
            if s.sid == sid:
                # logging.debug("found sid: {}".format(sid))
                return s
        return None

    def find_sentence_containing(self, start, end, chemdner=True):
        """
            Find the sentence between start and end. If chemdner, do not consider the first sentence, which
            is the title.
        """
        if chemdner:
            firstsent = 1
        else:
            firstsent = 0
        for i, s in enumerate(self.sentences[firstsent:]):
            if len(s.tokens) == 0:
                #logging.debug("sentence without tokens: {} {}".format(s.sid, s.text.encoding("utf-8")))
                continue
            if s.tokens[0].dstart <= start and s.tokens[-1].dend >= end:
                # print "found it!"
                return s
        for s in self.sentences:
            logging.debug("sentence not found: {}-{}".format(start, end))
            if len(s.tokens) > 0:
                logging.debug("{} {} {} {} {}".format(
                    s.tokens[0].dstart <= start, s.tokens[-1].dend >= end,
                    s.tokens[0].dstart, s.tokens[-1].dend,
                    s.text.encode("utf-8")))
        return None

    def get_entity_offsets(self, esource, ths, rules):
        offsets = []
        for s in self.sentences:
            if s.entities:
                offsets += s.entities.get_entity_offsets(
                    esource, ths, rules, s.tokens)
        return offsets

    def get_entity(self, eid, source="goldstandard"):
        for sentence in self.sentences:
            for e in sentence.entities.elist[source]:
                if e.eid == eid:
                    return e
        print "no entity found for eid {}".format(eid)
        return None

    def get_entities(self, source):
        entities = []
        for s in self.sentences:
            if source in s.entities.elist:
                for e in s.entities.elist[source]:
                    entities.append(e)
        return entities

    def get_abbreviations(self):
        self.abbreviations = {}
        first_elem = []
        second_elem = []
        open_paren = False
        for sentence in self.sentences:
            # print sentence.text
            for i, t in enumerate(sentence.tokens):
                if t.text == "-LRB-":
                    open_paren = True
                    last_token = sentence.tokens[i - 1]
                    while last_token.pos.startswith(
                            "NN") or last_token.pos.startswith(
                                "JJ"):  # use nouns before the parenthesis
                        first_elem.insert(0, last_token)
                        if last_token.order == 0:
                            break
                        else:
                            last_token = sentence.tokens[
                                last_token.order -
                                1]  # check the token before this one
                    if len(first_elem) > 0:
                        logging.info("starting abbreviation for this text: " +
                                     str([tt.text for tt in first_elem]))
                    else:
                        open_paren = False
                elif t.text == "-RRB-" and open_paren == True:
                    first_text = sentence.text[first_elem[0].
                                               start:first_elem[-1].end]
                    second_text = sentence.text[second_elem[0].
                                                start:second_elem[-1].end]
                    if len(first_text) > len(
                            second_text):  #abbreviation is the smallest word
                        second_text, first_text = first_text, second_text
                    # rules
                    if not first_text.islower() and len(first_text) > 1:
                        self.abbreviations[first_text] = second_text
                    open_paren = False
                    first_elem = []
                    second_elem = []
                elif open_paren:
                    second_elem.append(t)
        for abv in self.abbreviations:
            if not any([c.isalpha() for c in abv]):
                print abv, ":", self.abbreviations[abv]