Esempio n. 1
0
    def get_predictions(self, corpus):
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.entities)):
            # did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            # pid = did + ".p" + str(i)
            # if "B-TARGET" in self.predicted[i]:
            #     print self.predicted[i]
            # print self.scores
            did = self.entities[i][0].did
            if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
            for it, label in enumerate(self.predicted[i]):
                if label.endswith("B-TARGET"):
                    # print self.entities[i][0].text, [(e.text, e.type) for e in self.entities[i][1][it]]
                    for target in self.entities[i][1][it]:
                        pid = did + ".p" + str(i)
                        # if self.pred[i]:
                        #     did = '.'.join(pid.split(".")[:-1])
                        if did not in results.document_pairs:
                            results.document_pairs[did] = Pairs()
                        pair = corpus.documents[did].add_relation(self.entities[i][0], target, self.pairtype, relation=True)
                        results.document_pairs[did].add_pair(pair, "crf")
                        #pair = self.get_pair(pid, corpus)
                        #results.pairs[pid] = pair

                        # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                        #if pair not in temppreds:
                        #    temppreds[pair] = []
                        #temppreds[pair].append(p)
                        results.pairs[pid] = pair
                        results.pairs[pid].recognized_by["crf"] = 1
        results.corpus = corpus
        return results
Esempio n. 2
0
    def get_predictions(self, corpus):
        #real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        results = ResultsRE(self.modelname)
        temppreds = {}
        for i in range(len(self.pred)):
            did = ".".join(self.pairs[i][0].sid.split(".")[:-1])
            pid = did + ".p" + str(i)
            if self.pred[i]:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                #pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], real_pair_type, relation=True)
                pair = corpus.documents[did].add_relation(self.pairs[i][0],
                                                          self.pairs[i][1],
                                                          self.pairtype,
                                                          relation=True)
                results.document_pairs[did].add_pair(pair, "scikit")
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair

                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["scikit"] = 1
        results.corpus = corpus
        return results
Esempio n. 3
0
    def get_predictions(self, corpus):
        # real_pair_type = config.event_types[self.pairtype]["subtypes"][0]
        #pred_y = []
        with open(self.resultsfile, 'r') as resfile:
            pred = resfile.readlines()

        with codecs.open(self.examplesfile, 'r', 'utf-8') as trainfile:
            original = trainfile.readlines()

        if len(pred) != len(original):
            print "different number of predictions!"
            sys.exit()
        results = ResultsRE(self.resultsfile)
        temppreds = {}
        for i in range(len(pred)):
            original_tsv = original[i].split('\t')
            # logging.debug(original_tsv)
            pid = '.'.join(original_tsv[1].split('.')[:-1])

            p = float(pred[i].strip())
            if p == 0:
                p = -1
            if p == 2:
                print "p=2!"
                p = 1
            if p == 1:
                did = '.'.join(pid.split(".")[:-1])
                if did not in results.document_pairs:
                    results.document_pairs[did] = Pairs()
                pair = corpus.documents[did].add_relation(self.pairs[pid][0],
                                                          self.pairs[pid][1],
                                                          self.pairtype,
                                                          relation=True)
                # pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], real_pair_type, relation=True)
                #pair = self.get_pair(pid, corpus)
                results.pairs[pid] = pair
                results.document_pairs[did].add_pair(pair, "jsre")
                # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p))
                #if pair not in temppreds:
                #    temppreds[pair] = []
                #temppreds[pair].append(p)
                results.pairs[pid].recognized_by["jsre"] = p
        '''for pair in temppreds:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            p = mode(temppreds[pair])[0][0]
            if len(set(temppreds[pair])) > 1:
                print temppreds[pair], p
            pairs[pair][relations.SLK_PRED][dditype] = p
            #if pairs[pair][ddi.SLK_PRED][dditype] and not pairs[pair][ddi.SLK_PRED]["all"]:
            #    logging.info("type classifier %s found a new true pair: %s", dditype, pair)

        for pair in pairs:
            if relations.SLK_PRED not in pairs[pair]:
                pairs[pair][relations.SLK_PRED] = {}
            if dditype not in pairs[pair][relations.SLK_PRED]:
                 pairs[pair][relations.SLK_PRED][dditype] = -1'''
        results.corpus = corpus
        return results
Esempio n. 4
0
 def __init__(self, text, offset=0, **kwargs):
     self.text = text
     self.sid = kwargs.get("sid")
     self.did = kwargs.get("did")
     self.entities = Entities(sid=self.sid, did=self.did)
     self.offset = offset
     self.pairs = Pairs()
     self.parsetree = None
     self.depparse = None
     self.tokens = []
     self.regex_tokens = re.compile(r'(-|/|\\|\+|\.|\w+)')
Esempio n. 5
0
 def __init__(self,
              text,
              process=False,
              doctype="biomedical",
              ssplit=False,
              **kwargs):
     self.text = text
     self.title = kwargs.get("title")
     self.sentences = kwargs.get("sentences", [])
     self.did = kwargs.get("did", "d0")
     self.invalid_sids = []
     self.title_sids = []
     self.pairs = Pairs()
     if ssplit:
         self.sentence_tokenize(doctype)
     if process:
         self.process_document(doctype)
Esempio n. 6
0
 def get_predictions(self, corpus):
     results = ResultsRE(self.resultsfile)
     for i, pred in enumerate(self.predicted):
         if pred >= 0:
             score = 1.0 / (1.0 + math.exp(-pred))
             bag = self.bag_pairs[i]
             pairs = self.pairs[bag]
             for pair in pairs:
                 #did = bag[0]
                 did = pair[0].did
                 if did not in results.document_pairs:
                     results.document_pairs[did] = Pairs()
                 new_pair = corpus.documents[did].add_relation(
                     pair[0], pair[1], self.pairtype, relation=True)
                 results.document_pairs[did].add_pair(new_pair, "mil")
                 pid = did + ".p" + str(len(results.pairs))
                 results.pairs[pid] = new_pair
                 results.pairs[pid].recognized_by["mil"] = score
     results.corpus = corpus
     return results
Esempio n. 7
0
 def get_predictions(self, corpus):
     results = ResultsRE("")
     # print len(self.pids)
     for p, pid in enumerate(self.pids):
         did = self.pids[pid][0].did
         if did not in results.document_pairs:
             results.document_pairs[did] = Pairs()
         pair = corpus.documents[did].add_relation(self.pids[pid][0],
                                                   self.pids[pid][1],
                                                   self.ptype,
                                                   relation=True)
         # print pair, pair[0], pair[1]
         #pair = self.get_pair(pid, corpus)
         results.document_pairs[did].add_pair(pair, "mirtex_rules")
         results.pairs[pid] = pair
         pair.recognized_by["mirtex_rules"] = 1
         logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(
             pair.entities[0], pair.entities[1]))
     #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score))
     results.corpus = corpus
     return results
Esempio n. 8
0
    def load_relations(self, annotations_tag, did, allwords):
        stats = {
            "path_count": 0,
            "clinic_count": 0,
            "path_doc_chars": 0,
            "clinic_doc_chars": 0,
            "path_nentities": 0,
            "clinic_nentities": 0,
            "path_nrelations": 0,
            "clinic_nrelations": 0,
            "path_relation_dist": 0,
            "clinic_relation_dist": 0,
            "path_event_time": 0,
            "path_time_event": 0,
            "path_time_time": 0,
            "path_event_event": 0,
            "clinic_event_time": 0,
            "clinic_time_event": 0,
            "clinic_time_time": 0,
            "clinic_event_event": 0,
            "path_nevent_source": 0,
            "path_ntime_source": 0,
            "clinic_nevent_source": 0,
            "clinic_ntime_source": 0,
            "path_nevent_target": 0,
            "path_ntime_target": 0,
            "clinic_nevent_target": 0,
            "clinic_ntime_target": 0,
            "clinic_multisentence": 0,
            "path_multisentence": 0
        }

        wordsdic = {
            "path_event_time": {},
            "path_time_event": {},
            "path_time_time": {},
            "path_event_event": {},
            "clinic_event_time": {},
            "clinic_time_event": {},
            "clinic_time_time": {},
            "clinic_event_event": {}
        }
        if "path" in did:
            doc_type = "path_"
        else:
            doc_type = "clinic_"
        stats[doc_type + "count"] += 1
        stats[doc_type + "doc_chars"] += len(self.documents[did].text)
        source_relation = {
        }  # (source original id, target original id, relation original id)
        entity_list = {}  # all entities of this document original_id => entity
        for relation in annotations_tag.findall("relation"):
            stats[doc_type + "nrelations"] += 1
            props = relation.find("properties")
            source_id = props.find("Source").text
            target_id = props.find("Target").text
            relation_type = relation.find("type").text
            relation_id = relation.find("id").text
            if source_id not in source_relation:
                source_relation[source_id] = []
            source_relation[source_id].append(target_id)
        self.documents[did].pairs = Pairs()
        for sentence in self.documents[did].sentences:
            if "goldstandard" in sentence.entities.elist:
                for entity in sentence.entities.elist["goldstandard"]:
                    entity_list[entity.original_id] = entity
                    stats[doc_type + "nentities"] += 1
        for eid in entity_list:
            entity = entity_list[eid]
            entity.targets = []
            if entity.original_id in source_relation:
                for target in source_relation[entity.original_id]:
                    if target not in entity_list:
                        print "target not in entity list:", target
                    else:
                        pairwordsdic = {}
                        entity.targets.append(entity_list[target].eid)
                        e2 = get_entity(self.documents[did],
                                        entity_list[target].eid)
                        # print "{}:{}=>{}:{}".format(entity.type, entity.text, e2.type, e2.text)
                        # print "||{}||".format(self.documents[did].text[entity.dstart:e2.dend])

                        stats[doc_type + "relation_dist"] += len(
                            self.documents[did].text[entity.dend:e2.dstart])
                        stats[doc_type + "n{}_source".format(entity.type)] += 1
                        stats[doc_type + "n{}_target".format(e2.type)] += 1
                        stats[doc_type +
                              "{}_{}".format(entity.type, e2.type)] += 1

                        words = re.split(
                            "\W", self.documents[did].text[entity.dend:e2.
                                                           dstart].lower())
                        #stems = set()
                        stems = []
                        for w in words:
                            if w.strip() == "":
                                continue
                            #if w.isdigit():
                            #    stem = "#digit#"
                            #else:
                            #stem = self.stemmer.stem(w)
                            #    stem = w
                            #stems.add(stem)
                            stems.append(w)
                        for stem in stems:
                            if stem not in pairwordsdic:
                                pairwordsdic[stem] = 0
                            pairwordsdic[stem] += 1

                        if e2.sid != entity.sid:
                            stats[doc_type + "multisentence"] += 1
                        for stem in pairwordsdic:
                            if stem not in wordsdic[
                                    doc_type +
                                    "{}_{}".format(entity.type, e2.type)]:
                                wordsdic[doc_type + "{}_{}".format(
                                    entity.type, e2.type)][stem] = 0
                            wordsdic[doc_type + "{}_{}".format(
                                entity.type, e2.type)][stem] += pairwordsdic[
                                    stem] * 1.0 / allwords[stem]
                """        # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text))
                        chardist = e2.dstart - e1.dend
                        if chardist > maxdist[0] and e1.type != "time" and not e1.text.isupper():
                            print e1.type
                            maxdist = (chardist, "{}=>{}".format(e1, e2))
                        # logging.debug("dist between entities: {}".format(chardist))"""
                # logging.debug("|{}|=>|{}|".format(e1.text, e2.text))
                #self.documents[did].add_relation(e1, e2, "tlink", relation=True)
                """    npairs += 1
                elif '\n' not in self.documents[did].text[e1.dstart:e2.dend] or e1.text.isupper() or e1.type == "time":
                    self.documents[did].add_relation(e1, e2, "tlink", relation=False)
                    npairs += 1
                if (e2.original_id, e1.original_id) in relation_list:
                    inverted += 1"""
                """    if e1.sid != e2.sid:
                        sentence1 = self.documents[did].get_sentence(e1.sid)
                        sentence2 = self.documents[did].get_sentence(e2.sid)
                        # logging.debug("multi-sentence:{}+{}".format(sentence1.text, sentence2.text))
                        chardist = e2.dstart - e1.dend
                        if chardist > maxdist[0] and e2.type != "timex3" and not e2.text.isupper():
                            #print e2.type
                            maxdist = (chardist, "{}<={}".format(e1, e2))
                        # logging.debug("dist between entities: {}".format(chardist))

                    # logging.debug("|{}|<=|{}|".format(e1.text, e2.text))
                    self.documents[did].add_relation(e2, e1, "tlink", relation=True, original_id=relation_id)
                else:
                    self.documents[did].add_relation(e2, e1, "tlink", relation=False, original_id=relation_id)"""
        return stats, wordsdic
Esempio n. 9
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.")
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = config.paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))

        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)

        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                model.train()
        # testing

        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all", p,
                    config.paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1], [],
                                      [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[
                            did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all",
                    "all", config.paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],
                                      [], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall +
                                                             precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        else:
            res = corpus.test_sentence_classifier(options.ptype)
            print res
        corpus.save(config.paths[options.goldstd[0]]["corpus"])

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)