Exemple #1
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.")
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = config.paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))

        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)

        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                model.train()
        # testing

        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = config.pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all", p,
                    config.paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1], [],
                                      [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[
                            did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(
                    config.paths[options.goldstd[0]]["format"],
                    config.paths[options.goldstd[0]]["annotations"], "all",
                    "all", config.paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],
                                      [], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall +
                                                             precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in config.pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        else:
            res = corpus.test_sentence_classifier(options.ptype)
            print res
        corpus.save(config.paths[options.goldstd[0]]["corpus"])

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Exemple #2
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("--train",
                        nargs="+",
                        help="Gold standards to be used for training")
    parser.add_argument("--test",
                        nargs="+",
                        help="Gold standards to be used for testing")
    parser.add_argument("--tag",
                        dest="tag",
                        default="0",
                        help="Tag to identify the experiment")
    parser.add_argument("--emodels",
                        dest="emodels",
                        help="model destination path, without extension",
                        nargs="+")
    parser.add_argument("--rmodels",
                        dest="rmodels",
                        help="model destination path, without extension")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--crf",
                        dest="crf",
                        help="CRF implementation",
                        default="stanford",
                        choices=["stanford", "crfsuite", "banner"])
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--rules",
                        default=[],
                        nargs='+',
                        help="aditional post processing rules")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    parser.add_argument("--results",
                        dest="results",
                        help="Results object pickle.")

    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    # logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    relations = set()
    with open("corpora/transmir/transmir_relations.txt") as rfile:
        for l in rfile:
            relations.add(tuple(l.strip().split('\t')))
    with open("temp/mil.train", 'w') as f:
        f.write("")
    # train_corpus = Corpus("corpus/" + "&".join(options.goldstd[0]))
    total_entities = 0
    for goldstd in options.train:
        corpus_path = paths[goldstd]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        train_corpus = pickle.load(open(corpus_path, 'rb'))
        for sentence in train_corpus.get_sentences(options.emodels[0]):
            for e in sentence.entities.elist[options.emodels[0]]:
                if e.normalized_score > 0:
                    total_entities += 1
        # with open("mirna_ds-pmids.txt", 'w') as pmidfile:
        #     for did in train_corpus.documents:
        #         pmidfile.write(did + "\n")
        train_model = MILClassifier(train_corpus,
                                    options.ptype,
                                    relations,
                                    ner=options.emodels[0])
        train_model.load_kb("corpora/transmir/transmir_relations.txt")
        train_model.generateMILdata(test=False)
        train_model.write_to_file("temp/mil.train")
        train_model = None
        train_corpus = None
    print "total entities:", total_entities
    train_model = MILClassifier(None,
                                options.ptype,
                                relations,
                                ner=options.emodels[0],
                                generate=False,
                                modelname=options.rmodels)
    train_model.load_kb("corpora/transmir/transmir_relations.txt")
    train_model.load_from_file("temp/mil.train")
    #train_model.generateMILdata(test=False)
    train_model.train()

    # test_corpus = Corpus("corpus/" + "&".join(options.goldstd[1]))
    test_sets = []
    for g in options.test:
        corpus_path = paths[g]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        test_corpus = pickle.load(open(corpus_path, 'rb'))
        test_sets.append(test_corpus)

    for i, test_corpus in enumerate(test_sets):
        logging.info("evaluation {}".format(options.test[i]))
        test_model = MILClassifier(test_corpus,
                                   options.ptype,
                                   relations,
                                   test=True,
                                   ner=options.emodels[i + 1],
                                   modelname=options.rmodels)
        test_model.load_kb("corpora/transmir/transmir_relations.txt")
        test_model.generateMILdata(test=False)
        test_model.load_classifier()
        #test_model.vectorizer = train_model.vectorizer
        #test_model.classifier = train_model.classifier

        test_model.test()
        results = test_model.get_predictions(test_corpus)
        results.path = options.results + "-" + options.test[i]
        results.save(options.results + "-" + options.test[i] + ".pickle")
        results.load_corpus(options.test[i])
        if options.test[i] != "mirna_cf_annotated":
            logging.info("loading gold standard %s" %
                         paths[options.test[i]]["annotations"])
            goldset = get_gold_ann_set(paths[options.test[i]]["format"],
                                       paths[options.test[i]]["annotations"],
                                       options.etype, options.ptype,
                                       paths[options.test[i]]["text"])
            if options.test[i] in ("transmir_annotated", "miRTex_test"):
                get_list_results(results,
                                 options.kernel,
                                 goldset[1], {},
                                 options.rules,
                                 mode="re")
            else:
                get_relations_results(results, options.kernel, goldset[1], {},
                                      options.rules)
        else:
            total_entities = 0
            for sentence in test_corpus.get_sentences(options.emodels[1]):
                for e in sentence.entities.elist[options.emodels[1]]:
                    if e.normalized_score > 0:
                        total_entities += 1
            print "total entities:", total_entities
            #sysresults = results.corpus.get_unique_results(options.kernel, {}, options.rules, mode="re")
            sysresults = []
            for did in results.corpus.documents:
                for p in results.corpus.documents[did].pairs.pairs:
                    if options.kernel in p.recognized_by:
                        sentence = results.corpus.documents[did].get_sentence(
                            p.entities[0].sid)
                        # print p.entities[0].text, p.entities[1].text
                        sysresults.append(
                            (p.entities[0].sid, p.entities[0].normalized,
                             p.entities[1].normalized,
                             "{}=>{}".format(p.entities[0].normalized,
                                             p.entities[1].normalized),
                             p.recognized_by[options.kernel], sentence.text))
            rels = {}
            for x in sysresults:
                pair = (x[1], x[2])
                if pair not in rels:
                    rels[pair] = []
                #print x[0], x[-1], x[3]
                stext = x[0] + ": " + x[-1] + " (" + x[3] + ")"
                # stext = ""
                add = True
                for i in rels[pair]:
                    if i[0].startswith(x[0]):
                        add = False
                        break
                if add:
                    rels[pair].append((stext, x[4]))
            # for t in rels.items():
            #     print t
            o = OrderedDict(
                sorted(rels.items(), key=lambda t: t[1][0][1], reverse=True))
            for x in o:
                print x, len(o[x])
                for s in o[x]:
                    print "\t", s[0].encode("utf-8")
                print
            for x in o:
                conf = round(o[x][0][1], 3)
                unique_dids = set([sid[0].split(".")[0] for sid in o[x]])
                # print unique_dids
                print x[0] + "\t" + x[1] + "\t" + str(len(o[x])) + "\t" + str(
                    len(unique_dids)) + "\t" + str(conf)
            #max_width = table_instance.column_max_width(2)
            #for i, line in enumerate(table_instance.table_data):
            #    wrapped_string = '\n'.join(wrap(line[2], max_width))
            #    table_instance.table_data[i][2] = wrapped_string

    total_time = time.time() - start_time
    print "Total time: %ss" % total_time
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.")
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(paths[options.goldstd]["corpus"])

    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        basecorpus = pickle.load(open(corpus_path, 'rb'))
        corpus = SeeDevCorpus(corpus_path)
        corpus.documents = basecorpus.documents
        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)
        elif options.actions == "add_goldstandard":
            corpus.convert_entities_to_goldstandard()
            corpus.find_ds_relations()
            #corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types)
            models.train_types()
        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                # model.train()
        # testing
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            models = TaggerCollection(basepath=options.models, subtypes=all_entity_types)
            models.load_models()
            results = models.test_types(corpus)
            final_results = results.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"],
                                       "all", p, paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1],[], [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"],
                                       "all", "all", paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],[], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences": #and evaluate
            if options.ptype == "all":
                avg = [0,0,0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall + precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences": #and evaluate
            if options.ptype == "all":
                avg = [0,0,0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        #else:
        #    res = corpus.test_sentence_classifier(options.ptype)
        #    print res
        elif options.actions == "evaluate_ner":
            if os.path.exists(options.output[1] + ".pickle"):
                results = pickle.load(open(options.output[1] + ".pickle", 'rb'))
                results.load_corpus(options.goldstd[0])
                results.path = options.output[1]
            logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"])
            for t in all_entity_types:
                print t
                results.path = options.output[1] + "_" + t
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"],
                                           paths[options.goldstd[0]]["annotations"],
                                           t, options.ptype, paths[options.goldstd[0]]["text"])
                get_results(results, options.models + "_" + t, goldset[0], {}, {})

        corpus.save(paths[options.goldstd[0]]["corpus"])


    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Exemple #4
0
def run_crossvalidation(goldstd_list,
                        corpus,
                        model,
                        cv,
                        crf="stanford",
                        entity_type="all",
                        cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist) / cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(
            nlist, len(testids), len(trainids))
        train_corpus = Corpus(
            corpus.path + "_train",
            documents={did: corpus.documents[did]
                       for did in trainids})
        test_corpus = Corpus(
            corpus.path + "_test",
            documents={did: corpus.documents[did]
                       for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(
            nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191 + nlist)
        test_model.load_data(test_corpus,
                             feature_extractors.keys(),
                             mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191 + nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.paths[gs]["format"],
                                       config.paths[gs]["annotations"],
                                       entity_type, "pairtype",
                                       config.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset,
                                        {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p) / cv
    ravg = sum(r) / cv
    print "precision: average={} all={}".format(
        str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg),
                                              '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.paths[gs]["format"],
                                   config.paths[gs]["annotations"],
                                   entity_type, config.paths[gs]["text"])
        for g in goldset:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist)/cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist+1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(nlist, len(testids), len(trainids))
        train_corpus = Corpus(corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids})
        test_corpus = Corpus(corpus.path + "_test", documents={did: corpus.documents[did] for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191+nlist)
        test_model.load_data(test_corpus, feature_extractors.keys(), mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191+nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type,
                                       "pairtype", config.corpus_paths.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset, {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p)/cv
    ravg = sum(r)/cv
    print "precision: average={} all={}".format(str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg), '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "",
                                   config.corpus_paths.paths[gs]["text"])
        for g in goldset[0]:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])