コード例 #1
0
ファイル: normalize.py プロジェクト: lasigeBioTM/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("action", default="map",
                      help="Actions to be performed.")
    parser.add_argument("goldstd", default="chemdner_sample",
                        help="Gold standard to be used.",
                        choices=paths.keys())
    parser.add_argument("--corpus", dest="corpus",
                      default="data/chemdner_sample_abstracts.txt.pickle",
                      help="format path")
    parser.add_argument("--results", dest="results", help="Results object pickle.")
    parser.add_argument("--models", dest="models", help="model destination path, without extension", default="combined")
    parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined")
    parser.add_argument("--chebi", dest="chebi", help="Chebi mapping threshold.", default=0, type=float)
    parser.add_argument("--ssm", dest="ssm", help="SSM threshold.", default=0, type=float)
    parser.add_argument("--measure", dest="measure", help="semantic similarity measure", default="simui")
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    options = parser.parse_args()

    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Processing action {0} on {1}".format(options.action, options.goldstd))
    logging.info("loading results %s" % options.results + ".pickle")
    if os.path.exists(options.results + ".pickle"):
        results = pickle.load(open(options.results + ".pickle", 'rb'))
        results.path = options.results
    else:
        print "results not found"
        results = None

    if options.action == "chebi":
        if not config.use_chebi:
            print "If you want to use ChEBI, please re-run config.py and set use_chebi to true"
            sys.exit()
        add_chebi_mappings(results, options.results + ".pickle", options.models)
    # if options.action == "go":
    #    add_go_mappings(results, options.results + ".pickle", options.models)
    elif options.action in ("mirna", "protein", "all"):
        normalize_entities(results, options.results + ".pickle", options.models)
    elif options.action == "ssm":
        if options.measure.endswith("go"):
            ontology = "go"
        else:
            ontology = "chebi"
        add_ssm_score(results, options.results + ".pickle", options.models, options.measure, ontology)
コード例 #2
0
ファイル: crossvalidation.py プロジェクト: lasigeBioTM/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.")
    parser.add_argument("--cv", dest="cv", default=5, help="Number of folds.", type=int)
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    parser.add_argument("--pairtype1", action="store", dest="pairtype1")
    parser.add_argument("--pairtype2", action="store", dest="pairtype2")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Crossvalidation on {0}".format(options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    corpus_name = "&".join(options.goldstd)
    corpus = Corpus("corpus/" + corpus_name)
    for g in options.goldstd:
        corpus_path = paths[g]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        this_corpus = pickle.load(open(corpus_path, 'rb'))
        #docs = this_corpus.documents
        docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys())
        corpus.documents.update(docs)
    run_crossvalidation(options.goldstd, corpus, options.models, options.cv, options.crf, options.etype)

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #3
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("action",
                        default="evaluate",
                        help="Actions to be performed.")
    parser.add_argument("goldstd",
                        default="chemdner_sample",
                        help="Gold standard to be used.",
                        choices=paths.keys())
    parser.add_argument("--corpus",
                        dest="corpus",
                        default="data/chemdner_sample_abstracts.txt.pickle",
                        help="format path")
    parser.add_argument("--results",
                        dest="results",
                        help="Results object pickle.",
                        nargs='+')
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension",
                        default="combined")
    parser.add_argument("--ensemble",
                        dest="ensemble",
                        help="name/path of ensemble classifier",
                        default="combined")
    parser.add_argument("--chebi",
                        dest="chebi",
                        help="Chebi mapping threshold.",
                        default=0,
                        type=float)
    parser.add_argument("--ssm",
                        dest="ssm",
                        help="SSM threshold.",
                        default=0,
                        type=float)
    parser.add_argument("--measure",
                        dest="measure",
                        help="semantic similarity measure",
                        default="simui")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--rules",
                        default=[],
                        nargs='+',
                        help="aditional post processing rules")
    parser.add_argument("--features",
                        default=[
                            "chebi", "case", "number", "greek", "dashes",
                            "commas", "length", "chemwords", "bow"
                        ],
                        nargs='+',
                        help="aditional features for ensemble classifier")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default=None)
    parser.add_argument(
        "--external",
        action="store_true",
        default=False,
        help="Run external evaluation script, depends on corpus type")
    parser.add_argument("--output",
                        dest="output",
                        help="Final output",
                        default=None)
    options = parser.parse_args()

    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Processing action {0} on {1}".format(options.action,
                                                       options.goldstd))
    results_list = []
    for results_path in options.results:
        logging.info("loading results %s" % results_path + ".pickle")
        if os.path.exists(results_path + ".pickle"):
            results = pickle.load(open(results_path + ".pickle", 'rb'))
            results.load_corpus(options.goldstd)
            results.path = results_path
            results_list.append(results)
        else:
            print "results not found"
            print results_path
            sys.exit()

    if options.action in ("combine", "train_ensemble", "test_ensemble",
                          "savetocorpus"):
        # merge the results of various results corresponding to different classifiers
        # the entities of each sentence are added according to the classifier of each result
        # every result should correspond to the same gold standard
        # save to the first results path
        #results.load_corpus(options.goldstd)
        #logging.info("combining results...")
        #results.combine_results(options.models, options.models + "_combined")
        #results.save(options.results + "_combined.pickle")
        base_result = results_list[0]
        for result in results_list[1:]:
            logging.info("adding {}...".format(result.path))
            base_result.add_results(result)

        if options.action == "combine":
            base_result.combine_results(options.etype, options.models)
            n_sentences, n_docs, n_entities, n_relations = 0, 0, 0, 0
            for did in base_result.corpus.documents:
                n_docs += 1
                for sentence in base_result.corpus.documents[did].sentences:
                    n_sentences += 1
                    for e in sentence.entities.elist[options.models]:
                        n_entities += 1
            logging.info("Combined {} docs, {} sentences, {} entities".format(
                n_docs, n_sentences, n_entities))
            base_result.save(options.models + ".pickle")
        elif options.action == "savetocorpus":
            base_result.corpus.save(options.output + ".pickle")
        elif options.action == "train_ensemble":
            pipeline = Pipeline([
                #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                #('clf', SGDClassifier())
                #('clf', svm.NuSVC(nu=0.01 ))
                # ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                #('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                #('clf', MultinomialNB())
                #('clf', GaussianNB())
                ('clf', svm.SVC(kernel="rbf", degree=2, C=1))
                #('clf', DummyClassifier(strategy="constant", constant=True))
            ])
            print pipeline
            base_result.train_ensemble(pipeline, options.models, options.etype)
        elif options.action == "test_ensemble":
            pipeline = joblib.load("{}/{}/{}.pkl".format(
                "models/ensemble/", options.models, options.models))
            print pipeline
            base_result.test_ensemble(pipeline, options.models, options.etype)
            base_result.save("results/" + options.models + ".pickle")

    elif options.action in ("evaluate", "evaluate_list", "count_entities"):
        counts = {}
        if options.action == "count_entities":
            for did in results_list[0].corpus.documents:
                for sentence in results_list[0].corpus.documents[
                        did].sentences:
                    print sentence.entities.elist.keys()
                    if options.models in sentence.entities.elist:
                        for e in sentence.entities.elist[options.models]:
                            if e.type not in counts:
                                counts[e.type] = 0
                            counts[e.type] += 1
            print counts
            sys.exit()
        if paths[options.goldstd].get("annotations"):
            logging.info("loading gold standard %s" %
                         paths[options.goldstd]["annotations"])
            goldset = get_gold_ann_set(paths[options.goldstd]["format"],
                                       paths[options.goldstd]["annotations"],
                                       options.etype, options.ptype,
                                       paths[options.goldstd]["text"])
        else:
            goldset = ({}, {})
        logging.info("using thresholds: chebi > {!s} ssm > {!s}".format(
            options.chebi, options.ssm))
        #results.load_corpus(options.goldstd)
        #results.path = options.results
        ths = {"chebi": options.chebi}
        if options.ssm > 0:
            ths["ssm"] = options.ssm
        if options.action == "evaluate":
            for result in results_list:
                if options.ptype:  # evaluate this pair type
                    get_relations_results(result, options.models, goldset[1],
                                          ths, options.rules)
                else:  # evaluate an entity type
                    get_results(result, options.models, goldset[0], ths,
                                options.rules)
            if options.external:
                write_chemdner_files(results, options.models, goldset, ths,
                                     options.rules)
                #evaluation = run_chemdner_evaluation(paths[options.goldstd]["cem"],
                #                                     options.results[0] + ".tsv")
                #print evaluation
        elif options.action == "evaluate_list":  # ignore the spans, the gold standard is a list of unique entities
            for result in results_list:
                if options.ptype:
                    get_list_results(result,
                                     options.models,
                                     goldset[1],
                                     ths,
                                     options.rules,
                                     mode="re")
                else:
                    get_list_results(result, options.models, goldset[0], ths,
                                     options.rules)
    elif options.action == "anafora":
        for result in results_list:
            run_anafora(result, options.models,
                        paths[options.goldstd]["annotations"],
                        paths[options.goldstd]["text"], {}, options.rules,
                        options.etype)
    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #4
0
ファイル: results.py プロジェクト: lasigeBioTM/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("action",
                        default="evaluate",
                        help="Actions to be performed.")
    parser.add_argument("goldstd",
                        default="chemdner_sample",
                        help="Gold standard to be used.",
                        choices=paths.keys())
    parser.add_argument("--corpus",
                        dest="corpus",
                        default="data/chemdner_sample_abstracts.txt.pickle",
                        help="format path")
    parser.add_argument("--results",
                        dest="results",
                        help="Results object pickle.",
                        nargs='+')
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension",
                        nargs='+')
    parser.add_argument(
        "--finalmodel",
        dest="finalmodel",
        help="model destination path, without extension")  #,  nargs='+')
    parser.add_argument("--ensemble",
                        dest="ensemble",
                        help="name/path of ensemble classifier",
                        default="combined")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("-o", "--output", action="store", dest="output")
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--features",
                        default=[
                            "chebi", "case", "number", "greek", "dashes",
                            "commas", "length", "chemwords", "bow"
                        ],
                        nargs='+',
                        help="aditional features for ensemble classifier")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument(
        "--external",
        action="store_true",
        default=False,
        help="Run external evaluation script, depends on corpus type")
    parser.add_argument("-i",
                        "--input",
                        action="store",
                        help="input file to be convert to IBEnt results.")
    options = parser.parse_args()

    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Processing action {0} on {1}".format(options.action,
                                                       options.goldstd))
    logging.info("loading results %s" % options.results + ".pickle")
    results_list = []
    for r in options.results:
        if os.path.exists(r + ".pickle"):
            results = pickle.load(open(r + ".pickle", 'rb'))
            results.path = r
            results.load_corpus(options.goldstd)
            results_list.append(results)
        else:
            print "results not found"

    if options.action == "combine":
        # add another set of annotations to each sentence, ending in combined
        # each entity from this dataset should have a unique ID and a recognized_by attribute
        logging.info("combining results...")
        #new_name = "_".join([m.split("/")[-1] for m in options.results])
        #print new_name
        results = combine_results(options.finalmodel, results_list,
                                  options.output, options.etype,
                                  options.models)
        results.save(options.output + ".pickle")
    if options.action == "import":
        # import results from a different format to IBEnt
        # for now assume CHEMDNER format
        results = ResultsNER(options.results[0])
        logging.info("loading corpus...")
        results.corpus = pickle.load(open(paths[options.goldstd]["corpus"]))
        results.model = options.models[0]
        results.import_chemdner(options.input)
        results.save(results.name + ".pickle")
    """elif options.action in ("train_ensemble", "test_ensemble"):
        if "annotations" in config.paths[options.goldstd]:
            logging.info("loading gold standard %s" % config.paths[options.goldstd]["annotations"])
            goldset = get_gold_ann_set(config.paths[options.goldstd]["format"], config.paths[options.goldstd]["annotations"],
                                       options.etype,  config.paths[options.goldstd]["text"])
        else:
            goldset = None
        logging.info("using thresholds: chebi > {!s} ssm > {!s}".format(options.chebi, options.ssm))
        results.load_corpus(options.goldstd)
        results.path = options.results
        ths = {"chebi": options.chebi, "ssm": options.ssm}
        if "ensemble" in options.action:
            if len(options.submodels) > 1:
                submodels = []
                for s in options.submodels:
                    submodels += ['_'.join(options.models.split("_")[:-1]) + "_" + s + "_" + t for t in results.corpus.subtypes]
            else:
                submodels = ['_'.join(options.models.split("_")[:-1]) + "_" + t for t in results.corpus.subtypes]
            logging.info("using these features: {}".format(' '.join(submodels)))
        if options.action == "train_ensemble":
            ensemble = EnsembleNER(options.ensemble, goldset, options.models, types=submodels,
                                   features=options.features)
            ensemble.generate_data(results)
            ensemble.train()
            ensemble.save()
        if options.action == "test_ensemble":
            ensemble = EnsembleNER(options.ensemble, [], options.models, types=submodels,
                                   features=options.features)
            ensemble.load()
            ensemble.generate_data(results, supervisioned=False)
            ensemble.test()
            ensemble_results = ResultsNER(options.models + "_ensemble")
            # process the results
            ensemble_results.get_ensemble_results(ensemble, results.corpus, options.models)
            ensemble_results.path = options.results + "_ensemble"
            get_results(ensemble_results, options.models + "_ensemble", goldset, ths, options.rules)"""

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #5
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.")
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        basecorpus = pickle.load(open(corpus_path, 'rb'))
        corpus = SeeDevCorpus(corpus_path)
        corpus.documents = basecorpus.documents
        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)
        elif options.actions == "add_goldstandard":
            corpus.convert_entities_to_goldstandard()
            corpus.find_ds_relations()
            #corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models,
                                      corpus=corpus,
                                      subtypes=all_entity_types)
            models.train_types()
        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                # model.train()
        # testing
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(
                ' '.join(options.submodels)))
            models = TaggerCollection(basepath=options.models,
                                      subtypes=all_entity_types)
            models.load_models()
            results = models.test_types(corpus)
            final_results = results.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(
                    paths[options.goldstd[0]]["format"],
                    paths[options.goldstd[0]]["annotations"], "all", p,
                    paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1], [],
                                      [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[
                            did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(
                    paths[options.goldstd[0]]["format"],
                    paths[options.goldstd[0]]["annotations"], "all", "all",
                    paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],
                                      [], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall +
                                                             precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences":  #and evaluate
            if options.ptype == "all":
                avg = [0, 0, 0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        #else:
        #    res = corpus.test_sentence_classifier(options.ptype)
        #    print res
        elif options.actions == "evaluate_ner":
            if os.path.exists(options.output[1] + ".pickle"):
                results = pickle.load(open(options.output[1] + ".pickle",
                                           'rb'))
                results.load_corpus(options.goldstd[0])
                results.path = options.output[1]
            logging.info("loading gold standard %s" %
                         paths[options.goldstd[0]]["annotations"])
            for t in all_entity_types:
                print t
                results.path = options.output[1] + "_" + t
                goldset = get_gold_ann_set(
                    paths[options.goldstd[0]]["format"],
                    paths[options.goldstd[0]]["annotations"], t, options.ptype,
                    paths[options.goldstd[0]]["text"])
                get_results(results, options.models + "_" + t, goldset[0], {},
                            {})

        corpus.save(paths[options.goldstd[0]]["corpus"])

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #6
0
ファイル: evaluate.py プロジェクト: AndreLamurias/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("action", default="evaluate",
                      help="Actions to be performed.")
    parser.add_argument("goldstd", default="chemdner_sample",
                        help="Gold standard to be used.",
                        choices=paths.keys())
    parser.add_argument("--corpus", dest="corpus",
                      default="data/chemdner_sample_abstracts.txt.pickle",
                      help="format path")
    parser.add_argument("--results", dest="results", help="Results object pickle.", nargs='+')
    parser.add_argument("--models", dest="models", help="model destination path, without extension", default="combined")
    parser.add_argument("--ensemble", dest="ensemble", help="name/path of ensemble classifier", default="combined")
    parser.add_argument("--chebi", dest="chebi", help="Chebi mapping threshold.", default=0, type=float)
    parser.add_argument("--ssm", dest="ssm", help="SSM threshold.", default=0, type=float)
    parser.add_argument("--measure", dest="measure", help="semantic similarity measure", default="simui")
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--rules", default=[], nargs='+', help="aditional post processing rules")
    parser.add_argument("--features", default=["chebi", "case", "number", "greek", "dashes", "commas", "length", "chemwords", "bow"],
                        nargs='+', help="aditional features for ensemble classifier")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default=None)
    parser.add_argument("--external", action="store_true", default=False, help="Run external evaluation script, depends on corpus type")
    parser.add_argument("--output", dest="output", help="Final output", default=None)
    options = parser.parse_args()

    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.info("Processing action {0} on {1}".format(options.action, options.goldstd))
    results_list = []
    for results_path in options.results:
        logging.info("loading results %s" % results_path + ".pickle")
        if os.path.exists(results_path + ".pickle"):
            results = pickle.load(open(results_path + ".pickle", 'rb'))
            results.load_corpus(options.goldstd)
            results.path = results_path
            results_list.append(results)
        else:
            print "results not found"
            print results_path
            sys.exit()

    if options.action in ("combine", "train_ensemble", "test_ensemble", "savetocorpus"):
        # merge the results of various results corresponding to different classifiers
        # the entities of each sentence are added according to the classifier of each result
        # every result should correspond to the same gold standard
        # save to the first results path
        #results.load_corpus(options.goldstd)
        #logging.info("combining results...")
        #results.combine_results(options.models, options.models + "_combined")
        #results.save(options.results + "_combined.pickle")
        base_result = results_list[0]
        for result in results_list[1:]:
            logging.info("adding {}...".format(result.path))
            base_result.add_results(result)

        if options.action == "combine":
            base_result.combine_results(options.etype, options.models)
            n_sentences, n_docs, n_entities, n_relations = 0, 0, 0, 0
            for did in base_result.corpus.documents:
                n_docs += 1
                for sentence in base_result.corpus.documents[did].sentences:
                    n_sentences += 1
                    for e in sentence.entities.elist[options.models]:
                        n_entities += 1
            logging.info("Combined {} docs, {} sentences, {} entities".format(n_docs, n_sentences, n_entities))
            base_result.save(options.models + ".pickle")
        elif options.action == "savetocorpus":
            base_result.corpus.save(options.output + ".pickle")
        elif options.action == "train_ensemble":
            pipeline = Pipeline(
                [
                    #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                    #('clf', SGDClassifier())
                     #('clf', svm.NuSVC(nu=0.01 ))
                    # ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                    #('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                     #('clf', MultinomialNB())
                    #('clf', GaussianNB())
                    ('clf', svm.SVC(kernel="rbf", degree=2, C=1))
                    #('clf', DummyClassifier(strategy="constant", constant=True))
                ])
            print pipeline
            base_result.train_ensemble(pipeline, options.models, options.etype)
        elif options.action == "test_ensemble":
            pipeline = joblib.load("{}/{}/{}.pkl".format("models/ensemble/", options.models, options.models))
            print pipeline
            base_result.test_ensemble(pipeline, options.models, options.etype)
            base_result.save("results/" + options.models + ".pickle")

    elif options.action in ("evaluate", "evaluate_list", "count_entities"):
        counts = {}
        if options.action == "count_entities":
            for did in results_list[0].corpus.documents:
                for sentence in results_list[0].corpus.documents[did].sentences:
                    print sentence.entities.elist.keys()
                    if options.models in sentence.entities.elist:
                        for e in sentence.entities.elist[options.models]:
                            if e.type not in counts:
                                counts[e.type] = 0
                            counts[e.type] += 1
            print counts
            sys.exit()
        if "annotations" in paths[options.goldstd]:
            logging.info("loading gold standard %s" % paths[options.goldstd]["annotations"])
            goldset = get_gold_ann_set(paths[options.goldstd]["format"], paths[options.goldstd]["annotations"],
                                       options.etype, options.ptype, paths[options.goldstd]["text"])
        else:
            goldset = None
        logging.info("using thresholds: chebi > {!s} ssm > {!s}".format(options.chebi, options.ssm))
        #results.load_corpus(options.goldstd)
        #results.path = options.results
        ths = {"chebi": options.chebi}
        if options.ssm > 0:
            ths["ssm"] = options.ssm
        if options.action == "evaluate":
            for result in results_list:
                if options.ptype: # evaluate this pair type
                    get_relations_results(result, options.models, goldset[1], ths, options.rules)
                else: # evaluate an entity type
                    get_results(result, options.models, goldset[0], ths, options.rules)
            #if options.bceval:
            #    write_chemdner_files(results, options.models, goldset, ths, options.rules)
            #    evaluation = run_chemdner_evaluation(config.paths[options.goldstd]["cem"],
            #                                         options.results + ".tsv")
            #    print evaluation
        elif options.action == "evaluate_list": # ignore the spans, the gold standard is a list of unique entities
            for result in results_list:
                if options.ptype:
                    get_list_results(result, options.models, goldset[1], ths, options.rules, mode="re")
                else:
                    get_list_results(result, options.models, goldset[0], ths, options.rules)

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #7
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.")
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        corpus = SeeDevCorpus(corpus_path)
        corpus.load_corpus(corenlp_client)
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, "all")
            corpus.save(paths[options.goldstd]["corpus"])

    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        # corpus.clear_annotations("all")
        corpus.load_annotations(corpus_ann, "all", options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd))
        corpus_path = paths[options.goldstd[0]]["corpus"]
        logging.info("loading corpus %s" % corpus_path)
        basecorpus = pickle.load(open(corpus_path, 'rb'))
        corpus = SeeDevCorpus(corpus_path)
        corpus.documents = basecorpus.documents
        if options.actions == "add_sentences":
            corpus.add_more_sentences(options.models)
        elif options.actions == "add_goldstandard":
            corpus.convert_entities_to_goldstandard()
            corpus.find_ds_relations()
            #corpus.save(config.paths[options.goldstd[0]]["corpus"])
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types)
            models.train_types()
        elif options.actions == "train_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=True)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "multir":
                    model = MultiR(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p)
                # model.train()
        # testing
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            models = TaggerCollection(basepath=options.models, subtypes=all_entity_types)
            models.load_models()
            results = models.test_types(corpus)
            final_results = results.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.ptype == "all":
                ptypes = pair_types.keys()
                # ptypes = config.event_types.keys()
                all_results = ResultsRE(options.output[1])
                all_results.corpus = corpus
                all_results.path = options.output[1]
            else:
                ptypes = [options.ptype]
            for p in ptypes:
                print p
                if options.kernel == "jsre":
                    model = JSREKernel(corpus, p, train=False)
                elif options.kernel == "svmtk":
                    model = SVMTKernel(corpus, p)
                elif options.kernel == "rules":
                    model = RuleClassifier(corpus, p)
                elif options.kernel == "stanfordre":
                    model = StanfordRE(corpus, p)
                elif options.kernel == "scikit":
                    model = ScikitRE(corpus, p)
                elif options.kernel == "crf":
                    model = CrfSuiteRE(corpus, p, test=True)
                model.load_classifier()
                model.test()
                results = model.get_predictions(corpus)
                # results.save(options.output[1] + "_" + p.lower() + ".pickle")
                # results.load_corpus(options.goldstd[0])
                results.path = options.output[1] + "_" + p.lower()
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"],
                                       "all", p, paths[options.goldstd[0]]["text"])
                get_relations_results(results, options.models, goldset[1],[], [])
                if options.ptype == "all":
                    for did in results.document_pairs:
                        if did not in all_results.document_pairs:
                            all_results.document_pairs[did] = Pairs(did=did)
                        all_results.document_pairs[did].pairs += results.document_pairs[did].pairs
            if options.ptype == "all":
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"],
                                       "all", "all", paths[options.goldstd[0]]["text"])
                get_relations_results(all_results, options.models, goldset[1],[], [])
                write_seedev_results(all_results, options.output[1])
        elif options.actions == "train_sentences": #and evaluate
            if options.ptype == "all":
                avg = [0,0,0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.train_sentence_classifier(p)
                    if tps == 0 and fns == 0:
                        precision, recall, fscore = 0, 1, 1
                    else:
                        precision = 1.0 * tps / (fps + tps)
                        recall = 1.0 * fns / (fns + tps)
                        fscore = 2.0 * precision * recall / (recall + precision)
                    print precision, recall, fscore
                    avg[0] += tps
                    avg[1] += fps
                    avg[2] += fns
                #print [a/len(config.pair_types) for a in avg]
                precision = 1.0 * avg[1] / (avg[0] + avg[1])
                recall = 1.0 * avg[2] / (avg[0] + avg[2])
                fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
            else:
                res = corpus.train_sentence_classifier(options.ptype)
                print res
            corpus.save(paths[options.goldstd[0]]["corpus"])
        elif options.actions == "test_sentences": #and evaluate
            if options.ptype == "all":
                avg = [0,0,0]
                for p in pair_types:
                    print p
                    tps, fps, fns = corpus.test_sentence_classifier(p)
                if tps == 0 and fns == 0:
                    precision, recall, fscore = 0, 1, 1
                else:
                    precision = 1.0 * tps / (fps + tps)
                    recall = 1.0 * fns / (fns + tps)
                    fscore = 2.0 * precision * recall / (recall + precision)
                print precision, recall, fscore
                avg[0] += tps
                avg[1] += fps
                avg[2] += fns
            #print [a/len(config.pair_types) for a in avg]
            precision = 1.0 * avg[1] / (avg[0] + avg[1])
            recall = 1.0 * avg[2] / (avg[0] + avg[2])
            fscore = 2.0 * precision * recall / (recall + precision)
            print precision, recall, fscore
        #else:
        #    res = corpus.test_sentence_classifier(options.ptype)
        #    print res
        elif options.actions == "evaluate_ner":
            if os.path.exists(options.output[1] + ".pickle"):
                results = pickle.load(open(options.output[1] + ".pickle", 'rb'))
                results.load_corpus(options.goldstd[0])
                results.path = options.output[1]
            logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"])
            for t in all_entity_types:
                print t
                results.path = options.output[1] + "_" + t
                goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"],
                                           paths[options.goldstd[0]]["annotations"],
                                           t, options.ptype, paths[options.goldstd[0]]["text"])
                get_results(results, options.models + "_" + t, goldset[0], {}, {})

        corpus.save(paths[options.goldstd[0]]["corpus"])


    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #8
0
ファイル: main.py プロジェクト: lasigeBioTM/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.",
                      choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard",
                               "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher",
                               "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel",
                               "merge_corpus", "tuples", "generate_data"])
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("-i", "--input", dest="input", action="store",
                      default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
                      help="Text to classify.")
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment")
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--entitysubtype", dest="subtype", help="subtype of entities to be considered", default="all")

    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--annotated", action="store_true", default=False, dest="annotated",
                      help="True if the input has <entity> tags.")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite", "banner", "ensemble"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        #corenlp_process.kill()
        #corpus.load_genia() #TODO optional genia
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_genia":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_genia()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_biomodel":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_biomodel()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "tuples":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.info("converting to tuples...")
        corpus.to_tuple()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.name = options.goldstd
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            #logging.info("adding {} documents".format(len(documents)))
            # docs = this_corpus.documents
            docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys()[:13000])
            corpus.documents.update(docs)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")
        elif options.actions == "merge_corpus":
            corpus.save(paths[options.output[1]]["corpus"])
        # training
        elif options.actions == "generate_data":
            corpus_path = paths[options.goldstd[0]]["corpus"]
            print "writing to " + options.goldstd[0] + "_event_time_contains.txt"
            with open(options.goldstd[0] + "_event_time_contains.txt", 'w') as datafile:
                for sentence in corpus.get_sentences("goldstandard"):
                    sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]]
                    for pair in itertools.combinations(sentence_entities, 2):
                        if pair[0].type == "event" and pair[1].type == "time":
                            pair_label = (pair[1].eid, "temporal") in pair[0].targets
                            between_text = sentence.text[pair[0].start:pair[1].end]
                            datafile.write("{0}\t{1.original_id}\t{1.text}\t{2.original_id}\t{2.text}\t{3}\n".format(pair_label,
                                            pair[0], pair[1], between_text))

        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype, subtype=options.subtype)
            elif options.crf == "ensemble":
                model = EnsembleModel(options.models, options.etype, goldstd=options.goldstd[0])
            features = feature_extractors.keys()
            if options.etype.startswith("time"):
                features = time_features
            elif options.etype.startswith("event"):
                features = event_features
            model.load_data(corpus, features, options.etype, subtype=options.subtype)
            model.train()
        elif options.actions == "train_matcher": # Train a simple classifier based on string matching
            model = MatcherModel(options.models, options.etype)
            model.train_list("temporal_list.txt")
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, modelname=options.tag)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag)
            #elif options.kernel == "stanfordre":
            #    model = StanfordRE(corpus, options.ptype)
            #elif options.kernel == "multir":
            #    model = MultiR(corpus, options.ptype)
            #elif options.kernel == "scikit":
            #    model = ScikitRE(corpus, options.ptype)
            #elif options.kernel == "crf":
            #    model = CrfSuiteRE(corpus, options.ptype)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, ner=options.models)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus, feature_extractors.keys(), mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models + "_stanford", options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models + "_crfsuite", options.etype, subtype=options.subtype)
                elif options.crf == "banner":
                    model = BANNERModel(options.models, options.etype)
                elif options.crf == "ensemble":
                    model = EnsembleModel(options.models, options.etype, goldstd=options.goldstd[0])
                model.load_tagger()
                features = feature_extractors.keys()
                if options.etype.startswith("time"):
                    features = time_features
                elif options.etype.startswith("event"):
                    features = event_features
                model.load_data(corpus, features, options.etype, mode="test", subtype=options.subtype)
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models, options.etype)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype, ner=options.models)
            elif options.kernel == "mirtex_rules":
                model = MirtexClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
コード例 #9
0
ファイル: main.py プロジェクト: AndreLamurias/IBEnt
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions", default="classify",  help="Actions to be performed.",
                      choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard",
                               "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher",
                               "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel",
                               "merge_corpus"])
    parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+",
                        help="Gold standard to be used. Will override corpus, annotations",
                        choices=paths.keys())
    parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"),
    parser.add_argument("-i", "--input", dest="input", action="store",
                      default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
                      help="Text to classify.")
    parser.add_argument("--corpus", dest="corpus", nargs=2,
                      default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"],
                      help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment")
    parser.add_argument("--models", dest="models", help="model destination path, without extension")
    parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all")
    parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all")
    parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all")
    parser.add_argument("--annotated", action="store_true", default=False, dest="annotated",
                      help="True if the input has <entity> tags.")
    parser.add_argument("-o", "--output", "--format", dest="output",
                        nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford",
                        choices=["stanford", "crfsuite", "banner"])
    parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level")
    parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = paths[options.goldstd]["format"]
        corpus_path = paths[options.goldstd]["text"]
        corpus_ann = paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client)
        #corpus.load_genia() #TODO optional genia
        corpus.save(paths[options.goldstd]["corpus"])
        if corpus_ann: #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_genia":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_genia()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "load_biomodel":
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        corpus.load_biomodel()
        corpus.save(paths[options.goldstd]["corpus"])
    elif options.actions == "annotate": # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = paths[options.goldstd]["corpus"]
        corpus_ann = paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")
        elif options.actions == "merge_corpus":
            corpus.save(paths[options.output[1]]["corpus"])
        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher": # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, modelname=options.tag)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag)
            #elif options.kernel == "stanfordre":
            #    model = StanfordRE(corpus, options.ptype)
            #elif options.kernel == "multir":
            #    model = MultiR(corpus, options.ptype)
            #elif options.kernel == "scikit":
            #    model = ScikitRE(corpus, options.ptype)
            #elif options.kernel == "crf":
            #    model = CrfSuiteRE(corpus, options.ptype)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, ner=options.models)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus, feature_extractors.keys(), mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                elif options.crf == "banner":
                    model = BANNERModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype, ner=options.models)
            elif options.kernel == "mirtex_rules":
                model = MirtexClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            elif options.kernel == "mil":
                relations = set()
                with open("corpora/transmir/transmir_relations.txt") as rfile:
                    for l in rfile:
                        relations.add(tuple(l.strip().split('\t')))
                model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)