Esempio n. 1
0
class TaggerCollection(object):
    """
    Collection of tagger classifiers used to train and test specific subtype models
    """
    CHEMDNER_TYPES = [
        "IDENTIFIER", "MULTIPLE", "FAMILY", "FORMULA", "SYSTEMATIC",
        "ABBREVIATION", "TRIVIAL"
    ]
    GPRO_TYPES = ["NESTED", "IDENTIFIER", "FULL_NAME", "ABBREVIATION"]
    DDI_TYPES = ["drug", "group", "brand", "drug_n"]

    def __init__(self, basepath, baseport=9191, **kwargs):
        self.models = {}
        self.basepath = basepath
        self.corpus = kwargs.get("corpus")
        submodels = []
        self.baseport = baseport
        self.types = []
        if basepath.split("/")[-1].startswith("chemdner+ddi"):
            self.types = self.DDI_TYPES + self.CHEMDNER_TYPES + [
                "chemdner", "ddi"
            ]
        elif basepath.split("/")[-1].startswith("ddi"):
            self.types = self.DDI_TYPES + ["all"]
        elif basepath.split("/")[-1].startswith("chemdner") or basepath.split(
                "/")[-1].startswith("cemp"):
            self.types = ["all"] + self.CHEMDNER_TYPES
        elif basepath.split("/")[-1].startswith("gpro"):
            self.types = self.GPRO_TYPES + ["all"]
        self.basemodel = StanfordNERModel(self.basepath, "all")

    def train_types(self):
        """
        Train models for each subtype of entity, and a general model.
        :param types: subtypes of entities to train individual models, as well as a general model
        """
        self.basemodel.load_data(self.corpus,
                                 feature_extractors.keys(),
                                 subtype="all")
        for t in self.types:
            typepath = self.basepath + "_" + t
            model = StanfordNERModel(typepath,
                                     subtypes=self.basemodel.subtypes)
            model.copy_data(self.basemodel, t)
            logging.info("training subtype %s" % t)
            model.train()
            self.models[t] = model

    def load_models(self):
        for i, t in enumerate(self.types):
            model = StanfordNERModel(self.basepath + "_" + t,
                                     t,
                                     subtypes=self.basemodel.subtypes)
            model.load_tagger(self.baseport + i)
            self.models[t] = model

    def process_type(self, modelst, t, corpus, basemodel, basepath, port):
        # load data only for one model since this takes at least 5 minutes each time
        logging.debug("{}: copying data...".format(t))
        modelst.copy_data(basemodel)
        #logging.debug("pre test %s" % model)
        logging.debug("{}: testing...".format(t))
        res = modelst.test(corpus, port)
        logging.info("{}:done...".format(t))
        return res

    def test_types(self, corpus):
        """
        Classify the corpus with multiple classifiers from different subtypes
        :return ResultSetNER object with the results obtained for the models
        """
        # TODO: parallelize
        results = ResultSetNER(corpus, self.basepath)
        self.basemodel.load_data(corpus, feature_extractors.keys())
        all_results = []
        tasks = [(self.models[t], t, corpus, self.basemodel, self.basepath,
                  self.baseport + i) for i, t in enumerate(self.types)]

        all_results = []
        for t in tasks:
            r = self.process_type(*t)
            all_results.append(r)
        logging.info("adding results...")
        for res, i in enumerate(all_results):
            #logging.debug("adding these results: {}".format(self.types[i]))
            results.add_results(res)
        return results
Esempio n. 2
0
def run_crossvalidation(goldstd_list,
                        corpus,
                        model,
                        cv,
                        crf="stanford",
                        entity_type="all",
                        cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist) / cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(
            nlist, len(testids), len(trainids))
        train_corpus = Corpus(
            corpus.path + "_train",
            documents={did: corpus.documents[did]
                       for did in trainids})
        test_corpus = Corpus(
            corpus.path + "_test",
            documents={did: corpus.documents[did]
                       for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(
            nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191 + nlist)
        test_model.load_data(test_corpus,
                             feature_extractors.keys(),
                             mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191 + nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.paths[gs]["format"],
                                       config.paths[gs]["annotations"],
                                       entity_type, "pairtype",
                                       config.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset,
                                        {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p) / cv
    ravg = sum(r) / cv
    print "precision: average={} all={}".format(
        str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg),
                                              '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.paths[gs]["format"],
                                   config.paths[gs]["annotations"],
                                   entity_type, config.paths[gs]["text"])
        for g in goldset:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])
Esempio n. 3
0
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"):
    logfile = open(cvlog, 'w')
    doclist = corpus.documents.keys()
    random.shuffle(doclist)
    size = int(len(doclist)/cv)
    sublists = chunks(doclist, size)
    logging.debug("Chunks:")
    logging.debug(sublists)
    p, r = [], []
    all_results = ResultsNER(model)
    all_results.path = model + "_results"
    for nlist in range(cv):
        testids, trainids = None, None
        testids = sublists[nlist]
        trainids = list(itertools.chain.from_iterable(sublists[:nlist]))
        trainids += list(itertools.chain.from_iterable(sublists[nlist+1:]))
        train_corpus, test_corpus = None, None
        print 'CV{} - test set: {}; train set: {}'.format(nlist, len(testids), len(trainids))
        train_corpus = Corpus(corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids})
        test_corpus = Corpus(corpus.path + "_test", documents={did: corpus.documents[did] for did in testids})
        # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys())))
        #test_entities = len(test_corpus.get_all_entities("goldstandard"))
        #train_entities = len(train_corpus.get_all_entities("goldstandard"))
        #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities))
        basemodel = model + "_cv{}".format(nlist)
        logging.debug('CV{} - test set: {}; train set: {}'.format(nlist, len(test_corpus.documents), len(train_corpus.documents)))
        '''for d in train_corpus.documents:
            for s in train_corpus.documents[d].sentences:
                print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"])
        sys.exit()'''
        # train
        logging.info('CV{} - TRAIN'.format(nlist))
        # train_model = StanfordNERModel(basemodel)
        train_model = None
        if crf == "stanford":
            train_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            train_model = CrfSuiteModel(basemodel, entity_type)
        train_model.load_data(train_corpus, feature_extractors.keys())
        train_model.train()

        # test
        logging.info('CV{} - TEST'.format(nlist))
        test_model = None
        if crf == "stanford":
            test_model = StanfordNERModel(basemodel, entity_type)
        elif crf == "crfsuite":
            test_model = CrfSuiteModel(basemodel, entity_type)
        test_model.load_tagger(port=9191+nlist)
        test_model.load_data(test_corpus, feature_extractors.keys(), mode="test")
        final_results = None
        final_results = test_model.test(test_corpus, port=9191+nlist)
        if crf == "stanford":
            test_model.kill_process()
        final_results.basepath = basemodel + "_results"
        final_results.path = basemodel

        all_results.entities.update(final_results.entities)
        all_results.corpus.documents.update(final_results.corpus.documents)
        # validate
        """if config.use_chebi:
            logging.info('CV{} - VALIDATE'.format(nlist))
            final_results = add_chebi_mappings(final_results, basemodel)
            final_results = add_ssm_score(final_results, basemodel)
            final_results.combine_results(basemodel, basemodel)"""

        # evaluate
        logging.info('CV{} - EVALUATE'.format(nlist))
        test_goldset = set()
        for gs in goldstd_list:
            goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type,
                                       "pairtype", config.corpus_paths.paths[gs]["text"])
            for g in goldset[0]:
                if g[0] in testids:
                    test_goldset.add(g)
        precision, recall = get_results(final_results, basemodel, test_goldset, {}, [])
        # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t")
        # values = evaluation.split("\n")[1].split('\t')
        p.append(precision)
        r.append(recall)
        # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14])))
    pavg = sum(p)/cv
    ravg = sum(r)/cv
    print "precision: average={} all={}".format(str(pavg), '|'.join([str(pp) for pp in p]))
    print "recall: average={}  all={}".format(str(ravg), '|'.join([str(rr) for rr in r]))
    all_goldset = set()
    for gs in goldstd_list:
        goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "",
                                   config.corpus_paths.paths[gs]["text"])
        for g in goldset[0]:
            all_goldset.add(g)
    get_results(all_results, model, all_goldset, {}, [])