Example #1
0
    def __read_dictionaries(dic_paths, read_function, string_tokenizer,
                            case_sensitive, stop_words):
        stop_words = DictionaryFeatureGenerator.__normalize_stop_words(
            stop_words)

        ret = []

        for dic_path in dic_paths:
            try:
                reader = read_function(dic_path)
                try:
                    name = DictionaryFeatureGenerator.__get_filename(dic_path)
                    words_set = DictionaryFeatureGenerator.construct_words_set(
                        reader, string_tokenizer, case_sensitive, stop_words)
                    generator = DictionaryFeatureGenerator(
                        name, words_set, case_sensitive)
                    ret.append(generator)
                finally:
                    reader.close()
            except Exception as e:
                traceback.print_exc()
                print_debug("Could not read dictionary: {}".format(dic_path),
                            e)
                continue

        print_verbose("Using dictionaries: {}".format(", ".join(
            (repr(x) for x in ret))))

        return ret
Example #2
0
    def read_predictions(self,
                         dataset,
                         predictionsfile,
                         classification_threshold=None):
        classification_threshold = classification_threshold if classification_threshold is not None else self.classification_threshold

        values = []
        with predictionsfile:
            predictionsfile.seek(0)

            for line in predictionsfile:
                prediction = float(line.strip())
                print_verbose("  pred: " + str(prediction))

                if prediction > classification_threshold:
                    values.append(+1)
                else:
                    values.append(-1)

            if (len(values) > 1):
                for index, edge in enumerate(dataset.edges()):
                    edge.pred_target = values[index]
            else:
                if (next(dataset.edges(), None)):
                    raise Exception(
                        "EMPTY PREDICTIONS FILE -- This may be due to too small dataset or too few of features. Predictions file: "
                        + predictionsfile.name)

        return dataset.form_predicted_relations()
Example #3
0
    def add_to_feature_set(self, feature_set, edge, feature_name, value=1):
        """
        Return True if feature was added to feature_set. False, otherwise

        If the feature_name is None, the feature is not added in anycase. See: self.mk_feature_name
        """
        if feature_name is None:
            return False

        else:
            feature_name = self.__set_final_name(feature_name)

            if not feature_set.is_locked:
                feature_index = feature_set.get(feature_name, None)

                if feature_index is None:
                    feature_index = len(feature_set)
                    feature_set[feature_name] = feature_index
                    print_verbose(
                        "Feature map: {} == {} -- _1st_ value: {}".format(
                            str(feature_index), feature_name, str(value)))

                edge.features[feature_index] = value
                return True

            else:
                feature_index = feature_set.get(feature_name, None)

                if feature_index is not None:
                    edge.features[feature_index] = value
                    return True
                else:
                    return False
Example #4
0
 def filter(self, documents):
     pycrf = PyCRFSuite(self.binary_model)
     for pmid, doc in documents:
         dataset = Dataset()
         dataset.documents[pmid] = doc
         self.pipeline.execute(dataset)
         self.labeler.label(dataset)
         pycrf.tag(dataset, MUT_CLASS_ID)
         PostProcessing().process(dataset)
         ExclusiveNLDefiner().define(dataset)
         total_nl_mentions = []
         for part in doc:
             # print(part.annotations)
             print_verbose('predicted_annotations:',
                           part.predicted_annotations)
             nl_mentions = [
                 (ann.text, ann.subclass, ann.confidence)
                 for ann in part.predicted_annotations
                 if ann.subclass != 0 and ann.confidence <= self.threshold
             ]
             total_nl_mentions += nl_mentions
         if any(total_nl_mentions):
             print('nl mentions', json.dumps(total_nl_mentions, indent=4))
             yield pmid, doc
         print_verbose('nothing found')
Example #5
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if self.cache:
         print_verbose('writing the cache {}'.format(self.cache_filename))
         if not os.path.exists(self.cache_directory):
             os.makedirs(self.cache_directory)
         with open(self.cache_filename, 'w') as file:
             json.dump(self.cache, file)
Example #6
0
    def __init__(self, model_file, n_bins=300):
        import numpy as np
        self.model = Word2Vec.load(model_file)

        data = np.vstack(self.model[word] for word in self.model.vocab)
        hist, self.bin_edges = np.histogram(data.flatten(), bins=n_bins)

        print_verbose('word embddings loaded with vocab size:',
                      len(self.model.vocab))
Example #7
0
    def execute(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset()
        """

        self.splitter.split(dataset)
        self.tokenizer.tokenize(dataset)
        for feature_generator in self.feature_generators:
            print_verbose('Apply feature generator:', type(feature_generator))
            feature_generator.generate(dataset)
Example #8
0
    def __init__(self, model_file):
        import numpy as np
        self.model = Word2Vec.load(model_file)

        data = np.vstack(self.model[word] for word in self.model.vocab)
        self.pos_means = np.average(data, axis=0, weights=(data > 0))
        self.neg_means = np.average(data, axis=0, weights=(data < 0))

        print_verbose('word embddings loaded with vocab size:',
                      len(self.model.vocab))
Example #9
0
def get_word_embeddings_feature_generator(model_location=None,
                                          additive=None,
                                          multiplicative=None):
    """
    :returns: nalaf.features.embeddings.WordEmbeddingsFeatureGenerator
    """
    global _SINGLETON_WE_GENERATOR

    if _SINGLETON_WE_GENERATOR is None:
        additive = 0 if additive is None else additive
        multiplicative = 1 if multiplicative is None else multiplicative

        import tarfile

        import pkg_resources
        import requests
        from nalaf.features.embeddings import WordEmbeddingsFeatureGenerator
        from nalaf import print_verbose, print_warning

        if model_location is None:
            # D=100, no discretization, epoch=1, window=10
            last_model = "word_embeddings_2016-03-28"
            we_model = pkg_resources.resource_filename(
                'nala.data', os.path.join(last_model, 'word_embeddings.model'))
            if not os.path.exists(we_model):
                print_warning(
                    'Downloading Word Embeddings Model (this may take a long time). Expected path: '
                    + we_model)
                # TODO requests doesn't support ftp, but better use: ftp://rostlab.org/jmcejuela/...last_model...
                tar = '{}.tar.gz'.format(last_model)
                model_url = '{}/{}'.format('https://rostlab.org/~cejuela', tar)
                we_model_tar_gz = pkg_resources.resource_filename(
                    'nala.data', tar)

                response = requests.get(url=model_url, stream=True)
                with open(we_model_tar_gz, 'wb') as file:
                    for chunk in response.iter_content(8048):
                        if chunk:
                            print('.', end="", flush=True)
                            file.write(chunk)
                    print()
                # Unpack the model
                print_verbose('Extracting')

                tar = tarfile.open(we_model_tar_gz)
                tar.extractall(
                    path=pkg_resources.resource_filename('nala.data', ''))
                tar.close()
            _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator(
                we_model, additive, multiplicative)
        else:
            _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator(
                model_location, additive, multiplicative)

    return _SINGLETON_WE_GENERATOR
Example #10
0
 def export_ann_json(self, threshold_val=None):
     """
     Creates all Annotation files in the corresponding ann.json format.
     Description of ann.json-format: "https://github.com/tagtog/tagtog-doc/wiki/ann.json"
     :return:
     """
     for docid in self.data.documents.keys():
         fname = os.path.join(self.annjson_path, docid + ".ann.json")
         print_verbose(fname)
         with open(fname, 'w', encoding='utf-8') as f:
             json_obj = self.get_single_ann_json(threshold_val, docid)
             json.dump(json_obj, f)
Example #11
0
    def __init__(self,
                 dataset,
                 use_predicted,
                 to_save_to="resources/corpora/sample/anndoc",
                 who="ml:nalaf",
                 _annjson_folder="annjson",
                 _html_folder="html",
                 use_original_partids=True):
        """
        init function that does prepare annjson folder and html folder
        :param to_save_to: usually resources/corpora/[name of corpus]/anndoc/
        :type dataset: nalaf.structures.data.Dataset
        :param who:
        :param _annjson_folder:
        :param _html_folder:
        :return:
        """
        self.location = to_save_to
        """ root folder, that documents are saved into """
        self.data = dataset
        """ dataset param """
        self.who = who
        """ who parameter """
        self.use_original_partids = use_original_partids
        self.use_predicted = use_predicted

        # Possibility to use instance without writing files to disk
        if to_save_to:
            # check for root folder for files to save to
            if not os.path.isdir(self.location):
                print_verbose("mkdir", os.path.abspath(self.location))
                try:
                    os.makedirs(self.location)
                except FileExistsError:
                    pass

            # create subfolders if not existent
            # annjson folder
            self.annjson_path = os.path.join(self.location, _annjson_folder)
            """ subfolder where ann.json files are saved into """
            if not os.path.isdir(self.annjson_path):
                os.mkdir(self.annjson_path)
            # html folder
            self.html_folder = os.path.join(self.location, _html_folder)
            """ subfolder where html files are saved into """
            if not os.path.isdir(self.html_folder):
                os.mkdir(self.html_folder)
Example #12
0
    def evaluate(self, dataset):
        """
        :type dataset: nala.structures.data.Dataset
        :returns Evaluations
        """

        subcounts = ['tp', 'fp', 'fn']
        counts = {docid: dict.fromkeys(subcounts, 0) for docid in dataset.documents.keys()}

        print_verbose()

        for docid, doc in dataset.documents.items():
            if self.evaluate_only_on_edges_plausible_relations:
                # a set would be better, but so far Relation is unshable
                relations_search_space = list(dataset.plausible_relations_from_generated_edges())
            else:
                relations_search_space = None

            gold = doc.map_relations(use_predicted=False, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun, relations_search_space=relations_search_space).keys()
            pred = doc.map_relations(use_predicted=True, relation_type=self.rel_type, entity_map_fun=self.entity_map_fun).keys()

            for r_pred in pred:

                accept_decisions = {self.relation_accept_fun(r_gold, r_pred) for r_gold in gold}
                assert set.issubset(accept_decisions, {True, False, None}), "`relation_accept_fun` cannot return: " + str(accept_decisions)

                if True in accept_decisions:
                    # Count the true positives while iterating on gold
                    pass

                elif None in accept_decisions:
                    # Ignore as documented
                    pass

                else:
                    # either False or the set is empty, meaning that there are no gold annotations
                    print_debug("    ", docid, ": FALSE POSITIV", r_pred)
                    counts[docid]['fp'] += 1

            for r_gold in gold:

                r_preds = [r_pred for r_pred in pred if self.relation_accept_fun(r_gold, r_pred)]

                if len(r_preds) > 0:  # we could also do any(...); we have this in place only for debugging purposes
                    print_verbose("    ", docid, ": true positive", r_gold)
                    counts[docid]['tp'] += 1

                else:
                    print_debug("    ", docid, ": FALSE NEGATIV", r_gold)
                    counts[docid]['fn'] += 1

        print_verbose()

        evaluations = Evaluations()
        evaluations.add(EvaluationWithStandardError(self.rel_type, counts))
        return evaluations
Example #13
0
    def __enter__(self):
        self.cache_directory = os.path.join(os.path.expanduser('~'), '.nalaf')
        self.cache_filename = '{}_cache.json'.format(os.path.join(self.cache_directory, self.__class__.__name__))
        if os.path.exists(self.cache_filename):

            # if the file is too old reset the cache
            if self.is_timed and (time.time() - os.path.getctime(self.cache_filename)) > self.max_time_in_seconds:
                print_verbose('resetting the cache {}'.format(self.cache_filename))
                os.remove(self.cache_filename)
                self.cache = {}
            else:
                print_verbose('reading from cache {}'.format(self.cache_filename))
                with open(self.cache_filename) as f:
                    self.cache = json.load(f)
        else:
            print_verbose('no cache found {}'.format(self.cache_filename))
            self.cache = {}
        return self
Example #14
0
    def filter(self, documents, min_found=1, use_nala=False):
        """
        :type documents: collections.Iterable[(str, nalaf.structures.data.Document)]
        """

        _progress = 1
        _start_time = time.time()
        _total_time = 0

        _time_avg_per_pattern = 0
        _pattern_calls = 0
        _time_reg_pattern_total = 0
        _time_max_pattern = 0
        _low_performant_pattern = ""

        # NLDefiners init
        exclusive_definer = ExclusiveNLDefiner()
        _e_array = [0, 0, 0]
        inclusive_definer = InclusiveNLDefiner()
        _i_array = [0, 0]

        last_found = 0
        crf = PyCRFSuite(self.location_binary_model)

        # counter_to_stop_for_caching = 0

        for pmid, doc in documents:
            # if any part of the document contains any of the keywords
            # yield that document

            # if counter_to_stop_for_caching > 400:
            #     break
            # counter_to_stop_for_caching += 1
            # print(counter_to_stop_for_caching)

            part_offset = 0
            data_tmp = Dataset()
            data_tmp.documents[pmid] = doc
            data_nala = deepcopy(data_tmp)
            NLTKSplitter().split(data_tmp)
            # data_tmvar = TmVarTagger().generate_abstracts([pmid])
            if use_nala:
                self.pipeline.execute(data_nala)
                self.labeler.label(data_nala)
                crf.tag(data_nala, MUT_CLASS_ID)
                PostProcessing().process(data_nala)
                ExclusiveNLDefiner().define(data_nala)

            used_regexs = {}

            positive_sentences = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences_

                for sent in sentences:
                    sent_length = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text)
                    # new_text = re.sub('\W+', ' ', new_text)

                    found_in_sentence = False

                    for i, reg in enumerate(self.patterns):
                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call
                        # todo create pattern performance eval for descending amount of recognized patterns
                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")
                        if match:
                            # if pmid in data_tmvar.documents:
                            #     anti_doc = data_tmvar.documents.get(pmid)
                            nala_doc = data_nala.documents.get(pmid)

                            start = part_offset + sent_offset + match.span()[0]
                            end = part_offset + sent_offset + match.span()[1]
                            # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end))
                            # print(not nala_doc.overlaps_with_mention(start, end, annotated=False))

                            if reg.pattern in used_regexs:
                                used_regexs[reg.pattern] += 1
                            else:
                                used_regexs[reg.pattern] = 1
                            print(color.PURPLE + new_text.replace(
                                match.group(), color.BOLD + color.DARKCYAN +
                                color.UNDERLINE + match.group() + color.END +
                                color.PURPLE) + color.END)
                            if not found_in_sentence:
                                positive_sentences += 1
                                found_in_sentence = True
                                # if not anti_doc.overlaps_with_mention(start,
                                #                                       end) \
                                #         and not nala_doc.overlaps_with_mention(start, end, annotated=False):
                                #     _e_result = exclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _e_array[_e_result] += 1
                                #     _i_result = inclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _i_array[_i_result] += 1
                                # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))

                                # last_found += 1
                                # found_in_sentence = True
                                # else:
                                #     # if nala not used only tmvar considered
                                #     if not anti_doc.overlaps_with_mention(start, end):
                                #         _e_result = exclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _e_array[_e_result] += 1
                                #         _i_result = inclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _i_array[_i_result] += 1
                                #         # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                #         # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))
                                #         last_found += 1
                                #         found_in_sentence = True

                            if use_nala:
                                nala_found_mention = nala_doc.overlaps_with_mention(
                                    start, end, annotated=False)
                                if nala_found_mention:
                                    print_verbose(nala_found_mention)
                                    if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold:
                                        yield pmid, doc

                        if _lasttime - time.time() > 1:
                            print_verbose('time intensive regex', i)
                    sent_offset += 2 + sent_length

                    # for per sentence positives
                    if found_in_sentence:
                        positive_sentences += 1

                part_offset += sent_offset
            if use_nala:
                for part in nala_doc:
                    for ann in part.predicted_annotations:
                        if ann.subclass > 0:
                            print_verbose(part.text[:ann.offset] + color.BOLD +
                                          ann.text + color.END +
                                          part.text[ann.offset +
                                                    len(ann.text):])
                            positive_sentences += min_found
            _old_time = _start_time
            _start_time = time.time()
            _one_time = _start_time - _old_time

            if _one_time > 0.3 and positive_sentences > min_found:
                _progress += 1
                _total_time += _one_time

            _time_per_doc = _total_time / _progress
            print_verbose(
                "PROGRESS: {:.2f} secs ETA per one positive document:"
                " {:.2f} secs".format(_total_time, _time_per_doc))
            print_debug('used regular expressions:',
                        json.dumps(used_regexs, indent=4))
            if positive_sentences >= min_found:
                last_found = 0
                print_verbose('YEP', pmid)
                yield pmid, doc
            else:
                print_verbose('NOPE', pmid)
Example #15
0
 def __init__(self, model_file, additive=0, multiplicative=1):
     self.model = Word2Vec.load(model_file)
     self.additive = additive
     self.multiplicative = multiplicative
     print_verbose('word embddings loaded with vocab size:',
                   len(self.model.vocab))
Example #16
0
    def evaluate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        :returns (tp, fp, fn, precision, recall, f_measure): (int, int, int, float, float, float)

        Calculates precision, recall and subsequently F1 measure, defined as:
            * precision: number of correctly predicted items as a percentage of the total number of predicted items
                len(predicted items that are also real)/len(predicted)
                or in other words tp / tp + fp
            * recall: number of correctly predicted items as a percentage of the total number of correct items
                len(real items that are also predicted)/len(real)
                or in other words tp / tp + fn
        """
        TOTAL = EntityEvaluator.TOTAL_LABEL
        labels = [TOTAL]

        # find all possible subclasses or otherwise full classes
        labels += list(set(__class__._labelize(e) for e in dataset.entities()))
        labels += list(
            set(__class__._labelize(e) for e in dataset.predicted_entities()))

        docids = dataset.documents.keys()
        subcounts = ['tp', 'fp', 'fn']
        counts = {
            label: {docid: dict.fromkeys(subcounts, 0)
                    for docid in docids}
            for label in labels
        }

        for docid, doc in dataset.documents.items():
            for partid, part in doc.parts.items():

                gold_anns = set(
                    filter(None,
                           (self.entity_map_fun(e) for e in part.annotations)))
                pred_anns = set(
                    filter(None, (self.entity_map_fun(e)
                                  for e in part.predicted_annotations)))

                for pred in pred_anns:
                    accept_decisions = {
                        self.entity_accept_fun(gold, pred)
                        for gold in gold_anns
                    }
                    assert set.issubset(
                        accept_decisions,
                        {True, False, None
                         }), "did not expect: " + str(accept_decisions)

                    if True in accept_decisions:
                        # Count the true positives while iterating on gold
                        pass

                    elif None in accept_decisions:
                        pass

                    else:
                        # either False or the set is empty, meaning that there are no gold annotations
                        print_debug("    ", docid, ": FALSE POSITIV", pred)
                        counts[TOTAL][docid]['fp'] += 1
                        counts[__class__._labelize(pred)][docid]['fp'] += 1

                for gold in gold_anns:

                    accept_decisions = {
                        self.entity_accept_fun(gold, pred)
                        for pred in pred_anns
                    }

                    if True in accept_decisions:
                        print_verbose("    ", docid, ": true positive", gold)
                        counts[TOTAL][docid]['tp'] += 1
                        counts[__class__._labelize(gold)][docid]['tp'] += 1

                    elif "UNKNOWN:" in gold:  # Pass when unknown normalization
                        pass

                    else:
                        print_debug("    ", docid, ": FALSE NEGATIV", gold)
                        counts[TOTAL][docid]['fn'] += 1
                        counts[__class__._labelize(gold)][docid]['fn'] += 1

        evaluations = Evaluations()

        for label in labels:
            evaluations.add(EvaluationWithStandardError(label, counts[label]))

        return evaluations
Example #17
0
    def evaluate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        :returns (tp, fp, fn, tp_overlapping, precision, recall, f_measure): (int, int, int, int, float, float, float)

        Calculates precision, recall and subsequently F1 measure, defined as:
            * precision: number of correctly predicted items as a percentage of the total number of predicted items
                len(predicted items that are also real)/len(predicted)
                or in other words tp / tp + fp
            * recall: number of correctly predicted items as a percentage of the total number of correct items
                len(real items that are also predicted)/len(real)
                or in other words tp / tp + fn
            * possibly considers overlapping matches as well
        """

        TOTAL = MentionLevelEvaluator.TOTAL_LABEL
        labels = [TOTAL]

        def labelize(e):
            """
            Use this to represent an entity subclass as string and, if this is None or False (but not 0!), represent the entity with its class_id

            Convert to subclasses / classes ids to avoid the misstep of comparing possible subclass '0' with False, which in python breaks the universe
            --> info: https://twitter.com/juanmirocks/status/802209750612054016
            """
            return str(e.subclass) if str(
                e.subclass) not in ['None', 'False'] else str(e.class_id)

        if self.subclass_analysis:
            # find all possible subclasses or otherwise full classes

            subclasses = set(labelize(e) for e in dataset.entities())
            subclasses.update(
                set(labelize(e) for e in dataset.predicted_entities()))

            for x in subclasses:
                labels.append(x)

        docids = dataset.documents.keys()
        subcounts = ['tp', 'fp', 'fn', 'fp_ov', 'fn_ov']
        counts = {
            label: {docid: dict.fromkeys(subcounts, 0)
                    for docid in docids}
            for label in labels
        }

        for docid, doc in dataset.documents.items():
            for partid, part in doc.parts.items():

                overlap_real = {label: [] for label in labels}
                overlap_predicted = {label: [] for label in labels}

                Entity.equality_operator = 'overlapping'
                for ann_a in part.annotations:
                    for ann_b in part.predicted_annotations:
                        if ann_a == ann_b:  # equal according according to exclusive overlapping eq (not exact)
                            overlap_real[TOTAL].append(ann_a)
                            overlap_predicted[TOTAL].append(ann_b)

                            if self.subclass_analysis:
                                if labelize(ann_a) != labelize(ann_b):
                                    print_debug(
                                        'overlapping subclasses do not match',
                                        ann_a.subclass, ann_b.subclass)
                                    ann_b.subclass = ann_a.subclass

                                overlap_real[labelize(ann_a)].append(ann_a)
                                overlap_predicted[labelize(ann_b)].append(
                                    ann_b)

                Entity.equality_operator = 'exact'
                for ann in part.predicted_annotations:
                    if ann in part.annotations:
                        counts[TOTAL][docid]['tp'] += 1
                        print_verbose("    ", docid, ": TRUE POSITVE", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['tp'] += 1

                    else:
                        counts[TOTAL][docid]['fp'] += 1

                        if ann in overlap_predicted[TOTAL]:
                            counts[TOTAL][docid]['fp_ov'] += 1
                        else:
                            print_debug("    ", docid, ": FALSE POSITIV", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['fp'] += 1
                            if ann in overlap_predicted[labelize(ann)]:
                                counts[labelize(ann)][docid]['fp_ov'] += 1

                for ann in part.annotations:
                    if ann not in part.predicted_annotations:
                        counts[TOTAL][docid]['fn'] += 1

                        if ann in overlap_real[TOTAL]:
                            counts[TOTAL][docid]['fn_ov'] += 1
                        else:
                            print_debug("    ", docid, ": FALSE NEGATIV", ann)

                        if self.subclass_analysis:
                            counts[labelize(ann)][docid]['fn'] += 1
                            if ann in overlap_real[labelize(ann)]:
                                counts[labelize(ann)][docid]['fn_ov'] += 1

        evaluations = Evaluations()

        for label in labels:
            evaluations.add(EvaluationWithStandardError(label, counts[label]))

        return evaluations
Example #18
0
    def create_nalaf_entity(self,
                            tagger_entity,
                            original_text,
                            offset_adjustment=0):

        offset = tagger_entity["start"] + offset_adjustment
        end = tagger_entity["end"] + offset_adjustment
        entity_text = original_text[offset:end]

        e_class_id = n_class_id = None
        norms = []
        organisms_proteins = {}

        for norm in tagger_entity["ids"]:
            # assumption: the e_class_id and n_class_id once set will not change

            norm_id = norm["id"]

            if norm["type"] == "-3":
                e_class_id = self.organism_id
                n_class_id = self.taxonomy_norm_id
                norms.append(norm_id)

            elif norm["type"] == "-22":
                try:
                    if any(
                            are_go_parent_and_child(in_parent, norm_id)
                            for in_parent in
                            self.filter_in_go_localizations) and not any(
                                are_go_parent_and_child(out_parent, norm_id)
                                for out_parent in
                                self.filter_out_go_localizations):
                        e_class_id = self.localization_id
                        n_class_id = self.go_norm_id
                        norms.append(norm_id)
                    else:
                        print_verbose("REJECT", norm_id,
                                      get_localization_name(norm_id))
                        pass  # reject

                except KeyError as e:
                    print_verbose("REJECT", norm_id,
                                  get_localization_name(norm_id))
                    pass  # reject

            elif norm["type"].startswith("uniprot_ac:"):
                organism = int(norm["type"].split(":")[1])
                prots = organisms_proteins.get(organism, set())
                prots.update({norm_id})
                organisms_proteins[organism] = prots

                e_class_id = self.protein_id
                n_class_id = self.uniprot_norm_id
                norms.append(norm_id)

            elif norm["type"].startswith("string_id:"):
                # Set e_class_id thus not to reject the protein; this happens in the few cases the string id cannot be normalized to uniprot
                e_class_id = self.protein_id

        if not e_class_id:
            return None  # reject

        else:
            norms = set(
                norms
            )  # convert to set first just in case the original tagger returns repeated ids (happened)

            if self.remove_ambiguous_proteins:
                # Remove ambiguous ids; heuristic: different normalizations for a same organism are considered ambiguous
                for organism, proteins in organisms_proteins.items():
                    if len(proteins) > 1:
                        for ambiguous_protein in proteins:
                            norms.remove(ambiguous_protein)

            if not norms:
                norms = None
            else:
                norms = ",".join(norms)

            if n_class_id:
                norms_dic = {n_class_id: norms}
            else:
                norms_dic = None

            pred_entity = Entity(class_id=e_class_id,
                                 offset=offset,
                                 text=entity_text,
                                 norms=norms_dic)

            return pred_entity