def __merge(self, dataset, annotators):
        for doc_id in list(dataset.documents):
            doc = dataset.documents[doc_id]
            annotator_entities = {}
            # find the annotations that are marked complete by any annotator
            filenames = []

            doc_is_read = False
            annotatable_parts = set()
            for annotator in annotators:
                # either once or zero times
                for filename in glob.glob(os.path.join(os.path.join(self.directory, annotator), '*{}*.ann.json'.format(doc_id))):
                    with open(filename, 'r', encoding='utf-8') as file:
                        ann_json = json.load(file)
                        if ann_json['anncomplete'] or not self.delete_incomplete_docs:
                            doc_is_read = True
                            filenames.append(filename)
                            annotatable_parts |= set(ann_json['annotatable']['parts'])
                            annotator_entities[annotator] = ann_json['entities']
            if self.filter_below_iaa_threshold and not self.__is_acceptable(doc_id, doc, filenames):
                del dataset.documents[doc_id]
                continue

            # if there is at least once set of annotations
            if len(annotator_entities) > 0:
                Entity.equality_operator = 'exact_or_overlapping'
                if self.entity_strategy == 'priority':
                    merged = reduce(self.__merge_priority, [annotator_entities[x] for x in self.priority
                                                            if x in annotator_entities])
                else:
                    merged = reduce(self.__merge_pair, annotator_entities.values())

                for entity in merged:
                    try:
                        part = doc.parts[entity['part']]
                    except KeyError:
                        # TODO: Remove once the tagtog bug is fixed
                        break
                    if not self.read_only_class_id or entity['classId'] == self.read_only_class_id:
                        if self.is_predicted:
                            part.predicted_annotations.append(
                                Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text']))
                        else:
                            part.annotations.append(
                                Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text']))

                # delete parts that are not annotatable
                part_ids_to_del = []
                for part_id, part in doc.parts.items():
                    if part_id not in annotatable_parts:
                        part_ids_to_del.append(part_id)
                for part_id in part_ids_to_del:
                    del doc.parts[part_id]

            # Delete docs with no ann.jsons
            elif not doc_is_read:
                del dataset.documents[doc_id]

            else:
                continue  # keep the document
    def __merge_priority(self, entities_x, entities_y):
        merged = []
        merged_indices_x = []
        merged_indices_y = []

        for index_x, entity_x in enumerate(entities_x):
            for index_y, entity_y in enumerate(entities_y):
                if entity_x['part'] == entity_y['part']:
                    ann_x = Entity(entity_x['classId'],
                                   entity_x['offsets'][0]['start'],
                                   entity_x['offsets'][0]['text'])
                    ann_y = Entity(entity_y['classId'],
                                   entity_y['offsets'][0]['start'],
                                   entity_y['offsets'][0]['text'])

                    # if they are the same or overlap
                    # use the first once since that one has higher priority
                    if ann_x == ann_y:
                        if index_x not in merged_indices_x and index_y not in merged_indices_y:
                            merged_indices_x.append(index_x)
                            merged_indices_y.append(index_y)
                            merged.append(entity_x)

        self.__append_union(merged, entities_x, entities_y)
        return merged
Exemple #3
0
    def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_order_does_not_matter(self):

        entity_map_fun = (lambda e: "SAME")

        def relation_accept_fun(gold, pred):
            print('gold:', gold, ' <---> ', 'pred:', pred)
            return gold == pred

        r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "yin"), Entity(STUB_E_ID_2, 0, "yan"))
        r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "yan"), Entity(STUB_E_ID_2, 0, "yin"))

        self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r1.map(entity_map_fun)))
        self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun)))
        self.assertTrue(relation_accept_fun(r2.map(entity_map_fun), r1.map(entity_map_fun)))

        evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun)

        (dataset, part) = self._create_basic_dataset()

        # -

        part.relations = [r1]
        part.predicted_relations = [r1]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        print(evaluation)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
Exemple #4
0
    def test_DocumentLevelRelationEvaluator_parts_irrelevant(self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_ PART *1*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_2 = Part('_irrelevant_ PART *2*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_2'] = part_2

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_2.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")),
        ]

        self._apply_pipeline(dataset)

        # ---

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
    def annotate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for filename in glob.glob(str(self.directory + "/*.ann")):
            with open(filename, 'r', encoding='utf-8') as file:
                reader = csv.reader(file, delimiter='\t')

                pmid = os.path.basename(filename).replace('.ann', '')
                document = dataset.documents[pmid]
                for row in reader:
                    if row[0].startswith('T'):
                        entity_type, start, end = row[1].split()
                        start = int(start)
                        end = int(end)

                        title_len = len(document.parts['title'].text)
                        if 0 <= start < end <= title_len:
                            part = document.parts['title']
                        else:
                            part = document.parts['abstract']
                            start -= title_len + 1
                            end -= title_len + 1

                        if entity_type == 'SNP' or entity_type == 'RS':
                            ann = Entity(self.mut_class_id, start, row[2])
                            part.annotations.append(ann)

                        elif self.gene_class_id is not None and entity_type == 'Gene':
                            ann = Entity(self.gene_clas_id, start, row[2])
                            part.annotations.append(ann)
Exemple #6
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing')

        entities = [
            # Sent 1
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0),
            # Sent 2
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0),
            # Sent 3
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0),
            # Sent 4

        ]

        for e in entities:
            part1.annotations.append(e)

        cls.doc.parts['s1h1'] = part1

        cls.splitter = NLTKSplitter()
        cls.tokenizer = NLTK_TOKENIZER

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)

        # assert False, str(list(cls.dataset.sentences()))
        assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
 def __append_union(self, merged, entities_x, entities_y):
     # if the strategy is union
     # append the ones that are not overlapping with the already merged ones
     if self.strategy == 'union':
         existing = [Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text'])
                     for entity in merged]
         for entity in chain(entities_x, entities_y):
             ann = Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text'])
             if ann not in existing:
                 merged.append(entity)
Exemple #8
0
 def setUp(self):
     self.dataset = StringReader(
         'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
         )
     NLTKSplitter().split(self.dataset)
     TmVarTokenizer().tokenize(self.dataset)
     part = list(self.dataset.parts())[0]
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
    def annotate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for filename in glob.glob(str(self.directory + "/*.ann")):
            with open(filename, 'r', encoding='utf-8') as file:
                reader = csv.reader(file, delimiter='\t')

                docid, partid = os.path.basename(filename).replace('.ann',
                                                                   '').split(
                                                                       '-', 1)

                for row in reader:
                    if row[0].startswith('T'):
                        entity_type, start, end = row[1].split()
                        text = row[2]

                        if entity_type == 'mutation':
                            ann = Entity(self.entity_class_id, int(start),
                                         text)
                            if self.is_predicted:
                                dataset.documents[docid].parts[
                                    partid].predicted_annotations.append(ann)
                            else:
                                dataset.documents[docid].parts[
                                    partid].annotations.append(ann)
Exemple #10
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:

            for row in file:
                columns = row.split("\t")

                docid = columns[0]
                typ = columns[1]
                start = columns[2]
                end = columns[3]
                entity_text = columns[7]

                class_id = None
                if typ == 'Mutation':
                    class_id = self.mut_class_id
                elif typ == 'AminoacidResidue':
                    class_id = self.residue_class_id

                if class_id:
                    document = dataset.documents.get(docid, Document())

                    part = Part(entity_text)
                    document.parts[typ + '|' + start + '|' + end] = part

                    part.annotations.append(
                        Entity(class_id, int(start), entity_text))

                    dataset.documents[docid] = document

        return dataset
Exemple #11
0
    def _parse_pubtator(doc_id, doc, response_text):
        lines = response_text.strip().splitlines()
        if len(lines) >= 2 and len(doc.parts) == 2:
            tm_var_title = re.search('{}\|t\|(.*)'.format(doc_id),
                                     lines[0]).group(1)
            tm_var_abstract = re.search('{}\|a\|(.*)'.format(doc_id),
                                        lines[1]).group(1)

            parts = iter(doc.parts.values())
            title = next(parts)
            abstract = next(parts)

            for line in lines[2:]:
                _, start, end, _, _, _ = line.split('\t')
                start = int(start)
                end = int(end)

                if 0 <= start < end <= len(tm_var_title):
                    part = title
                    tm_part = tm_var_title
                else:
                    part = abstract
                    tm_part = tm_var_abstract
                    start -= len(tm_var_title) + 1
                    end -= len(tm_var_title) + 1

                start, end = TmVarTagger._adjust_offsets(
                    part.text, tm_part, start, end)

                part.predicted_annotations.append(
                    Entity(MUT_CLASS_ID, start, part.text[start:end]))
Exemple #12
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]
Exemple #13
0
    def process(self, dataset, class_id=MUT_CLASS_ID):
        for doc_id, doc in dataset.documents.items():
            for part_id, part in doc.parts.items():
                self.__fix_issues(part)
                for regex in self.patterns:
                    for match in regex.finditer(part.text):
                        start = match.start()
                        end = match.end()
                        matched_text = part.text[start:end]
                        ann = Entity(class_id, start, matched_text)

                        Entity.equality_operator = 'exact_or_overlapping'
                        if ann not in part.predicted_annotations:
                            part.predicted_annotations.append(
                                Entity(class_id, start, matched_text))
                        Entity.equality_operator = 'overlapping'
                        if ann in part.predicted_annotations:
                            for index, ann_b in enumerate(
                                    part.predicted_annotations):
                                if ann == ann_b and len(matched_text) > len(
                                        ann_b.text):
                                    part.predicted_annotations[index] = ann

                to_delete = [
                    index
                    for index, ann in enumerate(part.predicted_annotations)
                    if any(r.search(ann.text) for r in self.negative_patterns)
                    or (not self.keep_silent and self.__is_silent(ann)) or
                    (not self.keep_unnumbered and not self._is_numbered(ann))
                ]

                part.predicted_annotations = [
                    ann for index, ann in enumerate(part.predicted_annotations)
                    if index not in to_delete
                ]

        # sanity check, make sure annotations match their offset
        for part in dataset.parts():
            for ann in part.predicted_annotations:
                assert ann.text == part.text[ann.offset:ann.offset +
                                             len(ann.text)]
                while ann.text[0] == ' ':
                    ann.offset += 1
                    ann.text = ann.text[1:]
                while ann.text[-1] == ' ':
                    ann.text = ann.text[:-1]
    def __merge_pair(self, entities_x, entities_y):
        merged = []
        merged_indices_x = {}
        merged_indices_y = {}

        for index_x, entity_x in enumerate(entities_x):
            for index_y, entity_y in enumerate(entities_y):
                # if they have the same part_id
                if entity_x['part'] == entity_y['part']:
                    ann_x = Entity(entity_x['classId'],
                                   entity_x['offsets'][0]['start'],
                                   entity_x['offsets'][0]['text'])
                    ann_y = Entity(entity_y['classId'],
                                   entity_y['offsets'][0]['start'],
                                   entity_y['offsets'][0]['text'])

                    # if they are the same or overlap
                    if ann_x == ann_y:
                        # if neither of them haven't been matched before
                        if index_x not in merged_indices_x and index_y not in merged_indices_y:
                            if self.operator(len(ann_x.text), len(ann_y.text)):
                                merged.append(entity_x)
                                merged_indices_x[index_x] = len(merged), ann_x
                                merged_indices_y[index_y] = len(merged), ann_x
                            else:
                                merged.append(entity_y)
                                merged_indices_x[index_x] = len(merged), ann_y
                                merged_indices_y[index_y] = len(merged), ann_y
                        # if we already matched them before
                        else:
                            # try to see if we have a more suitable match now
                            if index_x in merged_indices_x:
                                index, ann_existing = merged_indices_x[index_x]
                            else:
                                index, ann_existing = merged_indices_y[index_y]
                            if self.operator(len(ann_x.text), len(ann_y.text)):
                                ann_new, entity_new = ann_x, entity_x
                            else:
                                ann_new, entity_new = ann_y, entity_y
                            if self.operator(len(ann_new.text),
                                             len(ann_existing.text)):
                                merged[index - 1] = entity_new

        self.__append_union(merged, entities_x, entities_y)
        return merged
Exemple #15
0
    def generate_abstracts(self, list_of_pmids):
        """
        Generates list of documents using pmids and the restapi interface from tmtools.
        Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/"
        :param list_of_pmids: strings
        :return nalaf.structures.Dataset: dataset
        """
        # if os.path.isfile('cache.json'):
        #     with open('cache.json') as f:
        #           tm_var = json.load()
        # else:
        url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/'
        url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

        # load cache.json if exists
        if os.path.exists('cache.json'):
            with open('cache.json', 'r', encoding='utf-8') as f:
                tm_var = json.load(f)
        else:
            tm_var = {}

        for pmid in list_of_pmids:
            if pmid not in tm_var:  # if pmid was not already downloaded from tmTools
                req = requests.get(url_tmvar.format(pmid))
                try:
                    tm_var[pmid] = req.json()
                except ValueError:
                    pass
        # cache the tmVar annotations so we don't pull them every time
        with open('cache.json', 'w') as file:
            json.dump(tm_var, file, indent=4)

        # for key in tm_var:
        #     print(json.dumps(tm_var[key], indent=4))

        dataset = Dataset()
        for doc_id in list_of_pmids:
            if doc_id in tm_var:
                doc = Document()
                text = tm_var[doc_id]['text']
                part = Part(text)
                denotations = tm_var[doc_id]['denotations']
                annotations = []
                for deno in denotations:
                    ann = Entity(
                        class_id=self.mut_class_id,
                        offset=int(deno['span']['begin']),
                        text=text[deno['span']['begin']:deno['span']['end']])
                    annotations.append(ann)
                    # note should the annotations from tmvar go to predicted_annotations or annotations?
                part.annotations = annotations
                doc.parts['abstract'] = part
                dataset.documents[doc_id] = doc

        return dataset
Exemple #16
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        part1 = Part('123')
        part2 = Part('45678')
        ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='2',
                      confidence=0)
        ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='567',
                      confidence=1)
        ann1.subclass = 0
        ann2.subclass = 2
        part1.annotations.append(ann1)
        part2.annotations.append(ann2)
        cls.doc.parts['s1h1'] = part1
        cls.doc.parts['s2p1'] = part2

        doc2 = Document()
        doc3 = Document().parts['someid'] = Part('marmor stein und eisen')
        cls.dataset2 = Dataset()
        cls.dataset2.documents['newid'] = doc3
        cls.dataset2.documents['testid'] = doc2
Exemple #17
0
    def annotate(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for filename in glob.glob(str(self.directory + "/*.ann")):
            with open(filename, 'r', encoding='utf-8') as file:
                reader = csv.reader(file, delimiter='\t')

                pmid = os.path.basename(filename).replace('.ann', '')
                document = dataset.documents[pmid]

                for row in reader:
                    if row[0].startswith('T'):
                        entity_type, start, end = row[1].split()

                        if entity_type == 'SNP' or entity_type == 'RS':
                            ann = Entity(MUT_CLASS_ID, start, row[2])
                            document.parts['abstract'].annotations.append(ann)

                        elif entity_type == 'Gene':
                            ann = Entity(self.gene_class_id, start, row[2])
                            document.parts['abstract'].annotations.append(ann)
Exemple #18
0
    def test_DocumentLevelRelationEvaluator_default_entities_case_irrelevant(
            self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_1.predicted_relations = [
            # empty
        ]

        self._apply_pipeline(dataset)

        # -

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.0)

        # ---

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

        # -

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "tool"),
                     Entity(STUB_E_ID_2, 0, "MAYNARD")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
Exemple #19
0
    def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_ignore_some_predictions(self):

        entity_map_fun = (lambda e: e.text)

        def relation_accept_fun(gold, pred):
            gold_pred_char_num = int(gold[-1])
            pred_last_char_num = int(pred[-1])
            print('gold:', gold, ' <---> ', 'pred:', pred,)

            if gold == pred:  # 1 == 1
                return True
            elif gold < pred:  # 1 < 2
                return None
            else:
                return False   # 1 !<= 0

            return gold == pred

        r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1"))

        r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1"))  # Accept
        r3 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2"))  # Ignore
        r4 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "0"))  # Reject

        self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun)))
        self.assertEqual(None, relation_accept_fun(r1.map(entity_map_fun), r3.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r1.map(entity_map_fun), r4.map(entity_map_fun)))

        evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun)

        (dataset, part) = self._create_basic_dataset()

        # -

        part.relations = [r1]
        part.predicted_relations = [r2, r4] + [r3, r3, r3, r3, r3]  # All the r3's should be ignored

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        print(evaluation)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.6666666666666666)
Exemple #20
0
    def test_overlapping(self):

        e1 = Entity(class_id="e_x", offset=987, text="PKB/Akt")
        e2 = Entity(class_id="e_x", offset=987, text="PKB")

        Entity.equality_operator = 'exact_or_overlapping'

        print(e1.offset, e1.end_offset())
        print(e2.offset, e2.end_offset())

        self.assertEqual(e1, e2)
Exemple #21
0
    def _parse_json(doc_id, doc, response_text):
        try:
            for pred_part in json.loads(response_text, strict=False):
                partid = pred_part['sourceid']
                part = doc.parts[partid]
                for pred in pred_part['denotations']:
                    start = pred['span']['begin']
                    end = pred['span']['end']

                    start, end = TmVarTagger._adjust_offsets(
                        part.text, pred_part['text'], start, end)

                    part.predicted_annotations.append(
                        Entity(MUT_CLASS_ID, start, part.text[start:end]))
        except Exception:
            print("ERROR PARSING JSON", response_text)
            raise
Exemple #22
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:
            documents = file.read().strip().split('\n\n')
            for document_text in documents:
                lines = document_text.strip().splitlines()

                first_line = re.search('(\d+)\|t\|(.*)', lines[0])
                doc_id = first_line.group(1)
                tmvar_title = first_line.group(2)
                tmvar_abstract = re.search('(\d+)\|a\|(.*)', lines[1]).group(2)

                document = Document()
                title = Part(tmvar_title)
                abstract = Part(tmvar_abstract)
                document.parts['title'] = title
                document.parts['abstract'] = abstract

                for line in lines[2:]:
                    _, start, end, _, _, _ = line.split('\t')
                    start = int(start)
                    end = int(end)

                    if 0 <= start < end <= len(tmvar_title):
                        part = title
                    else:
                        part = abstract
                        start -= len(tmvar_title) + 1
                        end -= len(tmvar_title) + 1

                    part.annotations.append(
                        Entity(self.mut_class_id, start, part.text[start:end]))

                dataset.documents[doc_id] = document

        return dataset
Exemple #23
0
    def _get_test_data(self, entity_sentence, assumed_tokens_words=None):
        if assumed_tokens_words is None:
            assumed_tokens_words = entity_sentence.split(' ')

        # Create dataset

        dataset = StringReader(entity_sentence).read()
        part = next(dataset.parts())
        entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                        offset=0,
                        text=entity_sentence)
        part.annotations.append(entity)

        # Apply through pipeline

        NLTKSplitter().split(dataset)
        NLTK_TOKENIZER.tokenize(dataset)
        self.parser.parse(dataset)

        # Rest

        sentences = part.sentences
        assert len(sentences) == 1
        sentence = sentences[0]

        assert len(assumed_tokens_words) == len(sentence)
        for (assumed_token_word, actual_token) in zip(assumed_tokens_words,
                                                      sentence):
            assert assumed_token_word == actual_token.word

        part.compute_tokens_depth()
        roots = Part.get_sentence_roots(sentence)
        for r in roots:
            self._assert_depth_eq(r, 0)

        part.set_entities_head_tokens()

        return (dataset, sentence, entity, roots)
Exemple #24
0
    def read(self):
        """
        read each html file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        Note that the text files may contain multiple paragraphs. The reader
        converts these paragraphs into different parts. Because of necessary offset corrections,
        the reader reads at the same time both the content and the annotations.

        :returns structures.data.Dataset
        """
        dataset = Dataset()

        ids_per_file_array = [1]

        file_list = glob.glob(str(self.directory + "/*.txt"))

        for file_path in file_list:
            file_name = os.path.basename(file_path)

            docid, partid_prefix, = file_name.replace('.txt', '').split('-', 1)
            # partid_prefix not complete due to multiple part cration for a single .txt file

            if 'Abstract' in partid_prefix:
                is_abstract = True
            else:
                is_abstract = False

            with open(file_path, encoding='utf-8') as file:
                text_raw = file.read()

            text = text_raw.replace('** IGNORE LINE **\n', '')
            paragraph_list = text.split('\n\n')

            # inital offset for raw_text
            tot_offset = text_raw.count('** IGNORE LINE **\n') * 18
            offsets = [tot_offset]

            for i, text_part in enumerate(paragraph_list):
                # if text is empty (usually last text due to splitting of "\n\n")
                if text_part != "":
                    partid = "{}-p{}".format(partid_prefix, i + 1)

                    if docid in dataset:
                        dataset.documents[docid].parts[partid] = Part(
                            text_part, is_abstract=is_abstract)
                    else:
                        document = Document()
                        document.parts[partid] = Part(text_part,
                                                      is_abstract=is_abstract)
                        dataset.documents[docid] = document

                    # add offset for next paragraph
                    tot_offset += len(text_part) + 2
                    offsets.append(tot_offset)

            # to delete last element
            del offsets[-1]

            # annotations
            with open(file_path.replace('.txt', '.ann'),
                      encoding='utf-8') as f:
                reader = csv.reader(f, delimiter='\t')
                for row in reader:
                    if row[0].startswith('T'):
                        entity_type, start, end = row[1].split()
                        start = int(start)
                        end = int(end)
                        text = row[2]

                        partid = None
                        part_index = None

                        for i in range(len(offsets) - 1):
                            if offsets[i + 1] > start:
                                part_index = i
                                break

                        if part_index is None:
                            part_index = len(offsets) - 1

                        partid = "{}-p{}".format(partid_prefix, part_index + 1)
                        real_start = start - offsets[part_index]
                        real_end = end - offsets[part_index]
                        calc_ann_text = document.parts[partid].text[
                            real_start:real_end]

                        if calc_ann_text != text:
                            print("   ERROR", docid, part_index, partid, start,
                                  offsets, real_start, "\n\t", text, "\n\t",
                                  calc_ann_text, "\n\t",
                                  document.parts[partid].text)

                        if entity_type == 'mutation':
                            ann = Entity(self.mut_class_id, real_start, text)
                            dataset.documents[docid].parts[
                                partid].annotations.append(ann)

                        elif entity_type == 'gene':
                            ann = Entity(self.gene_class_id, real_start, text)
                            dataset.documents[docid].parts[
                                partid].annotations.append(ann)

        return dataset
Exemple #25
0
    def tag(self,
            dataset,
            annotated=False,
            uniprot=False,
            process_only_abstract=True):
        """
        :type dataset: nalaf.structures.data.Dataset
        :param annotated: if True then saved into annotations otherwise into predicted_annotations
        """
        with GNormPlus() as gnorm:
            for doc_id, doc in dataset.documents.items():
                if process_only_abstract:
                    genes, gnorm_title, gnorm_abstract = gnorm.get_genes_for_pmid(
                        doc_id, postproc=True)

                    if uniprot:
                        with Uniprot() as uprot:
                            list_of_ids = gnorm.uniquify_genes(genes)
                            genes_mapping = uprot.get_uniprotid_for_entrez_geneid(
                                list_of_ids)
                    else:
                        genes_mapping = {}

                    # find the title and the abstract
                    parts = iter(doc.parts.values())
                    title = next(parts)
                    abstract = next(parts)
                    adjustment_offsets = []
                    if title.text != gnorm_title:
                        adjustment_offsets += self.__find_offset_adjustments(
                            title.text, gnorm_title, 0)
                    if abstract.text != gnorm_abstract:
                        adjustment_offsets += self.__find_offset_adjustments(
                            abstract.text, gnorm_abstract, len(gnorm_title))

                    for start, end, text, gene_id in genes:
                        if 0 <= start < end <= len(title.text):
                            part = title
                        else:
                            part = abstract
                            # we have to readjust the offset since GnormPlus provides
                            # offsets for title and abstract together
                            offset = len(title.text) + 1
                            start -= offset
                            end -= offset

                        for adjustment_offset, adjustment in adjustment_offsets:
                            if start > adjustment_offset:
                                start -= adjustment

                        # discussion which confidence value for gnormplus because there is no value supplied
                        ann = Entity(class_id=self.predicts_classes[0],
                                     offset=start,
                                     text=text,
                                     confidence=0.5)
                        try:
                            norm_dict = {
                                self.predicts_classes[1]: gene_id,
                                self.predicts_classes[2]:
                                genes_mapping[gene_id]
                            }
                        except KeyError:
                            norm_dict = {self.predicts_classes[1]: gene_id}

                        norm_string = ''  # todo normalized_text (stemming ... ?)
                        ann.norms = norm_dict
                        ann.normalized_text = norm_string
                        if annotated:
                            part.annotations.append(ann)
                        else:
                            part.predicted_annotations.append(ann)
                else:
                    # todo this is not used for now anywhere, might need to be re-worked or excluded
                    # genes = gnorm.get_genes_for_text(part.text)
                    pass
Exemple #26
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        from functools import reduce
        dataset = Dataset()
        for filename in glob.glob(self.path + '/*.txt'):
            with open(filename, 'r') as f:
                data = f.read()

                content = data.split("\n")
                try:
                    pmid = int(content[0])
                except ValueError:
                    continue

                doc = Document()

                title = content[2]
                part_title = Part(title, is_abstract=True)
                body = content[4]
                part_abstract = Part(body, is_abstract=True)

                title_offset = len(str(pmid)) + 2  # +2 for twice newline
                body_offset = title_offset + len(
                    title) + 2  # +2 for twice newline

                # elements for temporary
                current_annotation = []
                last_element = None

                # print(filename, pmid, title)
                with open(filename + '.ann', 'r') as fa:
                    tree = ET.parse(fa)
                    for element in tree.iterfind(
                            'Annotation/Annotation[@type]'):
                        # if gene annotation skip
                        if element.attrib['type'] == 'ge':
                            continue

                        # if last element is empty (beginning of new doc) save as last_element and skip
                        if last_element is None:
                            last_element = element
                            continue

                        span = last_element.attrib['span'].split('..')
                        start = int(span[0])
                        end = int(span[1])
                        text = data[start:end]

                        if start >= body_offset:
                            norm_start = start - body_offset
                            norm_end = end - body_offset
                        else:
                            norm_start = start - title_offset
                            norm_end = end - title_offset

                        if end + 1 == int(
                                element.attrib['span'].split('..')[0]
                        ):  # todo bugfix still mistake if space is in between the whole annotation case: "#1632 T"
                            if len(
                                    current_annotation
                            ) == 0:  # if no series of annotations linked
                                current_annotation.append(norm_start)
                                current_annotation.append(norm_end)
                                current_annotation.append(text)
                                current_annotation.append(
                                    (start >= body_offset))  # if is_body
                            else:  # if already annotations contained there
                                current_annotation[1] = norm_end
                                current_annotation[2] += text
                        else:
                            if len(current_annotation) > 0:
                                entity = Entity(self.mut_class_id,
                                                current_annotation[0],
                                                current_annotation[2])
                                if current_annotation[3]:
                                    part_abstract.annotations.append(entity)
                                else:
                                    part_title.annotations.append(entity)
                                current_annotation = []

                            entity = Entity(self.mut_class_id, norm_start,
                                            text)
                            if start >= body_offset:
                                part_abstract.annotations.append(entity)
                            else:
                                part_title.annotations.append(entity)

                        last_element = element

                    span = last_element.attrib['span'].split('..')
                    start = int(span[0])
                    end = int(span[1])
                    text = data[start:end]
                    if len(current_annotation
                           ) == 0:  # if no series of annotations linked
                        if start >= body_offset:
                            norm_start = start - body_offset
                            is_body = True
                        else:
                            norm_start = start - title_offset
                            is_body = False

                        entity = Entity(self.mut_class_id, norm_start, text)

                        if is_body:
                            part_abstract.annotations.append(entity)
                        else:
                            part_title.annotations.append(entity)

                    else:  # if already annotations contained there
                        current_annotation[2] += text
                        entity = Entity(self.mut_class_id,
                                        current_annotation[0],
                                        current_annotation[2])
                        if current_annotation[3]:
                            part_abstract.annotations.append(entity)
                        else:
                            part_title.annotations.append(entity)

                doc.parts['title'] = part_title
                doc.parts['abstract'] = part_abstract
                # print(part_title)
                # print(part_body)
                dataset.documents[pmid] = doc
                # print(doc)

        return dataset
Exemple #27
0
    def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_order_matters(self):

        entity_map_fun = (lambda e: e.text)

        def relation_accept_fun(gold, pred):
            print('gold:', gold, ' <---> ', 'pred:', pred)
            return gold < pred

        r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "1"), Entity(STUB_E_ID_2, 0, "2"))
        r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "2"), Entity(STUB_E_ID_2, 0, "1"))

        # r1 not equiv r1 because this IS NOT equals (r1 not < r1)
        self.assertFalse(relation_accept_fun(r1.map(entity_map_fun), r1.map(entity_map_fun)))
        # r1 < r2
        self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun)))
        # r2 not < r1
        self.assertFalse(relation_accept_fun(r2.map(entity_map_fun), r1.map(entity_map_fun)))

        evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun)

        (dataset, part) = self._create_basic_dataset()

        # -

        part.relations = [r1]
        part.predicted_relations = [r1]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        print(evaluation)
        self.assertEqual(evaluation.tp, 0)
        self.assertEqual(evaluation.fn, 1)
        self.assertEqual(evaluation.fp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.0)

        # -

        part.relations = [r1]
        part.predicted_relations = [r2]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        print(evaluation)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

        # -

        part.relations = [r2]
        part.predicted_relations = [r1]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 0)
        self.assertEqual(evaluation.fn, 1)
        self.assertEqual(evaluation.fp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.0)
Exemple #28
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        from functools import reduce
        dataset = Dataset()
        with open(self.path, 'r') as f:

            tree = ET.parse(f)
            # level document
            for element in tree.iterfind('Article'):
                doc = Document()

                # pmid <Pmid>
                pmid = element[0].text

                # title <Title>
                title = element[1].text
                if not title:
                    title = ""
                title_annotations = []
                for child in element[1]:
                    if child.tag == 'variant':
                        entity = Entity(self.mut_class_id, len(title),
                                        child.text)
                        title_annotations.append(entity)
                    # unforunately child.text or child.tail can be empty and return None, which cannot be written as ""
                    try:
                        title += child.text
                    except TypeError:
                        pass
                    try:
                        title += child.tail
                    except TypeError:
                        pass
                part_title = Part(title)
                part_title.annotations.extend(title_annotations)

                # body - abstract <Abstract>
                abstract = element[2].text
                if not abstract:
                    abstract = ""
                abstract_annotations = []
                for child in element[2]:
                    if child.tag == 'variant':
                        entity = Entity(self.mut_class_id, len(abstract),
                                        child.text)
                        abstract_annotations.append(entity)
                    # unforunately child.text or child.tail can be empty and return None, which cannot be written as ""
                    try:
                        abstract += child.text
                    except TypeError:
                        pass
                    try:
                        abstract += child.tail
                    except TypeError:
                        pass
                part_abstract = Part(abstract)
                part_abstract.annotations.extend(abstract_annotations)

                # save part to document
                doc.parts['title'] = part_title
                doc.parts['abstract'] = part_abstract
                dataset.documents[pmid] = doc  # save document to dataset
        return dataset
Exemple #29
0
    def __read_annjson(self, reader, filename, dataset):
        try:
            doc_id = os.path.basename(filename).replace('.ann.json', '').replace('.json', '')
            if not self.whole_basename_as_docid and '-' in doc_id:
                doc_id = doc_id.split('-')[-1]

            ann_json = json.load(reader)

            try:
                document = dataset.documents[doc_id]
            except Exception as err:
                print_warning("The annjson with docid={} was not in the whole plain dataset.".format(doc_id))
                return doc_id

            if not (ann_json['anncomplete'] or self.is_predicted) and self.delete_incomplete_docs:
                del dataset.documents[doc_id]

            else:

                for e in ann_json['entities']:

                    if self.read_only_class_id is None or e['classId'] in self.read_only_class_id:

                        part = document.parts[e['part']]

                        try:
                            normalizations = {key: obj['source']['id'] for key, obj in e['normalizations'].items()}
                        except KeyError as err:
                            print_warning("The normalization is badly formatted: (docid={}) {}".format(doc_id, str(e['normalizations'])))
                            normalizations = None

                        entity = Entity(
                            e['classId'],
                            e['offsets'][0]['start'],
                            e['offsets'][0]['text'],
                            e['confidence']['prob'],
                            norms=normalizations)

                        if self.is_predicted:
                            part.predicted_annotations.append(entity)
                        else:
                            part.annotations.append(entity)

                if self.read_relations:
                    for relation in ann_json['relations']:
                        # Note: no distinction with predicted_relations yet

                        part = document.parts[relation['entities'][0].split('|')[0]]

                        e1_start = int(relation['entities'][0].split('|')[1].split(',')[0])
                        e2_start = int(relation['entities'][1].split('|')[1].split(',')[0])

                        rel_id = relation['classId']

                        e1 = part.get_entity(e1_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies)
                        e2 = part.get_entity(e2_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies)

                        if (not self.raise_exception_on_incosistencies and (e1 is None or e2 is None)):
                            continue

                        rel = Relation(rel_id, e1, e2)

                        part.relations.append(rel)

                # delete parts that are not annotatable
                annotatable_parts = set(ann_json['annotatable']['parts'])
                part_ids_to_del = []
                for part_id, part in document.parts.items():
                    if part_id not in annotatable_parts:
                        part_ids_to_del.append(part_id)
                for part_id in part_ids_to_del:
                    del document.parts[part_id]

            return doc_id

        except Exception as err:
            if self.raise_exception_on_incosistencies:
                raise err
            else:
                pass
Exemple #30
0
    def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_dont_count_multiple_same_hits(self):

        entity_map_fun = (lambda e: e.text)

        def relation_accept_fun(gold, pred):
            print('gold:', gold, ' <---> ', 'pred:', pred,)
            gold = int(gold[-1])
            pred = int(pred[-1])


            if gold <= pred and ((pred - gold) < 3):  # e.g., 1 <= 1, 2, 3
                return True
            else:
                return False

            return gold == pred

        r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1"))
        r5 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "9"))  # Missing == fn
        r6 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "5"))
        r8 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2"))  # (maps to 1) Own repetition in gold, so 1 should be counted twice

        r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1"))  # Accept 1 --> do count == tp
        r3 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2"))  # repeated Accept 1,2 --> do count because of own repetition in gold == tp
        r4 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "3"))  # repeated Accept 1,2 --> do not count because it's over repetition
        r7 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "6"))  # Accept 5 --> do count == tp
        r9 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "5"))  # Accept 5 --> do not count because it's over repetition

        self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun)))
        self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r3.map(entity_map_fun)))
        self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r4.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r1.map(entity_map_fun), r7.map(entity_map_fun)))

        self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r2.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r3.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r4.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r7.map(entity_map_fun)))

        self.assertEqual(True, relation_accept_fun(r6.map(entity_map_fun), r7.map(entity_map_fun)))

        self.assertEqual(False, relation_accept_fun(r8.map(entity_map_fun), r2.map(entity_map_fun)))
        self.assertEqual(True, relation_accept_fun(r8.map(entity_map_fun), r3.map(entity_map_fun)))
        self.assertEqual(True, relation_accept_fun(r8.map(entity_map_fun), r4.map(entity_map_fun)))
        self.assertEqual(False, relation_accept_fun(r8.map(entity_map_fun), r7.map(entity_map_fun)))

        evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun)

        (dataset, part) = self._create_basic_dataset()

        # -

        part.relations = [r1, r5, r6, r8]
        part.predicted_relations = [r2, r3, r4, r7, r9]  # Only one shold be accepted

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        print(evaluation)
        self.assertEqual(evaluation.tp, 3, evaluation)
        self.assertEqual(evaluation.fn, 1)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.8571428571428571)