Ejemplo n.º 1
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        xmls = []
        if os.path.isdir(self.path):
            xmls = [
                os.path.join(root, file)
                for root, _, files in os.walk(self.path) for file in files
                if file.startswith('medline') and file.endswith('xml')
            ]
        elif self.path.startswith('medline') and self.path.endswith('xml'):
            xmls = [self.path]

        dataset = Dataset()

        for xml in xmls:
            for child in ET.parse(xml).getroot():
                pmid = next(child.iter('PMID')).text

                document = Document()
                article = next(child.iter('Article'))
                title = next(article.iter('ArticleTitle')).text
                document.parts['title'] = Part(title, is_abstract=False)
                try:
                    abstract = next(article.iter('AbstractText')).text
                    document.parts['abstract'] = Part(abstract)
                except StopIteration:
                    pass
                dataset.documents[pmid] = document

        return dataset
Ejemplo n.º 2
0
    def setUpClass(cls):
        cls.dataset = Dataset()

        doc1 = Document()
        cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1

        for s in TEST_SENTENCES_SINGLE_ROOT:
            part = Part(s)
            doc1.parts[s] = part

        doc2 = Document()
        cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2

        for s in TEST_SENTENCES_MULTI_ROOT:
            part = Part(s)
            doc2.parts[s] = part

        cls.nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(cls.nlp)
        cls.splitter = NLTKSplitter()
        cls.tokenizer = GenericTokenizer(
            lambda string: (tok.text for tok in cls.nlp.tokenizer(string)))

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)
        cls.parser.parse(cls.dataset)

        cls.computed_sentences = []

        for sentence in cls.dataset.sentences():
            dist, then = compute_shortest_paths(sentence)
            cls.computed_sentences.append((dist, then, sentence))
Ejemplo n.º 3
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        part1 = Part('123')
        part2 = Part('45678')
        ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='2',
                      confidence=0)
        ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='567',
                      confidence=1)
        ann1.subclass = 0
        ann2.subclass = 2
        part1.annotations.append(ann1)
        part2.annotations.append(ann2)
        cls.doc.parts['s1h1'] = part1
        cls.doc.parts['s2p1'] = part2

        doc2 = Document()
        doc3 = Document().parts['someid'] = Part('marmor stein und eisen')
        cls.dataset2 = Dataset()
        cls.dataset2.documents['newid'] = doc3
        cls.dataset2.documents['testid'] = doc2
Ejemplo n.º 4
0
    def test_DocumentLevelRelationEvaluator_order_irrelevant(self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"),
                     Entity(STUB_E_ID_1, 0, "TOOL")),
        ]

        self._apply_pipeline(dataset)

        # ---

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
Ejemplo n.º 5
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)],
                          [Token('Try', 18), Token('tried', 22), Token('tries', 28)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.generator = PorterStemFeatureGenerator()
Ejemplo n.º 6
0
    def generate_abstracts(self, list_of_pmids):
        """
        Generates list of documents using pmids and the restapi interface from tmtools.
        Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/"
        :param list_of_pmids: strings
        :return nalaf.structures.Dataset: dataset
        """
        # if os.path.isfile('cache.json'):
        #     with open('cache.json') as f:
        #           tm_var = json.load()
        # else:
        url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/'
        url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

        # load cache.json if exists
        if os.path.exists('cache.json'):
            with open('cache.json', 'r', encoding='utf-8') as f:
                tm_var = json.load(f)
        else:
            tm_var = {}

        for pmid in list_of_pmids:
            if pmid not in tm_var:  # if pmid was not already downloaded from tmTools
                req = requests.get(url_tmvar.format(pmid))
                try:
                    tm_var[pmid] = req.json()
                except ValueError:
                    pass
        # cache the tmVar annotations so we don't pull them every time
        with open('cache.json', 'w') as file:
            json.dump(tm_var, file, indent=4)

        # for key in tm_var:
        #     print(json.dumps(tm_var[key], indent=4))

        dataset = Dataset()
        for doc_id in list_of_pmids:
            if doc_id in tm_var:
                doc = Document()
                text = tm_var[doc_id]['text']
                part = Part(text)
                denotations = tm_var[doc_id]['denotations']
                annotations = []
                for deno in denotations:
                    ann = Entity(
                        class_id=self.mut_class_id,
                        offset=int(deno['span']['begin']),
                        text=text[deno['span']['begin']:deno['span']['end']])
                    annotations.append(ann)
                    # note should the annotations from tmvar go to predicted_annotations or annotations?
                part.annotations = annotations
                doc.parts['abstract'] = part
                dataset.documents[doc_id] = doc

        return dataset
Ejemplo n.º 7
0
    def setUpClass(cls):
        # create a sample dataset1 (1) to test
        cls.dataset1 = Dataset()
        doc_1 = Document()

        text = '.... aaaa .... bbbb .... cccc .... dddd .... eeee .... ffff .... gggg .... hhhh .... jjjj'
        part_1 = Part(text)

        cls.dataset1.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        exact_1 = Entity(STUB_E_ID_1, 5, 'aaaa')
        exact_1.subclass = 1
        exact_2 = Entity(STUB_E_ID_1, 55, 'ffff')
        exact_2.subclass = 2
        exact_3 = Entity(STUB_E_ID_1, 75, 'hhhh')
        exact_3.subclass = 2

        overlap_1_1 = Entity(STUB_E_ID_1, 25, 'cccc')
        overlap_1_1.subclass = 1
        overlap_1_2 = Entity(STUB_E_ID_1, 26, 'cc')
        overlap_1_2.subclass = 1

        overlap_2_1 = Entity(STUB_E_ID_1, 32, '.. ddd')
        overlap_2_1.subclass = 2
        overlap_2_2 = Entity(STUB_E_ID_1, 36, 'ddd ...')
        overlap_2_2.subclass = 2

        overlap_3_1 = Entity(STUB_E_ID_1, 65, 'gggg')
        overlap_3_1.subclass = 1
        overlap_3_2 = Entity(STUB_E_ID_1, 62, '.. gggg ..')
        overlap_3_2.subclass = 2

        missing_1 = Entity('e2', 45, 'eeee')
        missing_1.subclass = 1
        missing_2 = Entity('e2', 84, 'jjjj')
        missing_2.subclass = 1

        spurios = Entity('e2', 15, 'bbbb')
        spurios.subclass = 1

        part_1.annotations = [
            exact_1, exact_2, exact_3, overlap_1_1, overlap_2_1, overlap_3_1,
            missing_1, missing_2
        ]
        part_1.predicted_annotations = [
            exact_1, exact_2, exact_3, overlap_1_2, overlap_2_2, overlap_3_2,
            spurios
        ]
Ejemplo n.º 8
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[
            Token('Make', 0),
            Token('making', 5),
            Token('made', 12)
        ], [Token('Try', 18),
            Token('tried', 22),
            Token('tries', 28)]]
        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        for token in self.dataset.tokens():
            token.features['a'] = 'a'
            token.features['b'] = 'b'
Ejemplo n.º 9
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]
Ejemplo n.º 10
0
    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()
Ejemplo n.º 11
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:

            for row in file:
                columns = row.split("\t")

                docid = columns[0]
                typ = columns[1]
                start = columns[2]
                end = columns[3]
                entity_text = columns[7]

                class_id = None
                if typ == 'Mutation':
                    class_id = self.mut_class_id
                elif typ == 'AminoacidResidue':
                    class_id = self.residue_class_id

                if class_id:
                    document = dataset.documents.get(docid, Document())

                    part = Part(entity_text)
                    document.parts[typ + '|' + start + '|' + end] = part

                    part.annotations.append(
                        Entity(class_id, int(start), entity_text))

                    dataset.documents[docid] = document

        return dataset
Ejemplo n.º 12
0
    def read_file(a_file,
                  filename,
                  dataset=None,
                  whole_basename_as_docid=False):
        if dataset is None:
            dataset = Dataset()

        soup = BeautifulSoup(a_file, "html.parser")
        document = Document()

        for part in soup.find_all(id=re.compile('^s')):
            if re.match(r'^s[3-9]', part['id']):
                is_abstract = False
            else:
                is_abstract = True
            document.parts[part['id']] = Part(str(part.string),
                                              is_abstract=is_abstract)

        doc_id = os.path.basename(filename).replace('.plain.html', '').replace(
            '.html', '').replace('.xml', '')
        if not whole_basename_as_docid and '-' in doc_id:
            doc_id = doc_id.split('-')[-1]

        dataset.documents[doc_id] = document

        return dataset
Ejemplo n.º 13
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing')

        entities = [
            # Sent 1
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0),
            # Sent 2
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0),
            # Sent 3
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0),
            # Sent 4

        ]

        for e in entities:
            part1.annotations.append(e)

        cls.doc.parts['s1h1'] = part1

        cls.splitter = NLTKSplitter()
        cls.tokenizer = NLTK_TOKENIZER

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)

        # assert False, str(list(cls.dataset.sentences()))
        assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
Ejemplo n.º 14
0
 def setUpClass(cls):
     cls.dataset = Dataset()
     doc = Document()
     part = Part(
         'This is one sentence. This is another one.\n This is the third one; here continues.'
     )
     cls.dataset.documents['doc_1'] = doc
     doc.parts['part_1'] = part
Ejemplo n.º 15
0
 def _create_basic_dataset(self):
     dataset = Dataset()
     doc_1 = Document()
     part_1 = Part('_irrelevant_')
     dataset.documents['doc_1'] = doc_1
     doc_1.parts['part_1'] = part_1
     self._apply_pipeline(dataset)
     return (dataset, part_1)
Ejemplo n.º 16
0
    def download(self, pmids):
        for pmid in pmids:
            if pmid in self.cache:
                xml = ET.fromstring(self.cache[pmid])
            else:
                req = requests.get(self.pubmed_url, {
                    'db': 'pubmed',
                    'retmode': 'xml',
                    'id': pmid
                })
                text = req.text
                xml = ET.fromstring(text)
                self.cache[pmid] = text

            doc = Document()

            if self.one_part:
                joined_text = '\n'.join(
                    element.text
                    for element in chain(xml.findall('.//ArticleTitle'),
                                         xml.findall('.//AbstractText')))
                doc.parts['title_and_abstract'] = Part(joined_text)
            else:
                # for now only include title and abstract
                title_elem = xml.find('.//ArticleTitle')
                if title_elem is not None:
                    doc.parts['title'] = Part(title_elem.text)

                abstract_elem = xml.findall('.//AbstractText')
                if abstract_elem is not None:
                    abstract_elems = []
                    for elem in abstract_elem:
                        if 'Label' in elem.attrib and elem.attrib[
                                'Label'] != 'UNLABELLED':
                            abstract_elems.append('{}: {}'.format(
                                elem.attrib['Label'], elem.text))
                        else:
                            abstract_elems.append(elem.text)

                    abstract_elems = filter(None, abstract_elems)

                    doc.parts['abstract'] = Part(' '.join(abstract_elems))

            # yield the document but only if you found anything
            if len(doc.parts) > 0:
                yield pmid, doc
Ejemplo n.º 17
0
    def test_main_verbs(self):

        for _, _, sentence in self.computed_sentences:
            print()
            print(sentence)
            verbs = set(
                Part.get_main_verbs(sentence,
                                    token_map=lambda t: t.features["lemma"]))
            print("\t", verbs)
Ejemplo n.º 18
0
    def __process_file(filename):
        document = Document()
        with open(filename) as file:
            part_id = 1
            for part in re.split('\n\n', file.read()):
                if part.strip():
                    document.parts['{}'.format(part_id)] = Part(part)
                    part_id += 1

        return os.path.split(filename)[-1], document
Ejemplo n.º 19
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part('insertionefsA dup23.23')
        doc_id1.parts['p1'].sentences = [[Token('insertionefsA', 0), Token('dup23.23', 14)]]
        cls.dataset.documents['doc_id1'] = doc_id1

        cls.feature = TmVarFeatureGenerator()
        cls.feature.generate(dataset=cls.dataset)
Ejemplo n.º 20
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        part = Part(self.string)
        document = Document()
        dataset = Dataset()

        dataset.documents['doc_1'] = document
        document.parts['part_1'] = part

        return dataset
Ejemplo n.º 21
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:
            documents = file.read().strip().split('\n\n')
            for document_text in documents:
                lines = document_text.strip().splitlines()

                first_line = re.search('(\d+)\|t\|(.*)', lines[0])
                doc_id = first_line.group(1)
                tmvar_title = first_line.group(2)
                tmvar_abstract = re.search('(\d+)\|a\|(.*)', lines[1]).group(2)

                document = Document()
                title = Part(tmvar_title)
                abstract = Part(tmvar_abstract)
                document.parts['title'] = title
                document.parts['abstract'] = abstract

                for line in lines[2:]:
                    _, start, end, _, _, _ = line.split('\t')
                    start = int(start)
                    end = int(end)

                    if 0 <= start < end <= len(tmvar_title):
                        part = title
                    else:
                        part = abstract
                        start -= len(tmvar_title) + 1
                        end -= len(tmvar_title) + 1

                    part.annotations.append(
                        Entity(self.mut_class_id, start, part.text[start:end]))

                dataset.documents[doc_id] = document

        return dataset
Ejemplo n.º 22
0
    def setUpClass(cls):
        text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!"

        part1 = Part(text1)
        doc = Document()
        doc.parts['firstpart'] = part1
        dataset = Dataset()
        dataset.documents['firstdocument'] = doc

        NLTKSplitter().split(dataset)
        # TmVarTokenizer().tokenize(dataset)
        cls.data = dataset
        cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
Ejemplo n.º 23
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()

        doc_id1.parts['t1'] = Part('This title blows your mind')

        text = str(
            'This magic only exists in your dreams. To become reality, you have to work at it. '
            'Thr is only available with the residue threonine and a mutation, '
            'though things can change positions '
            'when adding some more replacements. Between me being sorry '
            'and you being an insertion.')
        doc_id1.parts['p1'] = Part(text.replace('\n', ''))

        cls.dataset.documents['doc_id1'] = doc_id1

        NLTKSplitter().split(cls.dataset)
        TmVarTokenizer().tokenize(cls.dataset)

        cls.feature = NLMentionFeatureGenerator(thr=4)
        cls.feature.generate(dataset=cls.dataset)
Ejemplo n.º 24
0
    def read(self):
        """
        read each .txt file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        :returns structures.data.Dataset
        """
        dataset = Dataset()
        with open(self.corpus_folder, encoding='utf-8') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                docid, title, abstract = row
                title = title.strip()
                abstract = abstract.strip()

                document = Document()
                if title:
                    document.parts['title'] = Part(title)
                if abstract and abstract != 'null':
                    document.parts['abstract'] = Part(abstract)

                dataset.documents[docid] = document

        return dataset
Ejemplo n.º 25
0
    def read(self):
        """
        read each .txt file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        :returns structures.data.Dataset
        """
        dataset = Dataset()
        with open(self.corpus_file, encoding='utf-8') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                document = Document()
                document.parts['abstract'] = Part(row[1])
                dataset.documents[row[0]] = document

        return dataset
Ejemplo n.º 26
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part(
            'This is some sample text. This is another, sample sentence with coma.'
        )
        doc_id1.parts['p1'].sentences_ = [
            'This is some sample text.',
            'This is another, sample sentence with coma.'
        ]

        cls.dataset.documents['doc_id1'] = doc_id1

        cls.tokenizer = NLTK_TOKENIZER
        cls.tokenizer.tokenize(cls.dataset)
Ejemplo n.º 27
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part(
            'this is some sample text. it contains this c.2708_2711delTTAG mutation.'
        )
        doc_id1.parts['p1'].sentences_ = [
            'this is some sample text.',
            'it contains this c.2708_2711delTTAG mutation.'
        ]

        cls.dataset.documents['doc_id1'] = doc_id1

        cls.tokenizer = TmVarTokenizer()
        cls.tokenizer.tokenize(cls.dataset)
Ejemplo n.º 28
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()
        part = Part('some text c.A100G p.V100Q some text')
        part.sentences = [[
            Token('some', 0),
            Token('text', 5),
            Token('c', 10),
            Token('.', 11),
            Token('A', 12),
            Token('100', 13),
            Token('G', 16),
            Token('p', 18),
            Token('.', 19),
            Token('V', 20),
            Token('100', 21),
            Token('Q', 24),
            Token('some', 26),
            Token('text', 31)
        ]]

        predicted_labels = [
            'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O',
            'O'
        ]

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'] = Document()
        cls.dataset.documents['doc_1'].parts['p1'] = part

        part = Part('test edge case DNA A927B test')
        part.sentences = [[
            Token('test', 0),
            Token('edge', 5),
            Token('case', 10),
            Token('DNA', 15),
            Token('A', 19),
            Token('927', 20),
            Token('B', 23),
            Token('test', 25)
        ]]

        predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O']

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'].parts['p2'] = part
Ejemplo n.º 29
0
    def _get_test_data(self, entity_sentence, assumed_tokens_words=None):
        if assumed_tokens_words is None:
            assumed_tokens_words = entity_sentence.split(' ')

        # Create dataset

        dataset = StringReader(entity_sentence).read()
        part = next(dataset.parts())
        entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                        offset=0,
                        text=entity_sentence)
        part.annotations.append(entity)

        # Apply through pipeline

        NLTKSplitter().split(dataset)
        NLTK_TOKENIZER.tokenize(dataset)
        self.parser.parse(dataset)

        # Rest

        sentences = part.sentences
        assert len(sentences) == 1
        sentence = sentences[0]

        assert len(assumed_tokens_words) == len(sentence)
        for (assumed_token_word, actual_token) in zip(assumed_tokens_words,
                                                      sentence):
            assert assumed_token_word == actual_token.word

        part.compute_tokens_depth()
        roots = Part.get_sentence_roots(sentence)
        for r in roots:
            self._assert_depth_eq(r, 0)

        part.set_entities_head_tokens()

        return (dataset, sentence, entity, roots)
Ejemplo n.º 30
0
    def generate(self, corpus, f_set, use_gold, use_pred):
        assert not (use_gold and use_pred), "No support for both"

        self.extract_abbreviation_synonyms(corpus, use_gold, use_pred)

        for docid, document in corpus.documents.items():
            for edge in document.edges():

                sentence = edge.get_combined_sentence()

                entities_in_sentences = edge.get_any_entities_in_sentences(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_in_sentences):
                    entities = entities_in_sentences[e_class_id]
                    # TODO this is wrong for other entitiey types nor appearing in the edge
                    # TODO also what about if the same entity type appears in both ends of the same edge? as in a protein-protein relation --> Just rest the counts of the edge
                    individual_count = len(entities) - 1  # rest 1, as one is already one of the edge's entities --
                    assert individual_count >= 0
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_total', total_count, 'int', 'total (all classes)')

                entities_between_entities = edge.get_any_entities_between_entities(predicted=use_pred)
                total_count = 0
                # We sort to have a deterministic order creation of the features
                for e_class_id in sorted(entities_between_entities):
                    entities = entities_between_entities[e_class_id]
                    individual_count = len(entities)
                    total_count += individual_count
                    self.add_with_value(f_set, edge, 'f_counts_in_between_individual', individual_count, 'int', 'individual', e_class_id)

                self.add_with_value(f_set, edge, 'f_counts_in_between_total', total_count, 'int', 'total (all classes)')

                order = edge.entity1.class_id < edge.entity2.class_id
                if order:
                    self.add(f_set, edge, 'f_order')

                for token in sentence:
                    self.add(f_set, edge, 'f_bow', masked_text(token, edge.same_part, use_gold, use_pred, token_map=lambda t: t.features['lemma'], token_is_number_fun=lambda _: "NUM"))
                    self.add(f_set, edge, 'f_pos', token.features['coarsed_pos'])

                self.add_with_value(f_set, edge, 'f_tokens_count', len(sentence))

                # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset
                _e1_first_token_index = edge.entity1.tokens[0].features['tmp_id']
                _e2_last_token_index = edge.entity2.tokens[-1].features['tmp_id']
                assert _e1_first_token_index < _e2_last_token_index, (docid, sentence, edge.entity1.text, edge.entity2.text, _e1_first_token_index, _e2_last_token_index)

                self.add_with_value(f_set, edge, 'f_tokens_count_before', len(sentence[:_e1_first_token_index]))
                self.add_with_value(f_set, edge, 'f_tokens_count_after', len(sentence[(_e2_last_token_index+1):]))

                #

                if Part.is_negated(sentence):
                    self.add(f_set, edge, "f_sentence_is_negated")

                #

                verbs = set(Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"]))

                if len(verbs) == 0:
                    self.add(f_set, edge, "f_main_verbs", "NO_MAIN_VERB")
                else:
                    for v in verbs:
                        self.add(f_set, edge, "f_main_verbs", v)

                counters = {}
                for part in document:
                    for entity in (part.annotations if use_gold else part.predicted_annotations):
                        ent_type_counter = counters.get(entity.class_id, Counter())
                        ent_key = __class__.entity2key(entity)
                        ent_type_counter.update([ent_key])
                        counters[entity.class_id] = ent_type_counter

                e1_key = __class__.entity2key(edge.entity1)
                e1_count = counters[edge.entity1.class_id][e1_key]
                self.add_with_value(f_set, edge, 'f_entity1_count', e1_count)

                e2_key = __class__.entity2key(edge.entity2)
                e2_count = counters[edge.entity2.class_id][e2_key]
                self.add_with_value(f_set, edge, 'f_entity2_count', e2_count)

                together_counter = Counter()
                diff_sentences = {}
                for aux_edge in document.edges():
                    if aux_edge.e1_sentence_id == aux_edge.e2_sentence_id:
                        together_key = __class__.edge2key(aux_edge)

                        sents = diff_sentences.get(together_key, [])
                        if aux_edge.e1_sentence_id not in sents:
                            sents.append(aux_edge.e1_sentence_id)
                            diff_sentences[together_key] = sents
                            together_counter.update([together_key])

                together_key = __class__.edge2key(edge)
                together_count = together_counter[together_key]
                if together_count > 0:
                    self.add_with_value(f_set, edge, 'f_diff_sents_together_count', together_count)