Beispiel #1
0
    def read_file(a_file,
                  filename,
                  dataset=None,
                  whole_basename_as_docid=False):
        if dataset is None:
            dataset = Dataset()

        soup = BeautifulSoup(a_file, "html.parser")
        document = Document()

        for part in soup.find_all(id=re.compile('^s')):
            if re.match(r'^s[3-9]', part['id']):
                is_abstract = False
            else:
                is_abstract = True
            document.parts[part['id']] = Part(str(part.string),
                                              is_abstract=is_abstract)

        doc_id = os.path.basename(filename).replace('.plain.html', '').replace(
            '.html', '').replace('.xml', '')
        if not whole_basename_as_docid and '-' in doc_id:
            doc_id = doc_id.split('-')[-1]

        dataset.documents[doc_id] = document

        return dataset
Beispiel #2
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        xmls = []
        if os.path.isdir(self.path):
            xmls = [
                os.path.join(root, file)
                for root, _, files in os.walk(self.path) for file in files
                if file.startswith('medline') and file.endswith('xml')
            ]
        elif self.path.startswith('medline') and self.path.endswith('xml'):
            xmls = [self.path]

        dataset = Dataset()

        for xml in xmls:
            for child in ET.parse(xml).getroot():
                pmid = next(child.iter('PMID')).text

                document = Document()
                article = next(child.iter('Article'))
                title = next(article.iter('ArticleTitle')).text
                document.parts['title'] = Part(title, is_abstract=False)
                try:
                    abstract = next(article.iter('AbstractText')).text
                    document.parts['abstract'] = Part(abstract)
                except StopIteration:
                    pass
                dataset.documents[pmid] = document

        return dataset
Beispiel #3
0
    def test_DocumentLevelRelationEvaluator_parts_irrelevant(self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_ PART *1*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_2 = Part('_irrelevant_ PART *2*')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_2'] = part_2

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_2.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")),
        ]

        self._apply_pipeline(dataset)

        # ---

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        self.assertEqual(evaluation.fn, 0)
        self.assertEqual(evaluation.fp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
Beispiel #4
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        part1 = Part('123')
        part2 = Part('45678')
        ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='2',
                      confidence=0)
        ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID,
                      offset=1,
                      text='567',
                      confidence=1)
        ann1.subclass = 0
        ann2.subclass = 2
        part1.annotations.append(ann1)
        part2.annotations.append(ann2)
        cls.doc.parts['s1h1'] = part1
        cls.doc.parts['s2p1'] = part2

        doc2 = Document()
        doc3 = Document().parts['someid'] = Part('marmor stein und eisen')
        cls.dataset2 = Dataset()
        cls.dataset2.documents['newid'] = doc3
        cls.dataset2.documents['testid'] = doc2
Beispiel #5
0
 def setUpClass(cls):
     cls.dataset = Dataset()
     doc = Document()
     part = Part(
         'This is one sentence. This is another one.\n This is the third one; here continues.'
     )
     cls.dataset.documents['doc_1'] = doc
     doc.parts['part_1'] = part
Beispiel #6
0
 def _create_basic_dataset(self):
     dataset = Dataset()
     doc_1 = Document()
     part_1 = Part('_irrelevant_')
     dataset.documents['doc_1'] = doc_1
     doc_1.parts['part_1'] = part_1
     self._apply_pipeline(dataset)
     return (dataset, part_1)
Beispiel #7
0
    def test_DocumentLevelRelationEvaluator_default_entities_case_irrelevant(
            self):

        evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1)

        dataset = Dataset()
        doc_1 = Document()
        part_1 = Part('_irrelevant_')
        dataset.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        part_1.relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        # -

        part_1.predicted_relations = [
            # empty
        ]

        self._apply_pipeline(dataset)

        # -

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 0)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 0.0)

        # ---

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"),
                     Entity(STUB_E_ID_2, 0, "maynard")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)

        # -

        part_1.predicted_relations = [
            Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "tool"),
                     Entity(STUB_E_ID_2, 0, "MAYNARD")),
        ]

        evals = evaluator.evaluate(dataset)
        evaluation = evals(STUB_R_ID_1)
        self.assertEqual(evaluation.tp, 1)
        computation = evals(STUB_R_ID_1).compute(strictness="exact")
        self.assertEqual(computation.f_measure, 1.0)
Beispiel #8
0
    def generate_abstracts(self, list_of_pmids):
        """
        Generates list of documents using pmids and the restapi interface from tmtools.
        Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/"
        :param list_of_pmids: strings
        :return nalaf.structures.Dataset: dataset
        """
        # if os.path.isfile('cache.json'):
        #     with open('cache.json') as f:
        #           tm_var = json.load()
        # else:
        url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/'
        url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'

        # load cache.json if exists
        if os.path.exists('cache.json'):
            with open('cache.json', 'r', encoding='utf-8') as f:
                tm_var = json.load(f)
        else:
            tm_var = {}

        for pmid in list_of_pmids:
            if pmid not in tm_var:  # if pmid was not already downloaded from tmTools
                req = requests.get(url_tmvar.format(pmid))
                try:
                    tm_var[pmid] = req.json()
                except ValueError:
                    pass
        # cache the tmVar annotations so we don't pull them every time
        with open('cache.json', 'w') as file:
            json.dump(tm_var, file, indent=4)

        # for key in tm_var:
        #     print(json.dumps(tm_var[key], indent=4))

        dataset = Dataset()
        for doc_id in list_of_pmids:
            if doc_id in tm_var:
                doc = Document()
                text = tm_var[doc_id]['text']
                part = Part(text)
                denotations = tm_var[doc_id]['denotations']
                annotations = []
                for deno in denotations:
                    ann = Entity(
                        class_id=self.mut_class_id,
                        offset=int(deno['span']['begin']),
                        text=text[deno['span']['begin']:deno['span']['end']])
                    annotations.append(ann)
                    # note should the annotations from tmvar go to predicted_annotations or annotations?
                part.annotations = annotations
                doc.parts['abstract'] = part
                dataset.documents[doc_id] = doc

        return dataset
Beispiel #9
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part('insertionefsA dup23.23')
        doc_id1.parts['p1'].sentences = [[Token('insertionefsA', 0), Token('dup23.23', 14)]]
        cls.dataset.documents['doc_id1'] = doc_id1

        cls.feature = TmVarFeatureGenerator()
        cls.feature.generate(dataset=cls.dataset)
Beispiel #10
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        part = Part(self.string)
        document = Document()
        dataset = Dataset()

        dataset.documents['doc_1'] = document
        document.parts['part_1'] = part

        return dataset
Beispiel #11
0
    def setUpClass(cls):
        text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!"

        part1 = Part(text1)
        doc = Document()
        doc.parts['firstpart'] = part1
        dataset = Dataset()
        dataset.documents['firstdocument'] = doc

        NLTKSplitter().split(dataset)
        # TmVarTokenizer().tokenize(dataset)
        cls.data = dataset
        cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
Beispiel #12
0
    def setUpClass(cls):
        # create a sample dataset1 (1) to test
        cls.dataset1 = Dataset()
        doc_1 = Document()

        text = '.... aaaa .... bbbb .... cccc .... dddd .... eeee .... ffff .... gggg .... hhhh .... jjjj'
        part_1 = Part(text)

        cls.dataset1.documents['doc_1'] = doc_1
        doc_1.parts['part_1'] = part_1

        exact_1 = Entity(STUB_E_ID_1, 5, 'aaaa')
        exact_1.subclass = 1
        exact_2 = Entity(STUB_E_ID_1, 55, 'ffff')
        exact_2.subclass = 2
        exact_3 = Entity(STUB_E_ID_1, 75, 'hhhh')
        exact_3.subclass = 2

        overlap_1_1 = Entity(STUB_E_ID_1, 25, 'cccc')
        overlap_1_1.subclass = 1
        overlap_1_2 = Entity(STUB_E_ID_1, 26, 'cc')
        overlap_1_2.subclass = 1

        overlap_2_1 = Entity(STUB_E_ID_1, 32, '.. ddd')
        overlap_2_1.subclass = 2
        overlap_2_2 = Entity(STUB_E_ID_1, 36, 'ddd ...')
        overlap_2_2.subclass = 2

        overlap_3_1 = Entity(STUB_E_ID_1, 65, 'gggg')
        overlap_3_1.subclass = 1
        overlap_3_2 = Entity(STUB_E_ID_1, 62, '.. gggg ..')
        overlap_3_2.subclass = 2

        missing_1 = Entity('e2', 45, 'eeee')
        missing_1.subclass = 1
        missing_2 = Entity('e2', 84, 'jjjj')
        missing_2.subclass = 1

        spurios = Entity('e2', 15, 'bbbb')
        spurios.subclass = 1

        part_1.annotations = [
            exact_1, exact_2, exact_3, overlap_1_1, overlap_2_1, overlap_3_1,
            missing_1, missing_2
        ]
        part_1.predicted_annotations = [
            exact_1, exact_2, exact_3, overlap_1_2, overlap_2_2, overlap_3_2,
            spurios
        ]
Beispiel #13
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]
Beispiel #14
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing')

        entities = [
            # Sent 1
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0),
            # Sent 2
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0),
            # Sent 3
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0),
            # Sent 4

        ]

        for e in entities:
            part1.annotations.append(e)

        cls.doc.parts['s1h1'] = part1

        cls.splitter = NLTKSplitter()
        cls.tokenizer = NLTK_TOKENIZER

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)

        # assert False, str(list(cls.dataset.sentences()))
        assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
Beispiel #15
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:

            for row in file:
                columns = row.split("\t")

                docid = columns[0]
                typ = columns[1]
                start = columns[2]
                end = columns[3]
                entity_text = columns[7]

                class_id = None
                if typ == 'Mutation':
                    class_id = self.mut_class_id
                elif typ == 'AminoacidResidue':
                    class_id = self.residue_class_id

                if class_id:
                    document = dataset.documents.get(docid, Document())

                    part = Part(entity_text)
                    document.parts[typ + '|' + start + '|' + end] = part

                    part.annotations.append(
                        Entity(class_id, int(start), entity_text))

                    dataset.documents[docid] = document

        return dataset
Beispiel #16
0
    def read(self):
        """
        read each .txt file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        :returns structures.data.Dataset
        """
        dataset = Dataset()
        with open(self.corpus_file, encoding='utf-8') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                document = Document()
                document.parts['abstract'] = Part(row[1])
                dataset.documents[row[0]] = document

        return dataset
Beispiel #17
0
    def download(self, pmids):
        for pmid in pmids:
            if pmid in self.cache:
                xml = ET.fromstring(self.cache[pmid])
            else:
                req = requests.get(self.pubmed_url, {
                    'db': 'pubmed',
                    'retmode': 'xml',
                    'id': pmid
                })
                text = req.text
                xml = ET.fromstring(text)
                self.cache[pmid] = text

            doc = Document()

            if self.one_part:
                joined_text = '\n'.join(
                    element.text
                    for element in chain(xml.findall('.//ArticleTitle'),
                                         xml.findall('.//AbstractText')))
                doc.parts['title_and_abstract'] = Part(joined_text)
            else:
                # for now only include title and abstract
                title_elem = xml.find('.//ArticleTitle')
                if title_elem is not None:
                    doc.parts['title'] = Part(title_elem.text)

                abstract_elem = xml.findall('.//AbstractText')
                if abstract_elem is not None:
                    abstract_elems = []
                    for elem in abstract_elem:
                        if 'Label' in elem.attrib and elem.attrib[
                                'Label'] != 'UNLABELLED':
                            abstract_elems.append('{}: {}'.format(
                                elem.attrib['Label'], elem.text))
                        else:
                            abstract_elems.append(elem.text)

                    abstract_elems = filter(None, abstract_elems)

                    doc.parts['abstract'] = Part(' '.join(abstract_elems))

            # yield the document but only if you found anything
            if len(doc.parts) > 0:
                yield pmid, doc
Beispiel #18
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part(
            'This is some sample text. This is another, sample sentence with coma.'
        )
        doc_id1.parts['p1'].sentences_ = [
            'This is some sample text.',
            'This is another, sample sentence with coma.'
        ]

        cls.dataset.documents['doc_id1'] = doc_id1

        cls.tokenizer = NLTK_TOKENIZER
        cls.tokenizer.tokenize(cls.dataset)
Beispiel #19
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part(
            'this is some sample text. it contains this c.2708_2711delTTAG mutation.'
        )
        doc_id1.parts['p1'].sentences_ = [
            'this is some sample text.',
            'it contains this c.2708_2711delTTAG mutation.'
        ]

        cls.dataset.documents['doc_id1'] = doc_id1

        cls.tokenizer = TmVarTokenizer()
        cls.tokenizer.tokenize(cls.dataset)
Beispiel #20
0
    def __process_file(filename):
        document = Document()
        with open(filename) as file:
            part_id = 1
            for part in re.split('\n\n', file.read()):
                if part.strip():
                    document.parts['{}'.format(part_id)] = Part(part)
                    part_id += 1

        return os.path.split(filename)[-1], document
Beispiel #21
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)],
                          [Token('Try', 18), Token('tried', 22), Token('tries', 28)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.generator = PorterStemFeatureGenerator()
Beispiel #22
0
    def setUpClass(cls):
        cls.dataset = Dataset()

        doc1 = Document()
        cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1

        for s in TEST_SENTENCES_SINGLE_ROOT:
            part = Part(s)
            doc1.parts[s] = part

        doc2 = Document()
        cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2

        for s in TEST_SENTENCES_MULTI_ROOT:
            part = Part(s)
            doc2.parts[s] = part

        cls.nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(cls.nlp)
        cls.splitter = NLTKSplitter()
        cls.tokenizer = GenericTokenizer(
            lambda string: (tok.text for tok in cls.nlp.tokenizer(string)))

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)
        cls.parser.parse(cls.dataset)

        cls.computed_sentences = []

        for sentence in cls.dataset.sentences():
            dist, then = compute_shortest_paths(sentence)
            cls.computed_sentences.append((dist, then, sentence))
Beispiel #23
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        dataset = Dataset()

        with open(self.corpus_file, encoding='utf-8') as file:
            documents = file.read().strip().split('\n\n')
            for document_text in documents:
                lines = document_text.strip().splitlines()

                first_line = re.search('(\d+)\|t\|(.*)', lines[0])
                doc_id = first_line.group(1)
                tmvar_title = first_line.group(2)
                tmvar_abstract = re.search('(\d+)\|a\|(.*)', lines[1]).group(2)

                document = Document()
                title = Part(tmvar_title)
                abstract = Part(tmvar_abstract)
                document.parts['title'] = title
                document.parts['abstract'] = abstract

                for line in lines[2:]:
                    _, start, end, _, _, _ = line.split('\t')
                    start = int(start)
                    end = int(end)

                    if 0 <= start < end <= len(tmvar_title):
                        part = title
                    else:
                        part = abstract
                        start -= len(tmvar_title) + 1
                        end -= len(tmvar_title) + 1

                    part.annotations.append(
                        Entity(self.mut_class_id, start, part.text[start:end]))

                dataset.documents[doc_id] = document

        return dataset
Beispiel #24
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()

        doc_id1.parts['t1'] = Part('This title blows your mind')

        text = str(
            'This magic only exists in your dreams. To become reality, you have to work at it. '
            'Thr is only available with the residue threonine and a mutation, '
            'though things can change positions '
            'when adding some more replacements. Between me being sorry '
            'and you being an insertion.')
        doc_id1.parts['p1'] = Part(text.replace('\n', ''))

        cls.dataset.documents['doc_id1'] = doc_id1

        NLTKSplitter().split(cls.dataset)
        TmVarTokenizer().tokenize(cls.dataset)

        cls.feature = NLMentionFeatureGenerator(thr=4)
        cls.feature.generate(dataset=cls.dataset)
Beispiel #25
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()
        part = Part('some text c.A100G p.V100Q some text')
        part.sentences = [[
            Token('some', 0),
            Token('text', 5),
            Token('c', 10),
            Token('.', 11),
            Token('A', 12),
            Token('100', 13),
            Token('G', 16),
            Token('p', 18),
            Token('.', 19),
            Token('V', 20),
            Token('100', 21),
            Token('Q', 24),
            Token('some', 26),
            Token('text', 31)
        ]]

        predicted_labels = [
            'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O',
            'O'
        ]

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'] = Document()
        cls.dataset.documents['doc_1'].parts['p1'] = part

        part = Part('test edge case DNA A927B test')
        part.sentences = [[
            Token('test', 0),
            Token('edge', 5),
            Token('case', 10),
            Token('DNA', 15),
            Token('A', 19),
            Token('927', 20),
            Token('B', 23),
            Token('test', 25)
        ]]

        predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O']

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'].parts['p2'] = part
Beispiel #26
0
    def read(self):
        """
        read each .txt file in the directory, parse it and create and instance of Document
        form a dataset consisting of every document parsed and return it

        :returns structures.data.Dataset
        """
        dataset = Dataset()
        with open(self.corpus_folder, encoding='utf-8') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                docid, title, abstract = row
                title = title.strip()
                abstract = abstract.strip()

                document = Document()
                if title:
                    document.parts['title'] = Part(title)
                if abstract and abstract != 'null':
                    document.parts['abstract'] = Part(abstract)

                dataset.documents[docid] = document

        return dataset
Beispiel #27
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[
            Token('Make', 0),
            Token('making', 5),
            Token('made', 12)
        ], [Token('Try', 18),
            Token('tried', 22),
            Token('tries', 28)]]
        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        for token in self.dataset.tokens():
            token.features['a'] = 'a'
            token.features['b'] = 'b'
Beispiel #28
0
    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()
Beispiel #29
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        from functools import reduce
        dataset = Dataset()
        with open(self.path, 'r') as f:

            tree = ET.parse(f)
            # level document
            for element in tree.iterfind('Article'):
                doc = Document()

                # pmid <Pmid>
                pmid = element[0].text

                # title <Title>
                title = element[1].text
                if not title:
                    title = ""
                title_annotations = []
                for child in element[1]:
                    if child.tag == 'variant':
                        entity = Entity(self.mut_class_id, len(title),
                                        child.text)
                        title_annotations.append(entity)
                    # unforunately child.text or child.tail can be empty and return None, which cannot be written as ""
                    try:
                        title += child.text
                    except TypeError:
                        pass
                    try:
                        title += child.tail
                    except TypeError:
                        pass
                part_title = Part(title)
                part_title.annotations.extend(title_annotations)

                # body - abstract <Abstract>
                abstract = element[2].text
                if not abstract:
                    abstract = ""
                abstract_annotations = []
                for child in element[2]:
                    if child.tag == 'variant':
                        entity = Entity(self.mut_class_id, len(abstract),
                                        child.text)
                        abstract_annotations.append(entity)
                    # unforunately child.text or child.tail can be empty and return None, which cannot be written as ""
                    try:
                        abstract += child.text
                    except TypeError:
                        pass
                    try:
                        abstract += child.tail
                    except TypeError:
                        pass
                part_abstract = Part(abstract)
                part_abstract.annotations.extend(abstract_annotations)

                # save part to document
                doc.parts['title'] = part_title
                doc.parts['abstract'] = part_abstract
                dataset.documents[pmid] = doc  # save document to dataset
        return dataset
Beispiel #30
0
    def read(self):
        """
        :returns: nalaf.structures.data.Dataset
        """
        from functools import reduce
        dataset = Dataset()
        for filename in glob.glob(self.path + '/*.txt'):
            with open(filename, 'r') as f:
                data = f.read()

                content = data.split("\n")
                try:
                    pmid = int(content[0])
                except ValueError:
                    continue

                doc = Document()

                title = content[2]
                part_title = Part(title, is_abstract=True)
                body = content[4]
                part_abstract = Part(body, is_abstract=True)

                title_offset = len(str(pmid)) + 2  # +2 for twice newline
                body_offset = title_offset + len(
                    title) + 2  # +2 for twice newline

                # elements for temporary
                current_annotation = []
                last_element = None

                # print(filename, pmid, title)
                with open(filename + '.ann', 'r') as fa:
                    tree = ET.parse(fa)
                    for element in tree.iterfind(
                            'Annotation/Annotation[@type]'):
                        # if gene annotation skip
                        if element.attrib['type'] == 'ge':
                            continue

                        # if last element is empty (beginning of new doc) save as last_element and skip
                        if last_element is None:
                            last_element = element
                            continue

                        span = last_element.attrib['span'].split('..')
                        start = int(span[0])
                        end = int(span[1])
                        text = data[start:end]

                        if start >= body_offset:
                            norm_start = start - body_offset
                            norm_end = end - body_offset
                        else:
                            norm_start = start - title_offset
                            norm_end = end - title_offset

                        if end + 1 == int(
                                element.attrib['span'].split('..')[0]
                        ):  # todo bugfix still mistake if space is in between the whole annotation case: "#1632 T"
                            if len(
                                    current_annotation
                            ) == 0:  # if no series of annotations linked
                                current_annotation.append(norm_start)
                                current_annotation.append(norm_end)
                                current_annotation.append(text)
                                current_annotation.append(
                                    (start >= body_offset))  # if is_body
                            else:  # if already annotations contained there
                                current_annotation[1] = norm_end
                                current_annotation[2] += text
                        else:
                            if len(current_annotation) > 0:
                                entity = Entity(self.mut_class_id,
                                                current_annotation[0],
                                                current_annotation[2])
                                if current_annotation[3]:
                                    part_abstract.annotations.append(entity)
                                else:
                                    part_title.annotations.append(entity)
                                current_annotation = []

                            entity = Entity(self.mut_class_id, norm_start,
                                            text)
                            if start >= body_offset:
                                part_abstract.annotations.append(entity)
                            else:
                                part_title.annotations.append(entity)

                        last_element = element

                    span = last_element.attrib['span'].split('..')
                    start = int(span[0])
                    end = int(span[1])
                    text = data[start:end]
                    if len(current_annotation
                           ) == 0:  # if no series of annotations linked
                        if start >= body_offset:
                            norm_start = start - body_offset
                            is_body = True
                        else:
                            norm_start = start - title_offset
                            is_body = False

                        entity = Entity(self.mut_class_id, norm_start, text)

                        if is_body:
                            part_abstract.annotations.append(entity)
                        else:
                            part_title.annotations.append(entity)

                    else:  # if already annotations contained there
                        current_annotation[2] += text
                        entity = Entity(self.mut_class_id,
                                        current_annotation[0],
                                        current_annotation[2])
                        if current_annotation[3]:
                            part_abstract.annotations.append(entity)
                        else:
                            part_title.annotations.append(entity)

                doc.parts['title'] = part_title
                doc.parts['abstract'] = part_abstract
                # print(part_title)
                # print(part_body)
                dataset.documents[pmid] = doc
                # print(doc)

        return dataset