Ejemplo n.º 1
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing')

        entities = [
            # Sent 1
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0),
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0),
            # Sent 2
            Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0),
            # Sent 3
            Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0),
            # Sent 4

        ]

        for e in entities:
            part1.annotations.append(e)

        cls.doc.parts['s1h1'] = part1

        cls.splitter = NLTKSplitter()
        cls.tokenizer = NLTK_TOKENIZER

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)

        # assert False, str(list(cls.dataset.sentences()))
        assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
Ejemplo n.º 2
0
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]
Ejemplo n.º 3
0
    def setUpClass(cls):
        cls.dataset = Dataset()

        doc1 = Document()
        cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1

        for s in TEST_SENTENCES_SINGLE_ROOT:
            part = Part(s)
            doc1.parts[s] = part

        doc2 = Document()
        cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2

        for s in TEST_SENTENCES_MULTI_ROOT:
            part = Part(s)
            doc2.parts[s] = part

        cls.nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(cls.nlp)
        cls.splitter = NLTKSplitter()
        cls.tokenizer = GenericTokenizer(
            lambda string: (tok.text for tok in cls.nlp.tokenizer(string)))

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)
        cls.parser.parse(cls.dataset)

        cls.computed_sentences = []

        for sentence in cls.dataset.sentences():
            dist, then = compute_shortest_paths(sentence)
            cls.computed_sentences.append((dist, then, sentence))
Ejemplo n.º 4
0
    def __init__(self, splitter=None, tokenizer=None, feature_generators=None):
        if not splitter:
            splitter = NLTKSplitter()
        if not tokenizer:
            tokenizer = TmVarTokenizer()
        if feature_generators is None:
            feature_generators = [SimpleFeatureGenerator(), PorterStemFeatureGenerator(),
                                  WindowFeatureGenerator((-3, -2, -1, 1, 2, 3), ['stem[0]'])]

        if isinstance(splitter, Splitter):
            self.splitter = splitter
        else:
            raise TypeError('not an instance that implements Splitter')

        if isinstance(tokenizer, Tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError('not an instance that implements Tokenizer')

        if hasattr(feature_generators, '__iter__'):
            for index, feature_generator in enumerate(feature_generators):
                if not isinstance(feature_generator, FeatureGenerator):
                    raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index))
            self.feature_generators = feature_generators
        elif isinstance(feature_generators, FeatureGenerator):
            self.feature_generators = [feature_generators]
        else:
            raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')
Ejemplo n.º 5
0
    def _apply_pipeline(self, dataset):
        # Apply through pipeline

        NLTKSplitter().split(dataset)
        NLTK_TOKENIZER.tokenize(dataset)
        # nlp = get_spacy_nlp_english(load_parser=False)
        # cls.parser = SpacyParser(nlp)
        # cls.parser.parse(cls.dataset)
        return dataset
Ejemplo n.º 6
0
 def setUp(self):
     self.dataset = StringReader(
         'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read(
         )
     NLTKSplitter().split(self.dataset)
     TmVarTokenizer().tokenize(self.dataset)
     part = list(self.dataset.parts())[0]
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG'))
     part.annotations.append(
         Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
Ejemplo n.º 7
0
    def test_split(self):
        NLTKSplitter().split(self.dataset)

        sentences_ = []
        for document in self.dataset.documents.values():
            for part in document.parts.values():
                sentences_ += part.sentences_

        expected = ['This is one sentence.', 'This is another one.', 'This is the third one; here continues.']

        self.assertEqual(sentences_, expected)
Ejemplo n.º 8
0
    def setUpClass(cls):
        text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!"

        part1 = Part(text1)
        doc = Document()
        doc.parts['firstpart'] = part1
        dataset = Dataset()
        dataset.documents['firstdocument'] = doc

        NLTKSplitter().split(dataset)
        # TmVarTokenizer().tokenize(dataset)
        cls.data = dataset
        cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
Ejemplo n.º 9
0
    def test_generate_patterns_245(self):
        dataset = StringReader('token c.A436C token').read()
        NLTKSplitter().split(dataset)
        TmVarTokenizer().tokenize(dataset)
        TmVarDictionaryFeatureGenerator().generate(dataset)

        token_features = [{key: value for key, value in token.features.items() if value is not 'O'}
                          for token in dataset.tokens()]
        self.assertEqual(token_features[0], {})
        self.assertEqual(token_features[1], {'pattern4[0]': 'B', 'pattern2[0]': 'B'})
        self.assertEqual(token_features[2], {'pattern4[0]': 'I', 'pattern2[0]': 'I'})
        self.assertEqual(token_features[3], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'B'})
        self.assertEqual(token_features[4], {'pattern4[0]': 'I', 'pattern2[0]': 'I', 'pattern5[0]': 'I'})
        self.assertEqual(token_features[5], {'pattern4[0]': 'E', 'pattern2[0]': 'I', 'pattern5[0]': 'E'})
        self.assertEqual(token_features[6], {})
Ejemplo n.º 10
0
 def test_tag(self):
     # todo question is that the proper way? with predicts_classes
     GNormPlusGeneTagger().tag(self.data, uniprot=True)
     NLTKSplitter().split(self.data)
     TmVarTokenizer().tokenize(self.data)
     StubSameSentenceRelationExtractor(PRO_CLASS_ID, MUT_CLASS_ID, PRO_REL_MUT_CLASS_ID).annotate(self.data)
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == PRO_CLASS_ID]), 0)
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 2)
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
     self.data.purge_false_relationships()
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
     del self.data.documents['15878741'].parts['abstract'].annotations[0]
     self.assertEqual(len([x for x in self.data.annotations() if x.class_id == MUT_CLASS_ID]), 1)
     self.data.purge_false_relationships()
     self.assertEqual(len([x for x in self.data.relations() if x.class_id == PRO_REL_MUT_CLASS_ID]), 0)
Ejemplo n.º 11
0
    def _get_test_data(self, entity_sentence, assumed_tokens_words=None):
        if assumed_tokens_words is None:
            assumed_tokens_words = entity_sentence.split(' ')

        # Create dataset

        dataset = StringReader(entity_sentence).read()
        part = next(dataset.parts())
        entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                        offset=0,
                        text=entity_sentence)
        part.annotations.append(entity)

        # Apply through pipeline

        NLTKSplitter().split(dataset)
        NLTK_TOKENIZER.tokenize(dataset)
        self.parser.parse(dataset)

        # Rest

        sentences = part.sentences
        assert len(sentences) == 1
        sentence = sentences[0]

        assert len(assumed_tokens_words) == len(sentence)
        for (assumed_token_word, actual_token) in zip(assumed_tokens_words,
                                                      sentence):
            assert assumed_token_word == actual_token.word

        part.compute_tokens_depth()
        roots = Part.get_sentence_roots(sentence)
        for r in roots:
            self._assert_depth_eq(r, 0)

        part.set_entities_head_tokens()

        return (dataset, sentence, entity, roots)
Ejemplo n.º 12
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()

        doc_id1.parts['t1'] = Part('This title blows your mind')

        text = str(
            'This magic only exists in your dreams. To become reality, you have to work at it. '
            'Thr is only available with the residue threonine and a mutation, '
            'though things can change positions '
            'when adding some more replacements. Between me being sorry '
            'and you being an insertion.')
        doc_id1.parts['p1'] = Part(text.replace('\n', ''))

        cls.dataset.documents['doc_id1'] = doc_id1

        NLTKSplitter().split(cls.dataset)
        TmVarTokenizer().tokenize(cls.dataset)

        cls.feature = NLMentionFeatureGenerator(thr=4)
        cls.feature.generate(dataset=cls.dataset)
Ejemplo n.º 13
0
from nala.utils.corpora import get_corpus
from nalaf.preprocessing.spliters import NLTKSplitter
from nalaf.preprocessing.tokenizers import TmVarTokenizer

data = get_corpus('nala_training_1')

NLTKSplitter().split(data)
TmVarTokenizer().tokenize(data)
from nalaf.features.embeddings import BinarizedWordEmbeddingsFeatureGenerator

BinarizedWordEmbeddingsFeatureGenerator(
    '/home/abojchevski/projects/nala/nala/data/word_embeddings_2016-03-28/word_embeddings.model'
).generate(data)

for token in data.tokens():
    print(token.features, token.end)
Ejemplo n.º 14
0
 def __init__(self):
     self.data = get_corpus('IDP4+')
     NLTKSplitter().split(self.data)
     TmVarTokenizer().tokenize(self.data)
Ejemplo n.º 15
0
    def filter(self, documents, min_found=1, use_nala=False):
        """
        :type documents: collections.Iterable[(str, nalaf.structures.data.Document)]
        """

        _progress = 1
        _start_time = time.time()
        _total_time = 0

        _time_avg_per_pattern = 0
        _pattern_calls = 0
        _time_reg_pattern_total = 0
        _time_max_pattern = 0
        _low_performant_pattern = ""

        # NLDefiners init
        exclusive_definer = ExclusiveNLDefiner()
        _e_array = [0, 0, 0]
        inclusive_definer = InclusiveNLDefiner()
        _i_array = [0, 0]

        last_found = 0
        crf = PyCRFSuite(self.location_binary_model)

        # counter_to_stop_for_caching = 0

        for pmid, doc in documents:
            # if any part of the document contains any of the keywords
            # yield that document

            # if counter_to_stop_for_caching > 400:
            #     break
            # counter_to_stop_for_caching += 1
            # print(counter_to_stop_for_caching)

            part_offset = 0
            data_tmp = Dataset()
            data_tmp.documents[pmid] = doc
            data_nala = deepcopy(data_tmp)
            NLTKSplitter().split(data_tmp)
            # data_tmvar = TmVarTagger().generate_abstracts([pmid])
            if use_nala:
                self.pipeline.execute(data_nala)
                self.labeler.label(data_nala)
                crf.tag(data_nala, MUT_CLASS_ID)
                PostProcessing().process(data_nala)
                ExclusiveNLDefiner().define(data_nala)

            used_regexs = {}

            positive_sentences = 0
            for i, x in enumerate(doc.parts):
                # print("Part", i)
                sent_offset = 0
                cur_part = doc.parts.get(x)
                sentences = cur_part.sentences_

                for sent in sentences:
                    sent_length = len(sent)
                    new_text = sent.lower()
                    new_text = re.sub('[\./\\-(){}\[\],%]', ' ', new_text)
                    # new_text = re.sub('\W+', ' ', new_text)

                    found_in_sentence = False

                    for i, reg in enumerate(self.patterns):
                        _lasttime = time.time()  # time start var
                        match = reg.search(new_text)

                        # debug bottleneck patterns
                        _time_current_reg = time.time(
                        ) - _lasttime  # time end var
                        _pattern_calls += 1  # pattern calls already occured
                        _time_reg_pattern_total += _time_current_reg  # total time spent on searching with patterns
                        if _time_reg_pattern_total > 0:
                            _time_avg_per_pattern = _time_reg_pattern_total / _pattern_calls  # avg spent time per pattern call
                        # todo create pattern performance eval for descending amount of recognized patterns
                        # if _pattern_calls > len(patterns) * 20 and _time_avg_per_pattern * 10000 < _time_current_reg:
                        #     print("BAD_PATTERN_PERFORMANCE:", _time_avg_per_pattern, _time_current_reg, reg.pattern)
                        # if _time_max_pattern < _time_current_reg:
                        #     _time_max_pattern = _time_current_reg
                        #     _low_performant_pattern = reg.pattern
                        #     print(_time_avg_per_pattern, _low_performant_pattern, _time_max_pattern)

                        # if reg.pattern == r'(\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (\b\w*\d+\w*\b\s?){1,3} (\b\w+\b\s?){1,4} (deletion|deleting|deleted)':
                        #     if _time_current_reg > _time_avg_per_pattern * 10:
                        #         # print(_time_avg_per_pattern, _time_current_reg)
                        #         f.write("BAD_PATTERN\n")
                        #         f.write(sent + "\n")
                        #         f.write(new_text + "\n")
                        if match:
                            # if pmid in data_tmvar.documents:
                            #     anti_doc = data_tmvar.documents.get(pmid)
                            nala_doc = data_nala.documents.get(pmid)

                            start = part_offset + sent_offset + match.span()[0]
                            end = part_offset + sent_offset + match.span()[1]
                            # print("TmVar is not overlapping?:", not anti_doc.overlaps_with_mention(start, end))
                            # print(not nala_doc.overlaps_with_mention(start, end, annotated=False))

                            if reg.pattern in used_regexs:
                                used_regexs[reg.pattern] += 1
                            else:
                                used_regexs[reg.pattern] = 1
                            print(color.PURPLE + new_text.replace(
                                match.group(), color.BOLD + color.DARKCYAN +
                                color.UNDERLINE + match.group() + color.END +
                                color.PURPLE) + color.END)
                            if not found_in_sentence:
                                positive_sentences += 1
                                found_in_sentence = True
                                # if not anti_doc.overlaps_with_mention(start,
                                #                                       end) \
                                #         and not nala_doc.overlaps_with_mention(start, end, annotated=False):
                                #     _e_result = exclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _e_array[_e_result] += 1
                                #     _i_result = inclusive_definer.define_string(
                                #         new_text[match.span()[0]:match.span()[1]])
                                #     _i_array[_i_result] += 1
                                # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))

                                # last_found += 1
                                # found_in_sentence = True
                                # else:
                                #     # if nala not used only tmvar considered
                                #     if not anti_doc.overlaps_with_mention(start, end):
                                #         _e_result = exclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _e_array[_e_result] += 1
                                #         _i_result = inclusive_definer.define_string(
                                #             new_text[match.span()[0]:match.span()[1]])
                                #         _i_array[_i_result] += 1
                                #         # todo write to file param + saving to manually annotate and find tp + fp for performance eval on each pattern
                                #         # print("e{}\ti{}\t{}\t{}\t{}\n".format(_e_result, _i_result, sent, match, reg.pattern))
                                #         last_found += 1
                                #         found_in_sentence = True

                            if use_nala:
                                nala_found_mention = nala_doc.overlaps_with_mention(
                                    start, end, annotated=False)
                                if nala_found_mention:
                                    print_verbose(nala_found_mention)
                                    if nala_found_mention.subclass > 0 and nala_found_mention.confidence <= self.threshold:
                                        yield pmid, doc

                        if _lasttime - time.time() > 1:
                            print_verbose('time intensive regex', i)
                    sent_offset += 2 + sent_length

                    # for per sentence positives
                    if found_in_sentence:
                        positive_sentences += 1

                part_offset += sent_offset
            if use_nala:
                for part in nala_doc:
                    for ann in part.predicted_annotations:
                        if ann.subclass > 0:
                            print_verbose(part.text[:ann.offset] + color.BOLD +
                                          ann.text + color.END +
                                          part.text[ann.offset +
                                                    len(ann.text):])
                            positive_sentences += min_found
            _old_time = _start_time
            _start_time = time.time()
            _one_time = _start_time - _old_time

            if _one_time > 0.3 and positive_sentences > min_found:
                _progress += 1
                _total_time += _one_time

            _time_per_doc = _total_time / _progress
            print_verbose(
                "PROGRESS: {:.2f} secs ETA per one positive document:"
                " {:.2f} secs".format(_total_time, _time_per_doc))
            print_debug('used regular expressions:',
                        json.dumps(used_regexs, indent=4))
            if positive_sentences >= min_found:
                last_found = 0
                print_verbose('YEP', pmid)
                yield pmid, doc
            else:
                print_verbose('NOPE', pmid)