Exemple #1
0
    def setUpClass(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()

        doc_id1 = Document()
        # 15 tokens in 2 sentences
        doc_id1.parts['p1'] = Part('insertionefsA dup23.23')
        doc_id1.parts['p1'].sentences = [[Token('insertionefsA', 0), Token('dup23.23', 14)]]
        cls.dataset.documents['doc_id1'] = doc_id1

        cls.feature = TmVarFeatureGenerator()
        cls.feature.generate(dataset=cls.dataset)
Exemple #2
0
    def tokenize(self, dataset):
        """
        :type dataset: nalaf.structures.data.Dataset
        """
        for part in dataset.parts():
            so_far = 0
            part.sentences = []
            for index, sentence_ in enumerate(part.sentences_):
                part.sentences.append([])

                for token_word in self.tokenize_string(sentence_):
                    token_start = part.text.find(token_word, so_far)
                    so_far = token_start + len(token_word)
                    part.sentences[index].append(Token(token_word,
                                                       token_start))
Exemple #3
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)],
                          [Token('Try', 18), Token('tried', 22), Token('tries', 28)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.generator = PorterStemFeatureGenerator()
Exemple #4
0
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[
            Token('Make', 0),
            Token('making', 5),
            Token('made', 12)
        ], [Token('Try', 18),
            Token('tried', 22),
            Token('tries', 28)]]
        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        for token in self.dataset.tokens():
            token.features['a'] = 'a'
            token.features['b'] = 'b'
Exemple #5
0
    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()
Exemple #6
0
    def setup_class(cls):
        # create a sample dataset to test
        cls.dataset = Dataset()
        part = Part('some text c.A100G p.V100Q some text')
        part.sentences = [[
            Token('some', 0),
            Token('text', 5),
            Token('c', 10),
            Token('.', 11),
            Token('A', 12),
            Token('100', 13),
            Token('G', 16),
            Token('p', 18),
            Token('.', 19),
            Token('V', 20),
            Token('100', 21),
            Token('Q', 24),
            Token('some', 26),
            Token('text', 31)
        ]]

        predicted_labels = [
            'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O',
            'O'
        ]

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'] = Document()
        cls.dataset.documents['doc_1'].parts['p1'] = part

        part = Part('test edge case DNA A927B test')
        part.sentences = [[
            Token('test', 0),
            Token('edge', 5),
            Token('case', 10),
            Token('DNA', 15),
            Token('A', 19),
            Token('927', 20),
            Token('B', 23),
            Token('test', 25)
        ]]

        predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O']

        for index, label in enumerate(predicted_labels):
            part.sentences[0][index].predicted_labels = [Label(label)]

        cls.dataset.documents['doc_1'].parts['p2'] = part