Ejemplo n.º 1
0
 def test_sentence_boundaries_simple(self):
     sentences = [
         sentence_factory("x x x\n" * 3),  # 3 words
         sentence_factory("x x x\n" * 2),  # 2 words
         sentence_factory("x x x\n" * 4),  # 4 words
     ]
     #          1st 2nd 3rd   end
     expected = [0, 3, 3 + 2, 3 + 2 + 4]
     self.assertEqual(get_sentence_boundaries(sentences), expected)
Ejemplo n.º 2
0
 def test_offsets_and_tokens_work_togheter(self):
     sentences = [
         sentence_factory("a x x\n" * 3),  # 3 words
         sentence_factory("b x x\n" * 2),  # 2 words
         sentence_factory("c x x\n" * 4),  # 4 words
         sentence_factory("d x x\n" * 5),  # 5 words
     ]
     words = get_tokens(sentences)
     offsets = get_sentence_boundaries(sentences)
     self.assertEqual(len(words), offsets[-1])
     self.assertEqual(words[offsets[1]], "b")
     self.assertEqual(words[offsets[1] - 1], "a")
     self.assertEqual(words[offsets[3]], "d")
     self.assertEqual(words[offsets[3] - 1], "c")
Ejemplo n.º 3
0
    def __call__(self, document):
        steps = [
            PreProcessSteps.tokenization,
            PreProcessSteps.sentencer,
            PreProcessSteps.tagging,
            PreProcessSteps.ner
        ]
        if not self.override and all(document.was_preprocess_step_done(step) for step in steps):
            return
        if not self.override and document.was_preprocess_step_done(PreProcessSteps.tokenization):
            raise NotImplementedError("Running with mixed preprocess steps not supported, must be 100% StanfordMultiStepRunner")


        analysis = get_analizer().analize(document.text)
        analized_sentences = analysis_to_sentences(analysis)
        sentence = document.text

        # Tokenization
        tokens = get_tokens(sentence)
        offsets = get_token_offsets(tokens)
        document.set_tokenization_result(list(zip(offsets, tokens)))

        # "Sentencing" (splitting in sentences)
        document.set_sentencer_result(get_sentence_boundaries(sentence))

        # POS tagging
        document.set_tagging_result(get_pos(analized_sentences))

        # NER
        #read from tagged corpus instead of using NER results
        # xs = get_tagged_entity_occurrences(document, tokens)
        # ys = get_entity_occurrences(analized_sentences)
        # zs = [FoundEntity(key="{} {} {} {}".format(document.human_identifier, kind, i, j),
        #                   kind_name=kind,
        #                   alias=" ".join(tokens[i:j]),
        #                   offset=i,
        #                   offset_end=j)
        #       for i, j, kind in merge_entities(xs, ys)]
        
        # document.set_ner_result(zs)

        # Save progress so far, next step doesn't modify `document`
        document.save()

        # Coreference resolution
        for coref in get_coreferences(analysis):
            try:
                apply_coreferences(document, coref)
            except CoreferenceError as e:
                logger.warning(e)
Ejemplo n.º 4
0
 def test_sentence_boundaries_empty(self):
     self.assertEqual(get_sentence_boundaries([]), [0])