def testNotEqual(self): tu1 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags tu2 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test2", # normalized form ["test2"], # tokens ["test2"], # lemmas ["test2"], # stems ["N"]) # POS tags tu3 = model.KBTextualUnit( "test-corpus2", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags self.failIf(self._tu != tu1) self.failUnless(self._tu != tu2) self.failUnless(self._tu != tu3)
def setUp(self): self._tu = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags
def testNotEqual(self): tuc1 = model.KBTextualUnitCluster() tuc2 = model.KBTextualUnitCluster() tuc3 = model.KBTextualUnitCluster() tu1 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags tu2 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test2", # normalized form ["test 2"], # tokens ["test 2"], # lemmas ["test 2"], # stems ["N"]) # POS tags tu3 = model.KBTextualUnit( "test-corpus2", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags self._tuc.addTextualUnit(tu1) self._tuc.addTextualUnit(tu2) self._tuc.centroid = tu2 tuc1.addTextualUnit(tu1) tuc1.addTextualUnit(tu2) tuc1.centroid = tu2 tuc2.addTextualUnit(tu1) tuc2.addTextualUnit(tu2) tuc2.centroid = tu1 tuc3.addTextualUnit(tu2) tuc3.centroid = tu2 self.failIf(self._tuc != tuc1) self.failUnless(self._tuc != tuc2) self.failUnless(self._tuc != tuc3)
def testSetWrongCentroid(self): tu = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags with self.assertRaises(exception.KBTextualUnitClusterException): self._tuc.centroid = tu
def testAddExistingTextualUnit(self): tu1 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags tu2 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags self._tuc.addTextualUnit(tu1) with self.assertRaises(exception.KBTextualUnitClusterException): self._tuc.addTextualUnit(tu2)
def testSetCentroid(self): tu = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags self._tuc.addTextualUnit(tu) self._tuc.centroid = tu self.failUnless(self._tuc.centroid == tu)
def testAddTextualUnit(self): tu1 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test", # normalized form ["test"], # tokens ["test"], # lemmas ["test"], # stems ["N"]) # POS tags tu2 = model.KBTextualUnit( "test-corpus", # corpus name "fr", # language "test2", # normalized form ["test 2"], # tokens ["test 2"], # lemmas ["test 2"], # stems ["N"]) # POS tags self._tuc.addTextualUnit(tu1) self._tuc.addTextualUnit(tu2) self.failUnless(self._tuc.numberOfTextualUnits() == 2) self.failUnless(self._tuc.textual_units == [tu1, tu2])
def _updateCandidateDictionary(self, candiates, document, sentence_offset, starting_token, ending_token): """Adds or update a newly extracted candidate form to a candidate dictionary. Args: candidates: The candidate dictionary to update. Keys are mixtures of candidate forms and POS tags. document: The C{KBDocument} where the candidate is extracted from. sentence_offset: The index of the sentence of the document where the candidate is extracted. starting_token: The index of the first token of the candidate within the sentence it is extracted from. ending_token: The index of the last token of the candidate within the sentence it is extracted from. """ tool_factory = core.KBBenchmark.singleton().run_tools[self._run_name] normalizer = tool_factory.normalizer(document.language) #--------------------------------------------------------------------------- tokenized_sentence = document.full_text_sentence_tokens[sentence_offset] pos_tagged_sentence = document.full_text_sentence_pos_tags[sentence_offset] #--------------------------------------------------------------------------- candidate_string = " ".join(tokenized_sentence[starting_token:ending_token]) candidate_seen_form = candidate_string # FIXME tokenized form :{ candidate_normalized_form = normalizer.normalize(candidate_string) candidate_normalized_tokens = candidate_normalized_form.split(" ") candidate_normalized_lemmas = document.full_text_token_lemmas[sentence_offset][starting_token:ending_token] candidate_normalized_stems = document.full_text_token_stems[sentence_offset][starting_token:ending_token] candidate_pos_tags = pos_tagged_sentence[starting_token:ending_token] #--------------------------------------------------------------------------- candidate = model.KBTextualUnit(document.corpus_name, document.language, candidate_normalized_form, candidate_normalized_tokens, candidate_normalized_lemmas, candidate_normalized_stems, candidate_pos_tags) if candidate.identifier not in candidates: candidates[identifier] = candidate candidates[identifier].addOccurrence(candidate_seen_form, sentence_offset, starting_token)
def _candidateExtraction(self, document): """Extracts the candidates of a given document. Args: document: The C{KBDocument} from which the candidates must be extracted. Returns: The C{list} of extracted, and filtered, candidates (C{KBTextualUnit}s). """ candidates = super(FrenchRefinedNounPhraseExtractor, self)._candidateExtraction(document) for index, candidate in enumerate(candidates): # WARNING works only for N+A? (one adjective at the right) # check if the adjective must be filtered out or not if not self._check_adjective(candidate): candidate_normalized_tokens = candidate.normalized_tokens[:-1] # create a new candidate without the adjective candidate_normalized_form = " ".join(candidate_normalized_tokens) candidate_normalized_lemmas = candidate.normalized_lemmas[:-1] candidate_normalized_stems = candidate.normalized_stems[:-1] candidate_pos_tags = candidate.pos_tags[:-1] new_candidate = model.KBTextualUnit(document.corpus_name, document.language, candidate_normalized_form, candidate_normalized_tokens, candidate_normalized_lemmas, candidate_normalized_stems, candidate_pos_tags) # add all the occurrences seen_forms = candidate.seen_forms for seen_form in seen_forms: new_seen_form =" ".join(seen_form.split(" ")[:-1]) # FIXME tokenized form, not seen form :{ for sentence_offset, inner_sentence_offsets in seen_forms[seen_form].items(): for inner_sentence_offset in inner_sentence_offsets: new_candidate.addOccurrence(new_seen_form, sentence_offset, inner_sentence_offset) candidates[index] = new_candidate return candidates.values()