def test_vanity(self): segments = Segments() segments.extend([Segment('The'), Segment('end')]) segments.append(Segment('is')) segments.append(Segment('nigh')) self.assertEqual(' '.join([s.text for s in segments]), 'The stop is nigh')
def base_segments(self) -> Iterable[Segment]: self.text = self.preprocess_text(self.text) sentence = get_en_core_web_sm(self.text) mod_text = self.text mapping = {} for entity in sentence.ents: to_put = entity.text.replace(' ', '___') mod_text = mod_text.replace(entity.text, to_put) mapping[to_put] = entity.label_ try: doc = make_spacy_doc(mod_text) except Exception as ex: logger.warning('Could not create spacy doc: %s', ex) else: textphrases = [ k[0] for k in textacy.ke.textrank(doc, normalize='lemma', topn=10) if ' ' in k[0] or '_' in k[0] # only really care about multi word phrases ] for textphrase in textphrases: to_put = textphrase.replace(' ', '___') mod_text = mod_text.replace(textphrase, to_put) mapping[textphrase] = 'SOMETHING' return [ Segment(text, tag=mapping.get(text, None)) for text in nltk.word_tokenize(mod_text) ]
def segments(self) -> Iterable[Segment]: segments = self.base_segments people = self.people for person in people: segments = replace_sub( # keep the object segments, [Segment(p) for p in person.dirty_name.split(' ')], [Segment(person.dirty_name, tag='PERSON')]) segments = replace_sub( segments, ['the'] + [Segment(p) for p in person.dirty_name.split(' ')], [Segment('the ' + person.dirty_name, tag='PERSON')]) segments = replace_sub( segments, ['a'] + [Segment(p) for p in person.dirty_name.split(' ')], [Segment('a ' + person.dirty_name, tag='PERSON')]) return segments
def segment(self): from mauve.models.segment import Segment return Segment(self.text, tag=self.pos)
def test_lem_stem(self): self.assertEqual(Segment('bats').lem_stem, 'bat')
def test_is_word(self): self.assertFalse(Segment('asd.').is_wordy) self.assertFalse(Segment('asd,').is_wordy) self.assertTrue(Segment('asd ').is_wordy)
def test_is_entity(self): self.assertTrue(Segment('asd.', tag='DATE').is_entity) self.assertFalse(Segment('asd.', tag='NN').is_entity)
def test_is_verb(self): self.assertTrue(Segment('looking').is_verb) self.assertFalse(Segment('blue').is_verb)
def test_is_adv(self): self.assertTrue(Segment('very').is_adv) self.assertFalse(Segment('trumpet').is_adv)
def test_is_adj(self): self.assertTrue(Segment('big').is_adj) self.assertFalse(Segment('house').is_adj)
def test_tag(self): self.assertEqual(Segment('blah', tag='wooooo').tag, 'wooooo') self.assertEqual(Segment('I').tag, 'PRP') self.assertEqual(Segment('a phrase').tag, 'dunno')