def test_tag_ner_str_str_latin(self): """Test make_ner(), str, str.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" text_str_iu = replace_jv(text_str) text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str) target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis." self.assertEqual(text, target)
def test_tag_ner_str_list_latin(self): """Test make_ner(), str, list.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list) target = [ ("ut", ), ("Uenus", "Entity"), (",", ), ("ut", ), ("Sirius", "Entity"), (",", ), ("ut", ), ("Spica", "Entity"), (",", ), ("ut", ), ("aliae", ), ("quae", ), ("primae", ), ("dicuntur", ), ("esse", ), ("mangitudinis", ), (".", ), ] self.assertEqual(tokens, target)
def test_tag_ner_list_str_latin(self): """Test make_ner(), list, str.""" text_list = ["ut", "Venus", "Sirius"] text_list_iu = [replace_jv(x) for x in text_list] text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str) target = " ut Uenus/Entity Sirius/Entity" self.assertEqual(text, target)
def test_tag_ner_list_list_latin(self): """Test make_ner(), list, list.""" text_list = ["ut", "Venus", "Sirius"] text_list_iu = [replace_jv(x) for x in text_list] tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list) target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")] self.assertEqual(tokens, target)
def test_tag_ner_list_str_latin(self): """Test make_ner(), list, str.""" text_list = ['ut', 'Venus', 'Sirius'] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str) target = ' ut Uenus/Entity Sirius/Entity' self.assertEqual(text, target)
def test_tag_ner_str_list_latin(self): """Test make_ner(), str, list.""" text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list) target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)] self.assertEqual(tokens, target)
def test_tag_ner_list_list_latin(self): """Test make_ner(), list, list.""" text_list = ['ut', 'Venus', 'Sirius'] jv_replacer = JVReplacer() text_list_iu = [jv_replacer.replace(x) for x in text_list] tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list) target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')] self.assertEqual(tokens, target)
def test_tag_ner_str_list_greek(self): """Test make_ner(), str, list.""" text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν' tokens = ner.tag_ner('greek', input_text=text_str, output_type=list) target = [('τὰ', ), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity'), ('Κάππαρος', 'Entity'), ('Πρωτογενείας', 'Entity'), ('Διονυσιάδες', 'Entity'), ('τὴν', )] self.assertEqual(tokens, target)
def test_tag_ner_str_str_latin(self): """Test make_ner(), str, str.""" jv_replacer = JVReplacer() text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis.""" jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(text_str) text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str) target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.' self.assertEqual(text, target)
def count_names_latin(target, sents): total_name_count = 0 target_name_count = 0 for s in sents: ner_tags = ner.tag_ner('latin', input_text=re.sub(r'#\d', "", s)) names = [tag[0].lower() for tag in ner_tags if len(tag)>1 and tag[1] == 'Entity'] total_name_count += len(names) if target.lower() in names: target_name_count += 1 return total_name_count, target_name_count
def test_tag_ner_str_list_greek(self): """Test make_ner(), str, list.""" text_str = "τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν" tokens = ner.tag_ner("grc", input_text=text_str, output_type=list) target = [ ("τὰ", ), ("Σίλαριν", "Entity"), ("Σιννᾶν", "Entity"), ("Κάππαρος", "Entity"), ("Πρωτογενείας", "Entity"), ("Διονυσιάδες", "Entity"), ("τὴν", ), ] self.assertEqual(tokens, target)
def createNERListFromCorpus(string): """ Will use CLTK NER method on a corpus (as string). Will perform jv replacement in the process. """ ner_list = [] jv_replacer = JVReplacer() text_str_iu = jv_replacer.replace(string) corpus_ner = ner.tag_ner('latin', input_text=text_str_iu) for tup in corpus_ner: if len(tup) > 1: ner_list.append(tup[0]) NER_unique_values = set(ner_list) print('These NER were found in the given corpus:') print(NER_unique_values) return ner_list
def entities(self, lemmatize=False, unique=False): """Returns a list of entities recognized in the text. Uses cltk's built in named-entity recognition. Reorganizes cltk's raw output from list of tuples to list of strings. Every entity recognized is added to the list returned. Unless unique option is set, entities which appear multiple times will be returned multiple times in the list. Args: lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once Example: >>> text = LatinText('Gallia est omnis divisa in partes tres') >>> print(text.entities()) ['Gallia'] """ # noqa from cltk.stem.lemma import LemmaReplacer from cltk.tag import ner entity_list = [] # filtering non-entities for result in ner.tag_ner( self.options['language'], input_text=self.data, output_type=list ): # appending if item flagged as entity in tuple[1] try: if result[1] == 'Entity': entity_list.append(result[0]) # do nothing if 'Entity' not specified except: pass # removing duplicate entities if unique option specified if unique: entity_list = list(set(entity_list)) # lemmatizing entities if option has been specified if lemmatize: entity_list = LemmaReplacer(self.options['language']).lemmatize( entity_list, return_string=False, return_raw=False ) return entity_list
def entities(self, lemmatize=False, unique=False): entity_list = [] # filtering non-entities for result in ner.tag_ner(self.language, input_text=self.data, output_type=list): # appending if item flagged as entity in tuple[1] try: if result[1] == 'Entity': entity_list.append(result[0]) # do nothing if 'Entity' not specified except: pass # removing duplicate entities if unique option specified if unique: entity_list = list(set(entity_list)) # lemmatizing entities if option has been specified if lemmatize: entity_list = LemmaReplacer(self.language).lemmatize( entity_list, return_string=False, return_raw=False) return entity_list
def test_tag_ner_list_str_greek(self): """Test make_ner(), list, str.""" text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν'] text = ner.tag_ner('greek', input_text=text_list, output_type=str) target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity' self.assertEqual(text, target)
def test_tag_ner_str_str_greek(self): """Test make_ner(), str, str.""" text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν' text = ner.tag_ner('greek', input_text=text_str, output_type=str) target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity Κάππαρος/Entity Πρωτογενείας/Entity Διονυσιάδες/Entity τὴν' self.assertEqual(text, target)
philippians_reader._fileids = [ 'new-testament__letter-to-the-philippians__grc.json' ] # print(list(perseus_reader.sents())) sentences = list(philippians_reader.sents()) sentence = cltk_normalize(sentences[0]) lemmatizer = LemmaReplacer('greek') word_list = lemmatizer.lemmatize(sentence) tagger = POSTag('greek') parts_of_speech = tagger.tag_ngram_123_backoff(sentence) # This is not a great lemmatizer standard_list = lemmatizer.lemmatize(list(philippians_reader.words()), return_raw=True) lemmatizer2 = BackoffGreekLemmatizer() # this one seems better backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words())) # Find most names names_in_first_sentence = ner.tag_ner('greek', input_text=sentence, output_type=list) transcriber = Transcriber(dialect="Attic", reconstruction="Probert") ipa = transcriber.transcribe(sentence)
def test_tag_ner_list_list_greek(self): """Test make_ner(), list, list.""" text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν'] tokens = ner.tag_ner('greek', input_text=text_list, output_type=list) target = [('τὰ', ), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity')] self.assertEqual(tokens, target)
def test_tag_ner_list_list_greek(self): """Test make_ner(), list, list.""" text_list = ["τὰ", "Σίλαριν", "Σιννᾶν"] tokens = ner.tag_ner("grc", input_text=text_list, output_type=list) target = [("τὰ", ), ("Σίλαριν", "Entity"), ("Σιννᾶν", "Entity")] self.assertEqual(tokens, target)
# ===================================================================== infile = "/Users/stellafritzell/mythodikos/canonical-greekLit-master/data/tlg0001/tlg001/tlg0001.tlg001.perseus-grc2.xml" soup = BeautifulSoup(open(infile), features="lxml") personlist = ['Ἀμφιδάμας', 'Μελέαγρος', 'Ζήτης'] # pull the contents of each 'l' tag in the .xml file and ignore other text (i.e. 'title', 'author') file_text = soup.find_all('l') for t in file_text: text = t.get_text() # Apply NER to text (comment out if testing other elements, takes time) ner_crawl = ner.tag_ner( 'greek', input_text=text, output_type=list ) # this action returns a string of tuples for each line of text *** FAILS TO IDENTIFY ALL ENTITIES # NEXT: merege ALL of the lists OR create loop to iterate through each list at a time -- tuples need to remain distinct print(ner_crawl) """ for e in entities: if 'Entity' in e == True: print(e) else: continue """ # Testing CLTK lemmatizer tokens = 'τοῖσιν δʼ Ἀμφιδάμας μυθήσατο, παῖς Ἀλεοῖο·'.split( ) # reads sentence as a list of strings
def test_tag_ner_list_list_greek(self): """Test make_ner(), list, list.""" text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν'] tokens = ner.tag_ner('greek', input_text=text_list, output_type=list) target = [('τὰ',), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity')] self.assertEqual(tokens, target)
def test_tag_ner_str_list_greek(self): """Test make_ner(), str, list.""" text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν' tokens = ner.tag_ner('greek', input_text=text_str, output_type=list) target = [('τὰ',), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity'), ('Κάππαρος', 'Entity'), ('Πρωτογενείας', 'Entity'), ('Διονυσιάδες', 'Entity'), ('τὴν',)] self.assertEqual(tokens, target)
def test_tag_ner_list_str_greek(self): """Test make_ner(), list, str.""" text_list = ["τὰ", "Σίλαριν", "Σιννᾶν"] text = ner.tag_ner("grc", input_text=text_list, output_type=str) target = " τὰ Σίλαριν/Entity Σιννᾶν/Entity" self.assertEqual(text, target)