コード例 #1
0
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     text_str_iu = replace_jv(text_str)
     text = ner.tag_ner("lat", input_text=text_str_iu, output_type=str)
     target = " ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis."
     self.assertEqual(text, target)
コード例 #2
0
ファイル: test_tag.py プロジェクト: kylepjohnson/cltkv1
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner("lat", input_text=text_str_iu, output_type=list)
     target = [
         ("ut", ),
         ("Uenus", "Entity"),
         (",", ),
         ("ut", ),
         ("Sirius", "Entity"),
         (",", ),
         ("ut", ),
         ("Spica", "Entity"),
         (",", ),
         ("ut", ),
         ("aliae", ),
         ("quae", ),
         ("primae", ),
         ("dicuntur", ),
         ("esse", ),
         ("mangitudinis", ),
         (".", ),
     ]
     self.assertEqual(tokens, target)
コード例 #3
0
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ["ut", "Venus", "Sirius"]
     text_list_iu = [replace_jv(x) for x in text_list]
     text = ner.tag_ner("lat", input_text=text_list_iu, output_type=str)
     target = " ut Uenus/Entity Sirius/Entity"
     self.assertEqual(text, target)
コード例 #4
0
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ["ut", "Venus", "Sirius"]
     text_list_iu = [replace_jv(x) for x in text_list]
     tokens = ner.tag_ner("lat", input_text=text_list_iu, output_type=list)
     target = [("ut", ), ("Uenus", "Entity"), ("Sirius", "Entity")]
     self.assertEqual(tokens, target)
コード例 #5
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
コード例 #6
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
コード例 #7
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
コード例 #8
0
ファイル: test_tag.py プロジェクト: ykl7/cltk
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
コード例 #9
0
ファイル: test_tag.py プロジェクト: ykl7/cltk
 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
コード例 #10
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_str_list_greek(self):
     """Test make_ner(), str, list."""
     text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν'
     tokens = ner.tag_ner('greek', input_text=text_str, output_type=list)
     target = [('τὰ', ), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity'),
               ('Κάππαρος', 'Entity'), ('Πρωτογενείας', 'Entity'),
               ('Διονυσιάδες', 'Entity'), ('τὴν', )]
     self.assertEqual(tokens, target)
コード例 #11
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
コード例 #12
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
     target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
     self.assertEqual(text, target)
コード例 #13
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
     target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
     self.assertEqual(text, target)
コード例 #14
0
def count_names_latin(target, sents):
    total_name_count = 0
    target_name_count = 0
    for s in sents:
        ner_tags = ner.tag_ner('latin', input_text=re.sub(r'#\d', "", s))
        names = [tag[0].lower() for tag in ner_tags if len(tag)>1 and tag[1] == 'Entity']
        total_name_count += len(names)
        if target.lower() in names:
            target_name_count += 1
    return total_name_count, target_name_count
コード例 #15
0
ファイル: test_tag.py プロジェクト: kylepjohnson/cltkv1
 def test_tag_ner_str_list_greek(self):
     """Test make_ner(), str, list."""
     text_str = "τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν"
     tokens = ner.tag_ner("grc", input_text=text_str, output_type=list)
     target = [
         ("τὰ", ),
         ("Σίλαριν", "Entity"),
         ("Σιννᾶν", "Entity"),
         ("Κάππαρος", "Entity"),
         ("Πρωτογενείας", "Entity"),
         ("Διονυσιάδες", "Entity"),
         ("τὴν", ),
     ]
     self.assertEqual(tokens, target)
コード例 #16
0
def createNERListFromCorpus(string):
    """
    Will use CLTK NER method on a corpus (as string).
    Will perform jv replacement in the process.
    """
    ner_list = []
    jv_replacer = JVReplacer()
    text_str_iu = jv_replacer.replace(string)
    corpus_ner = ner.tag_ner('latin', input_text=text_str_iu)
    for tup in corpus_ner:
        if len(tup) > 1:
            ner_list.append(tup[0])
    NER_unique_values = set(ner_list)
    print('These NER were found in the given corpus:')
    print(NER_unique_values)
    return ner_list
コード例 #17
0
ファイル: cltk.py プロジェクト: thePortus/dhelp
    def entities(self, lemmatize=False, unique=False):
        """Returns a list of entities recognized in the text.

        Uses cltk's built in named-entity recognition. Reorganizes cltk's raw
        output from list of tuples to list of strings. Every entity recognized
        is added to the list returned. Unless unique option is set, entities
        which appear multiple times will be returned multiple times in the
        list.

        Args:
            lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities
            unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once
        Example:
            >>> text = LatinText('Gallia est omnis divisa in partes tres')
            >>> print(text.entities())
            ['Gallia']
        """ # noqa
        from cltk.stem.lemma import LemmaReplacer
        from cltk.tag import ner
        entity_list = []
        # filtering non-entities
        for result in ner.tag_ner(
            self.options['language'],
            input_text=self.data,
            output_type=list
        ):
            # appending if item flagged as entity in tuple[1]
            try:
                if result[1] == 'Entity':
                    entity_list.append(result[0])
            # do nothing if 'Entity' not specified
            except:
                pass
            # removing duplicate entities if unique option specified
        if unique:
            entity_list = list(set(entity_list))
        # lemmatizing entities if option has been specified
        if lemmatize:
            entity_list = LemmaReplacer(self.options['language']).lemmatize(
                entity_list,
                return_string=False,
                return_raw=False
            )
        return entity_list
コード例 #18
0
ファイル: cltk_doc.py プロジェクト: thePortus/arakhne
 def entities(self, lemmatize=False, unique=False):
     entity_list = []
     # filtering non-entities
     for result in ner.tag_ner(self.language,
                               input_text=self.data,
                               output_type=list):
         # appending if item flagged as entity in tuple[1]
         try:
             if result[1] == 'Entity':
                 entity_list.append(result[0])
         # do nothing if 'Entity' not specified
         except:
             pass
         # removing duplicate entities if unique option specified
     if unique:
         entity_list = list(set(entity_list))
     # lemmatizing entities if option has been specified
     if lemmatize:
         entity_list = LemmaReplacer(self.language).lemmatize(
             entity_list, return_string=False, return_raw=False)
     return entity_list
コード例 #19
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_list_str_greek(self):
     """Test make_ner(), list, str."""
     text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν']
     text = ner.tag_ner('greek', input_text=text_list, output_type=str)
     target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity'
     self.assertEqual(text, target)
コード例 #20
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_str_str_greek(self):
     """Test make_ner(), str, str."""
     text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν'
     text = ner.tag_ner('greek', input_text=text_str, output_type=str)
     target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity Κάππαρος/Entity Πρωτογενείας/Entity Διονυσιάδες/Entity τὴν'
     self.assertEqual(text, target)
コード例 #21
0
ファイル: test.py プロジェクト: michaelhagedon/cltk-test
philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)

# This is not a great lemmatizer
standard_list = lemmatizer.lemmatize(list(philippians_reader.words()),
                                     return_raw=True)

lemmatizer2 = BackoffGreekLemmatizer()

# this one seems better
backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))

# Find most names
names_in_first_sentence = ner.tag_ner('greek',
                                      input_text=sentence,
                                      output_type=list)

transcriber = Transcriber(dialect="Attic", reconstruction="Probert")
ipa = transcriber.transcribe(sentence)
コード例 #22
0
ファイル: test_tag.py プロジェクト: tnmsahu/cltk
 def test_tag_ner_list_list_greek(self):
     """Test make_ner(), list, list."""
     text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν']
     tokens = ner.tag_ner('greek', input_text=text_list, output_type=list)
     target = [('τὰ', ), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity')]
     self.assertEqual(tokens, target)
コード例 #23
0
ファイル: test_tag.py プロジェクト: kylepjohnson/cltkv1
 def test_tag_ner_list_list_greek(self):
     """Test make_ner(), list, list."""
     text_list = ["τὰ", "Σίλαριν", "Σιννᾶν"]
     tokens = ner.tag_ner("grc", input_text=text_list, output_type=list)
     target = [("τὰ", ), ("Σίλαριν", "Entity"), ("Σιννᾶν", "Entity")]
     self.assertEqual(tokens, target)
コード例 #24
0
ファイル: lemma_search.py プロジェクト: atmcgrath/mythodikos
# =====================================================================

infile = "/Users/stellafritzell/mythodikos/canonical-greekLit-master/data/tlg0001/tlg001/tlg0001.tlg001.perseus-grc2.xml"

soup = BeautifulSoup(open(infile), features="lxml")

personlist = ['Ἀμφιδάμας', 'Μελέαγρος', 'Ζήτης']

# pull the contents of each 'l' tag in the .xml file and ignore other text (i.e. 'title', 'author')
file_text = soup.find_all('l')
for t in file_text:
    text = t.get_text()

    # Apply NER to text (comment out if testing other elements, takes time)
    ner_crawl = ner.tag_ner(
        'greek', input_text=text, output_type=list
    )  # this action returns a string of tuples for each line of text *** FAILS TO IDENTIFY ALL ENTITIES
    # NEXT: merege ALL of the lists OR create loop to iterate through each list at a time -- tuples need to remain distinct
    print(ner_crawl)
"""
	for e in entities: 
		if 'Entity' in e == True:
			print(e)
		else:
			continue
"""

# Testing CLTK lemmatizer
tokens = 'τοῖσιν δʼ Ἀμφιδάμας μυθήσατο, παῖς Ἀλεοῖο·'.split(
)  # reads sentence as a list of strings
コード例 #25
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_str_str_greek(self):
     """Test make_ner(), str, str."""
     text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν'
     text = ner.tag_ner('greek', input_text=text_str, output_type=str)
     target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity Κάππαρος/Entity Πρωτογενείας/Entity Διονυσιάδες/Entity τὴν'
     self.assertEqual(text, target)
コード例 #26
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_list_str_greek(self):
     """Test make_ner(), list, str."""
     text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν']
     text = ner.tag_ner('greek', input_text=text_list, output_type=str)
     target = ' τὰ Σίλαριν/Entity Σιννᾶν/Entity'
     self.assertEqual(text, target)
コード例 #27
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_list_list_greek(self):
     """Test make_ner(), list, list."""
     text_list = ['τὰ', 'Σίλαριν', 'Σιννᾶν']
     tokens = ner.tag_ner('greek', input_text=text_list, output_type=list)
     target = [('τὰ',), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity')]
     self.assertEqual(tokens, target)
コード例 #28
0
ファイル: test_tag.py プロジェクト: TylerKirby/cltk
 def test_tag_ner_str_list_greek(self):
     """Test make_ner(), str, list."""
     text_str = 'τὰ Σίλαριν Σιννᾶν Κάππαρος Πρωτογενείας Διονυσιάδες τὴν'
     tokens = ner.tag_ner('greek', input_text=text_str, output_type=list)
     target = [('τὰ',), ('Σίλαριν', 'Entity'), ('Σιννᾶν', 'Entity'), ('Κάππαρος', 'Entity'), ('Πρωτογενείας', 'Entity'), ('Διονυσιάδες', 'Entity'), ('τὴν',)]
     self.assertEqual(tokens, target)
コード例 #29
0
ファイル: test_tag.py プロジェクト: kylepjohnson/cltkv1
 def test_tag_ner_list_str_greek(self):
     """Test make_ner(), list, str."""
     text_list = ["τὰ", "Σίλαριν", "Σιννᾶν"]
     text = ner.tag_ner("grc", input_text=text_list, output_type=str)
     target = " τὰ Σίλαριν/Entity Σιννᾶν/Entity"
     self.assertEqual(text, target)