Esempio n. 1
0
 def test_line_tokenizer_include_blanks(self):
     """Test LineTokenizer"""
     text = """48. Cum tibi contigerit studio cognoscere multa,\nFac discas multa, vita nil discere velle.\n\n49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos."""
     target = ['48. Cum tibi contigerit studio cognoscere multa,','Fac discas multa, vita nil discere velle.','','49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.']
     tokenizer = LineTokenizer('latin')
     tokenized_lines = tokenizer.tokenize(text, include_blanks=True)
     self.assertTrue(tokenized_lines == target)
Esempio n. 2
0
 def test_french_line_tokenizer_include_blanks(self):
     """Test LineTokenizer"""
     text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie.\n\nLes contes que jo sai verais,\ndunt li Bretun unt fait les lais,\nvos conterai assez briefment."""  # pylint: disable=line-too-long
     target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie.', '', 'Les contes que jo sai verais,', 'dunt li Bretun unt fait les lais,', 'vos conterai assez briefment.']  # pylint: disable=line-too-long
     tokenizer = LineTokenizer('french')
     tokenized_lines = tokenizer.tokenize(text, include_blanks=True)
     self.assertTrue(tokenized_lines == target)
Esempio n. 3
0
 def test_line_tokenizer(self):
     """Test LineTokenizer"""
     text = """49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos."""
     target = ['49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.']
     tokenizer = LineTokenizer('latin')
     tokenized_lines = tokenizer.tokenize(text)
     self.assertTrue(tokenized_lines == target)
Esempio n. 4
0
 def test_french_line_tokenizer_include_blanks(self):
     """Test LineTokenizer"""
     text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie.\n\nLes contes que jo sai verais,\ndunt li Bretun unt fait les lais,\nvos conterai assez briefment."""  # pylint: disable=line-too-long
     target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie.','','Les contes que jo sai verais,','dunt li Bretun unt fait les lais,','vos conterai assez briefment.']  # pylint: disable=line-too-long
     tokenizer = LineTokenizer('french')
     tokenized_lines = tokenizer.tokenize(text, include_blanks=True)
     self.assertTrue(tokenized_lines == target)
Esempio n. 5
0
 def test_french_line_tokenizer(self):
     """Test LineTokenizer"""
     text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie. """  # pylint: disable=line-too-long
     target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.', 'Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie. ']  # pylint: disable=line-too-long
     tokenizer = LineTokenizer('french')
     tokenized_lines = tokenizer.tokenize(text)
     self.assertTrue(tokenized_lines == target)
Esempio n. 6
0
 def test_french_line_tokenizer(self):
     """Test LineTokenizer"""
     text = """Ki de bone matire traite,\nmult li peise, se bien n’est faite.\nOëz, seignur, que dit Marie,\nki en sun tens pas ne s’oblie. """  # pylint: disable=line-too-long
     target = ['Ki de bone matire traite,', 'mult li peise, se bien n’est faite.','Oëz, seignur, que dit Marie,', 'ki en sun tens pas ne s’oblie. ']  # pylint: disable=line-too-long
     tokenizer = LineTokenizer('french')
     tokenized_lines = tokenizer.tokenize(text)
     self.assertTrue(tokenized_lines == target)
Esempio n. 7
0
 def test_line_tokenizer_include_blanks(self):
     """Test LineTokenizer"""
     text = """48. Cum tibi contigerit studio cognoscere multa,\nFac discas multa, vita nil discere velle.\n\n49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos."""  # pylint: disable=line-too-long
     target = ['48. Cum tibi contigerit studio cognoscere multa,','Fac discas multa, vita nil discere velle.','','49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.']  # pylint: disable=line-too-long
     tokenizer = LineTokenizer('latin')
     tokenized_lines = tokenizer.tokenize(text, include_blanks=True)
     self.assertTrue(tokenized_lines == target)
Esempio n. 8
0
 def test_line_tokenizer(self):
     """Test LineTokenizer"""
     text = """49. Miraris verbis nudis me scribere versus?\nHoc brevitas fecit, sensus coniungere binos."""
     target = ['49. Miraris verbis nudis me scribere versus?','Hoc brevitas fecit, sensus coniungere binos.']
     tokenizer = LineTokenizer('latin')
     tokenized_lines = tokenizer.tokenize(text)
     self.assertTrue(tokenized_lines == target)
Esempio n. 9
0
from cltk.tokenize.line import LineTokenizer
from os import listdir, path

#initialize tokenizer
tokenizer = LineTokenizer('latin')

#create list of lines
whole_met = []

list_of_files = sorted(
    [file for file in listdir('la') if path.isfile(path.join('la/', file))])

#iterate through files/books of Metamorphoses
for file in list_of_files:

    if file.startswith('ovid'):

        #get text from each file
        with open('la/' + file) as f:
            raw = f.read()

            #add line-tokenized text to the master list of lines
            whole_met += tokenizer.tokenize(raw)

clean_met = [string.replace('\t', ' ') for string in whole_met]
Esempio n. 10
0
import random
import os
from cltk.tokenize.line import LineTokenizer

#initialize tokenizer
tokenizer = LineTokenizer('latin')

#create list of lines
whole_met = []

list_of_files = [
    file for file in listdir('la') if os.path.isfile(os.path.join('la/', file))
]
print(list_of_files)
'''
#iterate through files/books of Metamorphoses
for file in [file for file in listdir('la') if os.path.isfile(os.path.join('la/',file))]:

    if file.startswith('ovid'):

        #get text from each file
        with open('la/' + file) as f:
            raw = f.read()

            #add line-tokenized text to the master list of lines
            whole_met += tokenizer.tokenize(raw)

            whole_met.replace('\t',' ')

#test if there are any empty lines
def test_for_empty(list):