Beispiel #1
0
def build_corpus(raw_text_path: str, processed_text_path: str) -> None:
    if not os.path.exists(raw_text_path):
        print("Downloading corpus...")
        zip_file_path: str = raw_text_path + ".zip"
        response: requests.Response = requests.get(
            "https://box.hu-berlin.de/f/056b874a12cf44de82ab/?dl=1",
            stream=True)
        total_length: int = int(response.headers.get("content-length"))
        done_count: int = 0
        chunk_size: int = 1024
        with open(zip_file_path, "wb+") as f:
            for data in tqdm(
                    response.iter_content(chunk_size=chunk_size),
                    total=math.ceil(total_length // chunk_size),
                    unit="MB",
                    unit_scale=0.001):  # math.ceil(total_length // chunk_size)
                done_count += len(data)
                f.write(data)
        print("Extracting corpus...")
        zip_file: ZipFile = ZipFile(zip_file_path)
        file_path_parts: Tuple[str, str] = os.path.split(raw_text_path)
        zip_file.extract(file_path_parts[1], file_path_parts[0])
        zip_file.close()
    print("Segmenting and tokenizing corpus...")
    raw_text: str
    with open(raw_text_path) as f:
        raw_text = f.read()
    language: str = "latin"
    raw_sentences: List[str] = nltk.sent_tokenize(raw_text, language=language)
    del raw_text
    word_tokenizer = WordTokenizer(language)
    with open(processed_text_path, "a+") as f:
        raw_text_tokenized = []
        for sent in tqdm(raw_sentences):
            raw_text_tokenized.append(word_tokenizer.tokenize(sent))
            if len(raw_text_tokenized) == 1000:
                for sentence in raw_text_tokenized:
                    f.write("\t".join(sentence) + "\n")
                raw_text_tokenized = []
Beispiel #2
0
    def test_tokenize_arabic_words(self):
        word_tokenizer = WordTokenizer('arabic')
        tests = [
            'اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.',
            'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم',
            'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.',
            'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم',
            'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ',
            'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ',
            'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟'
        ]

        results = []
        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [
            ['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'],
            ['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'],
            [
                'الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،',
                'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'
            ],  # pylint: disable=line-too-long
            [
                'اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ',
                'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'
            ],  # pylint: disable=line-too-long
            ['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'],
            [
                'فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء',
                'فَأَسْقَيْنَاكُمُوهُ'
            ],
            [
                'سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،',
                'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟'
            ]  # pylint: disable=line-too-long
        ]
        self.assertEqual(results, target)
Beispiel #3
0
    def tag_ner_fr(self, input_text, output_type=list):

        entities = self.entities

        for entity in entities:
            (name, kind) = entity

        word_tokenizer = WordTokenizer('french')
        tokenized_text = word_tokenizer.tokenize(input_text)
        ner_tuple_list = []

        match = False
        for word in tokenized_text:
            for name, kind in entities:
                if word == name:
                    named_things = ([(name, 'entity', kind)])
                    ner_tuple_list.append(named_things)
                    match = True
                    break
            else:
                ner_tuple_list.append((word, ))
        return ner_tuple_list
Beispiel #4
0
 def test_french_lemmatizer(self):
     text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !"
     text = str.lower(text)
     tokenizer = WordTokenizer('french')
     lemmatizer = LemmaReplacer()
     tokens = tokenizer.tokenize(text)
     lemmas = lemmatizer.lemmatize(tokens)
     target = [('li', 'li'), ('rois', 'rois'), ('pense', 'pense'),
               ('que', 'que'), ('par', 'par'), ('folie', 'folie'),
               (',', ['PUNK']), ('sire', 'sire'), ('tristran', 'None'),
               (',', ['PUNK']), ('vos', 'vos'), ('aie', ['avoir']),
               ('amé', 'amer'), (';', ['PUNK']), ('mais', 'mais'),
               ('dé', 'dé'), ('plevis', 'plevir'), ('ma', 'ma'),
               ('loiauté', 'loiauté'), (',', ['PUNK']), ('qui', 'qui'),
               ('sor', 'sor'), ('mon', 'mon'), ('cors', 'cors'),
               ('mete', 'mete'), ('flaele', 'flaele'), (',', ['PUNK']),
               ("s'", "s'"), ('onques', 'onques'), ('fors', 'fors'),
               ('cil', 'cil'), ('qui', 'qui'), ("m'", "m'"), ('ot', 'ot'),
               ('pucele', 'pucele'), ('out', ['avoir']), ("m'", "m'"),
               ('amistié', 'amistié'), ('encor', 'encor'), ('nul', 'nul'),
               ('jor', 'jor'), ('!', ['PUNK'])]
     self.assertEqual(lemmas, target)
Beispiel #5
0
def tokenize(request):
    language = request['Content-Language']
    src_data = request['Payload']
    print(language)

    word_tokenizer = WordTokenizer(language)
    data = word_tokenizer.tokenize(src_data)
    clean_data = list(map(cltk_normalize, [w for w in data if w.isalpha()]))
    # and not w in STOPS_LIST]

    # lemma = LemmaReplacer(language).lemmatize(clean_data)
    lemma = None
    if language == 'greek':
        lemma = BackoffGreekLemmatizer().lemmatize(clean_data)
    elif language == 'latin':
        lemma = BackoffLatinLemmatizer().lemmatize(clean_data)

    result = []
    for i, elem in enumerate(lemma):
        w, l = elem
        result.append({'index': i + 1, 'word': w, 'lemma': l})

    return result
def runTest(text):
   '''Test cltk tools for latin'''
   print('Test phrase:')
   print(' -> ' + text)
   print()

#   print('[1/3] Testing JVReplacer')
#   jv = JVReplacer()
#   text = jv.replace(text)
#   print(' -> ' + text)
#   print()

   print('[2/3] Testing WordTokenizer')
   tokenizer = WordTokenizer('latin')
   tok = tokenizer.tokenize(text)
   print(' -> ' + ', '.join(["'{}'".format(t) for t in tok]))
   print()

   print('[3/3] Testing LemmaReplacer')
   lemmatizer = LemmaReplacer('latin')
   lem = lemmatizer.lemmatize(tok)
   print(' -> ' + ', '.join(["'{}'".format(l) for l in lem]))
   print()
Beispiel #7
0
    def test_syllabification_old_norse(self):
        old_norse_syllabifier = Syllabifier(language="old_norse",
                                            break_geminants=True)
        text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \
               "átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
        tokenizer = WordTokenizer('old_norse')
        words = tokenizer.tokenize(text)
        old_norse_syllabifier.set_invalid_onsets(invalid_onsets)

        syllabified_words = [
            old_norse_syllabifier.syllabify_ssp(word.lower()) for word in words
            if word not in ",."
        ]

        target = [['gef', 'jun'], ['dró'], ['frá'], ['gyl', 'fa'], ['glöð'],
                  ['djúp', 'rö', 'ðul'], ['óðl', 'a'], ['svá'], ['at'], ['af'],
                  ['ren', 'ni', 'rauk', 'num'], ['rauk'],
                  ['dan', 'mar', 'kar'], ['auk', 'a'], ['bár', 'u'], ['öxn'],
                  ['ok'], ['át', 'ta'], ['en', 'ni', 'tungl'], ['þars'],
                  ['geng', 'u'], ['fy', 'rir'], ['vi', 'ney', 'jar'],
                  ['víðr', 'i'], ['val', 'rauf'], ['fjö',
                                                   'gur'], ['hö', 'fuð']]
        self.assertListEqual(syllabified_words, target)
Beispiel #8
0
 def test_dict_lemmatizer(self):
     """Test model_lemmatizer()"""
     lemmas = {
         "ceterum": "ceterus",
         "antequam": "antequam",
         "destinata": "destino",
         "componam": "compono",
     }  # pylint: disable=line-too-long
     lemmatizer = DictLemmatizer(lemmas=lemmas)
     test_str = "Ceterum antequam destinata componam"
     target = [
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer("latin")
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
Beispiel #9
0
def convert_to_toks(sents):

    sent_tokenizer = SentenceTokenizer()
    word_tokenizer = WordTokenizer('latin')

    all_sents = []

    for data in sents:
        text = data.lower()

        sents = sent_tokenizer.tokenize(text)
        for sent in sents:
            tokens = word_tokenizer.tokenize(sent)
            filt_toks = []
            filt_toks.append("[CLS]")
            for tok in tokens:
                if tok != "":
                    filt_toks.append(tok)
            filt_toks.append("[SEP]")

            all_sents.append(filt_toks)

    return all_sents
Beispiel #10
0
    def test_greek_word_tokenizer(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('greek')

        # Test sources:
        # - Thuc. 1.1.1

        test = "Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων, ὡς ἐπολέμησαν πρὸς ἀλλήλους, ἀρξάμενος εὐθὺς καθισταμένου καὶ ἐλπίσας μέγαν τε ἔσεσθαι καὶ ἀξιολογώτατον τῶν προγεγενημένων, τεκμαιρόμενος ὅτι ἀκμάζοντές τε ᾖσαν ἐς αὐτὸν ἀμφότεροι παρασκευῇ τῇ πάσῃ καὶ τὸ ἄλλο Ἑλληνικὸν ὁρῶν ξυνιστάμενον πρὸς ἑκατέρους, τὸ μὲν εὐθύς, τὸ δὲ καὶ διανοούμενον."

        target = [
            'Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν',
            'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',', 'ὡς', 'ἐπολέμησαν',
            'πρὸς', 'ἀλλήλους', ',', 'ἀρξάμενος', 'εὐθὺς', 'καθισταμένου',
            'καὶ', 'ἐλπίσας', 'μέγαν', 'τε', 'ἔσεσθαι', 'καὶ', 'ἀξιολογώτατον',
            'τῶν', 'προγεγενημένων', ',', 'τεκμαιρόμενος', 'ὅτι', 'ἀκμάζοντές',
            'τε', 'ᾖσαν', 'ἐς', 'αὐτὸν', 'ἀμφότεροι', 'παρασκευῇ', 'τῇ',
            'πάσῃ', 'καὶ', 'τὸ', 'ἄλλο', 'Ἑλληνικὸν', 'ὁρῶν', 'ξυνιστάμενον',
            'πρὸς', 'ἑκατέρους', ',', 'τὸ', 'μὲν', 'εὐθύς', ',', 'τὸ', 'δὲ',
            'καὶ', 'διανοούμενον', '.'
        ]

        result = word_tokenizer.tokenize(test)

        self.assertEqual(result, target)
Beispiel #11
0
def read_file(filename):
    sent_tokenizer = SentenceTokenizer()
    word_tokenizer = WordTokenizer('latin')

    all_sents = []
    with open(filename, encoding="utf-8") as file:
        data = file.read()

        # BERT model is lowercase
        text = data.lower()

        sents = sent_tokenizer.tokenize(text)
        for sent in sents:
            tokens = word_tokenizer.tokenize(sent)
            filt_toks = []
            for tok in tokens:
                if tok != "":
                    filt_toks.append(tok)
            filt_toks.insert(0, "[CLS]")
            filt_toks.append("[SEP]")

            all_sents.append(filt_toks)

    return all_sents
Beispiel #12
0
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from cltk.corpus.utils.formatter import normalize_fr
from cltk.stop.french.stops import STOPS_LIST as FRENCH_STOPS
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.utils.formatter import normalize_fr
from cltk.lemmatize.french.lemma import LemmaReplacer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz
import pydotplus
import collections

json_data = None
tokenizer = WordTokenizer('french')
wb = load_workbook(filename='classif_questions.xlsx')

# On active l'onglet courant
ws = wb['Feuil2']

# On crГ©e un nouvel onglet
ws1 = wb.create_sheet()
ws1.title = ws.title
stemmer = FrenchStemmer()
words = set(stopwords.words("french"))  # load stopwords
words.add("la")
words.add("le")
words.add("dans")
words.add("un")
words.add("une")
Beispiel #13
0
# script to figure out frequency of each type of subjunctive

from cltk.tag.pos import POSTag
from cltk.tokenize.word import WordTokenizer
from os import listdir

tagger = POSTag('latin')
wt = WordTokenizer('latin')

filelist = sorted([f for f in listdir('./ovid_metamorphoses') if f.endswith('txt')])

present, imperfect, perfect, pluperfect = 0, 0, 0, 0
pres_ex, impf_ex, perf_ex, plup_ex = [], [], [], []

def count_subj(filename):

    global present, imperfect, perfect, pluperfect
    global pres_ex, impf_ex, perf_ex, plup_ex

    infile = open(filename)
    raw = infile.read()
    infile.close()

    tokenized = wt.tokenize(raw)
    tokenized = [t for t in tokenized if not None]

    for t in tokenized:

        tagged = tagger.tag_crf(t)

        if len(tagged) > 1 or len(tagged) == 0 or tagged[0][1] == None:
    "Clément Besnier <*****@*****.**>",
]
__license__ = "MIT License"

poetic_edda = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                           "Sæmundar-Edda")
poetic_edda_titles = [
    'Rígsþula', 'Helreið Brynhildar', 'Gróttasöngr', 'Sigrdrífumál',
    'Hárbarðsljóð', 'Grímnismál', 'Þrymskviða', 'Völuspá',
    'Atlamál in grænlenzku', 'Hyndluljóð', 'Skírnismál', 'Hymiskviða',
    'Atlakviða', 'Vafþrúðnismál', 'Oddrúnarkviða', 'Völundarkviða',
    'Alvíssmál', 'Fáfnismál', 'Dráp Niflunga', 'Hávamál', 'Guðrúnarhvöt',
    'Hamðismál', 'Baldrs draumar', 'Lokasenna', 'Guðrúnarkviða'
]

old_norse_tokenizer = WordTokenizer("old-norse")


class Converter:
    @staticmethod
    def converts_html_to_txt():
        """
        >>> Converter.converts_html_to_txt()

        :return:
        """
        book = "Sæmundar-Edda"
        for text_name in os.listdir(book):
            text_extractor("html", "txt", os.path.join(book, text_name),
                           ["complete.html"], ["complete.txt"], extract_text)
Beispiel #15
0
from IPython.display import Image
from cltk.stop.latin import STOPS_LIST

# See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization

cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit."
cato_agri_praef_lowered = cato_agri_praef.lower()
# create a tokenizer instance of the TokenizeSentence Class
latin_sentence_tokenizer = TokenizeSentence('latin')

#tokenize the text into sentence tokens
cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences(
    cato_agri_praef)

# tokenize the text (or specific sentences) into specific words
latin_word_tokenizer = WordTokenizer('latin')
cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered)
cato_word_tokens_WO_punt = [
    token for token in cato_word_tokens if token not in ['.', ',', ':', ';']
]

#print the tokens and the number of tokens
num_of_sentences = len(cato_sentence_tokens)
num_of_words = len(cato_word_tokens_WO_punt)
#print("There are " + str(num_of_sentences) + " sentences in the text")
#print("There are " + str(num_of_words) + " words in the text")
# for sentence in cato_sentence_tokens:
#     print(sentence)
#     print()

#print(cato_word_tokens_WO_punt)
Beispiel #16
0
stopwords.has_lang("th")  # check if there is a stopwords for the language
stopwords.langs()  # return a set of all the supported languages
#stopwords.stopwords("en")  # English stopwords
import emoji
from tqdm import tqdm_notebook
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
import re
import emoji
import nltk
from cltk.tokenize.word import WordTokenizer
#from nltk.corpus import stopwords
from cltk.stop.classical_hindi.stops import STOPS_LIST

# Cell
tok = WordTokenizer(language='multilingual')
## libraries that can be used
hi_stopwords = []
with open('../Data/Data/hindi_stopwords.txt', 'r') as fp:
    for w in fp.readlines():
        hi_stopwords.append(str(w[:-1]))
puncts = [
    ">", "+", ":", ";", "*", "’", "●", "•", "-", ".", "''", "``", "'", "|",
    "​", "!", ",", "@", "?", "\u200d", "#", "(", ")", "|", "%", "।", "=", "``",
    "&", "[", "]", "/", "'"
]
stop_for_this = hi_stopwords + list(
    stopwords.stopwords(["en", "hi", "ta", "te", "bn"])) + [
        "आएगा", "गए", "गई", "करे", "नही", "हम", "वो", "follow", "दे", "₹",
        "हर", "••••", "▀▄▀", "नही", "अब", "व्हाट्सएप", "॥", "–", "ov", "डॉ",
        "ॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐ", "क्या", "जी", "वो", "╬═╬", "_",
def createCorpus(text, save=True):
    '''
    :params text - the raw text

    returns  + the corpus, a list of list with tokenized sentences
             + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words.

    '''
    # load stopwords
    with open('../data/stopwords.txt', 'r', encoding="UTF-8") as src:
        stopwords = src.read()

    # add punctuation signs
    stopwords = stopwords.split('\n')
    stopwords.extend(
        [".", ",", "?", "!", "-", ":", ";", "·", "”", "“", "«", "»"])

    # tokenize sentences and then words
    Stokenizer = TokenizeSentence('greek')
    Wtokenizer = WordTokenizer('greek')

    sentences = Stokenizer.tokenize(text)
    new_sentences = []
    vocab = dict()

    print('Building corpus and freqDictionary')
    total_tokens = 0
    check = 0
    # for each sentence
    for sent in tqdm(sentences, desc="Sentences"):
        # extract the words
        new_sent = Wtokenizer.tokenize(sent)
        check += len(new_sent)
        # Stopword deletion
        new_sent = [w for w in new_sent if w not in stopwords]
        new_sentences.append(new_sent)
        total_tokens += len(new_sent)
        # add each word to dictionary or update count
        for w in new_sent:
            # Increment tokens count
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1
    vocab_size = len(vocab)

    print("total tokens: ", total_tokens)
    print("total token (incl. stopwords)", check)
    print("vocab_size : ", vocab_size)
    # Subsampling
    treshold = 10e-05
    for k, v in vocab.items():
        # http: // mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
        # Not really used for subsampling here but to generate the noise distribution
        frac = v / total_tokens
        p_w = (1 + math.sqrt(frac / treshold)) * (treshold / frac)
        vocab[k] = p_w

    if save:
        print('Saving the frequencies')
        with open(args.word_frequencies, 'w', encoding='utf-8') as fp:
            json.dump(vocab, fp, ensure_ascii=False)

        print('Saving the corpus')
        arr = np.array(new_sentences, dtype=object)
        np.save('../data/Homer_tokenized_accented.npy', arr)

        with open('../data/vocabs/Homer_wordList.csv', "w",
                  encoding="utf-8") as fp:
            for idx, word in tqdm(enumerate(vocab)):
                fp.write(str(idx) + "," + word + "\n")

    return new_sentences, vocab
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.word import WordTokenizer
import re

p = PunktLanguageVars()
w = WordTokenizer('greek')

p._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
    'NonWord': r"(?:[?!.)\";;}\]\*:@\'\({\[])", #Incorporates period and greek question mark to exclude from word tokens (PunktLanguageVars._re_non_word_chars includes these in word tokens)
    'MultiChar': PunktLanguageVars._re_multi_char_punct,
    'WordStart': PunktLanguageVars._re_word_start,
}, re.UNICODE | re.VERBOSE)


s = 'test test test. test test; test test? test test test test test. test. test test. test? test test.'
assert p.word_tokenize(s) == w.tokenize(s)

d = [('tesserae/texts/grc/achilles_tatius.leucippe_et_clitophon.tess', 'feature_data/a.txt', 'feature_data/b.txt'), 
	('tesserae/texts/grc/bacchylides.epinicians.tess', 'feature_data/m.txt', 'feature_data/n.txt'), 
	('tesserae/texts/grc/polybius.histories.tess', 'feature_data/x.txt', 'feature_data/y.txt')]

for t in d:
	with open(t[0], mode='r', encoding='utf-8') as f:
		from io import StringIO
		file_text = StringIO()
		for line in f:
			#Ignore lines without tess tags, or parse the tag out and strip whitespace
			if not line.startswith('<'):
				continue
			assert '>' in line
			file_text.write(line[line.index('>') + 1:].strip())
Beispiel #19
0
def main(args):

    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)

    tokenizers = {
        "en": spacy.load("en_core_web_sm"),
        "zh": spacy.load("zh_core_web_sm"),
        "ru": Russian(),
        "fr": spacy.load("fr_core_news_sm"),
        "es": spacy.load("es_core_news_sm"),
        "ar": WordTokenizer("arabic"),
    }

    src_tokenizer = None
    if args.src_tok is not None:
        src_tok = tokenizers[args.src_tok]
        if args.src_tok == "ar":

            def tokenize_src(text):
                return [tok for tok in src_tok.tokenize(text)]

        else:

            def tokenize_src(text):
                return [tok.text for tok in src_tok.tokenizer(text)]

        src_tokenizer = tokenize_src

    trg_tokenizer = None
    if args.trg_tok is not None:
        trg_tok = tokenizers[args.trg_tok]
        if args.trg_tok == "ar":

            def tokenize_trg(text):
                return [tok for tok in trg_tok.tokenize(text)]

        else:

            def tokenize_trg(text):
                return [tok.text for tok in tokz.tokenizer(text)]

        trg_tokenizer = tokenize_trg

    if args.task == "translation":
        indices = prep_trans_files(
            args.src_file,
            args.trg_file,
            args.save_path,
            src_tok=src_tokenizer,
            trg_tok=trg_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )
    elif args.task == "tagging":
        indices = prep_tag_files(
            args.src_file,
            args.save_path,
            src_tok=src_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )

    train, indices, = train_test_split(indices, test_size=0.3, random_state=42)
    valid, test = train_test_split(indices, test_size=0.5, random_state=42)

    split_to_tsv("train", train, args.save_path)
    split_to_tsv("test", test, args.save_path)
    split_to_tsv("valid", valid, args.save_path)

    # delete temporary files
    os.remove(os.path.join(args.save_path, "temp_src.txt"))
    os.remove(os.path.join(args.save_path, "temp_trg.txt"))
Beispiel #20
0
 def test_middle_high_german_tokenizer(self):
     text = "Gâwân het êre unde heil,\nieweders volleclîchen teil:\nnu nâht och sînes kampfes zît."
     target = ['Gâwân', 'het', 'êre', 'unde', 'heil', ',', 'ieweders', 'volleclîchen', 'teil', ':', 'nu', 'nâht', 'och', 'sînes', 'kampfes', 'zît', '.']
     tokenizer = WordTokenizer('middle_high_german')
     tokenized_lines = tokenizer.tokenize(text)
     self.assertTrue(tokenized_lines == target)
Beispiel #21
0
    "ΤΙ",
    "Ἀ",
    "ΘΕΟ",
    "ΛΑ",
    "ΓΟΡ",
]
numbers = ["α ́", "β ́", "γ ́", "δ ́", "ε ́", "ζ ́", "η ́", "θ ́", "ι ́", "ς ́", "κ ́"]
other_abbreviations = [
    "ὑπ",
    "Ἰνδ",
    "γρ",
]
abbreviations = speakers + numbers + other_abbreviations
ABBREVIATIONS = [i.lower() for i in abbreviations]

word_tokenizer = WordTokenizer('greek')


def get_corpus():
    with open("Ancient_Greek_ML.txt", "r") as f:
        texts = f.read()
    return texts

def strip_accents_from_sentence(sent):
        words = sent.split()
        words = list(map(lambda x: strip_accents(x), words))
        return " ".join(words)


def non_destructive_split(t, delim):
    """Splits the text t after the delimiter delim, retaining delim with the part of t that preceeded it. Returns a list of strings"""
Beispiel #22
0
    def test_latin_word_tokenizer(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('latin')

        #Test sources:
        # - V. Aen. 1.1
        # - Prop. 2.5.1-2
        # - Ov. Am. 1.8.65-66
        # - Cic. Phillip. 13.14
        # - Plaut. Capt. 937
        # - Lucr. DRN. 5.1351-53
        # - Plaut. Bacch. 837-38
        # - Plaut. Amph. 823
        # - Caes. Bel. 6.29.2

        tests = [
            'Arma virumque cano, Troiae qui primus ab oris.',
            'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
            'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
            'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
            'Quid opust verbis? lingua nullast qua negem quidquid roges.',
            'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.',  # pylint: disable=line-too-long
            'Dic sodes mihi, bellan videtur specie mulier?',
            'Cenavin ego heri in navi in portu Persico?',
            'quae ripas Ubiorum contingebat in longitudinem pedum ducentorum rescindit'
        ]

        results = []

        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [
            [
                'Arma', 'virum', '-que', 'cano', ',', 'Troiae', 'qui',
                'primus', 'ab', 'oris', '.'
            ],
            [
                'Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',',
                'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere',
                'nequitia', '?'
            ],  # pylint: disable=line-too-long
            [
                'Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria',
                'cerae', '.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper',
                'amator', ',', 'avos', '!'
            ],  # pylint: disable=line-too-long
            [
                'Neque', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id',
                'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',',
                'propterea', 'etiam', 'permittitur', '.'
            ],  # pylint: disable=line-too-long
            [
                'Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est',
                'qua', 'negem', 'quidquid', 'roges', '.'
            ],  # pylint: disable=line-too-long
            [
                'Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro',
                'tela', 'paratur', ',', 'nec', 'ratione', 'alia', 'possunt',
                'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii',
                ',', 'scapi', '-que', 'sonantes', '.'
            ],  # pylint: disable=line-too-long
            [
                'Dic', 'si', 'audes', 'mihi', ',', 'bella', '-ne', 'videtur',
                'specie', 'mulier', '?'
            ],
            [
                'Cenavi', '-ne', 'ego', 'heri', 'in', 'navi', 'in', 'portu',
                'Persico', '?'
            ],
            [
                'quae', "ripas", "Ubiorum", "contingebat", "in",
                "longitudinem", "pedum", "ducentorum", "rescindit"
            ]
        ]

        self.assertEqual(results, target)
Beispiel #23
0
import json
import os
# from nltk.tokenize import TweetTokenizer
from cltk.tokenize.word import WordTokenizer

src = "/home/du0/15CS30016/MTP2/DrQA/data/datasets/bengali/v1.2/SQuAD-v1.1-train.json"
dest = "SQuAD_mrqa_bengali_train-v1.2.jsonl"
dataset_name = "squad_bengali"
split_type = "train"  # train/dev

with open(src, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# tknzr = TweetTokenizer()
tknzr = WordTokenizer('multilingual')
f = open(dest, 'w')
# f.write ('{"header": {"dataset": "{dname}", "split": "{dataset_type}"}}'.format (dname = dataset_name, dataset_type = dataset_type))
f.write(json.dumps({"header": {"dataset": dataset_name, "split": split_type}}))
f.write('\n')

data = json_data['data']


def getTokenSpans(s: str, tokens: list):
    offset = 0
    spans = list()
    for token in tokens:
        position = s.find(token, offset)
        spans.append([token, position])
        offset = position + len(token)
Beispiel #24
0
def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence,
                                    rm_punctuation=True,
                                    rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [
                    w[1:] if w.startswith('-') else w for w in sentence
                ]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
Beispiel #25
0
from cltk.corpus.old_norse.syllabifier import hierarchy, invalid_onsets
from cltk.text_reuse.levenshtein import Levenshtein

from zoegas.constants import postags, dictionary_name, pos_verbose

# phonetic transcriber
phonetic_transcriber = phu.Transcriber(ont.DIPHTHONGS_IPA,
                                       ont.DIPHTHONGS_IPA_class, ont.IPA_class,
                                       ont.old_norse_rules)

# Old Norse syllabifier
s = Syllabifier(language="old_norse", break_geminants=True)
s.set_invalid_onsets(invalid_onsets)
s.set_hierarchy(hierarchy)

old_norse_word_tokenizer = WordTokenizer("old_norse")


def clean(text: str) -> Optional[str]:
    """

    :param text:
    :return:
    """
    if text is not None:
        text = re.sub(r"\t", "", text)
        text = re.sub(r"\n", "", text)
        return text
    else:
        return None
Beispiel #26
0
## Ingests texts on a corpus or single level basis from FileImport
#
RE = Reader(FI.filedata)
RE.ingest_corpus()
#RE.print_toc()
RE.print_single_text_sentences('Q003497', 'normalization')
print()
#RE.print_single_text('Q003497')
# Toying with ability to use tokenizers on text...


from collections import Counter
from cltk.corpus.akkadian.tokenizer import Tokenizer
line_tokenizer = Tokenizer(preserve_damage=False)
from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer('akkadian')
from nltk import Text

sennacherib = line_tokenizer.string_tokenizer(RE.lines)
sennacherib_tokens = word_tokenizer.tokenize('\n'.join(sennacherib))
s_tokens = [word[0] for word in sennacherib_tokens]
word_count = Counter(s_tokens)

running = 0
print("Top 25 words in the Taylor's Prism:\n")
print("{number:>5}  {word:<20}     {count:<12}{percent:<12}{running:<12}". \
      format(number="", word="TOKEN", count="COUNT", percent="TOKEN %", running = "RUNNING %"))
for i, pair in enumerate(word_count.most_common(25)):
    running += pair[1]
    print("{number:>5}. {word:<20}      {count:<12}{percent:<12}{running:<12}". \
          format(number=i+1, word=pair[0], count=pair[1], \
Beispiel #27
0
def word_tokenize(text):
    print("Word Tokenizer triggered")
    word_tokenizer = WordTokenizer('sanskrit')
    # print("word tokenize: ", word_tokenizer.tokenize(self.sentence))
    return word_tokenizer.tokenize(text)
Beispiel #28
0
 def word_tokenize(self):
     word_tokenizer = WordTokenizer('sanskrit')
     return word_tokenizer.tokenize(self.sentence)