Esempi in Python per cltk_normalize, esempi in Python per cltk.corpus.utils.formatter.cltk_normalize

Esempio n. 1

0

Mostra file

File: load_texts.py Progetto: jbperry004/bliss_senior_thesis

 def process_document(self, doc):
     cleaned_sents = []
     for paragraph in doc['text'].values():
         if type(paragraph) != str:
             paragraph = paragraph.values()
         else:
             paragraph = self.sent_tokenizer.tokenize(paragraph)
         for sent in paragraph:
             if type(sent) is dict:
                 for subsent in sent.values():
                     tokenized = self.sent_tokenizer.tokenize(subsent)
                     for token in tokenized:
                         cleaned = tlg_plaintext_cleanup(
                             token, rm_punctuation=True, rm_periods=True)
                         sentence = cltk_normalize(cleaned)
                         if len(self.word_tokenizer.tokenize(sentence)) > 5:
                             cleaned_sents.append(sentence)
             else:
                 tokenized = self.sent_tokenizer.tokenize(sent)
                 for token in tokenized:
                     cleaned = tlg_plaintext_cleanup(token,
                                                     rm_punctuation=True,
                                                     rm_periods=True)
                     sentence = cltk_normalize(cleaned)
                     if len(self.word_tokenizer.tokenize(sentence)) > 5:
                         cleaned_sents.append(sentence)
     return cleaned_sents

Esempio n. 2

0

Mostra file

 def test_cltk_normalize_compatible(self):
     """Test Normalizing Text with compatibility True"""
     s1 = "café"
     s2 = "cafe\u0301"
     normalized_text = cltk_normalize(s1, compatibility=True)
     target = normalize("NFKC", s2)
     self.assertEqual(normalized_text, target)

Esempio n. 3

0

Mostra file

File: test_corpus.py Progetto: ykl7/cltk

 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)

Esempio n. 4

0

Mostra file

File: test_tmp.py Progetto: j-duff/cltk

 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)

Esempio n. 5

0

Mostra file

File: search_by_lemma.py Progetto: bcrowell/homer

def main():
    if len(sys.argv) < 2:
        print(
            "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n"
        )
        sys.exit()
    infl = sys.argv[1]
    lem = lemmatize(infl)[0]  # lemmatized
    print("searching for " + lem + " <- " + infl)
    index = {}
    for work in ["iliad", "odyssey"]:
        for book in range(1, 24 + 1):  # ranges from 1 to 24
            filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess'
            #print(filename)
            reader = get_corpus_reader(corpus_name='greek_text_tesserae',
                                       language='greek')
            reader._fileids = [filename]
            sentences = list(reader.sents([filename]))
            sentences = [cltk_normalize(s) for s in sentences]
            count_sentences = 0
            for s in sentences:
                count_sentences = count_sentences + 1
                no_punct = re.sub(
                    r"[,;:\.']", '', s
                )  # remove punctuation, which lemmatizer treats as independent words
                words = re.split("\s+", no_punct)
                count_words = 0
                for word in lemmatize(no_punct):
                    count_words = count_words + 1
                    if lem == word:
                        i = count_words - 1
                        w = words[i]
                        context = " ".join(
                            words[max(i - 3, 0):min(i + 4,
                                                    len(words) - 1)])
                        #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __
                        pos_tagged = tagger.tag_tnt(no_punct)
                        # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb
                        # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging
                        describe = w
                        for t in pos_tagged:
                            if t[0] == w:
                                describe = t[0] + " " + pos_tag_to_description(
                                    t[1])
                                break
                        print(work + " " + str(book) + ", sentence " +
                              str(count_sentences) + ", word " +
                              str(count_words) + ": " + describe + "    " +
                              context)
                        if w in index:
                            index[w] += 1
                        else:
                            index[w] = 1
                #sys.exit()
    for w in sorted(list(index.keys())):
        print(str(index[w]) + " " + w)

Esempio n. 6

0

Mostra file

File: lda_helpers.py Progetto: kylepjohnson/notebooks

def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

Esempio n. 7

0

Mostra file

File: lda_helpers.py Progetto: igorrivin/notebooks

def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

Esempio n. 8

0

Mostra file

File: Class_traditions.py Progetto: Ycreak/Zweiquellentheorie_PCA

    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)

Esempio n. 9

0

Mostra file

def predict_from_file(path, model, use_sequential_decoding, align, step_len):
    """Runs prediction using the model on the texts located in the file given in path."""
    max_seq_len = model.processor.max_seq_len - 2
    with open(path, "r") as fp:
        texts = fp.read().splitlines()
    # prepare texts
    texts = clean_texts(texts, CHARS_TO_REMOVE, CHARS_TO_REPLACE)
    texts = [cltk_normalize(replace_square_brackets(t)) for t in texts]
    texts = [t.replace(" ", "_") for t in texts]
    results = []
    # break up long texts
    for t in texts:
        sequences = []
        if len(t) >= max_seq_len:
            if not (step_len and step_len < max_seq_len):
                step_len = round(max_seq_len / 2)
            # for i in range(0, len(t) - step_len, step_len):
            for i in range(0, len(t), step_len):
                seq = t[i : i + max_seq_len]
                sequences.append(seq)
        else:
            sequences.append(t)
        sequences = convert_masking(sequences)
        dicts = sentences_to_dicts(sequences)
        if use_sequential_decoding:
            result = model.predict_sequentially(dicts=dicts)
        else:
            result = model.predict(dicts=dicts)
        results.append(result)
    # output results
    for result in results:
        nb_of_masks = 0  # needed to proper alignment
        for i, res in enumerate(result):
            prediced_text = res["predictions"]["text_with_preds"].replace("_", " ")
            masked_text = res["predictions"]["masked_text"].replace("_", " ")
            if align:
                if not step_len:
                    step_len = round(max_seq_len / 2)
                # an approximate alignment is calculated by shifting each line by step_len + 2 * the number of masks in the overlaping portion of the previous prediction (to take into account the square brackets which are added around each prediction)
                print(" " * (step_len * i + (2 * nb_of_masks)) + prediced_text)
                nb_of_masks += len(re.findall(r"#+", masked_text[:step_len]))
            else:
                print(res["predictions"]["text_with_preds"].replace("_", " "))

Esempio n. 10

0

Mostra file

File: cltk.py Progetto: thePortus/dhelp

    def normalize(self):
        """Fixes problems with differences in greek accent encoding.

        Certain Greek accents have more than one possible encoding. Uses cltk's
        built-in normalizer to correct the character encoding differences and
        ensure that accents are encoded the same way.

        Returns:
            :obj:`self.__class__` New instance with altered text

        Example:
            >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι')
            >>> print(text.normalize())
            ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι
        """ # noqa
        from cltk.corpus.utils.formatter import cltk_normalize
        return self.__class__(
            text=cltk_normalize(str(self.data)),
            options=self.options
        )

Esempio n. 11

0

Mostra file

File: clean_data.py Progetto: jbperry004/bliss_senior_thesis

def clean_tokens(tokens, chars_to_remove, chars_to_replace):
    """Cleans a list of tokens."""
    cleaned_tokens = []
    for t in tokens:
        if t:
            # remove words in which latin characters appear
            if not re.search(r"\w+", t, re.ASCII):
                # remove tokens containing digits
                if not re.search(r"\d+", t):
                    # normalize
                    t = cltk_normalize(t)
                    t = t.strip("\t\r\n")
                    # remove unwanted chars
                    t = remove_unwanted_chars(t, chars_to_remove)
                    t = replace_chars(t, chars_to_replace)
                    # remove any inter-word hypens or en-dashes
                    t = re.sub(r"([^\s])(-|–)", r"\1", t)
                    # convert some other forms of whitespace to normal spaces
                    t = re.sub(r"\s+", " ", t)
                    # remove repeated whitespace
                    t = re.sub(r"\s{2,}", " ", t)
                    cleaned_tokens.append(t)
    return cleaned_tokens

Esempio n. 12

0

Mostra file

File: test.py Progetto: michaelhagedon/cltk-test

corpus_importer.import_corpus('greek_models_cltk')

corpus_importer2 = CorpusImporter('greek')
corpus_importer2.import_corpus('greek_text_perseus')

philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus",
                                       language="greek")

philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)

# This is not a great lemmatizer
standard_list = lemmatizer.lemmatize(list(philippians_reader.words()),
                                     return_raw=True)

lemmatizer2 = BackoffGreekLemmatizer()

# this one seems better
backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))

Esempio n. 13

0

Mostra file

File: search_by_lemma.py Progetto: bcrowell/homer

def lemmatize(s):
    # returns an array
    return lemmatizer.lemmatize(cltk_normalize(s))

Esempio n. 14

0

Mostra file

#!/bin/python3

from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

lemmatizer = LemmaReplacer('greek')

text = """
μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος
οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε,
πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν
ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν
οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή,
ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε
Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς.
"""

#print(lemmatizer.lemmatize(text))
print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά"))
print(
    lemmatizer.lemmatize(
        cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος")))
# ... doesn't work without the normalization

Esempio n. 15

0

Mostra file

File: lda_helpers.py Progetto: igorrivin/notebooks

DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1

STOPS_LIST_GRK = [
    simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0]
    for stop in STOPS_LIST_GRK
    if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0
]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += [
    "τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν',
    'πρε', 'ἀλλ'
]  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine

Esempio n. 16

0

Mostra file

File: lda_helpers.py Progetto: kylepjohnson/notebooks

# configs for all notebooks
working_dir = os.path.expanduser('~/cltk_data/user_data/lda_1kgreek/')
PREPROCESS_DEACCENT = False
TOK_MIN = 3  # rm words shorter than
TOK_MAX = 20  # rm words longer than
DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1


STOPS_LIST_GRK = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST_GRK if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ']  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine

Esempio n. 17

0

Mostra file

 def normalize(self):
     return self.__class__(cltk_normalize(str(self.data)), self.metadata)