Ejemplos de cltk_normalize en Python, ejemplos de cltk.corpus.utils.formatter.cltk_normalize en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: load_texts.py Proyecto: jbperry004/bliss_senior_thesis

 def process_document(self, doc):
     cleaned_sents = []
     for paragraph in doc['text'].values():
         if type(paragraph) != str:
             paragraph = paragraph.values()
         else:
             paragraph = self.sent_tokenizer.tokenize(paragraph)
         for sent in paragraph:
             if type(sent) is dict:
                 for subsent in sent.values():
                     tokenized = self.sent_tokenizer.tokenize(subsent)
                     for token in tokenized:
                         cleaned = tlg_plaintext_cleanup(
                             token, rm_punctuation=True, rm_periods=True)
                         sentence = cltk_normalize(cleaned)
                         if len(self.word_tokenizer.tokenize(sentence)) > 5:
                             cleaned_sents.append(sentence)
             else:
                 tokenized = self.sent_tokenizer.tokenize(sent)
                 for token in tokenized:
                     cleaned = tlg_plaintext_cleanup(token,
                                                     rm_punctuation=True,
                                                     rm_periods=True)
                     sentence = cltk_normalize(cleaned)
                     if len(self.word_tokenizer.tokenize(sentence)) > 5:
                         cleaned_sents.append(sentence)
     return cleaned_sents

Ejemplo n.º 2

0

Mostrar archivo

 def test_cltk_normalize_compatible(self):
     """Test Normalizing Text with compatibility True"""
     s1 = "café"
     s2 = "cafe\u0301"
     normalized_text = cltk_normalize(s1, compatibility=True)
     target = normalize("NFKC", s2)
     self.assertEqual(normalized_text, target)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_corpus.py Proyecto: ykl7/cltk

 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_tmp.py Proyecto: j-duff/cltk

 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: search_by_lemma.py Proyecto: bcrowell/homer

def main():
    if len(sys.argv) < 2:
        print(
            "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n"
        )
        sys.exit()
    infl = sys.argv[1]
    lem = lemmatize(infl)[0]  # lemmatized
    print("searching for " + lem + " <- " + infl)
    index = {}
    for work in ["iliad", "odyssey"]:
        for book in range(1, 24 + 1):  # ranges from 1 to 24
            filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess'
            #print(filename)
            reader = get_corpus_reader(corpus_name='greek_text_tesserae',
                                       language='greek')
            reader._fileids = [filename]
            sentences = list(reader.sents([filename]))
            sentences = [cltk_normalize(s) for s in sentences]
            count_sentences = 0
            for s in sentences:
                count_sentences = count_sentences + 1
                no_punct = re.sub(
                    r"[,;:\.']", '', s
                )  # remove punctuation, which lemmatizer treats as independent words
                words = re.split("\s+", no_punct)
                count_words = 0
                for word in lemmatize(no_punct):
                    count_words = count_words + 1
                    if lem == word:
                        i = count_words - 1
                        w = words[i]
                        context = " ".join(
                            words[max(i - 3, 0):min(i + 4,
                                                    len(words) - 1)])
                        #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __
                        pos_tagged = tagger.tag_tnt(no_punct)
                        # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb
                        # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging
                        describe = w
                        for t in pos_tagged:
                            if t[0] == w:
                                describe = t[0] + " " + pos_tag_to_description(
                                    t[1])
                                break
                        print(work + " " + str(book) + ", sentence " +
                              str(count_sentences) + ", word " +
                              str(count_words) + ": " + describe + "    " +
                              context)
                        if w in index:
                            index[w] += 1
                        else:
                            index[w] = 1
                #sys.exit()
    for w in sorted(list(index.keys())):
        print(str(index[w]) + " " + w)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: lda_helpers.py Proyecto: kylepjohnson/notebooks

def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

Ejemplo n.º 7

0

Mostrar archivo

Archivo: lda_helpers.py Proyecto: igorrivin/notebooks

def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

Ejemplo n.º 8

0

Mostrar archivo

Archivo: Class_traditions.py Proyecto: Ycreak/Zweiquellentheorie_PCA

    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)

Ejemplo n.º 9

0

Mostrar archivo

def predict_from_file(path, model, use_sequential_decoding, align, step_len):
    """Runs prediction using the model on the texts located in the file given in path."""
    max_seq_len = model.processor.max_seq_len - 2
    with open(path, "r") as fp:
        texts = fp.read().splitlines()
    # prepare texts
    texts = clean_texts(texts, CHARS_TO_REMOVE, CHARS_TO_REPLACE)
    texts = [cltk_normalize(replace_square_brackets(t)) for t in texts]
    texts = [t.replace(" ", "_") for t in texts]
    results = []
    # break up long texts
    for t in texts:
        sequences = []
        if len(t) >= max_seq_len:
            if not (step_len and step_len < max_seq_len):
                step_len = round(max_seq_len / 2)
            # for i in range(0, len(t) - step_len, step_len):
            for i in range(0, len(t), step_len):
                seq = t[i : i + max_seq_len]
                sequences.append(seq)
        else:
            sequences.append(t)
        sequences = convert_masking(sequences)
        dicts = sentences_to_dicts(sequences)
        if use_sequential_decoding:
            result = model.predict_sequentially(dicts=dicts)
        else:
            result = model.predict(dicts=dicts)
        results.append(result)
    # output results
    for result in results:
        nb_of_masks = 0  # needed to proper alignment
        for i, res in enumerate(result):
            prediced_text = res["predictions"]["text_with_preds"].replace("_", " ")
            masked_text = res["predictions"]["masked_text"].replace("_", " ")
            if align:
                if not step_len:
                    step_len = round(max_seq_len / 2)
                # an approximate alignment is calculated by shifting each line by step_len + 2 * the number of masks in the overlaping portion of the previous prediction (to take into account the square brackets which are added around each prediction)
                print(" " * (step_len * i + (2 * nb_of_masks)) + prediced_text)
                nb_of_masks += len(re.findall(r"#+", masked_text[:step_len]))
            else:
                print(res["predictions"]["text_with_preds"].replace("_", " "))

Ejemplo n.º 10

0

Mostrar archivo

Archivo: cltk.py Proyecto: thePortus/dhelp

    def normalize(self):
        """Fixes problems with differences in greek accent encoding.

        Certain Greek accents have more than one possible encoding. Uses cltk's
        built-in normalizer to correct the character encoding differences and
        ensure that accents are encoded the same way.

        Returns:
            :obj:`self.__class__` New instance with altered text

        Example:
            >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι')
            >>> print(text.normalize())
            ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι
        """ # noqa
        from cltk.corpus.utils.formatter import cltk_normalize
        return self.__class__(
            text=cltk_normalize(str(self.data)),
            options=self.options
        )

Ejemplo n.º 11

0

Mostrar archivo

Archivo: clean_data.py Proyecto: jbperry004/bliss_senior_thesis

def clean_tokens(tokens, chars_to_remove, chars_to_replace):
    """Cleans a list of tokens."""
    cleaned_tokens = []
    for t in tokens:
        if t:
            # remove words in which latin characters appear
            if not re.search(r"\w+", t, re.ASCII):
                # remove tokens containing digits
                if not re.search(r"\d+", t):
                    # normalize
                    t = cltk_normalize(t)
                    t = t.strip("\t\r\n")
                    # remove unwanted chars
                    t = remove_unwanted_chars(t, chars_to_remove)
                    t = replace_chars(t, chars_to_replace)
                    # remove any inter-word hypens or en-dashes
                    t = re.sub(r"([^\s])(-|–)", r"\1", t)
                    # convert some other forms of whitespace to normal spaces
                    t = re.sub(r"\s+", " ", t)
                    # remove repeated whitespace
                    t = re.sub(r"\s{2,}", " ", t)
                    cleaned_tokens.append(t)
    return cleaned_tokens

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test.py Proyecto: michaelhagedon/cltk-test

corpus_importer.import_corpus('greek_models_cltk')

corpus_importer2 = CorpusImporter('greek')
corpus_importer2.import_corpus('greek_text_perseus')

philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus",
                                       language="greek")

philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)

# This is not a great lemmatizer
standard_list = lemmatizer.lemmatize(list(philippians_reader.words()),
                                     return_raw=True)

lemmatizer2 = BackoffGreekLemmatizer()

# this one seems better
backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))

Ejemplo n.º 13

0

Mostrar archivo

Archivo: search_by_lemma.py Proyecto: bcrowell/homer

def lemmatize(s):
    # returns an array
    return lemmatizer.lemmatize(cltk_normalize(s))

Ejemplo n.º 14

0

Mostrar archivo

#!/bin/python3

from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

lemmatizer = LemmaReplacer('greek')

text = """
μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος
οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε,
πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν
ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν
οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή,
ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε
Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς.
"""

#print(lemmatizer.lemmatize(text))
print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά"))
print(
    lemmatizer.lemmatize(
        cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος")))
# ... doesn't work without the normalization

Ejemplo n.º 15

0

Mostrar archivo

Archivo: lda_helpers.py Proyecto: igorrivin/notebooks

DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1

STOPS_LIST_GRK = [
    simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0]
    for stop in STOPS_LIST_GRK
    if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0
]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += [
    "τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν',
    'πρε', 'ἀλλ'
]  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine

Ejemplo n.º 16

0

Mostrar archivo

Archivo: lda_helpers.py Proyecto: kylepjohnson/notebooks

# configs for all notebooks
working_dir = os.path.expanduser('~/cltk_data/user_data/lda_1kgreek/')
PREPROCESS_DEACCENT = False
TOK_MIN = 3  # rm words shorter than
TOK_MAX = 20  # rm words longer than
DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1


STOPS_LIST_GRK = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST_GRK if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ']  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine

Ejemplo n.º 17

0

Mostrar archivo

 def normalize(self):
     return self.__class__(cltk_normalize(str(self.data)), self.metadata)