Ejemplo n.º 1
0
 def process_document(self, doc):
     cleaned_sents = []
     for paragraph in doc['text'].values():
         if type(paragraph) != str:
             paragraph = paragraph.values()
         else:
             paragraph = self.sent_tokenizer.tokenize(paragraph)
         for sent in paragraph:
             if type(sent) is dict:
                 for subsent in sent.values():
                     tokenized = self.sent_tokenizer.tokenize(subsent)
                     for token in tokenized:
                         cleaned = tlg_plaintext_cleanup(
                             token, rm_punctuation=True, rm_periods=True)
                         sentence = cltk_normalize(cleaned)
                         if len(self.word_tokenizer.tokenize(sentence)) > 5:
                             cleaned_sents.append(sentence)
             else:
                 tokenized = self.sent_tokenizer.tokenize(sent)
                 for token in tokenized:
                     cleaned = tlg_plaintext_cleanup(token,
                                                     rm_punctuation=True,
                                                     rm_periods=True)
                     sentence = cltk_normalize(cleaned)
                     if len(self.word_tokenizer.tokenize(sentence)) > 5:
                         cleaned_sents.append(sentence)
     return cleaned_sents
Ejemplo n.º 2
0
 def test_cltk_normalize_compatible(self):
     """Test Normalizing Text with compatibility True"""
     s1 = "café"
     s2 = "cafe\u0301"
     normalized_text = cltk_normalize(s1, compatibility=True)
     target = normalize("NFKC", s2)
     self.assertEqual(normalized_text, target)
Ejemplo n.º 3
0
 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)
Ejemplo n.º 4
0
 def test_cltk_normalize_noncompatible(self):
     """Test Normalizing Text with compatibility False"""
     s1 = 'café'
     s2 = 'cafe\u0301'
     normalized_text = cltk_normalize(s1, compatibility=False)
     target = normalize('NFC', s2)
     self.assertEqual(normalized_text, target)
Ejemplo n.º 5
0
def main():
    if len(sys.argv) < 2:
        print(
            "Please supply an inflected word on the command line. Example: search_by_lemma.py κύνεσσιν\n"
        )
        sys.exit()
    infl = sys.argv[1]
    lem = lemmatize(infl)[0]  # lemmatized
    print("searching for " + lem + " <- " + infl)
    index = {}
    for work in ["iliad", "odyssey"]:
        for book in range(1, 24 + 1):  # ranges from 1 to 24
            filename = 'texts/homer.' + work + '.part.' + str(book) + '.tess'
            #print(filename)
            reader = get_corpus_reader(corpus_name='greek_text_tesserae',
                                       language='greek')
            reader._fileids = [filename]
            sentences = list(reader.sents([filename]))
            sentences = [cltk_normalize(s) for s in sentences]
            count_sentences = 0
            for s in sentences:
                count_sentences = count_sentences + 1
                no_punct = re.sub(
                    r"[,;:\.']", '', s
                )  # remove punctuation, which lemmatizer treats as independent words
                words = re.split("\s+", no_punct)
                count_words = 0
                for word in lemmatize(no_punct):
                    count_words = count_words + 1
                    if lem == word:
                        i = count_words - 1
                        w = words[i]
                        context = " ".join(
                            words[max(i - 3, 0):min(i + 4,
                                                    len(words) - 1)])
                        #context = re.sub(re.compile("("+w+")"),r"__\1__",context) # ... surround with __ __
                        pos_tagged = tagger.tag_tnt(no_punct)
                        # ... tag words in sentence with parts of speech, https://github.com/cltk/tutorials/blob/master/8%20Part-of-speech%20tagging.ipynb
                        # for descriptions of what the POS tags mean, see https://linguistics.stackexchange.com/questions/12803/what-do-the-labels-mean-in-this-latin-pos-tagging
                        describe = w
                        for t in pos_tagged:
                            if t[0] == w:
                                describe = t[0] + " " + pos_tag_to_description(
                                    t[1])
                                break
                        print(work + " " + str(book) + ", sentence " +
                              str(count_sentences) + ", word " +
                              str(count_words) + ": " + describe + "    " +
                              context)
                        if w in index:
                            index[w] += 1
                        else:
                            index[w] = 1
                #sys.exit()
    for w in sorted(list(index.keys())):
        print(str(index[w]) + " " + w)
Ejemplo n.º 6
0
def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens
Ejemplo n.º 7
0
def iter_docs(docs_dir, rm_ascii=False):
    """Stream files in a dir (TLG, TEI, etc.) doc-by-doc."""
    file_names = os.listdir(docs_dir)
    for file_name in file_names:
        file_path = os.path.join(docs_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read, rm_ascii=rm_ascii)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens
    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)
Ejemplo n.º 9
0
def predict_from_file(path, model, use_sequential_decoding, align, step_len):
    """Runs prediction using the model on the texts located in the file given in path."""
    max_seq_len = model.processor.max_seq_len - 2
    with open(path, "r") as fp:
        texts = fp.read().splitlines()
    # prepare texts
    texts = clean_texts(texts, CHARS_TO_REMOVE, CHARS_TO_REPLACE)
    texts = [cltk_normalize(replace_square_brackets(t)) for t in texts]
    texts = [t.replace(" ", "_") for t in texts]
    results = []
    # break up long texts
    for t in texts:
        sequences = []
        if len(t) >= max_seq_len:
            if not (step_len and step_len < max_seq_len):
                step_len = round(max_seq_len / 2)
            # for i in range(0, len(t) - step_len, step_len):
            for i in range(0, len(t), step_len):
                seq = t[i : i + max_seq_len]
                sequences.append(seq)
        else:
            sequences.append(t)
        sequences = convert_masking(sequences)
        dicts = sentences_to_dicts(sequences)
        if use_sequential_decoding:
            result = model.predict_sequentially(dicts=dicts)
        else:
            result = model.predict(dicts=dicts)
        results.append(result)
    # output results
    for result in results:
        nb_of_masks = 0  # needed to proper alignment
        for i, res in enumerate(result):
            prediced_text = res["predictions"]["text_with_preds"].replace("_", " ")
            masked_text = res["predictions"]["masked_text"].replace("_", " ")
            if align:
                if not step_len:
                    step_len = round(max_seq_len / 2)
                # an approximate alignment is calculated by shifting each line by step_len + 2 * the number of masks in the overlaping portion of the previous prediction (to take into account the square brackets which are added around each prediction)
                print(" " * (step_len * i + (2 * nb_of_masks)) + prediced_text)
                nb_of_masks += len(re.findall(r"#+", masked_text[:step_len]))
            else:
                print(res["predictions"]["text_with_preds"].replace("_", " "))
Ejemplo n.º 10
0
    def normalize(self):
        """Fixes problems with differences in greek accent encoding.

        Certain Greek accents have more than one possible encoding. Uses cltk's
        built-in normalizer to correct the character encoding differences and
        ensure that accents are encoded the same way.

        Returns:
            :obj:`self.__class__` New instance with altered text

        Example:
            >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι')
            >>> print(text.normalize())
            ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι
        """ # noqa
        from cltk.corpus.utils.formatter import cltk_normalize
        return self.__class__(
            text=cltk_normalize(str(self.data)),
            options=self.options
        )
Ejemplo n.º 11
0
def clean_tokens(tokens, chars_to_remove, chars_to_replace):
    """Cleans a list of tokens."""
    cleaned_tokens = []
    for t in tokens:
        if t:
            # remove words in which latin characters appear
            if not re.search(r"\w+", t, re.ASCII):
                # remove tokens containing digits
                if not re.search(r"\d+", t):
                    # normalize
                    t = cltk_normalize(t)
                    t = t.strip("\t\r\n")
                    # remove unwanted chars
                    t = remove_unwanted_chars(t, chars_to_remove)
                    t = replace_chars(t, chars_to_replace)
                    # remove any inter-word hypens or en-dashes
                    t = re.sub(r"([^\s])(-|–)", r"\1", t)
                    # convert some other forms of whitespace to normal spaces
                    t = re.sub(r"\s+", " ", t)
                    # remove repeated whitespace
                    t = re.sub(r"\s{2,}", " ", t)
                    cleaned_tokens.append(t)
    return cleaned_tokens
Ejemplo n.º 12
0
corpus_importer.import_corpus('greek_models_cltk')

corpus_importer2 = CorpusImporter('greek')
corpus_importer2.import_corpus('greek_text_perseus')

philippians_reader = get_corpus_reader(corpus_name="greek_text_perseus",
                                       language="greek")

philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)

# This is not a great lemmatizer
standard_list = lemmatizer.lemmatize(list(philippians_reader.words()),
                                     return_raw=True)

lemmatizer2 = BackoffGreekLemmatizer()

# this one seems better
backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))
Ejemplo n.º 13
0
def lemmatize(s):
    # returns an array
    return lemmatizer.lemmatize(cltk_normalize(s))
Ejemplo n.º 14
0
#!/bin/python3

from cltk.stem.lemma import LemmaReplacer
from cltk.corpus.utils.formatter import cltk_normalize

lemmatizer = LemmaReplacer('greek')

text = """
μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος
οὐλομένην, ἣ μυρί' ̓Αχαιοῖς ἄλγε' ἔθηκε,
πολλὰς δ' ἰφθίμους ψυχὰς ̓́Αϊδι προί̈αψεν
ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν
οἰωνοῖσί τε πᾶσι, Διὸς δ' ἐτελείετο βουλή,
ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε
Ατρεί̈δης τε ἄναξ ἀνδρῶν καὶ δῖος ̓Αχιλλεύς.
"""

#print(lemmatizer.lemmatize(text))
print(lemmatizer.lemmatize("Μῆνιν ἄειδε, θεά"))
print(
    lemmatizer.lemmatize(
        cltk_normalize("μῆνιν ἄειδε θεὰ Πηληϊάδεω ̓Αχιλῆος")))
# ... doesn't work without the normalization
Ejemplo n.º 15
0
DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1

STOPS_LIST_GRK = [
    simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0]
    for stop in STOPS_LIST_GRK
    if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0
]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += [
    "τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν',
    'πρε', 'ἀλλ'
]  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
Ejemplo n.º 16
0
# configs for all notebooks
working_dir = os.path.expanduser('~/cltk_data/user_data/lda_1kgreek/')
PREPROCESS_DEACCENT = False
TOK_MIN = 3  # rm words shorter than
TOK_MAX = 20  # rm words longer than
DOC_MIN = 50  # drop docs shorter than
remove_ascii = True
no_below = 20
no_above = 0.1


STOPS_LIST_GRK = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST_GRK if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0]
STOPS_LIST_GRK = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST_GRK += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ']  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST_GRK]

ascii_str = string.ascii_letters + string.punctuation + string.digits


def mk_working_dir(fp):
    """Make dir if not exists."""
    user_dir = os.path.expanduser(fp)
    try:
        os.makedirs(user_dir)
    except FileExistsError:
        pass


def tokenize(text, rm_ascii=False):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
Ejemplo n.º 17
0
 def normalize(self):
     return self.__class__(cltk_normalize(str(self.data)), self.metadata)