Example #1
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     replacer = Replacer()
     # Generic test
     beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     unicode_1 = replacer.beta_code(beta_1)
     target_1 = 'ὅπως οὖν μὴ ταὐτὸ '
     # Test for iota and diaeresis
     self.assertEqual(unicode_1, target_1)
     beta_2 = r"""*XALDAI+KH\N"""
     unicode_2 = replacer.beta_code(beta_2)
     target_2 = 'Χαλδαϊκὴν'
     self.assertEqual(unicode_2, target_2)
     # Test for upsilon and diaeresis
     beta_3 = r"""PROU+POTETAGME/NWN"""
     unicode_3 = replacer.beta_code(beta_3)
     target_3 = 'προϋποτεταγμένων'
     self.assertEqual(unicode_3, target_3)
     # Test for lowercase
     beta_4 = r"""proi+sxome/nwn"""
     unicode_4 = replacer.beta_code(beta_4)
     target_4 = 'προϊσχομένων'
     self.assertEqual(unicode_4, target_4)
Example #2
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     replacer = Replacer()
     unicode = replacer.beta_code(beta_example)
     target_unicode = 'ὅπως οὖν μὴ ταὐτὸ '
     self.assertEqual(unicode, target_unicode)
Example #3
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     replacer = Replacer()
     unicode = replacer.beta_code(beta_example)
     target_unicode = 'ὅπως οὖν μὴ ταὐτὸ '
     self.assertEqual(unicode, target_unicode)
Example #4
0
def get_tags():
    r = Replacer()
    entire_treebank = 'greek_treebank_perseus/agdt-1.7.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    sentences = root.findall('sentence')

    sentences_list = []
    for sentence in sentences:  # note: sentence is Element
        words_list = sentence.findall('word')
        sentence_list = []
        # http://ilk.uvt.nl/conll/
        for x in words_list:  # note: word is class
            word = x.attrib
            #id = word['id']
            form = word['form'].upper()  # make upper case for Beta Code converter
            form = r.beta_code(form)
            try:  # convert final sigmas
                if form[-1] == 'σ':
                    form = form[:-1] + 'ς'
            except IndexError:
                pass
            form = form.lower()

            # rm nasty single quotes
            form_list = [char for char in form if char not in ["'", '᾽', '’', '[', ']']]
            form = ''.join(form_list)

            #lemma = word['lemma']
            cpostag = word['relation']  # Coarse-grained part-of-speech tag
            cpostag = cpostag.split('_')[0]

            #postag = word['postag']
            #feats = '_'  # an underscore if not available
            #head = word['head']
            #deprel = word['head']
            #phead = '_'
            #pderprel = '_'
            word_tag = '/'.join([form, cpostag])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
        
    treebank_training_set = '\n\n'.join(sentences_list)

    with open('penn_pos_training_set_reduce.pos', 'w') as f:
        f.write(treebank_training_set)
Example #5
0
def get_tags(path):
    r = Replacer()
    entire_treebank = path
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
    sentences = body.findall('sentence')
    sentences_list = []
    for sentence in sentences:
        words_list = sentence.findall('word')
        sentence_list = []
        for x in words_list:
            word = x.attrib
            form = word['form'].upper()
            form = r.beta_code(form)
            try:
                if form[-1] == 's':
                    form = form[:-1] + '?'
            except IndexError:
                pass
            form = form.lower()
            form = clean(basify(form))
            form_list = [
                char for char in form
                if char not in [' ', "'", '?', '’', '[', ']']
            ]
            form = ''.join(form_list)
            try:
                postag1 = word['postag']
                postag1 = postag1
                postag2 = word['lemma']
                postag2 = clean(basify(postag2))
            except:
                postag = 'x--------'
            if len(form) == 0: continue
            word_tag = '/'.join([form, postag1, postag2])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
    treebank_training_set = '\n\n'.join(sentences_list)
    return treebank_training_set
Example #6
0
def beta2uni(text_beta):
    """ Wrapper of the cltk.corpus.greek.beta_to_unicode.Replacer function """
    if CLTK_NOT_FOUND:
        print(
            'CLTK is not found in this environment. In order to use the beta2uni converter,',
            'install this package with `pip install cltk` or `pip install dh-utils[betacode]`'
        )
        return None
    text_beta = text_beta.translate(LATIN_UPPER_TRANS)
    text_uni = Replacer().beta_code(text_beta.upper())
    return text_uni
Example #7
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     replacer = Replacer()
     # Generic test
     beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     unicode_1 = replacer.beta_code(beta_1)
     target_1 = 'ὅπως οὖν μὴ ταὐτὸ '
     # Test for iota and diaeresis
     self.assertEqual(unicode_1, target_1)
     beta_2 = r"""*XALDAI+KH\N"""
     unicode_2 = replacer.beta_code(beta_2)
     target_2 = 'Χαλδαϊκὴν'
     self.assertEqual(unicode_2, target_2)
     # Test for upsilon and diaeresis
     beta_3 = r"""PROU+POTETAGME/NWN"""
     unicode_3 = replacer.beta_code(beta_3)
     target_3 = 'προϋποτεταγμένων'
     self.assertEqual(unicode_3, target_3)
Example #8
0
def get_tags():
    r = Replacer()
    entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
    sentences = body.findall('sentence')
    sentences_list = []
    for sentence in sentences:
        words_list = sentence.findall('word')
        sentence_list = []
        for x in words_list:
            word = x.attrib
            form = word['form'].upper()
            form = r.beta_code(form)
            try:
                if form[-1] == 's':
                    form = form[:-1] + '?'
            except IndexError:
                pass
            form = form.lower()
            form = basify(form)
            form_list = [
                char for char in form
                if char not in [' ', "'", '?', '’', '[', ']']
            ]
            form = ''.join(form_list)
            try:
                postag = word['postag']
            except:
                postag = 'x--------'
            if len(form) == 0: continue
            word_tag = '/'.join([form, postag])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
    treebank_training_set = '\n\n'.join(sentences_list)
    with open('greek_training_set_2.pos', 'w') as f:
        f.write(treebank_training_set)
Example #9
0
 def compile_tlg_txt(self):
     """Reads original Beta Code files and converts to Unicode files"""
     logging.info('Starting TLG corpus compilation into files.')
     compiled_files_dir_tlg = os.path.join(self.compiled_files_dir, 'tlg')
     if os.path.isdir(compiled_files_dir_tlg) is True:
         pass
     else:
         os.mkdir(compiled_files_dir_tlg)
     self.make_tlg_index_file_author()
     self.read_tlg_index_file_author()
     for file_name in tlg_index:
         abbrev = tlg_index[file_name]
         orig_files_dir_tlg = os.path.join(self.orig_files_dir, 'tlg')
         file_name_txt = file_name + '.TXT'
         files_path = os.path.join(orig_files_dir_tlg, file_name_txt)
         try:
             with open(files_path, 'rb') as index_opened:
                 txt_read = index_opened.read().decode('latin-1')
                 txt_ascii = remove_non_ascii(txt_read)
                 local_replacer = Replacer()
                 new_uni = local_replacer.beta_code(txt_ascii)
                 file_name_txt_uni = file_name + '.txt'
                 file_path = os.path.join(compiled_files_dir_tlg,
                                          file_name_txt_uni)
                 try:
                     with open(file_path, 'w') as new_file:
                         new_file.write(new_uni)
                 except IOError:
                     logging.error('Failed to write to new file %s of '
                                   'author %s', file_name, abbrev)
             logging.info('Finished TLG corpus compilation to %s',
                          file_path)
         except IOError:
             logging.error('Failed to open TLG file %s of author %s',
                           file_name, abbrev)
     self.make_tlg_meta_index()
     self.make_tlg_index_auth_works()
Example #10
0
from cltk.corpus.greek.beta_to_unicode import Replacer
from lxml import etree
from greek_accentuation.characters import *

from greek_accentuation.characters import strip_accents
from transliterate import translit
from cltk.corpus.greek.beta_to_unicode import Replacer

r = Replacer()


def g_translit(string):
    tr = translit(string, "el")
    if string[-1] == "s":
        tr = tr[:-1]
        tr = tr + r.beta_code('s')
    return tr


def basify(string):
    basic = "".join([strip_accents(x) for x in string])
    return basic


def get_tags():
    r = Replacer()
    entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
Example #11
0
from cltk.corpus.greek.beta_to_unicode import Replacer
import re

r = Replacer()

fulls = ''
labels = ''
label = ''
workw = ''
lem = ''
raws = ''
postag = ''
stemtype = ''
keys = ''
prim = {}
f = open("homer-lmorph2.txt", "r")
for l in f:
    print("starts", l)
    l = re.sub('\s+$', '', l)
    l = re.sub('\\+/', '/+', l)  # o)i/+omai rather than o)i+/omai
    l = re.sub('\\+\\\\', '\\+', l)  # o)i/+omai rather than o)i+/omai
    l = re.sub('u\'', 'u’', l)
    fds = l.split("\t")
    fds[0] = re.sub("\s+$", "", fds[0])
    if (not re.search(" ", fds[0]) and re.search("indeclform", l)):
        postag = fds[1]
        if (len(fds) > 4):
            stemtype = fds[4]
    if (re.search(" ", fds[0])):
        prim = fds[0].split(" ")
        prim[1] = re.sub("w_", "w", prim[1])
Example #12
0
def beta2uni(text_beta):
    text_beta = text_beta.translate(
        str.maketrans(string.ascii_lowercase, string.ascii_uppercase))
    text_uni = Replacer().beta_code(text_beta)
    return text_uni
Example #13
0
from xml.etree import ElementTree as ET

import pickle
import collections
import pandas as pd
import numpy as np
from difflib import *

from greek_accentuation.characters import strip_accents
from transliterate import translit
from cltk.corpus.greek.beta_to_unicode import Replacer
from model.clean import clean
from Levenshtein import distance

#### Input list of words ####
r = Replacer()
tree = ET.parse(
    '/home/q078011/cltk_data/greek/text/greek_lexica_perseus/greek_english_lexicon.xml'
)
root = tree.getroot()
li = [entry for entry in root.iter('entryFree')]
wrd = lambda i: clean(
    basify(r.beta_code([x.text for x in li[i].iter('orth')][0])))

w = []
for i in range(len(li)):
    try:
        w.append(wrd(i))
    except AttributeError:
        pass