Esempio n. 1
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     replacer = Replacer()
     # Generic test
     beta_1 = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     unicode_1 = replacer.beta_code(beta_1)
     target_1 = 'ὅπως οὖν μὴ ταὐτὸ '
     # Test for iota and diaeresis
     self.assertEqual(unicode_1, target_1)
     beta_2 = r"""*XALDAI+KH\N"""
     unicode_2 = replacer.beta_code(beta_2)
     target_2 = 'Χαλδαϊκὴν'
     self.assertEqual(unicode_2, target_2)
     # Test for upsilon and diaeresis
     beta_3 = r"""PROU+POTETAGME/NWN"""
     unicode_3 = replacer.beta_code(beta_3)
     target_3 = 'προϋποτεταγμένων'
     self.assertEqual(unicode_3, target_3)
     # Test for lowercase
     beta_4 = r"""proi+sxome/nwn"""
     unicode_4 = replacer.beta_code(beta_4)
     target_4 = 'προϊσχομένων'
     self.assertEqual(unicode_4, target_4)
Esempio n. 2
0
 def test_greek_betacode_to_unicode(self):
     """Test converting Beta Code to Unicode.
     Note: assertEqual appears to not be correctly comparing certain
     characters (``ά`` and ``ί``, at least).
     """
     beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """
     replacer = Replacer()
     unicode = replacer.beta_code(beta_example)
     target_unicode = 'ὅπως οὖν μὴ ταὐτὸ '
     self.assertEqual(unicode, target_unicode)
Esempio n. 3
0
def beta2uni(text_beta):
    """ Wrapper of the cltk.corpus.greek.beta_to_unicode.Replacer function """
    if CLTK_NOT_FOUND:
        print(
            'CLTK is not found in this environment. In order to use the beta2uni converter,',
            'install this package with `pip install cltk` or `pip install dh-utils[betacode]`'
        )
        return None
    text_beta = text_beta.translate(LATIN_UPPER_TRANS)
    text_uni = Replacer().beta_code(text_beta.upper())
    return text_uni
Esempio n. 4
0
def get_tags():
    r = Replacer()
    entire_treebank = 'greek_treebank_perseus/agdt-1.7.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    sentences = root.findall('sentence')

    sentences_list = []
    for sentence in sentences:  # note: sentence is Element
        words_list = sentence.findall('word')
        sentence_list = []
        # http://ilk.uvt.nl/conll/
        for x in words_list:  # note: word is class
            word = x.attrib
            #id = word['id']
            form = word['form'].upper()  # make upper case for Beta Code converter
            form = r.beta_code(form)
            try:  # convert final sigmas
                if form[-1] == 'σ':
                    form = form[:-1] + 'ς'
            except IndexError:
                pass
            form = form.lower()

            # rm nasty single quotes
            form_list = [char for char in form if char not in ["'", '᾽', '’', '[', ']']]
            form = ''.join(form_list)

            #lemma = word['lemma']
            cpostag = word['relation']  # Coarse-grained part-of-speech tag
            cpostag = cpostag.split('_')[0]

            #postag = word['postag']
            #feats = '_'  # an underscore if not available
            #head = word['head']
            #deprel = word['head']
            #phead = '_'
            #pderprel = '_'
            word_tag = '/'.join([form, cpostag])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
        
    treebank_training_set = '\n\n'.join(sentences_list)

    with open('penn_pos_training_set_reduce.pos', 'w') as f:
        f.write(treebank_training_set)
Esempio n. 5
0
def get_tags(path):
    r = Replacer()
    entire_treebank = path
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
    sentences = body.findall('sentence')
    sentences_list = []
    for sentence in sentences:
        words_list = sentence.findall('word')
        sentence_list = []
        for x in words_list:
            word = x.attrib
            form = word['form'].upper()
            form = r.beta_code(form)
            try:
                if form[-1] == 's':
                    form = form[:-1] + '?'
            except IndexError:
                pass
            form = form.lower()
            form = clean(basify(form))
            form_list = [
                char for char in form
                if char not in [' ', "'", '?', '’', '[', ']']
            ]
            form = ''.join(form_list)
            try:
                postag1 = word['postag']
                postag1 = postag1
                postag2 = word['lemma']
                postag2 = clean(basify(postag2))
            except:
                postag = 'x--------'
            if len(form) == 0: continue
            word_tag = '/'.join([form, postag1, postag2])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
    treebank_training_set = '\n\n'.join(sentences_list)
    return treebank_training_set
Esempio n. 6
0
def get_tags():
    r = Replacer()
    entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
    sentences = body.findall('sentence')
    sentences_list = []
    for sentence in sentences:
        words_list = sentence.findall('word')
        sentence_list = []
        for x in words_list:
            word = x.attrib
            form = word['form'].upper()
            form = r.beta_code(form)
            try:
                if form[-1] == 's':
                    form = form[:-1] + '?'
            except IndexError:
                pass
            form = form.lower()
            form = basify(form)
            form_list = [
                char for char in form
                if char not in [' ', "'", '?', '’', '[', ']']
            ]
            form = ''.join(form_list)
            try:
                postag = word['postag']
            except:
                postag = 'x--------'
            if len(form) == 0: continue
            word_tag = '/'.join([form, postag])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
    treebank_training_set = '\n\n'.join(sentences_list)
    with open('greek_training_set_2.pos', 'w') as f:
        f.write(treebank_training_set)
Esempio n. 7
0
from cltk.corpus.greek.beta_to_unicode import Replacer
from lxml import etree
from greek_accentuation.characters import *

from greek_accentuation.characters import strip_accents
from transliterate import translit
from cltk.corpus.greek.beta_to_unicode import Replacer

r = Replacer()


def g_translit(string):
    tr = translit(string, "el")
    if string[-1] == "s":
        tr = tr[:-1]
        tr = tr + r.beta_code('s')
    return tr


def basify(string):
    basic = "".join([strip_accents(x) for x in string])
    return basic


def get_tags():
    r = Replacer()
    entire_treebank = '/home/q078011/cltk_data/greek/text/perseus_treebank_dev/v2.1/Greek/texts/tlg0003.tlg001.perseus-grc1.1.tb.xml'
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
Esempio n. 8
0
def beta2uni(text_beta):
    text_beta = text_beta.translate(
        str.maketrans(string.ascii_lowercase, string.ascii_uppercase))
    text_uni = Replacer().beta_code(text_beta)
    return text_uni