Ejemplo n.º 1
0
def proieltbs(treebank):
    froot = treebank.getroot()
    author = 'unknown'
    title = 'unknown'
    for source in froot:
        for division in source:
            if division.tag == 'title':
                title = division.text
            if division.tag == 'author':
                author = division.text
            for sentence in division:
                alltokesinsent = sentence.findall(".*[@form]")
                for token in alltokesinsent:
                    subject = 'ellipsed'
                    en = 'ellipsed'
                    prepobj = 'ellipsed'
                    if deaccent(
                            token.get('lemma')) == 'περισσευω' and token.get(
                                'morphology')[4] == 'a':
                        verbid = token.get('id')
                        for word in alltokesinsent:
                            if word.get('head-id') == verbid:
                                if word.get('relation') == 'sub':
                                    subject = word.get('form')
                                if word.get('lemma') == 'ἐν':
                                    en = 'ἐν'
                                    enid = word.get('id')
                                    for preobj in alltokesinsent:
                                        if preobj.get('head-id') == enid:
                                            prepobj = preobj.get('form')
                        print(author, ":", title, subject, token.get('form'),
                              en, prepobj)
    return
Ejemplo n.º 2
0
def perseustbs(treebank):
    froot = treebank.getroot()
    author = froot.find(".//author")
    author = author.text
    title = froot.find(".//title")
    title = title.text
    for body in froot:
        for sentence in body:
            mainverb = 'ellipsed'
            alltokesinsent = sentence.findall(".*[@form]")
            for verb in alltokesinsent:
                subject = 'ellipsed'
                en = 'ellipsed'
                prepobj = 'ellipsed'
                if deaccent(verb.get('lemma')) == 'περισσευω' and verb.get(
                        'postag')[5] == 'a':
                    verbid = verb.get('id')
                    for word in alltokesinsent:
                        if word.get('head') == verbid:
                            if word.get('relation') == 'sub':
                                subject = word.get('form')
                            if word.get('lemma') == 'ἐν':
                                en = 'ἐν'
                                enid = word.get('id')
                                for preobj in alltokesinsent:
                                    if preobj.get('head-id') == enid:
                                        prepobj = preobj.get('form')
                    print(author, ":", title, subject, verb.get('form'), en,
                          prepobj)
    return
Ejemplo n.º 3
0
def perseuscount(froot, i, j, inffile, fn):
    """Prints every instance of this articular infinitive construction for Perseus treebanks."""
    idtoheadid = {}
    inflist = []
    idtoform = {}
    for body in froot:
        for sentence in body:
            for word in sentence:
                if word.tag == 'word':
                    # Create artheadid{ID:HeadID}
                    idtoheadid[word.get('id')] = word.get('head')
                    # Create a list of every id of an infinitive.
                    if word.get('postag')[4] == 'n':
                        inflist.append(word.get('id'))
                    # Create a dictionary idtoform{ID:form}
                    idtoform[word.get('id')] = word.get('form')

    for body in froot:
        for sentence in body:
            for word in sentence:
                if word.tag == 'word':
                    if deaccent(word.get('lemma')) == 'ο' and word.get('head') in inflist and \
                            word.get('relation') == 'ATR':
                        infinitiveid = word.get('head')
                        for infobj in sentence:
                            if infobj.tag == 'word':
                                if infobj.get('head') == infinitiveid and infobj.get('relation') == 'OBJ':
                                    print(sentence.get('subdoc'), word.get('form'), idtoform[infinitiveid],
                                          infobj.get('form'))
                                    inffile.writelines([fn,'\n', sentence.get('subdoc')])
                                    if int(word.get('id')) > int(infobj.get('id')):
                                        print('^^Backwards^^')
                                        j += 1
                                    i += 1
    return i, j, inffile
Ejemplo n.º 4
0
def perseustbs(treebank, wordtype):
    """Returns a list of two Counters filled with article stats for the given treebank and wordform."""
    wordcounter = Counter()
    idtoworddict = {}
    artheadid = {}
    artwordcounter = Counter()
    froot = treebank.getroot()

    for body in froot:
        for sentence in body:
            for word in sentence:
                morph = str(word.get('postag'))
                if morph[0] == 'n':
                    senwordid = str(sentence.get('id')) + '-' + str(word.get('id'))
                    idtoworddict[senwordid] = word.get(wordtype)
                    notaccented = deaccent(word.get(wordtype))
                    wordcounter[notaccented] += 1
    # Creates wordcounter(EveryWordofThatPOS:OccurrenceCount)
    # Creates idtoworddict{Sentence-WordIDThatCorrespondsTo:EveryWordofThatPOS}

    for body in froot:
        for sentence in body:
            for word in sentence:
                if word.get('lemma') == 'ὁ':
                    artid = str(sentence.get('id')) + '-' + str(word.get('id'))
                    headid = str(sentence.get('id')) + '-' + str(word.get('head'))
                    artheadid[artid] = headid
    # Creates artheadid{ArticleIDs:HeadNounIDs}

    for key in artheadid:
        headnounid = artheadid[key]
        if headnounid in idtoworddict:
            accents = idtoworddict[headnounid]
            noaccents = deaccent(accents)
            artwordcounter[noaccents] += 1
    # Creates artwordcounter(EveryWordofThatPOS:ArticularOccurrences)

    counters = [wordcounter, artwordcounter]
    return counters
Ejemplo n.º 5
0
def proieltbs(treebank, perarticledict, perpronoundict, totarticlenumber,
              allforms):
    """Creates lists in ML format for each article."""
    froot = treebank.getroot()
    for source in froot:
        for division in source:
            for sentence in division:
                alltokesinsent = sentence.findall(".*[@form]")
                # Loops through every word.
                for token in alltokesinsent:
                    # Creates all the values that will go into a single element.
                    if token.get('lemma') == 'ὁ':
                        articlenumber = alltokesinsent.index(token)
                        artform = deaccent(token.get('form'))
                        if artform not in allforms:
                            allforms.append(artform)
                        if source.get('jewish') == 'yes':
                            jewish = 'yes'
                        else:
                            jewish = 'no'
                        mlformatlist = [jewish]
                        nextwordid = articlenumber + 1
                        try:
                            form = deaccent(
                                alltokesinsent[nextwordid].get('form'))
                            mlformatlist.append(form)
                            if form not in allforms and not form == '':
                                allforms.append(form)
                        except IndexError:
                            mlformatlist.append('OOR')
                        if token.get('part-of-speech') == 'S-':
                            mlformatlist.append(0)
                            perarticledict[totarticlenumber] = mlformatlist
                        else:
                            mlformatlist.append(1)
                            perpronoundict[totarticlenumber] = mlformatlist
                        totarticlenumber += 1
    returnlist = [perarticledict, perpronoundict, totarticlenumber, allforms]
    return returnlist
Ejemplo n.º 6
0
def perseustbs(treebank, perarticledict, perpronoundict, totarticlenumber,
               allforms):
    froot = treebank.getroot()
    for body in froot:
        for sentence in body:
            allwordsinsent = sentence.findall(".*[@form]")
            # Loops through every word.
            for word in allwordsinsent:
                # Creates all the values that will go into a single element.
                if word.get('lemma') == 'ὁ':
                    articlenumber = allwordsinsent.index(word)
                    artform = deaccent(word.get('form'))
                    if artform not in allforms:
                        allforms.append(artform)
                    if body.get('jewish') == 'yes':
                        jewish = 'yes'
                    else:
                        jewish = 'no'
                    mlformatlist = [jewish]
                    nextwordid = articlenumber + 1
                    try:
                        form = deaccent(allwordsinsent[nextwordid].get('form'))
                        mlformatlist.append(form)
                        if form not in allforms:
                            allforms.append(form)
                    except IndexError:
                        mlformatlist.append('OOR')
                    if word.get('postag')[0] == 'l':
                        mlformatlist.append(0)
                        perarticledict[totarticlenumber] = mlformatlist
                    else:
                        mlformatlist.append(1)
                        perpronoundict[totarticlenumber] = mlformatlist
                    perarticledict[totarticlenumber] = mlformatlist
                    totarticlenumber += 1

    returnlist = [perarticledict, perpronoundict, totarticlenumber, allforms]
    return returnlist
Ejemplo n.º 7
0
def perseuslist(treebank, wordtype, firstwordlist):
    """Find every word of the chosen morphology which appears
    and add it to a firstwordlist if it's not already part of that list."""
    froot = treebank.getroot()
    for body in froot:
        for sentence in body:
            for word in sentence:
                morph = str(word.get('postag'))
                if morph[0] == 'n':
                    accented = str(word.get(wordtype))
                    unaccented = deaccent(accented)
                    if unaccented not in firstwordlist:
                        firstwordlist.append(unaccented)
    return firstwordlist
Ejemplo n.º 8
0
def proiellist(treebank, wordtype, firstwordlist):
    """Find every word of the chosen part of speech which appears
    and add it to a firstwordlist if it's not already part of that list."""
    froot = treebank.getroot()
    for source in froot:
        for division in source:
            for sentence in division:
                for token in sentence:
                    if token.get('part-of-speech') == 'Ne' or token.get('part-of-speech') == 'Nb':
                        accented = str(token.get(wordtype))
                        unaccented = deaccent(accented)
                        if unaccented not in firstwordlist:
                            firstwordlist.append(unaccented)
    return firstwordlist
Ejemplo n.º 9
0
def proieltbs(treebank, wordtype):
    """Returns a list of two Counters filled with article stats for the given treebank and wordform."""
    wordcounter = Counter()
    idtoworddict = {}
    artheadid = {}
    artwordcounter = Counter()
    froot = treebank.getroot()

    for source in froot:
        for division in source:
            for sentence in division:
                for token in sentence:
                    if token.get('part-of-speech') == 'Ne' or token.get('part-of-speech') == 'Nb':
                        idtoworddict[token.get('id')] = token.get(wordtype)
                        notaccented = deaccent(token.get(wordtype))
                        wordcounter[notaccented] += 1
    # Creates wordcounter(EveryWordofThatPOS:OccurrenceCount)
    # Creates idtoworddict{WordIDThatCorrespondsTo:EveryWordofThatPOS}

    for source in froot:
        for division in source:
            for sentence in division:
                for token in sentence:
                    if token.get('lemma') == 'ὁ':
                        artheadid[token.get('id')] = token.get('head-id')
    # Creates artheadid{ArticleIDs:HeadNounIDs}

    for key in artheadid:
        headnounid = artheadid[key]
        if headnounid in idtoworddict:
            accents = idtoworddict[headnounid]
            noaccents = deaccent(accents)
            artwordcounter[noaccents] += 1
    # Creates artnouncounter(EveryWordofThatPOS:ArticularOccurrences)

    counters = [wordcounter, artwordcounter]
    return counters
Ejemplo n.º 10
0
def perseustbs(treebank, artcount, auxcount, procount):
    froot = treebank.getroot()
    fartcount = 0
    fauxcount = 0
    fprocount = 0
    for body in froot:
        for sentence in body:
            alltokesinsent = sentence.findall(".*[@form]")
            for word in alltokesinsent:
                if deaccent(word.get('lemma')) == 'ο':
                    artcount += 1
                    fartcount += 1
                    if word.get('relation') == 'ATR':
                        auxcount += 1
                        fauxcount += 1
                    else:
                        procount += 1
                        fprocount += 1
    print('Percent Pronoun', fprocount / fartcount)

    return artcount, auxcount, procount
Ejemplo n.º 11
0
def proieltbs(treebank, artcount, auxcount, procount):
    froot = treebank.getroot()
    fartcount = 0
    fauxcount = 0
    fprocount = 0
    for source in froot:
        for division in source:
            for sentence in division:
                alltokesinsent = sentence.findall(".*[@form]")
                for token in alltokesinsent:
                    if deaccent(token.get('lemma')) == 'ο':
                        artcount += 1
                        fartcount += 1
                        if token.get('relation') == 'aux':
                            auxcount += 1
                            fauxcount += 1
                        else:
                            procount += 1
                            fprocount += 1

    print('Percent Pronoun', fprocount / fartcount)
    return artcount, auxcount, procount
Ejemplo n.º 12
0
def proielcount(froot, i, j, inffile, fn):
    """Prints every instance of this articular infinitive construction for PROIEL treebanks."""
    idtoheadid = {}
    inflist = []
    idtoform = {}

    for source in froot:
        for division in source:
            for sentence in division:
                for token in sentence:
                    if token.tag == 'token' and token.get('empty-token-sort') is None:
                        # Create artheadid{ID:HeadID}
                        idtoheadid[token.get('id')] = token.get('head-id')
                        # Create a list of every id of an infinitive.
                        if token.get('morphology')[3] == 'n':
                            inflist.append(token.get('id'))
                        # Create a dictionary idtoform{ID:form}
                        idtoform[token.get('id')] = token.get('form')

    for source in froot:
        for division in source:
            for sentence in division:
                if sentence.tag == 'sentence':
                    for token in sentence:
                        if token.tag == 'token' and token.get('empty-token-sort') is None:
                            if deaccent(token.get('lemma')) == 'ο' and token.get('head-id') in inflist and\
                                    token.get('relation') == 'aux':
                                infinitiveid = token.get('head-id')
                                for infobj in sentence:
                                    if infobj.tag == 'token' and infobj.get('empty-token-sort') is None:
                                        if infobj.get('relation') == 'obj' and infobj.get('head-id') == infinitiveid:
                                            print(token.get('citation-part'), token.get('form'),
                                                  idtoform[infinitiveid], infobj.get('form'))
                                            inffile.writelines([fn, token.get('citation-part')])
                                            if int(token.get('id')) > int(infobj.get('id')):
                                                print('^^Backwards!^^')
                                                j += 1
                                            i += 1
    return i, j, inffile
Ejemplo n.º 13
0
def perseustbs(treebank, perarticledict, totarticlenumber, alllemmas, allpos,
               allletters, answersdict):
    froot = treebank.getroot()
    for body in froot:
        for sentence in body:
            allwordsinsent = sentence.findall(".*[@form]")
            # Loops through every word.
            for word in allwordsinsent:
                # Create lists of words or letters.
                if not deaccent(word.get('lemma')) in alllemmas:
                    alllemmas.append(deaccent(word.get('lemma')))
                for letter in word.get('postag'):
                    if letter not in allletters:
                        allletters.append(letter)
                # Creates all the values that will go into a single element.
                if word.get('lemma') == 'ὁ':
                    morph = word.get('postag')[1:]
                    articlenumber = allwordsinsent.index(word)
                    if body.get('jewish') == 'yes':
                        jewish = 'yes'
                    else:
                        jewish = 'no'
                    mlformatlist = [jewish]
                    for letter in morph:
                        mlformatlist.append(letter)
                    headwordplace = int(word.get('head')) - int(word.get('id'))
                    if headwordplace == 0:
                        print(sentence.get('id'))
                    nextwordid = articlenumber - 1
                    try:
                        lemma = deaccent(
                            allwordsinsent[nextwordid].get('lemma'))
                        morph = allwordsinsent[nextwordid].get('postag')
                        mlformatlist.append(lemma)
                        for letter in morph:
                            mlformatlist.append(letter)
                    except IndexError:
                        mlformatlist.extend(['ellipsed'] * 10)
                    i = 1
                    while i < 5:
                        nextwordid = articlenumber + i
                        try:
                            lemma = deaccent(
                                allwordsinsent[nextwordid].get('lemma'))
                            morph = allwordsinsent[nextwordid].get('postag')
                            mlformatlist.append(lemma)
                            for letter in morph:
                                mlformatlist.append(letter)
                        except IndexError:
                            mlformatlist.extend(['ellipsed'] * 10)
                        i += 1
                    if headwordplace < -1 or headwordplace > 4:
                        fanswer = 5
                    else:
                        fanswer = answersdict[headwordplace]
                    mlformatlist.append(fanswer)
                    perarticledict[totarticlenumber] = mlformatlist
                    totarticlenumber += 1

    returnlist = [
        perarticledict, totarticlenumber, alllemmas, allpos, allletters
    ]
    return returnlist
Ejemplo n.º 14
0
def proieltbs(treebank, perarticledict, totarticlenumber, alllemmas, allpos,
              allletters, answersdict, posdict):
    """Creates lists in ML format for each article."""
    froot = treebank.getroot()
    for source in froot:
        for division in source:
            for sentence in division:
                alltokesinsent = sentence.findall(".*[@form]")
                # Loops through every word.
                for token in alltokesinsent:
                    # Create lists of words or letters.
                    posletter = posdict[token.get('part-of-speech')]
                    if not deaccent(token.get('lemma')) in alllemmas:
                        alllemmas.append(deaccent(token.get('lemma')))
                    if posletter not in allpos:
                        allpos.append(posletter)
                    for letter in token.get('morphology'):
                        if letter not in allletters:
                            allletters.append(letter)
                    # Creates all the values that will go into a single element.
                    if token.get('lemma') == 'ὁ':
                        morph = token.get('morphology')[:8]
                        articlenumber = alltokesinsent.index(token)
                        if source.get('jewish') == 'yes':
                            jewish = 'yes'
                        else:
                            jewish = 'no'
                        mlformatlist = [jewish]
                        for letter in morph:
                            mlformatlist.append(letter)
                        headwordplace = int(token.get('head-id')) - int(
                            token.get('id'))
                        nextwordid = articlenumber - 1
                        try:
                            lemma = deaccent(
                                alltokesinsent[nextwordid].get('lemma'))
                            morph = alltokesinsent[nextwordid].get(
                                'morphology')[:8]
                            pos = posdict[alltokesinsent[nextwordid].get(
                                'part-of-speech')]
                            mlformatlist.extend([lemma, pos])
                            for letter in morph:
                                mlformatlist.append(letter)
                        except IndexError:
                            mlformatlist.extend(['ellipsed'] * 10)
                        i = 1
                        while i < 5:
                            nextwordid = articlenumber + i
                            try:
                                lemma = deaccent(
                                    alltokesinsent[nextwordid].get('lemma'))
                                morph = alltokesinsent[nextwordid].get(
                                    'morphology')[:8]
                                pos = posdict[alltokesinsent[nextwordid].get(
                                    'part-of-speech')]
                                mlformatlist.extend([lemma, pos])
                                for letter in morph:
                                    mlformatlist.append(letter)
                            except IndexError:
                                mlformatlist.extend(['ellipsed'] * 10)
                            i += 1
                        if headwordplace < -1 or headwordplace > 4:
                            fanswer = 5
                        else:
                            fanswer = answersdict[headwordplace]
                        mlformatlist.append(fanswer)
                        perarticledict[totarticlenumber] = mlformatlist
                        totarticlenumber += 1

    returnlist = [
        perarticledict, totarticlenumber, alllemmas, allpos, allletters
    ]
    return returnlist
Ejemplo n.º 15
0
import pandas as pd
from collections import Counter
import math
import os
import glob
from utility import deaccent

theText = ''
path = '/home/chris/PycharmProjects/learn/Texts/'
for filename in glob.glob(os.path.join(path, '*.txt')):
    newText = open(filename).read()
    theText = theText + newText
# Combines every text file in the Text folder into a single
# string "theText".

plainText = deaccent(theText)
wordList = plainText.lower().split()
wordCounter = Counter()
bigramList = []
bigramCounter = Counter()
bigramDic = {}

for word in wordList:
    wordCounter[word] += 1
# Adds every unique word in wordList to a counter
# object with corresponding frequency.

i = 0
minGram = 12
listLength = len(wordList)
while i < listLength - 1:
from utility import deaccent
import os
import xml.etree.cElementTree as Et

# go to correct directory, by default, place the Perseus folder in the working folder
homeFolder = os.getcwd()
perseusFolder = os.path.join(os.getcwd(), '1.0 Original')
indir = os.listdir(perseusFolder)

# iterate through files in directory
for file in indir:
    os.chdir(perseusFolder)
    print(file)
    # parse the XML
    tree = Et.parse(file)

    # for each file, iterate through all words, deacent
    for logos in tree.iter('word'):
        accentedWord = logos.get('form')
        unaccentedWord = deaccent(accentedWord).lower()
        logos.set('form', unaccentedWord)
        accentedLemma = logos.get('lemma')
        unaccentedLemma = deaccent(accentedLemma).lower()
        logos.set('lemma', unaccentedLemma)

    os.chdir(homeFolder)
    tree.write(file, encoding='UTF-8')