def proieltbs(treebank): froot = treebank.getroot() author = 'unknown' title = 'unknown' for source in froot: for division in source: if division.tag == 'title': title = division.text if division.tag == 'author': author = division.text for sentence in division: alltokesinsent = sentence.findall(".*[@form]") for token in alltokesinsent: subject = 'ellipsed' en = 'ellipsed' prepobj = 'ellipsed' if deaccent( token.get('lemma')) == 'περισσευω' and token.get( 'morphology')[4] == 'a': verbid = token.get('id') for word in alltokesinsent: if word.get('head-id') == verbid: if word.get('relation') == 'sub': subject = word.get('form') if word.get('lemma') == 'ἐν': en = 'ἐν' enid = word.get('id') for preobj in alltokesinsent: if preobj.get('head-id') == enid: prepobj = preobj.get('form') print(author, ":", title, subject, token.get('form'), en, prepobj) return
def perseustbs(treebank): froot = treebank.getroot() author = froot.find(".//author") author = author.text title = froot.find(".//title") title = title.text for body in froot: for sentence in body: mainverb = 'ellipsed' alltokesinsent = sentence.findall(".*[@form]") for verb in alltokesinsent: subject = 'ellipsed' en = 'ellipsed' prepobj = 'ellipsed' if deaccent(verb.get('lemma')) == 'περισσευω' and verb.get( 'postag')[5] == 'a': verbid = verb.get('id') for word in alltokesinsent: if word.get('head') == verbid: if word.get('relation') == 'sub': subject = word.get('form') if word.get('lemma') == 'ἐν': en = 'ἐν' enid = word.get('id') for preobj in alltokesinsent: if preobj.get('head-id') == enid: prepobj = preobj.get('form') print(author, ":", title, subject, verb.get('form'), en, prepobj) return
def perseuscount(froot, i, j, inffile, fn): """Prints every instance of this articular infinitive construction for Perseus treebanks.""" idtoheadid = {} inflist = [] idtoform = {} for body in froot: for sentence in body: for word in sentence: if word.tag == 'word': # Create artheadid{ID:HeadID} idtoheadid[word.get('id')] = word.get('head') # Create a list of every id of an infinitive. if word.get('postag')[4] == 'n': inflist.append(word.get('id')) # Create a dictionary idtoform{ID:form} idtoform[word.get('id')] = word.get('form') for body in froot: for sentence in body: for word in sentence: if word.tag == 'word': if deaccent(word.get('lemma')) == 'ο' and word.get('head') in inflist and \ word.get('relation') == 'ATR': infinitiveid = word.get('head') for infobj in sentence: if infobj.tag == 'word': if infobj.get('head') == infinitiveid and infobj.get('relation') == 'OBJ': print(sentence.get('subdoc'), word.get('form'), idtoform[infinitiveid], infobj.get('form')) inffile.writelines([fn,'\n', sentence.get('subdoc')]) if int(word.get('id')) > int(infobj.get('id')): print('^^Backwards^^') j += 1 i += 1 return i, j, inffile
def perseustbs(treebank, wordtype): """Returns a list of two Counters filled with article stats for the given treebank and wordform.""" wordcounter = Counter() idtoworddict = {} artheadid = {} artwordcounter = Counter() froot = treebank.getroot() for body in froot: for sentence in body: for word in sentence: morph = str(word.get('postag')) if morph[0] == 'n': senwordid = str(sentence.get('id')) + '-' + str(word.get('id')) idtoworddict[senwordid] = word.get(wordtype) notaccented = deaccent(word.get(wordtype)) wordcounter[notaccented] += 1 # Creates wordcounter(EveryWordofThatPOS:OccurrenceCount) # Creates idtoworddict{Sentence-WordIDThatCorrespondsTo:EveryWordofThatPOS} for body in froot: for sentence in body: for word in sentence: if word.get('lemma') == 'ὁ': artid = str(sentence.get('id')) + '-' + str(word.get('id')) headid = str(sentence.get('id')) + '-' + str(word.get('head')) artheadid[artid] = headid # Creates artheadid{ArticleIDs:HeadNounIDs} for key in artheadid: headnounid = artheadid[key] if headnounid in idtoworddict: accents = idtoworddict[headnounid] noaccents = deaccent(accents) artwordcounter[noaccents] += 1 # Creates artwordcounter(EveryWordofThatPOS:ArticularOccurrences) counters = [wordcounter, artwordcounter] return counters
def proieltbs(treebank, perarticledict, perpronoundict, totarticlenumber, allforms): """Creates lists in ML format for each article.""" froot = treebank.getroot() for source in froot: for division in source: for sentence in division: alltokesinsent = sentence.findall(".*[@form]") # Loops through every word. for token in alltokesinsent: # Creates all the values that will go into a single element. if token.get('lemma') == 'ὁ': articlenumber = alltokesinsent.index(token) artform = deaccent(token.get('form')) if artform not in allforms: allforms.append(artform) if source.get('jewish') == 'yes': jewish = 'yes' else: jewish = 'no' mlformatlist = [jewish] nextwordid = articlenumber + 1 try: form = deaccent( alltokesinsent[nextwordid].get('form')) mlformatlist.append(form) if form not in allforms and not form == '': allforms.append(form) except IndexError: mlformatlist.append('OOR') if token.get('part-of-speech') == 'S-': mlformatlist.append(0) perarticledict[totarticlenumber] = mlformatlist else: mlformatlist.append(1) perpronoundict[totarticlenumber] = mlformatlist totarticlenumber += 1 returnlist = [perarticledict, perpronoundict, totarticlenumber, allforms] return returnlist
def perseustbs(treebank, perarticledict, perpronoundict, totarticlenumber, allforms): froot = treebank.getroot() for body in froot: for sentence in body: allwordsinsent = sentence.findall(".*[@form]") # Loops through every word. for word in allwordsinsent: # Creates all the values that will go into a single element. if word.get('lemma') == 'ὁ': articlenumber = allwordsinsent.index(word) artform = deaccent(word.get('form')) if artform not in allforms: allforms.append(artform) if body.get('jewish') == 'yes': jewish = 'yes' else: jewish = 'no' mlformatlist = [jewish] nextwordid = articlenumber + 1 try: form = deaccent(allwordsinsent[nextwordid].get('form')) mlformatlist.append(form) if form not in allforms: allforms.append(form) except IndexError: mlformatlist.append('OOR') if word.get('postag')[0] == 'l': mlformatlist.append(0) perarticledict[totarticlenumber] = mlformatlist else: mlformatlist.append(1) perpronoundict[totarticlenumber] = mlformatlist perarticledict[totarticlenumber] = mlformatlist totarticlenumber += 1 returnlist = [perarticledict, perpronoundict, totarticlenumber, allforms] return returnlist
def perseuslist(treebank, wordtype, firstwordlist): """Find every word of the chosen morphology which appears and add it to a firstwordlist if it's not already part of that list.""" froot = treebank.getroot() for body in froot: for sentence in body: for word in sentence: morph = str(word.get('postag')) if morph[0] == 'n': accented = str(word.get(wordtype)) unaccented = deaccent(accented) if unaccented not in firstwordlist: firstwordlist.append(unaccented) return firstwordlist
def proiellist(treebank, wordtype, firstwordlist): """Find every word of the chosen part of speech which appears and add it to a firstwordlist if it's not already part of that list.""" froot = treebank.getroot() for source in froot: for division in source: for sentence in division: for token in sentence: if token.get('part-of-speech') == 'Ne' or token.get('part-of-speech') == 'Nb': accented = str(token.get(wordtype)) unaccented = deaccent(accented) if unaccented not in firstwordlist: firstwordlist.append(unaccented) return firstwordlist
def proieltbs(treebank, wordtype): """Returns a list of two Counters filled with article stats for the given treebank and wordform.""" wordcounter = Counter() idtoworddict = {} artheadid = {} artwordcounter = Counter() froot = treebank.getroot() for source in froot: for division in source: for sentence in division: for token in sentence: if token.get('part-of-speech') == 'Ne' or token.get('part-of-speech') == 'Nb': idtoworddict[token.get('id')] = token.get(wordtype) notaccented = deaccent(token.get(wordtype)) wordcounter[notaccented] += 1 # Creates wordcounter(EveryWordofThatPOS:OccurrenceCount) # Creates idtoworddict{WordIDThatCorrespondsTo:EveryWordofThatPOS} for source in froot: for division in source: for sentence in division: for token in sentence: if token.get('lemma') == 'ὁ': artheadid[token.get('id')] = token.get('head-id') # Creates artheadid{ArticleIDs:HeadNounIDs} for key in artheadid: headnounid = artheadid[key] if headnounid in idtoworddict: accents = idtoworddict[headnounid] noaccents = deaccent(accents) artwordcounter[noaccents] += 1 # Creates artnouncounter(EveryWordofThatPOS:ArticularOccurrences) counters = [wordcounter, artwordcounter] return counters
def perseustbs(treebank, artcount, auxcount, procount): froot = treebank.getroot() fartcount = 0 fauxcount = 0 fprocount = 0 for body in froot: for sentence in body: alltokesinsent = sentence.findall(".*[@form]") for word in alltokesinsent: if deaccent(word.get('lemma')) == 'ο': artcount += 1 fartcount += 1 if word.get('relation') == 'ATR': auxcount += 1 fauxcount += 1 else: procount += 1 fprocount += 1 print('Percent Pronoun', fprocount / fartcount) return artcount, auxcount, procount
def proieltbs(treebank, artcount, auxcount, procount): froot = treebank.getroot() fartcount = 0 fauxcount = 0 fprocount = 0 for source in froot: for division in source: for sentence in division: alltokesinsent = sentence.findall(".*[@form]") for token in alltokesinsent: if deaccent(token.get('lemma')) == 'ο': artcount += 1 fartcount += 1 if token.get('relation') == 'aux': auxcount += 1 fauxcount += 1 else: procount += 1 fprocount += 1 print('Percent Pronoun', fprocount / fartcount) return artcount, auxcount, procount
def proielcount(froot, i, j, inffile, fn): """Prints every instance of this articular infinitive construction for PROIEL treebanks.""" idtoheadid = {} inflist = [] idtoform = {} for source in froot: for division in source: for sentence in division: for token in sentence: if token.tag == 'token' and token.get('empty-token-sort') is None: # Create artheadid{ID:HeadID} idtoheadid[token.get('id')] = token.get('head-id') # Create a list of every id of an infinitive. if token.get('morphology')[3] == 'n': inflist.append(token.get('id')) # Create a dictionary idtoform{ID:form} idtoform[token.get('id')] = token.get('form') for source in froot: for division in source: for sentence in division: if sentence.tag == 'sentence': for token in sentence: if token.tag == 'token' and token.get('empty-token-sort') is None: if deaccent(token.get('lemma')) == 'ο' and token.get('head-id') in inflist and\ token.get('relation') == 'aux': infinitiveid = token.get('head-id') for infobj in sentence: if infobj.tag == 'token' and infobj.get('empty-token-sort') is None: if infobj.get('relation') == 'obj' and infobj.get('head-id') == infinitiveid: print(token.get('citation-part'), token.get('form'), idtoform[infinitiveid], infobj.get('form')) inffile.writelines([fn, token.get('citation-part')]) if int(token.get('id')) > int(infobj.get('id')): print('^^Backwards!^^') j += 1 i += 1 return i, j, inffile
def perseustbs(treebank, perarticledict, totarticlenumber, alllemmas, allpos, allletters, answersdict): froot = treebank.getroot() for body in froot: for sentence in body: allwordsinsent = sentence.findall(".*[@form]") # Loops through every word. for word in allwordsinsent: # Create lists of words or letters. if not deaccent(word.get('lemma')) in alllemmas: alllemmas.append(deaccent(word.get('lemma'))) for letter in word.get('postag'): if letter not in allletters: allletters.append(letter) # Creates all the values that will go into a single element. if word.get('lemma') == 'ὁ': morph = word.get('postag')[1:] articlenumber = allwordsinsent.index(word) if body.get('jewish') == 'yes': jewish = 'yes' else: jewish = 'no' mlformatlist = [jewish] for letter in morph: mlformatlist.append(letter) headwordplace = int(word.get('head')) - int(word.get('id')) if headwordplace == 0: print(sentence.get('id')) nextwordid = articlenumber - 1 try: lemma = deaccent( allwordsinsent[nextwordid].get('lemma')) morph = allwordsinsent[nextwordid].get('postag') mlformatlist.append(lemma) for letter in morph: mlformatlist.append(letter) except IndexError: mlformatlist.extend(['ellipsed'] * 10) i = 1 while i < 5: nextwordid = articlenumber + i try: lemma = deaccent( allwordsinsent[nextwordid].get('lemma')) morph = allwordsinsent[nextwordid].get('postag') mlformatlist.append(lemma) for letter in morph: mlformatlist.append(letter) except IndexError: mlformatlist.extend(['ellipsed'] * 10) i += 1 if headwordplace < -1 or headwordplace > 4: fanswer = 5 else: fanswer = answersdict[headwordplace] mlformatlist.append(fanswer) perarticledict[totarticlenumber] = mlformatlist totarticlenumber += 1 returnlist = [ perarticledict, totarticlenumber, alllemmas, allpos, allletters ] return returnlist
def proieltbs(treebank, perarticledict, totarticlenumber, alllemmas, allpos, allletters, answersdict, posdict): """Creates lists in ML format for each article.""" froot = treebank.getroot() for source in froot: for division in source: for sentence in division: alltokesinsent = sentence.findall(".*[@form]") # Loops through every word. for token in alltokesinsent: # Create lists of words or letters. posletter = posdict[token.get('part-of-speech')] if not deaccent(token.get('lemma')) in alllemmas: alllemmas.append(deaccent(token.get('lemma'))) if posletter not in allpos: allpos.append(posletter) for letter in token.get('morphology'): if letter not in allletters: allletters.append(letter) # Creates all the values that will go into a single element. if token.get('lemma') == 'ὁ': morph = token.get('morphology')[:8] articlenumber = alltokesinsent.index(token) if source.get('jewish') == 'yes': jewish = 'yes' else: jewish = 'no' mlformatlist = [jewish] for letter in morph: mlformatlist.append(letter) headwordplace = int(token.get('head-id')) - int( token.get('id')) nextwordid = articlenumber - 1 try: lemma = deaccent( alltokesinsent[nextwordid].get('lemma')) morph = alltokesinsent[nextwordid].get( 'morphology')[:8] pos = posdict[alltokesinsent[nextwordid].get( 'part-of-speech')] mlformatlist.extend([lemma, pos]) for letter in morph: mlformatlist.append(letter) except IndexError: mlformatlist.extend(['ellipsed'] * 10) i = 1 while i < 5: nextwordid = articlenumber + i try: lemma = deaccent( alltokesinsent[nextwordid].get('lemma')) morph = alltokesinsent[nextwordid].get( 'morphology')[:8] pos = posdict[alltokesinsent[nextwordid].get( 'part-of-speech')] mlformatlist.extend([lemma, pos]) for letter in morph: mlformatlist.append(letter) except IndexError: mlformatlist.extend(['ellipsed'] * 10) i += 1 if headwordplace < -1 or headwordplace > 4: fanswer = 5 else: fanswer = answersdict[headwordplace] mlformatlist.append(fanswer) perarticledict[totarticlenumber] = mlformatlist totarticlenumber += 1 returnlist = [ perarticledict, totarticlenumber, alllemmas, allpos, allletters ] return returnlist
import pandas as pd from collections import Counter import math import os import glob from utility import deaccent theText = '' path = '/home/chris/PycharmProjects/learn/Texts/' for filename in glob.glob(os.path.join(path, '*.txt')): newText = open(filename).read() theText = theText + newText # Combines every text file in the Text folder into a single # string "theText". plainText = deaccent(theText) wordList = plainText.lower().split() wordCounter = Counter() bigramList = [] bigramCounter = Counter() bigramDic = {} for word in wordList: wordCounter[word] += 1 # Adds every unique word in wordList to a counter # object with corresponding frequency. i = 0 minGram = 12 listLength = len(wordList) while i < listLength - 1:
from utility import deaccent import os import xml.etree.cElementTree as Et # go to correct directory, by default, place the Perseus folder in the working folder homeFolder = os.getcwd() perseusFolder = os.path.join(os.getcwd(), '1.0 Original') indir = os.listdir(perseusFolder) # iterate through files in directory for file in indir: os.chdir(perseusFolder) print(file) # parse the XML tree = Et.parse(file) # for each file, iterate through all words, deacent for logos in tree.iter('word'): accentedWord = logos.get('form') unaccentedWord = deaccent(accentedWord).lower() logos.set('form', unaccentedWord) accentedLemma = logos.get('lemma') unaccentedLemma = deaccent(accentedLemma).lower() logos.set('lemma', unaccentedLemma) os.chdir(homeFolder) tree.write(file, encoding='UTF-8')