def get_glinfobasic(file, startpage=499, stoppage=712): """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table. Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and tag-free Gloss Text for every gloss in Wb.""" curepist = "Unknown" infolist = [["Epistle", "Page", "Folio", "Gloss No.", "Gloss Text"]] for page in range(startpage, stoppage + 1): thispage = page pagetext = get_pages(file, thispage, thispage) epfunc = get_tagtext(pagetext, "H2") if epfunc: curepist = epfunc[0] glosslist = order_glosslist( clear_tags("\n\n".join(get_section(pagetext, "SG")))) foliolist = [] for folinfo in get_fol( order_glosses( clear_tags( "\n\n".join( get_section(get_pages(file, thispage, thispage), "SG")), "fol"))): folio = folinfo[1] foliotext = folinfo[0] foliolist.append([folio, foliotext]) for gloss in glosslist: thisglosslist = [curepist, thispage] glossfound = False for foltextlist in foliolist: if gloss in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: glossstub = gloss[:11] for foltextlist in foliolist: if glossstub in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: thisglosslist.append("No Folio Information Found") glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glosspatitir = glossnopat.finditer(gloss) for i in glosspatitir: thisglosslist.extend([ (i.group())[:-2], gloss[gloss.find(i.group()) + len(i.group()):] ]) infolist.append(thisglosslist) return infolist
def list_numbered_glosses(file, startpage, stoppage): """Lists glosses by their folio ID and gloss number""" glist = [] for p in range(startpage, stoppage + 1): fcont = get_fol( order_glosses( clear_tags( "\n\n".join(get_section(get_pages(file, p, p), "SG")), "fol"))) for g in order_glosslist("\n\n".join( get_section(get_pages(file, p, p), "SG"))): for fol in fcont: raw_gloss = clear_tags(g) if clear_tags(g) in fol[0]: numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') numpatitir = numpat.finditer(raw_gloss) for i in numpatitir: if i.group() in raw_gloss: glist.append( [fol[1][3:] + i.group()[:-1], cleangloss(g)]) return glist
def get_glinfo(file, startpage=499, stoppage=712): """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table. Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and Gloss Text (with [GLat][/GLat] tags converted to html italics tags) for every gloss in Wb.""" curepist = "Unknown" infolist = [[ "Epistle", "Page", "Folio", "Gloss No.", "Gloss Full-Tags", "Gloss Text", "Gloss Footnotes", "Relevant Footnotes", "Adrian's Notes", "Gloss Translation" ]] pagestrans = get_transpagesinfo(file, startpage, stoppage) for page in range(startpage, stoppage + 1): thispage = page pagetext = get_pages(file, thispage, thispage) # Gets all page Footnotes for the first time (for the gloss) footnotelist = order_footlist(file, page) # Gets all notes supplied by me (for the gloss) notelist = order_newlist(file, page) newnotelist = list() notefol = False if notelist != ['']: for notenum, note in enumerate(notelist): noteidpat = re.compile(r'\[/?f\. \d{1,2}[a-d]\]') noteiditer = noteidpat.findall(note) if noteiditer: for folinfo in noteiditer: note = "".join(note.split(folinfo)) folinfo = "".join(i for i in noteiditer[0] if i not in ["[", "]", "/"]) if notenum == 0: notefol = folinfo elif notefol != folinfo: notefol = folinfo notenumpat = re.compile(r'^\d{1,2}[a-z]?\. ') notenumiter = notenumpat.findall(note) if not notenumiter: raise RuntimeError( f"Personal note found without link to gloss number.\nNote: {note}" ) elif len(notenumiter ) > 1 or notenumiter[0] != note[:len(notenumiter[0])]: raise RuntimeError( f"Multiple possible gloss numbers found for personal note.\nNote: {note}" ) elif notenumiter[0] == note[:len(notenumiter[0])]: glossnum = notenumiter[0][:-2] note = note[len(notenumiter[0]):].strip() newnotelist.append([notefol, glossnum, note]) # Checks for a new epistle on the current page. epfunc = get_tagtext(pagetext, "H2") if epfunc: curepist = epfunc[0] # Identifies individual glosses on the current page, and adds them to a gloss-list. glosslist = order_glosslist( clear_spectags("\n\n".join(get_section(pagetext, "SG")), "fol")) foliolist = [] # Creates a list of folios and related gloss text for the current page. for folinfo in get_fol( order_glosses("\n\n".join( get_section(get_pages(file, thispage, thispage), "SG")))): folio = folinfo[1] foliotext = folinfo[0] foliolist.append([folio, foliotext]) # Creates a list with the current epistle name and page, # Checks for each gloss on the current page which folio it is in, # Adds folio information to the list. for gloss in glosslist: thisglosslist = [curepist, thispage] glossfound = False for foltextlist in foliolist: if gloss in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: glossstub = gloss[:11] for foltextlist in foliolist: if glossstub in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: thisglosslist.append("No Folio Information Found") # Identifies gloss numbers and removes them from the gloss text. glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glosspatitir = glossnopat.finditer(gloss) for i in glosspatitir: # Adds gloss number to list. thisglosslist.append(i.group()[:-2]) # Identifies foundational gloss including all markup tags. glossfulltags = gloss[gloss.find(i.group()) + len(i.group()):] # Creates a display copy of the gloss text, replacing Latin tags with html emphasis tags. glosstext = glossfulltags if "[GLat]" in glosstext: glosstextlist = glosstext.split("[GLat]") glosstext = "<em>".join(glosstextlist) if "[/GLat]" in glosstext: glosstextlist = glosstext.split("[/GLat]") glosstext = "</em>".join(glosstextlist) # Creates 2 copies of display gloss text, one primary, one retaining footnotes in superscript tags. basegloss = clear_tags(glosstext) footnotesgloss = glosstext[:] footnotepat = re.compile(r'\[/?[a-z]\]') fnpatitir = footnotepat.finditer(footnotesgloss) fnlist = [] for j in fnpatitir: fnlist.append(j.group()) if not fnlist: fnstring = "" thisglosslist.extend([ glossfulltags, basegloss, clear_tags(footnotesgloss), fnstring ]) if fnlist: for fntag in fnlist: if "[/" in fntag: endtag = fntag begintag = "".join(endtag.split("/")) footnotesgloss = "".join( footnotesgloss.split(begintag)) tagplace = footnotesgloss.find(endtag) footnotesgloss = footnotesgloss[:tagplace] + "<sup>" +\ footnotesgloss[tagplace + 2: tagplace + 3] + "</sup>" +\ footnotesgloss[tagplace + 4:] if begintag in fnlist: del fnlist[fnlist.index(begintag)] glossfnlist = [] for footnote in fnlist: fnletter = footnote[-2] # Collects footnotes relevant to this gloss and adds them to a list. for fnote in footnotelist: fnoteid = fnote[:1] if fnletter == fnoteid: glossfnlist.append( clear_tags(fnote[:1] + ":" + fnote[1:])) fnsuperscript = "<sup>" + fnletter + "</sup>" footnotesgloss = fnsuperscript.join( footnotesgloss.split(footnote)) thisglosslist.extend([ glossfulltags, basegloss, clear_tags(footnotesgloss), glossfnlist ]) if newnotelist: for note in newnotelist: folinfo = note[0] glossnum = note[1] notetext = note[2] if folinfo == thisglosslist[ 2] and glossnum == thisglosslist[3]: if len(thisglosslist) == 8: thisglosslist.extend([notetext]) elif len(thisglosslist) == 9: if thisglosslist[8] == "": thisglosslist[8] = notetext else: anstring = "" if len(thisglosslist) == 8: thisglosslist.extend([anstring]) elif not newnotelist: anstring = "" thisglosslist.extend([anstring]) infolist.append(thisglosslist) # add translations to the end of the info-lists where they are available for infoset in infolist[ 1:]: # exclude the first info-set containing the titles glossid = infoset[3] curpagetrans = pagestrans[0] curtransid = curpagetrans[0] curtrans = curpagetrans[1] # deal with the conjoined gloss on TPH p. 500 (1b10 + 1b11) # split the gloss id into the two numbers, use these to identify the two translations # conjoin the two translations and append them to the info-set for the conjoined gloss ids if ", " in glossid: glossidlist = glossid.split(", ") splittranslations = [] for newid in glossidlist: if newid == curtransid: splittranslations.append(curtrans) del pagestrans[0] curpagetrans = pagestrans[0] curtransid = curpagetrans[0] curtrans = curpagetrans[1] joinedtrans = " i.e. ".join(splittranslations) if "[GLat]" in joinedtrans: transtextlist = joinedtrans.split("[GLat]") joinedtrans = "<em>".join(transtextlist) if "[/GLat]" in joinedtrans: transtextlist = joinedtrans.split("[/GLat]") joinedtrans = "</em>".join(transtextlist) infoset.append(joinedtrans) else: if glossid == curtransid: if "[GLat]" in curtrans: transtextlist = curtrans.split("[GLat]") curtrans = "<em>".join(transtextlist) if "[/GLat]" in curtrans: transtextlist = curtrans.split("[/GLat]") curtrans = "</em>".join(transtextlist) infoset.append(curtrans) del pagestrans[0] # deal with page 587 where glosses 27, 28, and 29 share the one translation, numbered '27 – 29.'. elif " – " in curtransid: curtransidlist = curtransid.split(" – ") curtransidrange = [ int(curtransidlist[0]), int(curtransidlist[1]) ] idstart = curtransidrange[0] idstop = curtransidrange[1] curtransidlist = [] for i in range(idstart, idstop + 1): curtransidlist.append(str(i)) if glossid in curtransidlist: if "[GLat]" in curtrans: transtextlist = curtrans.split("[GLat]") curtrans = "<em>".join(transtextlist) if "[/GLat]" in curtrans: transtextlist = curtrans.split("[/GLat]") curtrans = "</em>".join(transtextlist) infoset.append(curtrans) if glossid == curtransidlist[-1]: del pagestrans[0] # if no translation is given in TPH else: infoset.append("No translation available.") # Gets all page Footnotes for the second time (for the translation) curpage = None for infoset in infolist[ 1:]: # exclude the first info-set containing the titles thistransfns = [] # ensures page footnotes are only generated once per page, and not for every gloss if curpage: if curpage != infoset[1]: curpage = infoset[1] footnotelist = order_footlist(file, curpage) elif not curpage: curpage = infoset[1] footnotelist = order_footlist(file, curpage) trans = infoset[9] # finds which translations have footnotes, looks for the associated footnote i the list generated above if "<sup>" in trans: superscriptpat = re.compile(r'<sup>\w</sup>') superscriptpatitir = superscriptpat.finditer(trans) for i in superscriptpatitir: fnid = i.group()[5] for footnote in footnotelist: if footnote[0] == fnid: # if the footnote is found and not already in the footnote list for the gloss it is added if infoset[7]: if clear_tags(footnote[:1] + ":" + footnote[1:]) not in infoset[7]: thistransfns.append( clear_tags(footnote[:1] + ":" + footnote[1:])) else: thistransfns.append( clear_tags(footnote[:1] + ":" + footnote[1:])) # all footnotes found for the gloss are combined # if there are translation footnotes if thistransfns: # if there are no gloss footnotes to add them to if not infoset[7]: infoset[7] = thistransfns # if there are gloss footnotes to add them to elif infoset[7]: for i in thistransfns: infoset[7].append(i) return infolist
def splitglosses(file): """Splits the glosses from a gloss-hand file into a gloss list""" filetext = openhandlists(file) glosslist = order_glosslist(filetext) return glosslist
def scribe_split(glossfile, startpage=499, stoppage=712): """Takes the text of the glosses, identifies page number, gloss text and footnotes, Separates the three scribal hands first by identifying prima manus footnotes throughout whole text then by breaking the remaining glosses of f.32d from f.33a""" # get a list of pages and page numbers from the file, and isolate the irish gloss text ... and footnotes pagesinfolist = get_pageinfo(glossfile, startpage, stoppage) pagesdir = [] for page in pagesinfolist: pageno = page[0] irish = get_section(get_pages(glossfile, pageno, pageno), "SG") irish = irish[0] pagedir = [pageno, irish] pagesdir.append(pagedir) # get the individual glosses per page, check if they have a 'prima manus' footnote, if so, put in PM list allglosses = ['All Glosses'] primanlist = ['Prima Manus'] handiilist = ['Hand Two'] handiiilist = ['Hand Three'] # glosscount = 0 # pmcount = 0 # htwocount = 0 # hthreecount = 0 # adds all glosses to a single list for page in pagesdir: glosslist = order_glosslist(page[1]) for curgloss in glosslist: allglosses.append(curgloss) # glosscount += 1 # adds prima manus glosses to a proma manus list for page in pagesdir: glosslist = order_glosslist(page[1]) footnotes = order_footlist(glossfile, page[0]) for curgloss in glosslist: # find footnote markers in each individual gloss glossfnpat = re.compile(r'\[[a-z]\]') glossfnitir = glossfnpat.finditer(curgloss) for i in glossfnitir: let = i.group() let = let[1:-1] for fn in footnotes: # Find footnote associated with gloss, then if it indicates a prima manu add gloss to prima list if fn[0] == let: if "prima" in fn: if curgloss not in primanlist: primanlist.append(curgloss) # pmcount += 1 # adds remaining glosses to separate lists for hands 2 and 3 handtwo = True for page in pagesdir: glosslist = order_glosslist(page[1]) for curgloss in glosslist: # iterate through the remaining glosses, remove prima glosses, divide rest into hand 2 or hand 3 list if "[f. 33a]" in curgloss: handtwo = False if handtwo: if curgloss not in primanlist: handiilist.append(curgloss) # htwocount += 1 else: if curgloss not in primanlist: handiiilist.append(curgloss) # hthreecount += 1 handlists = [allglosses, primanlist, handiilist, handiiilist] # print("Full Count: %d\nH1: %d\nH2: %d\nH3: %d" % (glosscount, pmcount, htwocount, hthreecount)) return handlists
def get_latpageinfo(file, page): """returns a list of gloss-lists for a specified page of TPH each gloss-list contains a gloss[0], the Latin verse[1], the lemma[2], and the lemma position[3]""" latininfolist = [] latlines = order_latlist("\n\n".join( get_section(get_pages(file, page, page), "Lat"))) eachgloss = order_glosslist( clear_tags("\n\n".join(get_section(get_pages(file, page, page), "SG")))) glosses = order_glosses( clear_tags("\n\n".join(get_section(get_pages(file, page, page), "SG")))) numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glossitir = numpat.finditer(glosses) glossnums = [] for i in glossitir: # Gets gloss numbers from the Irish text, converts them to match tags in the Latin text, adds them to a list. glossnum = i.group() glossnum = glossnum[:-2] if ", " in glossnum: glossnum = "–".join(glossnum.split(", ")) glossnums.append("[" + glossnum + "]") latpergloss = [] lemmata = [] positions = [] usednums = [] backlist = [] # Creates a reversed version of latlines to be searched instead on pages where there are duplicate glossnos. # This prevents two glosses with the same number interacting with each other's Latin lines. for line in latlines: backlist.append(line) backlist.reverse() # Checks for expected gloss numbers in the latin text and, if found, adds the latin line and lemma to lists. for num in glossnums: if num not in usednums: # If this is the first instance of this glossno on this page. usednums.append(num) found = False while not found: for line in latlines: if num in line: latpergloss.append(line) linetext = line numpos = line.find(num) linetext = linetext[:numpos] lemma = linetext[linetext.rfind(" ") + 1:] if "[" in lemma: lemma = clear_tags(lemma) lemmata.append(lemma) notagtext = clear_tags(linetext, ["let"]) remnumpat = re.compile( r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))' ) thisglossremnum = "" # has to be put here for pages which begin mid-Latin-line for remnum in remnumpat.finditer(notagtext): if remnum.group() != "": thisglossremnum = remnum.group() remlen = len(thisglossremnum) notagtext = notagtext[remlen:] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(notagtext) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: notagtext = "".join( notagtext.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format( marker[2:3]) notagtext = supscr.join( notagtext.split(marker)) for marker in fns: if marker in notagtext: supscr = "</em><sup>{}</sup><em>".format( marker[1:2]) notagtext = supscr.join( notagtext.split(marker)) lempos = notagtext.rfind(lemma) positions.append(lempos) found = True break elif num in usednums: # If this is not the first instance of this glossno on this page. found = False while not found: for line in backlist: if num in line: latpergloss.append(line) linetext = line numpos = line.find(num) linetext = linetext[:numpos] lemma = linetext[linetext.rfind(" ") + 1:] if "[" in lemma: lemma = clear_tags(lemma) lemmata.append(lemma) notagtext = clear_tags(linetext, ["let"]) remnumpat = re.compile( r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))' ) thisglossremnum = "" # has to be put here for pages which begin mid-Latin-line for remnum in remnumpat.finditer(notagtext): if remnum.group() != "": thisglossremnum = remnum.group() remlen = len(thisglossremnum) notagtext = notagtext[remlen:] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(notagtext) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: notagtext = "".join( notagtext.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format( marker[2:3]) notagtext = supscr.join( notagtext.split(marker)) for marker in fns: if marker in notagtext: supscr = "</em><sup>{}</sup><em>".format( marker[1:2]) notagtext = supscr.join( notagtext.split(marker)) lempos = notagtext.rfind(lemma) positions.append(lempos) found = True break for i in range(len(glossnums)): # Compiles a list of the gloss, the Latin line, and the lemma for the gloss within the Latin line. thislatperglos = latpergloss[i] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(thislatperglos) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: thislatperglos = "".join( thislatperglos.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format(marker[2:3]) thislatperglos = supscr.join(thislatperglos.split(marker)) for marker in fns: if marker in thislatperglos: supscr = "</em><sup>{}</sup><em>".format(marker[1:2]) thislatperglos = supscr.join(thislatperglos.split(marker)) latininfolist.append([ eachgloss[i], clear_tags(thislatperglos, ["NV"]), clear_tags(lemmata[i]), positions[i] ]) return latininfolist