Python get_tagtext Examples

Programming Language: Python

Namespace/Package Name: GetTagText

Method/Function: get_tagtext

Examples at hotexamples.com: 4

Python get_tagtext - 4 examples found. These are the top rated real world Python examples of GetTagText.get_tagtext extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def rep_mstags(text, footnotes):
    """Finds all open/close footnotes in a gloss or line of Latin, checks if there is a footnote with an associated
       manuscript replacement tagged and, if so, replaces it returning the gloss/Latin with all replacements made."""
    textstring = text
    tagsintext = []
    taggedtextlist = []
    tagtextpat = re.compile(r'\[/[a-z]\]')
    tagtextitir = tagtextpat.finditer(textstring)
    for tagfound in tagtextitir:
        closetag = tagfound.group()
        opentag = "[" + closetag[2:]
        tag = opentag[1:-1]
        tagsintext.append(tag)
        tagtextlist = get_tagtext(textstring, tag)
        for taggedtext in tagtextlist:
            tagstring = opentag + taggedtext + closetag
            taggedtextlist.append(tagstring)
    for tag in tagsintext:
        tagplace = tagsintext.index(tag)
        replacetext = taggedtextlist[tagplace]
        for footnote in footnotes:
            if footnote[:2] == tag + " ":
                if "[/Rep]" in footnote:
                    replacementlist = get_tagtext(footnote, "Rep")
                    repstring = "[Rep]" + replacementlist[0] + "[/Rep]"
                    textstringlist = textstring.split(replacetext)
                    textstring = repstring.join(textstringlist)
    return textstring

Example #2

Show file

def get_section(fulltext, section="No Section"):
    """Gets and returns list of instances of just the text between a particular set of section tags"""
    if section not in ["Lat", "SG", "Eng", "FN", "AN"]:
        section = "No Section"
    if section != "No Section":
        secs = get_tagtext(fulltext, section)
    else:
        secremovetext = fulltext
        seclist = ["Lat", "SG", "Eng", "FN", "AN"]
        for sec in seclist:
            opentag = ("[" + sec + "]")
            closetag = ("[/" + sec + "]")
            if opentag in secremovetext:
                remcount = secremovetext.count(opentag)
                for i in range(remcount):
                    secremovetext = secremovetext[:secremovetext.find(opentag)] + "***SPLITHERE***" + \
                                    secremovetext[secremovetext.find(closetag) + len(closetag):]
        secs = secremovetext.split("***SPLITHERE***")
        newsects = []
        for sect in secs:
            sect = sect.strip()
            newsects.append(sect)
        secs = newsects
        spaces = "\n\n\n\n"
        if spaces in "\n".join(secs):
            secs = "\n".join(secs)
            secs = secs.split(spaces)
        spaces = "\n\n\n"
        if spaces in "\n".join(secs):
            secs = "\n".join(secs)
            secs = secs.split(spaces)
    return secs

Example #3

Show file

def get_glinfobasic(file, startpage=499, stoppage=712):
    """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table.
       Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and tag-free
       Gloss Text for every gloss in Wb."""
    curepist = "Unknown"
    infolist = [["Epistle", "Page", "Folio", "Gloss No.", "Gloss Text"]]
    for page in range(startpage, stoppage + 1):
        thispage = page
        pagetext = get_pages(file, thispage, thispage)
        epfunc = get_tagtext(pagetext, "H2")
        if epfunc:
            curepist = epfunc[0]
        glosslist = order_glosslist(
            clear_tags("\n\n".join(get_section(pagetext, "SG"))))
        foliolist = []
        for folinfo in get_fol(
                order_glosses(
                    clear_tags(
                        "\n\n".join(
                            get_section(get_pages(file, thispage, thispage),
                                        "SG")), "fol"))):
            folio = folinfo[1]
            foliotext = folinfo[0]
            foliolist.append([folio, foliotext])
        for gloss in glosslist:
            thisglosslist = [curepist, thispage]
            glossfound = False
            for foltextlist in foliolist:
                if gloss in foltextlist[1]:
                    thisfolio = foltextlist[0]
                    thisglosslist.append(thisfolio)
                    glossfound = True
            if not glossfound:
                glossstub = gloss[:11]
                for foltextlist in foliolist:
                    if glossstub in foltextlist[1]:
                        thisfolio = foltextlist[0]
                        thisglosslist.append(thisfolio)
                        glossfound = True
            if not glossfound:
                thisglosslist.append("No Folio Information Found")
            glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
            glosspatitir = glossnopat.finditer(gloss)
            for i in glosspatitir:
                thisglosslist.extend([
                    (i.group())[:-2],
                    gloss[gloss.find(i.group()) + len(i.group()):]
                ])
            infolist.append(thisglosslist)
    return infolist

Example #4

Show file

def get_glinfo(file, startpage=499, stoppage=712):
    """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table.
       Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and Gloss Text
       (with [GLat][/GLat] tags converted to html italics tags) for every gloss in Wb."""
    curepist = "Unknown"
    infolist = [[
        "Epistle", "Page", "Folio", "Gloss No.", "Gloss Full-Tags",
        "Gloss Text", "Gloss Footnotes", "Relevant Footnotes",
        "Adrian's Notes", "Gloss Translation"
    ]]
    pagestrans = get_transpagesinfo(file, startpage, stoppage)
    for page in range(startpage, stoppage + 1):
        thispage = page
        pagetext = get_pages(file, thispage, thispage)
        # Gets all page Footnotes for the first time (for the gloss)
        footnotelist = order_footlist(file, page)
        # Gets all notes supplied by me (for the gloss)
        notelist = order_newlist(file, page)
        newnotelist = list()
        notefol = False
        if notelist != ['']:
            for notenum, note in enumerate(notelist):
                noteidpat = re.compile(r'\[/?f\. \d{1,2}[a-d]\]')
                noteiditer = noteidpat.findall(note)
                if noteiditer:
                    for folinfo in noteiditer:
                        note = "".join(note.split(folinfo))
                    folinfo = "".join(i for i in noteiditer[0]
                                      if i not in ["[", "]", "/"])
                    if notenum == 0:
                        notefol = folinfo
                    elif notefol != folinfo:
                        notefol = folinfo
                notenumpat = re.compile(r'^\d{1,2}[a-z]?\. ')
                notenumiter = notenumpat.findall(note)
                if not notenumiter:
                    raise RuntimeError(
                        f"Personal note found without link to gloss number.\nNote: {note}"
                    )
                elif len(notenumiter
                         ) > 1 or notenumiter[0] != note[:len(notenumiter[0])]:
                    raise RuntimeError(
                        f"Multiple possible gloss numbers found for personal note.\nNote: {note}"
                    )
                elif notenumiter[0] == note[:len(notenumiter[0])]:
                    glossnum = notenumiter[0][:-2]
                    note = note[len(notenumiter[0]):].strip()
                newnotelist.append([notefol, glossnum, note])
        # Checks for a new epistle on the current page.
        epfunc = get_tagtext(pagetext, "H2")
        if epfunc:
            curepist = epfunc[0]
        # Identifies individual glosses on the current page, and adds them to a gloss-list.
        glosslist = order_glosslist(
            clear_spectags("\n\n".join(get_section(pagetext, "SG")), "fol"))
        foliolist = []
        # Creates a list of folios and related gloss text for the current page.
        for folinfo in get_fol(
                order_glosses("\n\n".join(
                    get_section(get_pages(file, thispage, thispage), "SG")))):
            folio = folinfo[1]
            foliotext = folinfo[0]
            foliolist.append([folio, foliotext])
        # Creates a list with the current epistle name and page,
        # Checks for each gloss on the current page which folio it is in,
        # Adds folio information to the list.
        for gloss in glosslist:
            thisglosslist = [curepist, thispage]
            glossfound = False
            for foltextlist in foliolist:
                if gloss in foltextlist[1]:
                    thisfolio = foltextlist[0]
                    thisglosslist.append(thisfolio)
                    glossfound = True
            if not glossfound:
                glossstub = gloss[:11]
                for foltextlist in foliolist:
                    if glossstub in foltextlist[1]:
                        thisfolio = foltextlist[0]
                        thisglosslist.append(thisfolio)
                        glossfound = True
            if not glossfound:
                thisglosslist.append("No Folio Information Found")
            # Identifies gloss numbers and removes them from the gloss text.
            glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
            glosspatitir = glossnopat.finditer(gloss)
            for i in glosspatitir:
                # Adds gloss number to list.
                thisglosslist.append(i.group()[:-2])
                # Identifies foundational gloss including all markup tags.
                glossfulltags = gloss[gloss.find(i.group()) + len(i.group()):]
                # Creates a display copy of the gloss text, replacing Latin tags with html emphasis tags.
                glosstext = glossfulltags
                if "[GLat]" in glosstext:
                    glosstextlist = glosstext.split("[GLat]")
                    glosstext = "<em>".join(glosstextlist)
                if "[/GLat]" in glosstext:
                    glosstextlist = glosstext.split("[/GLat]")
                    glosstext = "</em>".join(glosstextlist)
                # Creates 2 copies of display gloss text, one primary, one retaining footnotes in superscript tags.
                basegloss = clear_tags(glosstext)
                footnotesgloss = glosstext[:]
                footnotepat = re.compile(r'\[/?[a-z]\]')
                fnpatitir = footnotepat.finditer(footnotesgloss)
                fnlist = []
                for j in fnpatitir:
                    fnlist.append(j.group())
                if not fnlist:
                    fnstring = ""
                    thisglosslist.extend([
                        glossfulltags, basegloss,
                        clear_tags(footnotesgloss), fnstring
                    ])
                if fnlist:
                    for fntag in fnlist:
                        if "[/" in fntag:
                            endtag = fntag
                            begintag = "".join(endtag.split("/"))
                            footnotesgloss = "".join(
                                footnotesgloss.split(begintag))
                            tagplace = footnotesgloss.find(endtag)
                            footnotesgloss = footnotesgloss[:tagplace] + "<sup>" +\
                                             footnotesgloss[tagplace + 2: tagplace + 3] + "</sup>" +\
                                             footnotesgloss[tagplace + 4:]
                            if begintag in fnlist:
                                del fnlist[fnlist.index(begintag)]
                    glossfnlist = []
                    for footnote in fnlist:
                        fnletter = footnote[-2]
                        # Collects footnotes relevant to this gloss and adds them to a list.
                        for fnote in footnotelist:
                            fnoteid = fnote[:1]
                            if fnletter == fnoteid:
                                glossfnlist.append(
                                    clear_tags(fnote[:1] + ":" + fnote[1:]))
                        fnsuperscript = "<sup>" + fnletter + "</sup>"
                        footnotesgloss = fnsuperscript.join(
                            footnotesgloss.split(footnote))
                    thisglosslist.extend([
                        glossfulltags, basegloss,
                        clear_tags(footnotesgloss), glossfnlist
                    ])
                if newnotelist:
                    for note in newnotelist:
                        folinfo = note[0]
                        glossnum = note[1]
                        notetext = note[2]
                        if folinfo == thisglosslist[
                                2] and glossnum == thisglosslist[3]:
                            if len(thisglosslist) == 8:
                                thisglosslist.extend([notetext])
                            elif len(thisglosslist) == 9:
                                if thisglosslist[8] == "":
                                    thisglosslist[8] = notetext
                        else:
                            anstring = ""
                            if len(thisglosslist) == 8:
                                thisglosslist.extend([anstring])
                elif not newnotelist:
                    anstring = ""
                    thisglosslist.extend([anstring])
            infolist.append(thisglosslist)
    # add translations to the end of the info-lists where they are available
    for infoset in infolist[
            1:]:  # exclude the first info-set containing the titles
        glossid = infoset[3]
        curpagetrans = pagestrans[0]
        curtransid = curpagetrans[0]
        curtrans = curpagetrans[1]
        # deal with the conjoined gloss on TPH p. 500 (1b10 + 1b11)
        # split the gloss id into the two numbers, use these to identify the two translations
        # conjoin the two translations and append them to the info-set for the conjoined gloss ids
        if ", " in glossid:
            glossidlist = glossid.split(", ")
            splittranslations = []
            for newid in glossidlist:
                if newid == curtransid:
                    splittranslations.append(curtrans)
                    del pagestrans[0]
                    curpagetrans = pagestrans[0]
                    curtransid = curpagetrans[0]
                    curtrans = curpagetrans[1]
            joinedtrans = " i.e. ".join(splittranslations)
            if "[GLat]" in joinedtrans:
                transtextlist = joinedtrans.split("[GLat]")
                joinedtrans = "<em>".join(transtextlist)
            if "[/GLat]" in joinedtrans:
                transtextlist = joinedtrans.split("[/GLat]")
                joinedtrans = "</em>".join(transtextlist)
            infoset.append(joinedtrans)
        else:
            if glossid == curtransid:
                if "[GLat]" in curtrans:
                    transtextlist = curtrans.split("[GLat]")
                    curtrans = "<em>".join(transtextlist)
                if "[/GLat]" in curtrans:
                    transtextlist = curtrans.split("[/GLat]")
                    curtrans = "</em>".join(transtextlist)
                infoset.append(curtrans)
                del pagestrans[0]
            # deal with page 587 where glosses 27, 28, and 29 share the one translation, numbered '27 – 29.'.
            elif " – " in curtransid:
                curtransidlist = curtransid.split(" – ")
                curtransidrange = [
                    int(curtransidlist[0]),
                    int(curtransidlist[1])
                ]
                idstart = curtransidrange[0]
                idstop = curtransidrange[1]
                curtransidlist = []
                for i in range(idstart, idstop + 1):
                    curtransidlist.append(str(i))
                if glossid in curtransidlist:
                    if "[GLat]" in curtrans:
                        transtextlist = curtrans.split("[GLat]")
                        curtrans = "<em>".join(transtextlist)
                    if "[/GLat]" in curtrans:
                        transtextlist = curtrans.split("[/GLat]")
                        curtrans = "</em>".join(transtextlist)
                    infoset.append(curtrans)
                if glossid == curtransidlist[-1]:
                    del pagestrans[0]
            # if no translation is given in TPH
            else:
                infoset.append("No translation available.")
    # Gets all page Footnotes for the second time (for the translation)
    curpage = None
    for infoset in infolist[
            1:]:  # exclude the first info-set containing the titles
        thistransfns = []
        # ensures page footnotes are only generated once per page, and not for every gloss
        if curpage:
            if curpage != infoset[1]:
                curpage = infoset[1]
                footnotelist = order_footlist(file, curpage)
        elif not curpage:
            curpage = infoset[1]
            footnotelist = order_footlist(file, curpage)
        trans = infoset[9]
        # finds which translations have footnotes, looks for the associated footnote i the list generated above
        if "<sup>" in trans:
            superscriptpat = re.compile(r'<sup>\w</sup>')
            superscriptpatitir = superscriptpat.finditer(trans)
            for i in superscriptpatitir:
                fnid = i.group()[5]
                for footnote in footnotelist:
                    if footnote[0] == fnid:
                        # if the footnote is found and not already in the footnote list for the gloss it is added
                        if infoset[7]:
                            if clear_tags(footnote[:1] + ":" +
                                          footnote[1:]) not in infoset[7]:
                                thistransfns.append(
                                    clear_tags(footnote[:1] + ":" +
                                               footnote[1:]))
                        else:
                            thistransfns.append(
                                clear_tags(footnote[:1] + ":" + footnote[1:]))
        # all footnotes found for the gloss are combined
        # if there are translation footnotes
        if thistransfns:
            # if there are no gloss footnotes to add them to
            if not infoset[7]:
                infoset[7] = thistransfns
            # if there are gloss footnotes to add them to
            elif infoset[7]:
                for i in thistransfns:
                    infoset[7].append(i)
    return infolist