Esempio n. 1
0
def sampleworkcitation(authorid: str, workid: str) -> JSON_STR:
    """

	called by loadsamplecitation() in autocomplete.js

	we are using the maual input style on the web page
	so we need some hint on how to do things: check the end line for a sample citation

	"In Timarchum (w001)" yields...

	127.0.0.1 - - [04/Apr/2021 13:48:53] "GET /get/json/samplecitation/gr0026/001 HTTP/1.1" 200 -
	/get/json/samplecitation
		{"firstline": "1.1", "lastline": "196.7"}

	:param authorid:
	:param workid:
	:return:
	"""
    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    returnvals = dict()
    returnvals['firstline'] = str()
    returnvals['lastline'] = str()

    authorid = depunct(authorid)
    workid = depunct(workid)

    try:
        ao = authordict[authorid]
        wo = workdict[authorid + 'w' + workid]
    except KeyError:
        returnvals['firstline'] = 'no such author/work combination'
        return json.dumps(returnvals)

    toplevel = wo.availablelevels - 1
    firstlineindex = returnfirstorlastlinenumber(wo.universalid,
                                                 dbcursor,
                                                 disallowt=True,
                                                 disallowlevel=toplevel)
    flo = dblineintolineobject(
        grabonelinefromwork(authorid, firstlineindex, dbcursor))

    lastlineidx = returnfirstorlastlinenumber(wo.universalid,
                                              dbcursor,
                                              findlastline=True)
    llo = dblineintolineobject(
        grabonelinefromwork(authorid, lastlineidx, dbcursor))

    returnvals['firstline'] = flo.prolixlocus()
    returnvals['lastline'] = llo.prolixlocus()

    results = json.dumps(returnvals)

    dbconnection.connectioncleanup()

    return results
def lookoutsideoftheline(linenumber: int, numberofextrawords: int, workid: str,
                         searchobject: SearchObject, cursor) -> str:
    """

	grab a line and add the N words at the tail and head of the previous and next lines
	this will let you search for phrases that fall along a line break "και δη | και"

	if you wanted to look for 'ἀείδων Ϲπάρτηϲ'
	you need this individual line:
		2.1.374  δεξιτερὴν γὰρ ἀνέϲχε μετάρϲιον, ὡϲ πρὶν ἀείδων
	to turn extend out to:
		ὑφαίνων δεξιτερὴν γὰρ ἀνέϲχε μετάρϲιον ὡϲ πρὶν ἀείδων ϲπάρτηϲ

	:param linenumber:
	:param numberofextrawords:
	:param workid:
	:param searchobject:
	:param cursor:
	:return:
	"""
    whitespace = ' '
    workdbname = workid[0:6]

    query = 'SELECT {wltmp} FROM {db} WHERE index BETWEEN %s AND %s ORDER BY index ASC'.format(
        wltmp=worklinetemplate, db=workdbname)
    data = (linenumber - 1, linenumber + 1)
    cursor.execute(query, data)
    results = cursor.fetchall()

    lines = [dblineintolineobject(r) for r in results]
    # will get key errors if there is no linenumber+/-1
    if len(lines) == 2:
        if lines[0].index == linenumber:
            lines = [makeablankline(workdbname, linenumber - 1)] + lines
        else:
            lines.append(makeablankline(workdbname, linenumber + 1))
    if len(lines) == 1:
        lines = [makeablankline(workdbname, linenumber - 1)] + lines
        lines.append(makeablankline(workdbname, linenumber + 1))

    text = list()
    for line in lines:
        wordsinline = line.wordlist(searchobject.usewordlist)
        if line.index == linenumber - 1:
            text = wordsinline[(numberofextrawords * -1):]
        elif line.index == linenumber:
            text += wordsinline
        elif line.index == linenumber + 1:
            text += wordsinline[0:numberofextrawords]

    aggregate = whitespace.join(text)
    aggregate = re.sub(r'\s\s', whitespace, aggregate)
    aggregate = ' {a} '.format(a=aggregate)

    return aggregate
    def iteratethroughsearchlist(self):
        """

		this is the simple core of the whole thing; the rest is about feeding it properly

		if you do not pickle the lineobjects here and now you will need to generate line objects at the other end
			foundlineobjects = [dblineintolineobject(item) for item in founddblineobjects]

		you will also need to use lo.decompose() in phrasesearching.py to feed the findslist

		:return:
		"""

        insertposition = self.searchfunctionparameters.index('parametertoswap')
        while self.emptytest and self.activepoll.gethits() <= self.so.cap:
            srchfunct = self.searchfunction
            nextitem = self.getnextfnc()
            if self.so.session['onehit']:
                # simplelemma chunk might have already searched and found in an author
                if self.so.lemma or self.so.proximatelemma:
                    # nextitem looks like '(chunk, item)'
                    if nextitem[1] in self.authorsamongthefinds():
                        srchfunct = None

            if nextitem and srchfunct:
                params = self.parameterswapper(nextitem, insertposition)
                foundlines = srchfunct(*tuple(params))
                lineobjects = [dblineintolineobject(f) for f in foundlines]
                self.addnewfindstolistoffinds(lineobjects)
                self.updatepollfinds(lineobjects)
                self.updatepollremaining()
            elif not srchfunct:
                pass
            else:
                # listofplacestosearch has been exhausted
                break

        self.listcleanup()

        if self.needconnectioncleanup:
            self.dbconnection.connectioncleanup()

        # empty return because foundlineobjects is a ListProxy:
        # ask for self.foundlineobjects as the search result instead
        # print('{i} finished'.format(i=self.workerid))
        return
def textsegmentfindstartandstop(authorobject, workobject, passageaslist,
                                cursor) -> dict:
    """
	find the first and last lines of a work segment
	:return:
	"""

    p = tuple(passageaslist)
    lookforline = finddblinefromincompletelocus(workobject, p, cursor)
    # assuming that lookforline['code'] == 'success'
    # lookforline['code'] is (allegedly) only relevant to the Perseus lookup problem where a bad locus can be sent
    foundline = lookforline['line']
    line = grabonelinefromwork(authorobject.universalid, foundline, cursor)
    lo = dblineintolineobject(line)

    # let's say you looked for 'book 2' of something that has 'book, chapter, line'
    # that means that you want everything that has the same level2 value as the lineobject
    # build a where clause
    passageaslist.reverse()
    atloc = '|'.join(passageaslist)
    selection = '{uid}_AT_{line}'.format(uid=workobject.universalid,
                                         line=atloc)

    w = atsignwhereclauses(selection, '=',
                           {authorobject.universalid: authorobject})
    d = [workobject.universalid]
    qw = str()
    for i in range(0, len(w)):
        qw += 'AND (' + w[i][0] + ') '
        d.append(w[i][1])

    query = 'SELECT index FROM {au} WHERE wkuniversalid=%s {whr} ORDER BY index DESC LIMIT 1'.format(
        au=authorobject.universalid, whr=qw)
    data = tuple(d)

    cursor.execute(query, data)
    found = cursor.fetchone()

    startandstop = dict()
    startandstop['startline'] = lo.index
    startandstop['endline'] = found[0]

    return startandstop
def grableadingandlagging(hitline: dbWorkLine,
                          searchobject: SearchObject,
                          cursor,
                          override=None) -> dict:
    """

	take a dbline and grab the N words in front of it and after it

	it would be a good idea to have an autocommit connection here?

	override was added so that the rewritten so of precomposedphraseandproximitysearch() can set 'seeking' as it
	wishes

	:param hitline:
	:param searchobject:
	:param cursor:
	:return:
	"""

    so = searchobject
    # look out for off-by-one errors
    distance = so.distance + 1

    if override:
        seeking = override
    elif so.lemma:
        seeking = wordlistintoregex(so.lemma.formlist)
        so.usewordlist = 'polytonic'
    else:
        seeking = so.termone

    # expanded searchzone bacause "seeking" might be a multi-line phrase
    prev = grabonelinefromwork(hitline.authorid, hitline.index - 1, cursor)
    next = grabonelinefromwork(hitline.authorid, hitline.index + 1, cursor)
    prev = dbWorkLine(*prev)
    next = dbWorkLine(*next)

    searchzone = ' '.join([
        getattr(prev, so.usewordlist),
        getattr(hitline, so.usewordlist),
        getattr(next, so.usewordlist)
    ])

    match = re.search(r'{s}'.format(s=seeking), searchzone)
    # but what if you just found 'paucitate' inside of 'paucitatem'?
    # you will have 'm' left over and this will throw off your distance-in-words count
    past = None
    upto = None
    lagging = list()
    leading = list()
    ucount = 0
    pcount = 0

    try:
        past = searchzone[match.end():].strip()
    except AttributeError:
        # AttributeError: 'NoneType' object has no attribute 'end'
        pass

    try:
        upto = searchzone[:match.start()].strip()
    except AttributeError:
        pass

    if upto:
        ucount = len([x for x in upto.split(' ') if x])
        lagging = [x for x in upto.split(' ') if x]

    if past:
        pcount = len([x for x in past.split(' ') if x])
        leading = [x for x in past.split(' ') if x]

    atline = hitline.index

    while ucount < distance + 1:
        atline -= 1
        try:
            previous = dblineintolineobject(
                grabonelinefromwork(hitline.authorid, atline, cursor))
        except TypeError:
            # 'NoneType' object is not subscriptable
            previous = makeablankline(hitline.authorid, -1)
            ucount = 999
        lagging = previous.wordlist(so.usewordlist) + lagging
        ucount += previous.wordcount()
    lagging = lagging[-1 * (distance - 1):]
    lagging = ' '.join(lagging)

    atline = hitline.index
    while pcount < distance + 1:
        atline += 1
        try:
            nextline = dblineintolineobject(
                grabonelinefromwork(hitline.authorid, atline, cursor))
        except TypeError:
            # 'NoneType' object is not subscriptable
            nextline = makeablankline(hitline.authorid, -1)
            pcount = 999
        leading += nextline.wordlist(so.usewordlist)
        pcount += nextline.wordcount()
    leading = leading[:distance - 1]
    leading = ' '.join(leading)

    returndict = {'lag': lagging, 'lead': leading}

    return returndict
def textmaker(author: str,
              work=None,
              passage=None,
              endpoint=None,
              citationdelimiter='|') -> JSON_STR:
    """
	build a text suitable for display

		"GET /textof/lt0474/024/20/30"

	:return:
	"""

    probeforsessionvariables()

    dbconnection = ConnectionObject('autocommit')
    dbcursor = dbconnection.cursor()

    linesevery = hipparchia.config['SHOWLINENUMBERSEVERY']

    po = TextmakerInputParsingObject(author, work, passage, endpoint,
                                     citationdelimiter)

    ao = po.authorobject
    wo = po.workobject

    segmenttext = str()

    # consolewarning('po.passageaslist: {p}'.format(p=po.passageaslist))

    if ao and wo:
        # we have both an author and a work, maybe we also have a subset of the work
        if endpoint:
            firstlinenumber = finddblinefromincompletelocus(
                wo, po.passageaslist, dbcursor)
            lastlinenumber = finddblinefromincompletelocus(wo,
                                                           po.endpointlist,
                                                           dbcursor,
                                                           findlastline=True)
            if firstlinenumber['code'] == 'success' and lastlinenumber[
                    'code'] == 'success':
                startline = firstlinenumber['line']
                endline = lastlinenumber['line']
                startlnobj = dblineintolineobject(
                    grabonelinefromwork(ao.universalid, startline, dbcursor))
                stoplnobj = dblineintolineobject(
                    grabonelinefromwork(ao.universalid, endline, dbcursor))
            else:
                msg = '"buildtexttospan/" could not find first and last: {a}w{b} - {c} TO {d}'
                consolewarning(
                    msg.format(a=author, b=work, c=passage, d=endpoint))
                startlnobj = makeablankline(work, 0)
                stoplnobj = makeablankline(work, 1)
                startline = 0
                endline = 1
            segmenttext = 'from {a} to {b}'.format(a=startlnobj.shortlocus(),
                                                   b=stoplnobj.shortlocus())
        elif not po.passageaslist:
            # whole work
            startline = wo.starts
            endline = wo.ends
        else:
            startandstop = textsegmentfindstartandstop(ao, wo,
                                                       po.passageaslist,
                                                       dbcursor)
            startline = startandstop['startline']
            endline = startandstop['endline']
        texthtml = buildtext(wo.universalid, startline, endline, linesevery,
                             dbcursor)
    else:
        texthtml = str()

    if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']:
        texthtml = gtltsubstitutes(texthtml)

    if not segmenttext:
        segmenttext = '.'.join(po.passageaslist)

    if not ao or not wo:
        ao = makeanemptyauthor('gr0000')
        wo = makeanemptywork('gr0000w000')

    results = dict()
    results['authorname'] = avoidsmallvariants(ao.shortname)
    results['title'] = avoidsmallvariants(wo.title)
    results['structure'] = avoidsmallvariants(wo.citation())
    results['worksegment'] = segmenttext
    results['texthtml'] = texthtml

    results = json.dumps(results)

    dbconnection.connectioncleanup()

    return results
def buildindexto(searchid: str,
                 author: str,
                 work=None,
                 passage=None,
                 endpoint=None,
                 citationdelimiter='|',
                 justvocab=False) -> JSON_STR:
    """
	build a complete index to a an author, work, or segment of a work

	:return:
	"""

    probeforsessionvariables()

    pollid = validatepollid(searchid)

    starttime = time.time()

    progresspolldict[pollid] = ProgressPoll(pollid)
    progresspolldict[pollid].activate()

    dbconnection = ConnectionObject('autocommit')
    dbcursor = dbconnection.cursor()

    po = IndexmakerInputParsingObject(author, work, passage, endpoint,
                                      citationdelimiter)

    ao = po.authorobject
    wo = po.workobject
    psg = po.passageaslist
    stop = po.endpointlist

    if not work:
        wo = makeanemptywork('gr0000w000')

    # bool
    useheadwords = session['headwordindexing']

    allworks = list()
    output = list()
    cdict = dict()
    segmenttext = str()
    valid = True

    if ao and work and psg and stop:
        start = psg
        firstlinenumber = finddblinefromincompletelocus(wo, start, dbcursor)
        lastlinenumber = finddblinefromincompletelocus(wo,
                                                       stop,
                                                       dbcursor,
                                                       findlastline=True)
        if firstlinenumber['code'] == 'success' and lastlinenumber[
                'code'] == 'success':
            cdict = {
                wo.universalid:
                (firstlinenumber['line'], lastlinenumber['line'])
            }
            startln = dblineintolineobject(
                grabonelinefromwork(ao.universalid, firstlinenumber['line'],
                                    dbcursor))
            stopln = dblineintolineobject(
                grabonelinefromwork(ao.universalid, lastlinenumber['line'],
                                    dbcursor))
        else:
            msg = '"indexspan/" could not find first and last: {a}w{b} - {c} TO {d}'
            consolewarning(msg.format(a=author, b=work, c=passage, d=endpoint))
            startln = makeablankline(work, 0)
            stopln = makeablankline(work, 1)
            valid = False
        segmenttext = 'from {a} to {b}'.format(a=startln.shortlocus(),
                                               b=stopln.shortlocus())
    elif ao and work and psg:
        # subsection of a work of an author
        progresspolldict[pollid].statusis(
            'Preparing a partial index to {t}'.format(t=wo.title))
        startandstop = textsegmentfindstartandstop(ao, wo, psg, dbcursor)
        startline = startandstop['startline']
        endline = startandstop['endline']
        cdict = {wo.universalid: (startline, endline)}
    elif ao and work:
        # one work
        progresspolldict[pollid].statusis(
            'Preparing an index to {t}'.format(t=wo.title))
        startline = wo.starts
        endline = wo.ends
        cdict = {wo.universalid: (startline, endline)}
    elif ao:
        # whole author
        allworks = [
            '{w}  ⇒ {t}'.format(w=w.universalid[6:10], t=w.title)
            for w in ao.listofworks
        ]
        allworks.sort()
        progresspolldict[pollid].statusis(
            'Preparing an index to the works of {a}'.format(a=ao.shortname))
        for wkid in ao.listworkids():
            cdict[wkid] = (workdict[wkid].starts, workdict[wkid].ends)
    else:
        # we do not have a valid selection
        valid = False
        output = ['invalid input']

    if not stop:
        segmenttext = '.'.join(psg)

    if valid and justvocab:
        dbconnection.connectioncleanup()
        del progresspolldict[pollid]
        return cdict

    if valid:
        output = buildindextowork(cdict, progresspolldict[pollid],
                                  useheadwords, dbcursor)

    # get ready to send stuff to the page
    count = len(output)

    try:
        locale.setlocale(locale.LC_ALL, 'en_US')
        count = locale.format_string('%d', count, grouping=True)
    except locale.Error:
        count = str(count)

    progresspolldict[pollid].statusis('Preparing the index HTML')
    indexhtml = wordindextohtmltable(output, useheadwords)

    buildtime = time.time() - starttime
    buildtime = round(buildtime, 2)
    progresspolldict[pollid].deactivate()

    if not ao:
        ao = makeanemptyauthor('gr0000')

    results = dict()
    results['authorname'] = avoidsmallvariants(ao.shortname)
    results['title'] = avoidsmallvariants(wo.title)
    results['structure'] = avoidsmallvariants(wo.citation())
    results['worksegment'] = segmenttext
    results['elapsed'] = buildtime
    results['wordsfound'] = count
    results['indexhtml'] = indexhtml
    results['keytoworks'] = allworks
    results['newjs'] = supplementalindexjs()
    results = json.dumps(results)

    dbconnection.connectioncleanup()
    del progresspolldict[pollid]

    return results
Esempio n. 8
0
def sessionselectionsinfo(authordict: dict, workdict: dict) -> dict:
    """
	build the selections html either for a or b:
		#selectionstable + #selectioninfocell
		#selectionstable + #exclusioninfocell
	there are seven headings to populate
		[a] author classes
		[b] work genres
		[c] author location
		[d] work provenance
		[e] author selections
		[f] work selections
		[g] passage selections

	id numbers need to be attached to the selections so that they can be double-clicked so as to delete them

	:param authordict:
	:return:
	"""

    returndict = dict()
    thejs = list()

    tit = 'title="Double-click to remove this item"'

    try:
        # it is possible to hit this function before the session has been set, so...
        session['auselections']
    except KeyError:
        probeforsessionvariables()

    sessionsearchlist = session['auselections'] + session['agnselections'] + session['wkgnselections'] + \
                        session['psgselections'] + session['wkselections'] + session['alocselections'] + \
                        session['wlocselections']

    for selectionorexclusion in ['selections', 'exclusions']:
        thehtml = list()
        # if there are no explicit selections, then
        if not sessionsearchlist and selectionorexclusion == 'selections':
            thehtml.append('<span class="picklabel">Authors</span><br />')
            thehtml.append('[All in active corpora less exclusions]<br />')

        if selectionorexclusion == 'exclusions' and not sessionsearchlist and session['spuria'] == 'Y' and \
          not session['wkgnexclusions'] and not session['agnexclusions'] and not session['auexclusions']:
            thehtml.append('<span class="picklabel">Authors</span><br />')
            thehtml.append('[No exclusions]<br />')

        # [a] author classes
        v = 'agn'
        var = v + selectionorexclusion
        if session[var]:
            thehtml.append(
                '<span class="picklabel">Author categories</span><br />')
            htmlandjs = selectionlinehtmlandjs(v, selectionorexclusion,
                                               session)
            thehtml += htmlandjs['html']
            thejs += htmlandjs['js']

        # [b] work genres
        v = 'wkgn'
        var = v + selectionorexclusion
        if session[var]:
            thehtml.append('<span class="picklabel">Work genres</span><br />')
            htmlandjs = selectionlinehtmlandjs(v, selectionorexclusion,
                                               session)
            thehtml += htmlandjs['html']
            thejs += htmlandjs['js']

        # [c] author location
        v = 'aloc'
        var = v + selectionorexclusion
        if session[var]:
            thehtml.append(
                '<span class="picklabel">Author location</span><br />')
            htmlandjs = selectionlinehtmlandjs(v, selectionorexclusion,
                                               session)
            thehtml += htmlandjs['html']
            thejs += htmlandjs['js']

        # [d] work provenance
        v = 'wloc'
        var = v + selectionorexclusion
        if session[var]:
            thehtml.append(
                '<span class="picklabel">Work provenance</span><br />')
            htmlandjs = selectionlinehtmlandjs(v, selectionorexclusion,
                                               session)
            thehtml += htmlandjs['html']
            thejs += htmlandjs['js']

        # [e] authors
        v = 'au'
        var = v + selectionorexclusion
        if session[var]:
            thehtml.append('<span class="picklabel">Authors</span><br />')
            localval = -1
            for s in session[var]:
                localval += 1
                ao = authordict[s]
                thehtml.append(
                    '<span class="{v}{soe} selection" id="{var}_0{lv}" {tit}>{s}</span>'
                    '<br />'.format(v=v,
                                    soe=selectionorexclusion,
                                    var=var,
                                    lv=localval,
                                    s=ao.akaname,
                                    tit=tit))
                thejs.append((var, localval))

        # [f] works
        v = 'wk'
        var = v + selectionorexclusion
        if session[var] and selectionorexclusion == 'exclusions' and session[
                'spuria'] == 'N':
            thehtml.append('<span class="picklabel">Works</span><br />')
            thehtml.append('[All non-selected spurious works]<br />')

        if session[var]:
            thehtml.append('<span class="picklabel">Works</span><br />')
            if selectionorexclusion == 'exclusions' and session[
                    'spuria'] == 'N':
                thehtml.append('[Non-selected spurious works]<br />')
            localval = -1
            for s in session[var]:
                localval += 1
                uid = s[:6]
                ao = authordict[uid]
                wk = workdict[s]
                thehtml.append(
                    '<span class="{v}{soe} selection" id="{var}_0{lv}" {tit}>{au}, '
                    '<span class="pickedwork">{wk}</span></span>'
                    '<br />'.format(v=v,
                                    var=var,
                                    soe=selectionorexclusion,
                                    lv=localval,
                                    au=ao.akaname,
                                    tit=tit,
                                    wk=wk.title))
                thejs.append((var, localval))

        # [g] passages
        v = 'psg'
        var = v + selectionorexclusion
        if session[var]:
            psgtemplate = '<span class="{v}{soe} selection" id="{var}_0{lv}" {tit}>{au}, <span class="pickedwork">{wk}</span>&nbsp; <span class="pickedsubsection">{loc}</span></span><br />'
            spantemplate = 'from {a} to {b}'
            thehtml.append('<span class="picklabel">Passages</span><br />')
            localval = -1
            for s in session[var]:
                localval += 1
                uid = s[:6]
                ao = authordict[uid]
                loc = str()
                # watch out for heterogenous passage selection formats; only _AT_ and _FROM_ exist ATM
                # session[psgselections] = ['lt0474w005_FROM_4501_TO_11915', 'lt2806w002_AT_3|4|5']
                if '_AT_' in s:
                    locus = s.split('_AT_')[1].split('|')
                    locus.reverse()
                    citationtuple = tuple(locus)
                    for w in ao.listofworks:
                        if w.universalid == s[0:10]:
                            wk = w
                    loc = prolixlocus(wk, citationtuple)
                elif '_FROM_' in s:
                    dbconnection = ConnectionObject()
                    dbcursor = dbconnection.cursor()
                    wk = workdict[s[0:10]]
                    locus = s.split('_FROM_')[1]
                    start = locus.split('_TO_')[0]
                    stop = locus.split('_TO_')[1]
                    startln = dblineintolineobject(
                        grabonelinefromwork(uid, start, dbcursor))
                    stopln = dblineintolineobject(
                        grabonelinefromwork(uid, stop, dbcursor))
                    dbconnection.connectioncleanup()
                    # print('_FROM_', start, stop, startln.uncleanlocustuple(), stopln.uncleanlocustuple())
                    loc = spantemplate.format(a=startln.prolixlocus(),
                                              b=stopln.prolixlocus())

                thehtml.append(
                    psgtemplate.format(v=v,
                                       var=var,
                                       soe=selectionorexclusion,
                                       lv=localval,
                                       au=ao.akaname,
                                       wk=wk.title,
                                       loc=loc,
                                       tit=tit))
                thejs.append((var, localval))

        returndict[selectionorexclusion] = '\n'.join(thehtml)

    scount = len(session['auselections'] + session['wkselections'] +
                 session['agnselections'] + session['wkgnselections'] +
                 session['psgselections'] + session['alocselections'] +
                 session['wlocselections'])
    scount += len(session['auexclusions'] + session['wkexclusions'] +
                  session['agnexclusions'] + session['wkgnexclusions'] +
                  session['psgexclusions'] + session['alocexclusions'] +
                  session['wlocexclusions'])

    returndict['numberofselections'] = -1
    if scount > 0:
        returndict['numberofselections'] = scount

    returndict['jstuples'] = thejs

    return returndict
def findvalidlevelvalues(workobject: dbOpus, partialcitationtuple: tuple,
                         cursor) -> LowandHighInfo:
    """

	tell me some of a citation and i can tell you what is a valid choice at the next step
	i expect the lowest level to be stored at position 0 in the tuple
	note that you should not send me a full citation because i will look at lowestlevel-1

	sample imput:
		workid = lt0474w015
		workstructure = {0: 'line', 1: 'section'}
		partialcitationtuple = ('13',)
	out:
		(Cicero, Pro Sulla 13)

	note that this is mildly costly as a function call when you convert all of the results to lineobjects
	OOP is a lot easier; but you pay a price

	this also means that lowering the init costs of dbworklines is a good idea

	--------------------------------------------------------------------------------

	127.0.0.1 - - [16/Apr/2019 21:55:27] "GET /getworksof/lt2349 HTTP/1.1" 200 -
	--------------------------------------------------------------------------------
	PATH: '/getstructure/lt2349/005'
	         208072 function calls in 0.380 seconds

	   Ordered by: internal time, call count
	   List reduced from 254 to 25 due to restriction <25>

	   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
	    34548    0.099    0.000    0.174    0.000 /Users/erik/hipparchia_venv/HipparchiaServer/server/hipparchiaobjects/worklineobject.py:83(__init__)
	    69098    0.071    0.000    0.071    0.000 {method 'format' of 'str' objects}
	        1    0.063    0.063    0.063    0.063 {method 'execute' of 'psycopg2.extensions.cursor' objects}
	        1    0.061    0.061    0.061    0.061 {method 'fetchall' of 'psycopg2.extensions.cursor' objects}
	    34548    0.031    0.000    0.205    0.000 /Users/erik/hipparchia_venv/HipparchiaServer/server/dbsupport/dblinefunctions.py:24(dblineintolineobject)
    ...

	:param workid:
	:param workstructure:
	:param partialcitationtuple:
	:param cursor:
	:return:
	"""

    workid = workobject.universalid
    workstructure = workobject.structure

    partialcitation = list(partialcitationtuple)
    availablelevels = len(workstructure)

    atlevel = availablelevels - len(partialcitation)
    # cheat in the case where you want to find the top by sending a 'noncitation': 'top'
    # e.g.: /getstructure/gr0003w001/firstline
    if partialcitationtuple[0] == 'firstline':
        atlevel = availablelevels
    if atlevel < 1:
        # i am confused; threatening to probe for level "-1"
        # a selection at level00 will do this to me
        #   /getstructure/gr0003w001/3|36|5|3
        # this needs to be made uncontroversial:
        #   /getstructure/gr0003w001/3|36|5
        # and so: massage the data
        atlevel = 1
        try:
            partialcitation.pop()
        except IndexError:
            atlevel = availablelevels

    audb = workid[0:6]
    lvl = 'l' + str(atlevel - 1)

    # select level_00_value from gr0565w001 where level_03_value='3' AND level_02_value='2' AND level_01_value='1' AND level_00_value NOT IN ('t') ORDER BY index ASC;
    # select level_01_value from gr0565w001 where level_03_value='2' AND level_02_value='1' AND level_01_value NOT IN ('t') ORDER BY index ASC;
    query = 'SELECT {wltmp} FROM {db} WHERE ( wkuniversalid=%s ) AND '.format(
        wltmp=worklinetemplate, lvl=atlevel - 1, db=audb)
    datalist = [workid]
    for level in range(availablelevels - 1, atlevel - 1, -1):
        query += ' level_0{lvl}_value=%s AND '.format(lvl=level)
        datalist.append(partialcitationtuple[availablelevels - level - 1])
    query += 'level_0{lvl}_value NOT IN (%s) ORDER BY index'.format(
        lvl=atlevel - 1)
    datalist.append('t')
    data = tuple(datalist)

    cursor.execute(query, data)

    results = cursor.fetchall()
    if results:
        lines = [dblineintolineobject(r) for r in results]
    else:
        lines = None

    if not lines:
        lowandhighobject = LowandHighInfo(availablelevels, atlevel - 1,
                                          workstructure[atlevel - 1], '-9999',
                                          '', [''])
        return lowandhighobject

    low = getattr(lines[0], lvl)
    high = getattr(lines[-1], lvl)
    rng = [getattr(l, lvl) for l in lines]
    # need to drop dupes and keep the index order
    deduper = set()
    for r in rng:
        if r not in deduper:
            deduper.add(r)
    rng = list(deduper)

    try:
        rng = [int(r) for r in rng]
        rng = sorted(rng)
        rng = [str(r) for r in rng]
    except ValueError:
        rng = sorted(rng)

    lowandhighobject = LowandHighInfo(availablelevels, atlevel - 1,
                                      workstructure[atlevel - 1], low, high,
                                      rng)

    return lowandhighobject
Esempio n. 10
0
def buildtext(work: str, firstline: int, lastline: int, linesevery: int,
              cursor) -> str:
    """
	make a readable/printable version of a work

	:param work:
	:param levelcount:
	:param higherlevels:
	:param linesevery:
	:param cursor:
	:return:
	"""

    workobject = workdict[work]

    auid = work[0:6]

    qtemplate = """
	SELECT {wltmp} FROM {a} WHERE (index >= %s and index <= %s) ORDER BY index ASC
	"""

    query = qtemplate.format(wltmp=worklinetemplate, a=auid)
    data = (firstline, lastline)
    cursor.execute(query, data)
    results = cursor.fetchall()

    output = ['<table>\n']

    # consecutive lines can get numbered twice
    # 660	       ἤν τιϲ ὀφείλων ἐξαρνῆται. Πρ. πόθεν οὖν ἐδάνειϲ’ ὁ
    # 660	           δανείϲαϲ,
    avoiddoubletap = False

    linetemplate = determinelinetemplate()

    # pull these outside the "line in results" loop lest you compile the regex 12000x over 1000 lines
    bracketfinder = {
        'square': {
            'ocreg': re.compile(r'\[(.*?)(\]|$)'),
            'coreg': re.compile(r'(^|\[)(.*?)\]'),
            'class': 'editorialmarker_squarebrackets',
            'o': '[',
            'c': ']'
        },
        'round': {
            'ocreg': re.compile(r'\((.*?)(\)|$)'),
            'coreg': re.compile(r'(^|\()(.*?)\)'),
            'class': 'editorialmarker_roundbrackets',
            'o': '(',
            'c': ')'
        },
        'angled': {
            'ocreg': re.compile(r'⟨(.*?)(⟩|$)'),
            'coreg': re.compile(r'(^|⟨)(.*?)⟩'),
            'class': 'editorialmarker_angledbrackets',
            'o': '⟨',
            'c': '⟩'
        },
        'curly': {
            'ocreg': re.compile(r'\{(.*?)(\}|$)'),
            'coreg': re.compile(r'(^|\{)(.*?)\}'),
            'class': 'editorialmarker_curlybrackets',
            'o': '{',
            'c': '}'
        }
    }

    openfinder = {
        'square': {
            'regex':
            re.compile(r'\[[^\]]{0,}$'),
            'exceptions':
            [re.compile(r'\[(ϲτρ|ἀντ)\. .\.'),
             re.compile(r'\[ἐπῳδόϲ')]
        },
        'round': {
            'regex': re.compile(r'\([^\)]{0,}$')
        },
        'angled': {
            'regex': re.compile(r'⟨[^⟩]{0,}$')
        },
        'curly': {
            'regex': re.compile(r'\{[^\}]{0,}$')
        },
    }

    closefinder = {
        'square': {
            'c': re.compile(r'\]')
        },
        'round': {
            'c': re.compile(r'\)')
        },
        'angled': {
            'c': re.compile(r'⟩')
        },
        'curly': {
            'c': re.compile(r'\}')
        },
    }

    if results:
        previousline = dblineintolineobject(results[0])
        brackettypes = findactivebrackethighlighting()
        editorialcontinuation = {
            'square': False,
            'round': False,
            'curly': False,
            'angled': False
        }

        lines = [dblineintolineobject(line) for line in results]
        lines = paragraphformatting(lines)  # polish up the HTML of the lines
        for thisline in lines:
            if workobject.isnotliterary(
            ) and thisline.index == workobject.starts:
                # line.index == workobject.starts added as a check because
                # otherwise you will re-see date info in the middle of some documents
                # it gets reasserted with a CD block reinitialization
                metadata = checkfordocumentmetadata(thisline, workobject)
                if metadata:
                    output.append(metadata)

            if brackettypes:
                columnb = thisline.markeditorialinsersions(
                    editorialcontinuation, bracketfinder=bracketfinder)
                editorialcontinuation = {
                    t: setcontinuationvalue(thisline,
                                            previousline,
                                            editorialcontinuation[t],
                                            t,
                                            openfinder=openfinder,
                                            closefinder=closefinder)
                    for t in brackettypes
                }
            else:
                columnb = thisline.markedup

            if thisline.samelevelas(previousline) is not True:
                columna = thisline.shortlocus()
            else:
                columna = str()
            try:
                linenumber = int(thisline.l0)
            except ValueError:
                # 973b is not your friend
                linenumber = 0
            if linenumber % linesevery == 0 and not avoiddoubletap:
                columna = thisline.locus()
                avoiddoubletap = True
            else:
                avoiddoubletap = False

            notes = '; '.join(thisline.insetannotations())

            if columna and session['simpletextoutput']:
                columna = '({a})'.format(a=columna)

            linehtml = linetemplate.format(ca=columna, cb=columnb, cc=notes)

            output.append(linehtml)

            previousline = thisline

    output.append('</table>\n')

    html = '\n'.join(output)

    return html
Esempio n. 11
0
def buildbrowseroutputobject(authorobject: dbAuthor, workobject: dbOpus,
                             locusindexvalue: int,
                             dbcursor) -> BrowserOutputObject:
    """

	this function does a lot of work via a number of subfunctions

	lots of refactoring required if you change anything...

	:param authorobject:
	:param workobject:
	:param locusindexvalue:
	:param linesofcontext:
	:param numbersevery:
	:param dbcursor:
	:return:
	"""

    thiswork = workobject.universalid
    linesofcontext = int(session['browsercontext'])
    numbersevery = hipparchia.config['SHOWLINENUMBERSEVERY']

    # [a] acquire the lines we need to display
    surroundinglines = simplecontextgrabber(workobject.authorid,
                                            locusindexvalue, linesofcontext,
                                            dbcursor)
    lines = [dblineintolineobject(l) for l in surroundinglines]
    lines = [l for l in lines if l.wkuinversalid == thiswork]

    focusline = lines[0]
    for line in lines:
        if line.index == locusindexvalue:
            focusline = line

    passage = BrowserPassageObject(authorobject, workobject, locusindexvalue)
    passage.focusline = focusline
    passage.biblio = formatpublicationinfo(workobject.publication_info)
    passage.citation = locusintocitation(workobject, focusline)

    previousline = lines[0]
    brackettypes = findactivebrackethighlighting()
    continuationdict = {
        'square': False,
        'round': False,
        'curly': False,
        'angled': False
    }

    lineprefix = str()
    if session['debugdb']:
        lineprefix = '<smallcode>{id}&nbsp;&nbsp;&nbsp;</smallcode>&nbsp;'

    # [b] format the lines and insert them into the BrowserPassageObject
    # [b1] check to see if this line is part of a larger formatting block: really only servius?

    lines = paragraphformatting(lines)

    # [b2]
    for line in lines:
        if workobject.isnotliterary() and line.index == workobject.starts:
            # line.index == workobject.starts added as a check because
            # otherwise you will re-see date info in the middle of some documents:
            # it gets reasserted with a CD block reinitialization
            metadata = checkfordocumentmetadata(line, workobject)
            if metadata:
                passage.browsedlines.append(metadata)

        if session['debughtml']:
            columnb = line.showlinehtml()
        else:
            columnb = insertparserids(line, continuationdict)

        if brackettypes:
            continuationdict = {
                t: setcontinuationvalue(line, previousline,
                                        continuationdict[t], t)
                for t in brackettypes
            }

        if line.index == focusline.index:
            # highlight the citationtuple line
            columna = line.locus()
            columnb = '<span class="focusline">{c}</span>'.format(c=columnb)
        else:
            try:
                linenumber = int(line.l0)
            except ValueError:
                # 973b is not your friend
                linenumber = 0
            if line.samelevelas(previousline) is not True:
                columna = line.shortlocus()
            elif linenumber % numbersevery == 0:
                columna = line.locus()
            else:
                # do not insert a line number or special formatting
                columna = str()

        prefix = lineprefix.format(id=line.getlineurl())
        columnb = prefix + columnb

        notes = '; '.join(line.insetannotations())

        if columna and session['simpletextoutput']:
            columna = '({a})'.format(a=columna)

        linehtml = passage.linetemplate.format(l=columnb, n=notes, c=columna)

        passage.browsedlines.append(linehtml)
        previousline = line

    # [c] build the output
    outputobject = BrowserOutputObject(authorobject, workobject,
                                       locusindexvalue)

    outputobject.browserhtml = passage.generatepassagehtml()

    return outputobject
def subqueryphrasesearch(workerid, foundlineobjects: ListProxy,
                         searchphrase: str, listofplacestosearch: ListProxy,
                         searchobject: SearchObject,
                         dbconnection) -> ListProxy:
    """

    foundlineobjects, searchingfor, searchlist, commitcount, whereclauseinfo, activepoll

    use subquery syntax to grab multi-line windows of text for phrase searching

    line ends and line beginning issues can be overcome this way, but then you have plenty of
    bookkeeping to do to to get the proper results focussed on the right line

    tablestosearch:
        ['lt0400', 'lt0022', ...]

    a search inside of Ar., Eth. Eud.:

        SELECT secondpass.index, secondpass.accented_line
                FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM
                    (SELECT index, accented_line,
                        concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle
                        FROM gr0086 WHERE ( (index BETWEEN 15982 AND 18745) ) ) firstpass
                    ) secondpass
                WHERE secondpass.linebundle ~ %s  LIMIT 200

    a search in x., hell and x., mem less book 3 of hell and book 2 of mem:
        SELECT secondpass.index, secondpass.accented_line
                FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM
                    (SELECT index, accented_line,
                        concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle
                        FROM gr0032 WHERE ( (index BETWEEN 1 AND 7918) OR (index BETWEEN 7919 AND 11999) ) AND ( (index NOT BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 8845 AND 9864) ) ) firstpass
                    ) secondpass
                WHERE secondpass.linebundle ~ %s  LIMIT 200

    :return:
    """
    # print('subqueryphrasesearch()')
    so = searchobject
    activepoll = so.poll

    # build incomplete sfo that will handle everything other than iteratethroughsearchlist()
    sfo = returnsearchfncobject(workerid, foundlineobjects,
                                listofplacestosearch, so, dbconnection, None)

    querytemplate = """
		SELECT secondpass.index, secondpass.{co} FROM 
			(SELECT firstpass.index, firstpass.linebundle, firstpass.{co} FROM
					(SELECT index, {co}, concat({co}, ' ', lead({co}) OVER (ORDER BY index ASC)) AS linebundle
						FROM {db} {whr} ) firstpass
			) secondpass
		WHERE secondpass.linebundle ~ %s {lim}"""

    wheretempate = """
	WHERE EXISTS
		(SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index)
	"""

    # substringsearch() needs ability to CREATE TEMPORARY TABLE
    sfo.dbconnection.setreadonly(False)
    dbcursor = sfo.dbconnection.cursor()

    qcomb = QueryCombinator(searchphrase)
    # the last item is the full phrase:  ('one two three four five', '')
    combinations = qcomb.combinations()
    combinations.pop()
    # lines start/end
    sp = re.sub(r'^\s', r'(^|\\s)', searchphrase)
    sp = re.sub(r'\s$', r'(\\s|$)', sp)
    # on the reasoning behind the following substitution see 'DEBUGGING notes: SQL oddities' above
    # sp = re.sub(r' ', r'\\s', sp)

    if not so.onehit:
        lim = ' LIMIT ' + str(so.cap)
    else:
        # the windowing problem means that '1' might be something that gets discarded
        lim = ' LIMIT 5'

    if so.redissearchlist:
        listofplacestosearch = True

    while listofplacestosearch and activepoll.gethits() <= so.cap:
        # sfo.getnextfnc() also takes care of the commitcount
        authortable = sfo.getnextfnc()
        sfo.updatepollremaining()

        if authortable:
            whr = str()
            r = so.indexrestrictions[authortable]
            if r['type'] == 'between':
                indexwedwhere = buildbetweenwhereextension(authortable, so)
                if indexwedwhere != '':
                    # indexwedwhere will come back with an extraneous ' AND'
                    indexwedwhere = indexwedwhere[:-4]
                    whr = 'WHERE {iw}'.format(iw=indexwedwhere)
            elif r['type'] == 'temptable':
                avoidcollisions = assignuniquename()
                q = r['where']['tempquery']
                q = re.sub('_includelist',
                           '_includelist_{a}'.format(a=avoidcollisions), q)
                dbcursor.execute(q)
                whr = wheretempate.format(tbl=authortable, a=avoidcollisions)

            query = querytemplate.format(db=authortable,
                                         co=so.usecolumn,
                                         whr=whr,
                                         lim=lim)
            data = (sp, )
            # print('subqueryphrasesearch() find indices() q,d:\n\t',query, data)
            dbcursor.execute(query, data)
            indices = [i[0] for i in dbcursor.fetchall()]
            # this will yield a bunch of windows: you need to find the centers; see 'while...' below

            locallineobjects = list()
            if indices:
                for i in indices:
                    query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                        wtmpl=worklinetemplate, tb=authortable)
                    data = (i, )
                    # print('subqueryphrasesearch() iterate through indices() q,d:\n\t', query, data)
                    dbcursor.execute(query, data)
                    locallineobjects.append(
                        dblineintolineobject(dbcursor.fetchone()))

            locallineobjects.reverse()
            # debugging
            # for l in locallineobjects:
            #	print(l.universalid, l.locus(), getattr(l,so.usewordlist))

            gotmyonehit = False
            while locallineobjects and activepoll.gethits(
            ) <= so.cap and not gotmyonehit:
                # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133]
                # figure out which line is really the line with the goods
                # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in
                # subsequent lines means that you really should check your work carefully; this is not an especially costly
                # operation relative to the whole search and esp. relative to the speed gains of using a subquery search
                lineobject = locallineobjects.pop()
                if re.search(sp, getattr(lineobject, so.usewordlist)):
                    sfo.addnewfindstolistoffinds([lineobject])
                    activepoll.addhits(1)
                    if so.onehit:
                        gotmyonehit = True
                else:
                    try:
                        nextline = locallineobjects[0]
                    except IndexError:
                        nextline = makeablankline('gr0000w000', -1)

                    if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != (
                            nextline.index - 1):
                        # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101)
                        # usually you won't get a hit by grabbing the next db line, but sometimes you do...
                        query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                            wtmpl=worklinetemplate, tb=authortable)
                        data = (lineobject.index + 1, )
                        # print('subqueryphrasesearch() "while locallineobjects..." loop q,d:\n\t', query, data)
                        dbcursor.execute(query, data)
                        try:
                            nextline = dblineintolineobject(
                                dbcursor.fetchone())
                        except:
                            nextline = makeablankline('gr0000w000', -1)

                    for c in combinations:
                        tail = c[0] + '$'
                        head = '^' + c[1]
                        # debugging
                        # print('re',getattr(lo,so.usewordlist),tail, head, getattr(next,so.usewordlist))

                        t = False
                        h = False
                        try:
                            t = re.search(tail,
                                          getattr(lineobject, so.usewordlist))
                        except re.error:
                            pass
                        try:
                            h = re.search(head,
                                          getattr(nextline, so.usewordlist))
                        except re.error:
                            pass

                        if t and h:
                            sfo.addnewfindstolistoffinds([lineobject])
                            activepoll.addhits(1)
                            if so.onehit:
                                gotmyonehit = True
        else:
            # redis will return None for authortable if the set is now empty
            listofplacestosearch = None

    sfo.listcleanup()

    if sfo.needconnectioncleanup:
        sfo.dbconnection.connectioncleanup()

    return foundlineobjects
def withinxwords(workdbname: str, searchobject: SearchObject,
                 dbconnection) -> List[dbWorkLine]:
    """

    int(session['proximity']), searchingfor, proximate, curs, wkid, whereclauseinfo

    after finding x, look for y within n words of x

    getting to y:
        find the search term x and slice it out of its line
        then build forwards and backwards within the requisite range
        then see if you get a match in the range

    if looking for 'paucitate' near 'imperator' you will find:
        'romani paucitate seruorum gloriatos itane tandem ne'
    this will become:
        'romani' + 'seruorum gloriatos itane tandem ne'

    :param workdbname:
    :param searchobject:
    :return:
    """

    so = searchobject
    dbcursor = dbconnection.cursor()
    dbconnection.setautocommit()

    # you will only get session['maxresults'] back from substringsearch() unless you raise the cap
    # "Roman" near "Aetol" will get 3786 hits in Livy, but only maxresults will come
    # back for checking: but the Aetolians are likley not among those passages...
    templimit = 9999

    if so.lemma:
        chunksize = hipparchia.config['LEMMACHUNKSIZE']
        terms = so.lemma.formlist
        chunked = [
            terms[i:i + chunksize] for i in range(0, len(terms), chunksize)
        ]
        chunked = [wordlistintoregex(c) for c in chunked]

        hits = list()
        for c in chunked:
            hits += list(
                substringsearch(c, workdbname, so, dbcursor, templimit))
        so.usewordlist = 'polytonic'
    else:
        hits = list(
            substringsearch(so.termone, workdbname, so, dbcursor, templimit))

    fullmatches = list()

    for hit in hits:
        hitline = dblineintolineobject(hit)

        leadandlag = grableadingandlagging(hitline, so, dbcursor)
        lagging = leadandlag['lag']
        leading = leadandlag['lead']
        # print(hitline.universalid, so.termtwo, '\n\t[lag] ', lagging, '\n\t[lead]', leading)

        if so.near and (re.search(so.termtwo, leading)
                        or re.search(so.termtwo, lagging)):
            fullmatches.append(hit)
        elif not so.near and not re.search(
                so.termtwo, leading) and not re.search(so.termtwo, lagging):
            fullmatches.append(hit)

    return fullmatches
def lemmatizedwithinxlines(searchobject: SearchObject, hitlist: List[tuple],
                           dbcursor):
    """

    BROKEN ATM: 1.7.4 (probably most/all of 1.7.x)

    the alternate way of doing withinxlines

    this will ask regex to do the heavy lifting

    nasty edge case 'fire' near 'burn' in Homer:

    simplewithinxlines()
      Sought all 5 known forms of »πῦρ« within 1 lines of all 359 known forms of »καίω«
      Searched 3 texts and found 24 passages (621.25s)

    lemmatizedwithinxlines()
       Sought all 5 known forms of »πῦρ« within 1 lines of all 359 known forms of »καίω«
       Searched 3 texts and found 24 passages (2.82s)

    note that this function is often slightly slower than simplewithinxlines(), but it does seem to be able
    to avoid the catastrophe

    lemmatized vs non-lemmatized is probably the key difference when it comes to speed

    :param hitlist:
    :return:
    """

    so = searchobject

    columconverter = {
        'marked_up_line': 'markedup',
        'accented_line': 'polytonic',
        'stripped_line': 'stripped'
    }
    col = columconverter[so.usecolumn]

    prox = int(so.session['proximity'])

    # note that at the moment we arrive here with a one-work per worker policy
    # that is all of the hits will come from the same table
    # this means extra/useless sifting below, but perhaps it is safer to be wasteful now lest we break later

    fullmatches = set()  # set to avoid duplicate hits
    hitlinelist = list()
    linesintheauthors = dict()

    hitlinelist = [dblineintolineobject(h) for h in hitlist]
    for l in hitlinelist:
        wkid = l.universalid
        # prox = 2
        # l = 100
        # list(range(l-prox, l+prox+1))
        # [98, 99, 100, 101, 102]
        environs = set(range(l.index - prox, l.index + prox + 1))
        environs = ['{w}_ln_{x}'.format(w=wkid, x=e) for e in environs]
        try:
            linesintheauthors[wkid[0:6]]
        except KeyError:
            linesintheauthors[wkid[0:6]] = set()
        linesintheauthors[wkid[0:6]].update(environs)

    # now grab all of the lines you might need
    linecollection = set()
    for l in linesintheauthors:
        if linesintheauthors[l]:
            # example: {'lt0803': {952, 953, 951}}
            linecollection = grablistoflines(l, list(linesintheauthors[l]),
                                             dbcursor)
            linecollection = {
                '{w}_ln_{x}'.format(w=l.wkuinversalid, x=l.index): l
                for l in linecollection
            }

    # then associate all of the surrounding words with those lines
    wordbundles = dict()
    for l in hitlinelist:
        wkid = l.universalid
        environs = set(range(l.index - prox, l.index + prox + 1))
        mylines = list()
        for e in environs:
            try:
                mylines.append(linecollection['{w}_ln_{x}'.format(w=wkid,
                                                                  x=e)])
            except KeyError:
                # you went out of bounds and tried to grab something that is not really there
                # KeyError: 'lt1515w001_ln_1175'
                # line 1175 is actually the first line of lt1515w002...
                pass

        mywords = [getattr(l, col) for l in mylines]
        mywords = [w.split(' ') for w in mywords if mywords]
        mywords = flattenlistoflists(mywords)
        mywords = ' '.join(mywords)
        wordbundles[l] = mywords

    # then see if we have any hits...
    while True:
        for provisionalhitline in wordbundles:
            if len(fullmatches) > so.cap:
                break
            if so.near and re.search(so.termtwo,
                                     wordbundles[provisionalhitline]):
                fullmatches.add(provisionalhitline)
            elif not so.near and not re.search(
                    so.termtwo, wordbundles[provisionalhitline]):
                fullmatches.add(provisionalhitline)
        break

    fullmatches = [m.decompose() for m in fullmatches]

    return fullmatches
Esempio n. 15
0
def precomposedsqlsubqueryphrasesearch(so: SearchObject) -> List[dbWorkLine]:
    """

    use subquery syntax to grab multi-line windows of text for phrase searching

    line ends and line beginning issues can be overcome this way, but then you have plenty of
    bookkeeping to do to to get the proper results focussed on the right line

    these searches take linear time: same basic time for any given scope regardless of the query

    """

    # rebuild the searchsqldict but this time pass through rewritequerystringforsubqueryphrasesearching()
    so.searchsqldict = searchlistintosqldict(so,
                                             so.phrase,
                                             subqueryphrasesearch=True)

    # debugmessage('precomposedsqlsubqueryphrasesearch() so.searchsqldict: {d}'.format(d=so.searchsqldict))

    # the windowed collection of lines; you will need to work to find the centers
    # windowing will increase the number of hits: 2+ lines per actual find
    initialhitlines = generatepreliminaryhitlist(so, recap=so.cap * 3)

    m = 'Generating final list of hits by searching among the {h} preliminary hits'
    so.poll.statusis(m.format(h=so.poll.gethits()))
    so.poll.sethits(0)

    sp = re.sub(r'^\s', r'(^|\\s)', so.phrase)
    sp = re.sub(r'\s$', r'(\\s|$)', sp)

    combinations = QueryCombinator(so.phrase)
    # the last item is the full phrase and it will have already been searched:  ('one two three four five', '')
    combinations = combinations.combinations()
    combinations.pop()

    listoffinds = list()

    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    setofhits = set()

    while initialhitlines:
        # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133]
        # figure out which line is really the line with the goods
        # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in
        # subsequent lines means that you really should check your work carefully; this is not an especially costly
        # operation relative to the whole search and esp. relative to the speed gains of using a subquery search
        lineobject = initialhitlines.pop()
        if not so.onehit or lineobject.authorid not in setofhits:
            if re.search(sp, getattr(lineobject, so.usewordlist)):
                listoffinds.append(lineobject)
                so.poll.addhits(1)
                setofhits.add(lineobject.authorid)
            else:
                try:
                    nextline = initialhitlines[0]
                except IndexError:
                    nextline = makeablankline('gr0000w000', -1)

                if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != (
                        nextline.index - 1):
                    # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101)
                    # usually you won't get a hit by grabbing the next db line, but sometimes you do...
                    query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                        wtmpl=worklinetemplate, tb=lineobject.authorid)
                    data = (lineobject.index + 1, )
                    dbcursor.execute(query, data)
                    try:
                        nextline = dblineintolineobject(dbcursor.fetchone())
                    except:
                        nextline = makeablankline('gr0000w000', -1)

                for c in combinations:
                    tail = c[0] + '$'
                    head = '^' + c[1]

                    t = False
                    h = False
                    try:
                        t = re.search(tail, getattr(lineobject,
                                                    so.usewordlist))
                    except re.error:
                        pass
                    try:
                        h = re.search(head, getattr(nextline, so.usewordlist))
                    except re.error:
                        pass

                    if t and h:
                        listoffinds.append(lineobject)
                        so.poll.addhits(1)
                        setofhits.add(lineobject.authorid)

    dbconnection.connectioncleanup()
    return listoffinds
def workonprecomposedsqlsearch(workerid: int, foundlineobjects: ListProxy, listofplacestosearch: ListProxy,
                               searchobject: SearchObject, dbconnection) -> ListProxy:
    """

    iterate through listofplacestosearch

    execute precomposedsqlsearcher() on each item in the list

    gather the results...

    listofplacestosearch elements are dicts and the whole looks like:

        [{'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)},
        {'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)} ...]

    this is supposed to give you one query per hipparchiaDB table unless you are lemmatizing

    """

    if not dbconnection:
        dbconnection = ConnectionObject()

    so = searchobject
    activepoll = so.poll
    dbconnection.setreadonly(False)
    dbcursor = dbconnection.cursor()
    commitcount = 0
    getnetxitem = listofplacestosearch.pop
    emptyerror = IndexError
    remaindererror = TypeError

    while listofplacestosearch and activepoll.gethits() <= so.cap:
        # if workerid == 0:
        #     print('remain:', len(listofplacestosearch))
        commitcount += 1
        dbconnection.checkneedtocommit(commitcount)

        try:
            querydict = getnetxitem(0)
            # consolewarning("workonprecomposedsqlsearch() querydict:\n\t{q}".format(q=querydict))
        except emptyerror:
            querydict = None
            listofplacestosearch = None

        if querydict:
            foundlines = precomposedsqlsearcher(querydict, dbcursor)
            lineobjects = [dblineintolineobject(f) for f in foundlines]
            foundlineobjects.extend(lineobjects)

            if lineobjects:
                numberoffinds = len(lineobjects)
                activepoll.addhits(numberoffinds)
        else:
            listofplacestosearch = None

        try:
            activepoll.remain(len(listofplacestosearch))
        except remaindererror:
            pass

    if not icanpickleconnections():
        dbconnection.connectioncleanup()

    return foundlineobjects