def headwordsearch(searchid, headform) -> JSON_STR: """ you get sent here via the morphology tables this is a restricted version of executesearch(): a dictionary headword :param searchid: :param headform: :return: """ probeforsessionvariables() inputlemma = cleaninitialquery(headform) try: lemma = lemmatadict[inputlemma] except KeyError: lemma = None pollid = validatepollid(searchid) seeking = str() proximate = str() proximatelemma = str() so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma, session) jsonoutput = executesearch(pollid, so) return jsonoutput
def checkforactivesearch(searchid, trialnumber=0) -> JSON_STR: """ test the activity of a poll so you don't start conjuring a bunch of key errors if you use wscheckpoll() prematurely note that uWSGI does not look like it will ever be able to work with the polling: poll[ts].getactivity() will never return anything because the processing and threading of uWSGI means that the poll is not going to be available to the instance; redis, vel. sim could fix this, but that's a lot of trouble to go to at a minimum you can count on uWSGI giving you a KeyError when you ask for poll[ts] :param searchid: :return: """ maxtrials = 4 trialnumber = trialnumber + 1 pollid = validatepollid(searchid) pollport = hipparchia.config['PROGRESSPOLLDEFAULTPORT'] if trialnumber >= maxtrials: # note that very short searches can trigger this: rare word in a small author, etc. # w = 'checkforactivesearch() cannot find the poll for {p} after {t} tries' # consolewarning(w.format(p=pollid, t=trialnumber), color='magenta') return json.dumps('cannot_find_the_poll') checkforlivewebsocket() if hipparchia.config['EXTERNALWSGI'] and hipparchia.config[ 'POLLCONNECTIONTYPE'] == 'redis': return externalwsgipolling(pollid) try: if progresspolldict[pollid].getactivity(): return json.dumps(pollport) except KeyError: time.sleep(.20) return checkforactivesearch(searchid, trialnumber) # should seldom make it here; but super-short requests will: 'confirm' on a vector search that will abort, e.g. time.sleep(.1) return checkforactivesearch(searchid, trialnumber)
def singlewordsearch(searchid, searchterm) -> JSON_STR: """ you get sent here via the morphology tables this is a restricted version of executesearch(): single, exact term WINDOWS ONLY ERROR: this function will trigger a recursion error the situation looks a lot like case #3 @ https://bugs.python.org/issue9592 but that is supposed to be a closed bug cf the complaints at https://forums.fast.ai/t/recursion-error-fastai-v1-0-27-windows-10/30673/10 "multiprocessing\popen_spawn_win32.py" is the culprit? the current 'solution' is to send things to executesearch() instead "if osname == 'nt'" this test is inside morphologychartjs(); this is a potential source of future brittleness to the extent that one wants to explore refactoring executesearch() :param searchid: :param searchterm: :return: """ probeforsessionvariables() pollid = validatepollid(searchid) searchterm = cleaninitialquery(searchterm) seeking = ' {s} '.format(s=searchterm) proximate = str() lemma = None proximatelemma = None so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma, session) jsonoutput = executesearch(pollid, so) return jsonoutput
def dispatchvectorsearch(vectortype: str, searchid: str, one=None, two=None, three=None) -> JSON_STR: """ dispatcher for "/vectors/..." requests """ if not hipparchia.config['SEMANTICVECTORSENABLED']: so = SearchObject(str(), str(), str(), str(), str(), session) oo = SearchOutputObject(so) target = 'searchsummary' message = '[semantic vectors have not been enabled]' return oo.generatenulloutput(itemname=target, itemval=message) pollid = validatepollid(searchid) one = depunct(one) two = depunct(two) three = depunct(three) simple = [pollid, one] triple = [pollid, one, two, three] knownfunctions = { 'nearestneighborsquery': { 'bso': simple, 'pref': 'CONCEPTMAPPINGENABLED' }, 'analogies': { 'bso': triple, 'pref': 'VECTORANALOGIESENABLED' }, 'topicmodel': { 'bso': simple, 'pref': 'TOPICMODELINGENABLED' }, 'vectortestfunction': { 'bso': simple, 'pref': 'TESTINGVECTORBUTTONENABLED' }, 'unused': { 'fnc': lambda: str(), 'bso': None, 'pref': None }, } if not knownfunctions[vectortype]['pref'] or not hipparchia.config[ knownfunctions[vectortype]['pref']]: return json.dumps('this type of search has not been enabled') bso = knownfunctions[vectortype]['bso'] so = None if len(bso) == 4: so = buildtriplelemmasearchobject(*bso) if len(bso) == 2: so = buildsinglelemmasearchobject(*bso) so.vectorquerytype = vectortype progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to vectorize') if hipparchia.config['EXTERNALVECTORHELPER']: j = externalvectors(so) else: j = pythonvectors(so) if hipparchia.config['JSONDEBUGMODE']: print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j)) try: del so.poll except AttributeError: pass return j
def buildindexto(searchid: str, author: str, work=None, passage=None, endpoint=None, citationdelimiter='|', justvocab=False) -> JSON_STR: """ build a complete index to a an author, work, or segment of a work :return: """ probeforsessionvariables() pollid = validatepollid(searchid) starttime = time.time() progresspolldict[pollid] = ProgressPoll(pollid) progresspolldict[pollid].activate() dbconnection = ConnectionObject('autocommit') dbcursor = dbconnection.cursor() po = IndexmakerInputParsingObject(author, work, passage, endpoint, citationdelimiter) ao = po.authorobject wo = po.workobject psg = po.passageaslist stop = po.endpointlist if not work: wo = makeanemptywork('gr0000w000') # bool useheadwords = session['headwordindexing'] allworks = list() output = list() cdict = dict() segmenttext = str() valid = True if ao and work and psg and stop: start = psg firstlinenumber = finddblinefromincompletelocus(wo, start, dbcursor) lastlinenumber = finddblinefromincompletelocus(wo, stop, dbcursor, findlastline=True) if firstlinenumber['code'] == 'success' and lastlinenumber[ 'code'] == 'success': cdict = { wo.universalid: (firstlinenumber['line'], lastlinenumber['line']) } startln = dblineintolineobject( grabonelinefromwork(ao.universalid, firstlinenumber['line'], dbcursor)) stopln = dblineintolineobject( grabonelinefromwork(ao.universalid, lastlinenumber['line'], dbcursor)) else: msg = '"indexspan/" could not find first and last: {a}w{b} - {c} TO {d}' consolewarning(msg.format(a=author, b=work, c=passage, d=endpoint)) startln = makeablankline(work, 0) stopln = makeablankline(work, 1) valid = False segmenttext = 'from {a} to {b}'.format(a=startln.shortlocus(), b=stopln.shortlocus()) elif ao and work and psg: # subsection of a work of an author progresspolldict[pollid].statusis( 'Preparing a partial index to {t}'.format(t=wo.title)) startandstop = textsegmentfindstartandstop(ao, wo, psg, dbcursor) startline = startandstop['startline'] endline = startandstop['endline'] cdict = {wo.universalid: (startline, endline)} elif ao and work: # one work progresspolldict[pollid].statusis( 'Preparing an index to {t}'.format(t=wo.title)) startline = wo.starts endline = wo.ends cdict = {wo.universalid: (startline, endline)} elif ao: # whole author allworks = [ '{w} ⇒ {t}'.format(w=w.universalid[6:10], t=w.title) for w in ao.listofworks ] allworks.sort() progresspolldict[pollid].statusis( 'Preparing an index to the works of {a}'.format(a=ao.shortname)) for wkid in ao.listworkids(): cdict[wkid] = (workdict[wkid].starts, workdict[wkid].ends) else: # we do not have a valid selection valid = False output = ['invalid input'] if not stop: segmenttext = '.'.join(psg) if valid and justvocab: dbconnection.connectioncleanup() del progresspolldict[pollid] return cdict if valid: output = buildindextowork(cdict, progresspolldict[pollid], useheadwords, dbcursor) # get ready to send stuff to the page count = len(output) try: locale.setlocale(locale.LC_ALL, 'en_US') count = locale.format_string('%d', count, grouping=True) except locale.Error: count = str(count) progresspolldict[pollid].statusis('Preparing the index HTML') indexhtml = wordindextohtmltable(output, useheadwords) buildtime = time.time() - starttime buildtime = round(buildtime, 2) progresspolldict[pollid].deactivate() if not ao: ao = makeanemptyauthor('gr0000') results = dict() results['authorname'] = avoidsmallvariants(ao.shortname) results['title'] = avoidsmallvariants(wo.title) results['structure'] = avoidsmallvariants(wo.citation()) results['worksegment'] = segmenttext results['elapsed'] = buildtime results['wordsfound'] = count results['indexhtml'] = indexhtml results['keytoworks'] = allworks results['newjs'] = supplementalindexjs() results = json.dumps(results) dbconnection.connectioncleanup() del progresspolldict[pollid] return results
async def wscheckpoll(websocket, path): """ a poll checker started by startwspolling(): the client sends the name of a poll and this will output the status of the poll continuously while the poll remains active example: progress {'active': 1, 'total': 20, 'remaining': 20, 'hits': 48, 'message': 'Putting the results in context', 'elapsed': 14.0, 'extrainfo': '<span class="small"></span>'} :param websocket: :param path: :return: """ try: pollid = await websocket.recv() except websockets.exceptions.ConnectionClosed: # you reloaded the page return # comes to us with quotes: "eb91fb11" --> eb91fb11 pollid = re.sub(r'"', str(), pollid) pollid = validatepollid(pollid) while True: progress = dict() try: active = progresspolldict[pollid].getactivity() progress['ID'] = pollid progress['Poolofwork'] = progresspolldict[pollid].worktotal() progress['Remaining'] = progresspolldict[pollid].getremaining() progress['Hitcount'] = progresspolldict[pollid].gethits() progress['Statusmessage'] = progresspolldict[pollid].getstatus() progress['Launchtime'] = progresspolldict[pollid].getlaunchtime() if not hipparchia.config['SUPPRESSLONGREQUESTMESSAGE']: if progresspolldict[pollid].getnotes(): progress['Notes'] = progresspolldict[pollid].getnotes() else: progress['Notes'] = str() except KeyError: # the poll key is deleted from progresspolldict when the query ends; you will always end up here progress['Active'] = 'inactive' try: await websocket.send(json.dumps(progress)) except websockets.exceptions.ConnectionClosed: # you reloaded the page in the middle of a search and both the poll and the socket vanished pass break except TypeError: # TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType # the poll is gone... break await asyncio.sleep(.4) # print(progress) # print('progress %', ((progress['Poolofwork'] - progress['Remaining']) / progress['Poolofwork']) * 100) try: # something changed amid backend updates and json.dumps() started choking on progresspolldict[pollid].getactivity() # active is (now) a <Synchronized wrapper for c_byte(1)>; that was the unexpected change: it was 'bool' # <class 'multiprocessing.sharedctypes.Synchronized'> progress['Active'] = active.value except AttributeError: # AttributeError: 'str' (or 'int' or 'bool') object has no attribute 'value' progress['Active'] = active try: await websocket.send(json.dumps(progress)) except websockets.exceptions.ConnectionClosed: # websockets.exceptions.ConnectionClosed because you reloaded the page in the middle of a search pass except TypeError as e: # "Object of type Synchronized is not JSON serializable" # macOS and indexmaker combo is a problem; macOS is the real problem? consolewarning('websocket non-fatal error: "{e}"'.format(e=e), color='yellow', isbold=False) pass return
def executesearch(searchid: str, so=None, req=request) -> JSON_STR: """ the interface to all of the other search functions tell me what you are looking for and i'll try to find it the results are returned in a json bundle that will be used to update the html on the page note that cosdistbysentence vector queries also flow through here: they need a hitdict overview: buildsearchobject() and then start modifying elements of the SearchObject build a search list via compilesearchlist() modify search list via flagexclusions() modify search list via calculatewholeauthorsearches() build search list restrictions via indexrestrictions() search via searchdispatcher() format results via buildresultobjects() :return: """ pollid = validatepollid(searchid) if not so: # there is a so if singlewordsearch() sent you here probeforsessionvariables() so = buildsearchobject(pollid, req, session) frozensession = so.session progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to search') nosearch = True output = SearchOutputObject(so) allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if frozensession[c]] if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph'] or frozensession['topicmodel']) and activecorpora: so.poll.statusis('Compiling the list of works to search') so.searchlist = compilesearchlist(listmapper, frozensession) if so.searchlist: # do this before updatesearchlistandsearchobject() which collapses items and cuts your total workssearched = len(so.searchlist) # calculatewholeauthorsearches() + configurewhereclausedata() so = updatesearchlistandsearchobject(so) nosearch = False skg = None prx = None isgreek = re.compile( '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]' ) if so.lemmaone: so.termone = wordlistintoregex(so.lemma.formlist) skg = so.termone if re.search(isgreek, skg): # 'v' is a problem because the lemmata list is going to send 'u' # but the greek lemmata are accented so.usecolumn = 'accented_line' if so.lemmatwo: so.termtwo = wordlistintoregex(so.lemmatwo.formlist) prx = so.termtwo if re.search(isgreek, prx): so.usecolumn = 'accented_line' so.setsearchtype() thesearch = so.generatesearchdescription() htmlsearch = so.generatehtmlsearchdescription() # now that the SearchObject is built, do the search... hits = precomposedsqlsearch(so) so.poll.statusis('Putting the results in context') # hits is List[dbWorkLine] hitdict = sortresultslist(hits, so, authordict, workdict) if so.vectorquerytype == 'cosdistbylineorword': # print('executesearch(): h - cosdistbylineorword') # take these hits and head on over to the vector worker output = findabsolutevectorsfromhits(so, hitdict, workssearched) del progresspolldict[pollid] return output resultlist = buildresultobjects(hitdict, authordict, workdict, so) so.poll.statusis('Converting results to HTML') sandp = rewriteskgandprx(skg, prx, htmlsearch, so) skg = sandp['skg'] prx = sandp['prx'] htmlsearch = sandp['html'] for r in resultlist: r.lineobjects = flagsearchterms(r, skg, prx, so) if so.context > 0: findshtml = htmlifysearchfinds(resultlist, so) else: findshtml = nocontexthtmlifysearchfinds(resultlist) if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']: findshtml = gtltsubstitutes(findshtml) findsjs = insertbrowserclickjs('browser') resultcount = len(resultlist) if resultcount < so.cap: hitmax = False else: hitmax = True output.title = thesearch output.found = findshtml output.js = findsjs output.setresultcount(resultcount, 'passages') output.setscope(workssearched) output.searchtime = so.getelapsedtime() output.thesearch = thesearch output.htmlsearch = htmlsearch output.hitmax = hitmax if nosearch: if not activecorpora: output.reasons.append('there are no active databases') if len(so.seeking) == 0: output.reasons.append('there is no search term') if len(so.seeking) > 0 and len(so.searchlist) == 0: output.reasons.append('zero works match the search criteria') output.title = '(empty query)' output.setresultcount(0, 'passages') output.explainemptysearch() so.poll.deactivate() jsonoutput = json.dumps(output.generateoutput()) del progresspolldict[pollid] return jsonoutput
def reverselexiconsearch(searchid, searchterm) -> JSON_STR: """ attempt to find all of the greek/latin dictionary entries that might go with the english search term 'ape' will drive this crazy; what is needed is a lookup for only the senses this can be built into the dictionary :param searchid: :param searchterm: :return: """ searchterm = searchterm[:hipparchia.config['MAXIMUMLEXICALLENGTH']] pollid = validatepollid(searchid) progresspolldict[pollid] = ProgressPoll(pollid) activepoll = progresspolldict[pollid] activepoll.activate() activepoll.statusis('Searching lexical entries for "{t}"'.format(t=searchterm)) probeforsessionvariables() returndict = dict() returnarray = list() seeking = depunct(searchterm) if justlatin(): searchunder = [('latin', 'hi')] elif justtlg(): searchunder = [('greek', 'tr')] else: searchunder = [('greek', 'tr'), ('latin', 'hi')] limit = hipparchia.config['CAPONDICTIONARYFINDS'] entriestuples = list() for s in searchunder: usedict = s[0] translationlabel = s[1] # first see if your term is mentioned at all wordobjects = reversedictionarylookup(seeking, usedict, limit) entriestuples += [(w.entry, w.id) for w in wordobjects] if len(entriestuples) == limit: returnarray.append('[stopped searching after {lim} finds]\n<br>\n'.format(lim=limit)) entriestuples = list(set(entriestuples)) unsortedentries = [(querytotalwordcounts(e[0]), e[0], e[1]) for e in entriestuples] entries = list() for e in unsortedentries: hwcountobject = e[0] term = e[1] idval = e[2] if hwcountobject: entries.append((hwcountobject.t, term, idval)) else: entries.append((0, term, idval)) entries = sorted(entries, reverse=True) entriestuples = [(e[1], e[2]) for e in entries] # now we retrieve and format the entries if entriestuples: # summary of entry values first countobjectdict = {e: querytotalwordcounts(e[0]) for e in entriestuples} summary = list() count = 0 for c in countobjectdict.keys(): count += 1 try: totalhits = countobjectdict[c].t except: totalhits = 0 # c[0]: the word; c[1]: the id summary.append((count, c[0], c[1], totalhits)) summarytemplate = """ <span class="sensesum">({n}) <a class="nounderline" href="#{w}_{wdid}">{w}</a> <span class="small">({t:,})</span> </span> """ summary = sorted(summary, key=lambda x: x[3], reverse=True) summary = [summarytemplate.format(n=e[0], w=e[1], wdid=e[2], t=e[3]) for e in summary] returnarray.append('\n<br />\n'.join(summary)) # then the entries proper dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() wordobjects = [probedictionary(setdictionarylanguage(e[0]) + '_dictionary', 'entry_name', e[0], '=', dbcursor=dbcursor, trialnumber=0) for e in entriestuples] wordobjects = flattenlistoflists(wordobjects) outputobjects = [lexicalOutputObject(w) for w in wordobjects] if len(outputobjects) > 1: usecounter = True else: usecounter = False count = 0 for oo in outputobjects: count += 1 if usecounter: entry = oo.generatelexicaloutput(countervalue=count) else: entry = oo.generatelexicaloutput() returnarray.append(entry) else: returnarray.append('<br />[nothing found under "{skg}"]'.format(skg=seeking)) returndict['newhtml'] = '\n'.join(returnarray) returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()]) jsondict = json.dumps(returndict) del progresspolldict[pollid] return jsondict