Example #1
0
def headwordsearch(searchid, headform) -> JSON_STR:
    """

	you get sent here via the morphology tables

	this is a restricted version of executesearch(): a dictionary headword

	:param searchid:
	:param headform:
	:return:
	"""

    probeforsessionvariables()
    inputlemma = cleaninitialquery(headform)

    try:
        lemma = lemmatadict[inputlemma]
    except KeyError:
        lemma = None

    pollid = validatepollid(searchid)
    seeking = str()
    proximate = str()

    proximatelemma = str()

    so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma,
                      session)

    jsonoutput = executesearch(pollid, so)

    return jsonoutput
Example #2
0
def checkforactivesearch(searchid, trialnumber=0) -> JSON_STR:
    """

	test the activity of a poll so you don't start conjuring a bunch of key errors if you use wscheckpoll() prematurely

	note that uWSGI does not look like it will ever be able to work with the polling: poll[ts].getactivity() will
	never return anything because the processing and threading of uWSGI means that the poll is not going
	to be available to the instance; redis, vel. sim could fix this, but that's a lot of trouble to go to

	at a minimum you can count on uWSGI giving you a KeyError when you ask for poll[ts]

	:param searchid:
	:return:
	"""

    maxtrials = 4
    trialnumber = trialnumber + 1

    pollid = validatepollid(searchid)
    pollport = hipparchia.config['PROGRESSPOLLDEFAULTPORT']

    if trialnumber >= maxtrials:
        # note that very short searches can trigger this: rare word in a small author, etc.
        # w = 'checkforactivesearch() cannot find the poll for {p} after {t} tries'
        # consolewarning(w.format(p=pollid, t=trialnumber), color='magenta')
        return json.dumps('cannot_find_the_poll')

    checkforlivewebsocket()

    if hipparchia.config['EXTERNALWSGI'] and hipparchia.config[
            'POLLCONNECTIONTYPE'] == 'redis':
        return externalwsgipolling(pollid)

    try:
        if progresspolldict[pollid].getactivity():
            return json.dumps(pollport)
    except KeyError:
        time.sleep(.20)
        return checkforactivesearch(searchid, trialnumber)

    # should seldom make it here; but super-short requests will: 'confirm' on a vector search that will abort, e.g.
    time.sleep(.1)
    return checkforactivesearch(searchid, trialnumber)
Example #3
0
def singlewordsearch(searchid, searchterm) -> JSON_STR:
    """

	you get sent here via the morphology tables

	this is a restricted version of executesearch(): single, exact term

	WINDOWS ONLY ERROR: this function will trigger a recursion error

	the situation looks a lot like case #3 @ https://bugs.python.org/issue9592

	but that is supposed to be a closed bug

	cf the complaints at https://forums.fast.ai/t/recursion-error-fastai-v1-0-27-windows-10/30673/10

	"multiprocessing\popen_spawn_win32.py" is the culprit?

	the current 'solution' is to send things to executesearch() instead "if osname == 'nt'"
	this test is inside morphologychartjs(); this is a potential source of future brittleness
	to the extent that one wants to explore refactoring executesearch()

	:param searchid:
	:param searchterm:
	:return:
	"""

    probeforsessionvariables()

    pollid = validatepollid(searchid)
    searchterm = cleaninitialquery(searchterm)
    seeking = ' {s} '.format(s=searchterm)
    proximate = str()
    lemma = None
    proximatelemma = None

    so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma,
                      session)

    jsonoutput = executesearch(pollid, so)

    return jsonoutput
Example #4
0
def dispatchvectorsearch(vectortype: str,
                         searchid: str,
                         one=None,
                         two=None,
                         three=None) -> JSON_STR:
    """

	dispatcher for "/vectors/..." requests

	"""

    if not hipparchia.config['SEMANTICVECTORSENABLED']:
        so = SearchObject(str(), str(), str(), str(), str(), session)
        oo = SearchOutputObject(so)
        target = 'searchsummary'
        message = '[semantic vectors have not been enabled]'
        return oo.generatenulloutput(itemname=target, itemval=message)

    pollid = validatepollid(searchid)
    one = depunct(one)
    two = depunct(two)
    three = depunct(three)

    simple = [pollid, one]
    triple = [pollid, one, two, three]

    knownfunctions = {
        'nearestneighborsquery': {
            'bso': simple,
            'pref': 'CONCEPTMAPPINGENABLED'
        },
        'analogies': {
            'bso': triple,
            'pref': 'VECTORANALOGIESENABLED'
        },
        'topicmodel': {
            'bso': simple,
            'pref': 'TOPICMODELINGENABLED'
        },
        'vectortestfunction': {
            'bso': simple,
            'pref': 'TESTINGVECTORBUTTONENABLED'
        },
        'unused': {
            'fnc': lambda: str(),
            'bso': None,
            'pref': None
        },
    }

    if not knownfunctions[vectortype]['pref'] or not hipparchia.config[
            knownfunctions[vectortype]['pref']]:
        return json.dumps('this type of search has not been enabled')

    bso = knownfunctions[vectortype]['bso']

    so = None

    if len(bso) == 4:
        so = buildtriplelemmasearchobject(*bso)

    if len(bso) == 2:
        so = buildsinglelemmasearchobject(*bso)

    so.vectorquerytype = vectortype

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]
    so.poll.activate()
    so.poll.statusis('Preparing to vectorize')

    if hipparchia.config['EXTERNALVECTORHELPER']:
        j = externalvectors(so)
    else:
        j = pythonvectors(so)

    if hipparchia.config['JSONDEBUGMODE']:
        print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j))

    try:
        del so.poll
    except AttributeError:
        pass

    return j
def buildindexto(searchid: str,
                 author: str,
                 work=None,
                 passage=None,
                 endpoint=None,
                 citationdelimiter='|',
                 justvocab=False) -> JSON_STR:
    """
	build a complete index to a an author, work, or segment of a work

	:return:
	"""

    probeforsessionvariables()

    pollid = validatepollid(searchid)

    starttime = time.time()

    progresspolldict[pollid] = ProgressPoll(pollid)
    progresspolldict[pollid].activate()

    dbconnection = ConnectionObject('autocommit')
    dbcursor = dbconnection.cursor()

    po = IndexmakerInputParsingObject(author, work, passage, endpoint,
                                      citationdelimiter)

    ao = po.authorobject
    wo = po.workobject
    psg = po.passageaslist
    stop = po.endpointlist

    if not work:
        wo = makeanemptywork('gr0000w000')

    # bool
    useheadwords = session['headwordindexing']

    allworks = list()
    output = list()
    cdict = dict()
    segmenttext = str()
    valid = True

    if ao and work and psg and stop:
        start = psg
        firstlinenumber = finddblinefromincompletelocus(wo, start, dbcursor)
        lastlinenumber = finddblinefromincompletelocus(wo,
                                                       stop,
                                                       dbcursor,
                                                       findlastline=True)
        if firstlinenumber['code'] == 'success' and lastlinenumber[
                'code'] == 'success':
            cdict = {
                wo.universalid:
                (firstlinenumber['line'], lastlinenumber['line'])
            }
            startln = dblineintolineobject(
                grabonelinefromwork(ao.universalid, firstlinenumber['line'],
                                    dbcursor))
            stopln = dblineintolineobject(
                grabonelinefromwork(ao.universalid, lastlinenumber['line'],
                                    dbcursor))
        else:
            msg = '"indexspan/" could not find first and last: {a}w{b} - {c} TO {d}'
            consolewarning(msg.format(a=author, b=work, c=passage, d=endpoint))
            startln = makeablankline(work, 0)
            stopln = makeablankline(work, 1)
            valid = False
        segmenttext = 'from {a} to {b}'.format(a=startln.shortlocus(),
                                               b=stopln.shortlocus())
    elif ao and work and psg:
        # subsection of a work of an author
        progresspolldict[pollid].statusis(
            'Preparing a partial index to {t}'.format(t=wo.title))
        startandstop = textsegmentfindstartandstop(ao, wo, psg, dbcursor)
        startline = startandstop['startline']
        endline = startandstop['endline']
        cdict = {wo.universalid: (startline, endline)}
    elif ao and work:
        # one work
        progresspolldict[pollid].statusis(
            'Preparing an index to {t}'.format(t=wo.title))
        startline = wo.starts
        endline = wo.ends
        cdict = {wo.universalid: (startline, endline)}
    elif ao:
        # whole author
        allworks = [
            '{w}  ⇒ {t}'.format(w=w.universalid[6:10], t=w.title)
            for w in ao.listofworks
        ]
        allworks.sort()
        progresspolldict[pollid].statusis(
            'Preparing an index to the works of {a}'.format(a=ao.shortname))
        for wkid in ao.listworkids():
            cdict[wkid] = (workdict[wkid].starts, workdict[wkid].ends)
    else:
        # we do not have a valid selection
        valid = False
        output = ['invalid input']

    if not stop:
        segmenttext = '.'.join(psg)

    if valid and justvocab:
        dbconnection.connectioncleanup()
        del progresspolldict[pollid]
        return cdict

    if valid:
        output = buildindextowork(cdict, progresspolldict[pollid],
                                  useheadwords, dbcursor)

    # get ready to send stuff to the page
    count = len(output)

    try:
        locale.setlocale(locale.LC_ALL, 'en_US')
        count = locale.format_string('%d', count, grouping=True)
    except locale.Error:
        count = str(count)

    progresspolldict[pollid].statusis('Preparing the index HTML')
    indexhtml = wordindextohtmltable(output, useheadwords)

    buildtime = time.time() - starttime
    buildtime = round(buildtime, 2)
    progresspolldict[pollid].deactivate()

    if not ao:
        ao = makeanemptyauthor('gr0000')

    results = dict()
    results['authorname'] = avoidsmallvariants(ao.shortname)
    results['title'] = avoidsmallvariants(wo.title)
    results['structure'] = avoidsmallvariants(wo.citation())
    results['worksegment'] = segmenttext
    results['elapsed'] = buildtime
    results['wordsfound'] = count
    results['indexhtml'] = indexhtml
    results['keytoworks'] = allworks
    results['newjs'] = supplementalindexjs()
    results = json.dumps(results)

    dbconnection.connectioncleanup()
    del progresspolldict[pollid]

    return results
async def wscheckpoll(websocket, path):
	"""

	a poll checker started by startwspolling(): the client sends the name of a poll and this will output
	the status of the poll continuously while the poll remains active

	example:
		progress {'active': 1, 'total': 20, 'remaining': 20, 'hits': 48, 'message': 'Putting the results in context', 'elapsed': 14.0, 'extrainfo': '<span class="small"></span>'}

	:param websocket:
	:param path:
	:return:
	"""

	try:
		pollid = await websocket.recv()
	except websockets.exceptions.ConnectionClosed:
		# you reloaded the page
		return

	# comes to us with quotes: "eb91fb11" --> eb91fb11
	pollid = re.sub(r'"', str(), pollid)
	pollid = validatepollid(pollid)

	while True:
		progress = dict()
		try:
			active = progresspolldict[pollid].getactivity()
			progress['ID'] = pollid
			progress['Poolofwork'] = progresspolldict[pollid].worktotal()
			progress['Remaining'] = progresspolldict[pollid].getremaining()
			progress['Hitcount'] = progresspolldict[pollid].gethits()
			progress['Statusmessage'] = progresspolldict[pollid].getstatus()
			progress['Launchtime'] = progresspolldict[pollid].getlaunchtime()
			if not hipparchia.config['SUPPRESSLONGREQUESTMESSAGE']:
				if progresspolldict[pollid].getnotes():
					progress['Notes'] = progresspolldict[pollid].getnotes()
			else:
				progress['Notes'] = str()
		except KeyError:
			# the poll key is deleted from progresspolldict when the query ends; you will always end up here
			progress['Active'] = 'inactive'
			try:
				await websocket.send(json.dumps(progress))
			except websockets.exceptions.ConnectionClosed:
				# you reloaded the page in the middle of a search and both the poll and the socket vanished
				pass
			break
		except TypeError:
			# TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType
			# the poll is gone...
			break
		
		await asyncio.sleep(.4)
		# print(progress)
		# print('progress %', ((progress['Poolofwork'] - progress['Remaining']) / progress['Poolofwork']) * 100)

		try:
			# something changed amid backend updates and json.dumps() started choking on progresspolldict[pollid].getactivity()
			# active is (now) a <Synchronized wrapper for c_byte(1)>; that was the unexpected change: it was 'bool'
			# <class 'multiprocessing.sharedctypes.Synchronized'>
			progress['Active'] = active.value
		except AttributeError:
			# AttributeError: 'str' (or 'int' or 'bool') object has no attribute 'value'
			progress['Active'] = active

		try:
			await websocket.send(json.dumps(progress))
		except websockets.exceptions.ConnectionClosed:
			# websockets.exceptions.ConnectionClosed because you reloaded the page in the middle of a search
			pass
		except TypeError as e:
			# "Object of type Synchronized is not JSON serializable"
			# macOS and indexmaker combo is a problem; macOS is the real problem?
			consolewarning('websocket non-fatal error: "{e}"'.format(e=e), color='yellow', isbold=False)
			pass
	return
Example #7
0
def executesearch(searchid: str, so=None, req=request) -> JSON_STR:
    """

	the interface to all of the other search functions

	tell me what you are looking for and i'll try to find it

	the results are returned in a json bundle that will be used to update the html on the page

	note that cosdistbysentence vector queries also flow through here: they need a hitdict

	overview:
		buildsearchobject() and then start modifying elements of the SearchObject

		build a search list via compilesearchlist()
			modify search list via flagexclusions()
			modify search list via calculatewholeauthorsearches()
		build search list restrictions via indexrestrictions()

		search via searchdispatcher()

		format results via buildresultobjects()

	:return:
	"""

    pollid = validatepollid(searchid)

    if not so:
        # there is a so if singlewordsearch() sent you here
        probeforsessionvariables()
        so = buildsearchobject(pollid, req, session)

    frozensession = so.session

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]

    so.poll.activate()
    so.poll.statusis('Preparing to search')

    nosearch = True
    output = SearchOutputObject(so)

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if frozensession[c]]

    if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph']
            or frozensession['topicmodel']) and activecorpora:
        so.poll.statusis('Compiling the list of works to search')
        so.searchlist = compilesearchlist(listmapper, frozensession)

    if so.searchlist:
        # do this before updatesearchlistandsearchobject() which collapses items and cuts your total
        workssearched = len(so.searchlist)

        # calculatewholeauthorsearches() + configurewhereclausedata()
        so = updatesearchlistandsearchobject(so)

        nosearch = False
        skg = None
        prx = None

        isgreek = re.compile(
            '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]'
        )

        if so.lemmaone:
            so.termone = wordlistintoregex(so.lemma.formlist)
            skg = so.termone
            if re.search(isgreek, skg):
                # 'v' is a problem because the lemmata list is going to send 'u'
                # but the greek lemmata are accented
                so.usecolumn = 'accented_line'

        if so.lemmatwo:
            so.termtwo = wordlistintoregex(so.lemmatwo.formlist)
            prx = so.termtwo
            if re.search(isgreek, prx):
                so.usecolumn = 'accented_line'

        so.setsearchtype()
        thesearch = so.generatesearchdescription()
        htmlsearch = so.generatehtmlsearchdescription()

        # now that the SearchObject is built, do the search...
        hits = precomposedsqlsearch(so)
        so.poll.statusis('Putting the results in context')

        # hits is List[dbWorkLine]
        hitdict = sortresultslist(hits, so, authordict, workdict)

        if so.vectorquerytype == 'cosdistbylineorword':
            # print('executesearch(): h - cosdistbylineorword')
            # take these hits and head on over to the vector worker
            output = findabsolutevectorsfromhits(so, hitdict, workssearched)
            del progresspolldict[pollid]
            return output

        resultlist = buildresultobjects(hitdict, authordict, workdict, so)

        so.poll.statusis('Converting results to HTML')

        sandp = rewriteskgandprx(skg, prx, htmlsearch, so)
        skg = sandp['skg']
        prx = sandp['prx']
        htmlsearch = sandp['html']

        for r in resultlist:
            r.lineobjects = flagsearchterms(r, skg, prx, so)

        if so.context > 0:
            findshtml = htmlifysearchfinds(resultlist, so)
        else:
            findshtml = nocontexthtmlifysearchfinds(resultlist)

        if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']:
            findshtml = gtltsubstitutes(findshtml)

        findsjs = insertbrowserclickjs('browser')

        resultcount = len(resultlist)

        if resultcount < so.cap:
            hitmax = False
        else:
            hitmax = True

        output.title = thesearch
        output.found = findshtml
        output.js = findsjs
        output.setresultcount(resultcount, 'passages')
        output.setscope(workssearched)
        output.searchtime = so.getelapsedtime()
        output.thesearch = thesearch
        output.htmlsearch = htmlsearch
        output.hitmax = hitmax

    if nosearch:
        if not activecorpora:
            output.reasons.append('there are no active databases')
        if len(so.seeking) == 0:
            output.reasons.append('there is no search term')
        if len(so.seeking) > 0 and len(so.searchlist) == 0:
            output.reasons.append('zero works match the search criteria')

        output.title = '(empty query)'
        output.setresultcount(0, 'passages')
        output.explainemptysearch()

    so.poll.deactivate()
    jsonoutput = json.dumps(output.generateoutput())

    del progresspolldict[pollid]

    return jsonoutput
Example #8
0
def reverselexiconsearch(searchid, searchterm) -> JSON_STR:
	"""
	attempt to find all of the greek/latin dictionary entries that might go with the english search term

	'ape' will drive this crazy; what is needed is a lookup for only the senses

	this can be built into the dictionary

	:param searchid:
	:param searchterm:
	:return:
	"""

	searchterm = searchterm[:hipparchia.config['MAXIMUMLEXICALLENGTH']]
	pollid = validatepollid(searchid)
	progresspolldict[pollid] = ProgressPoll(pollid)
	activepoll = progresspolldict[pollid]
	activepoll.activate()
	activepoll.statusis('Searching lexical entries for "{t}"'.format(t=searchterm))

	probeforsessionvariables()

	returndict = dict()
	returnarray = list()

	seeking = depunct(searchterm)

	if justlatin():
		searchunder = [('latin', 'hi')]
	elif justtlg():
		searchunder = [('greek', 'tr')]
	else:
		searchunder = [('greek', 'tr'), ('latin', 'hi')]

	limit = hipparchia.config['CAPONDICTIONARYFINDS']

	entriestuples = list()
	for s in searchunder:
		usedict = s[0]
		translationlabel = s[1]
		# first see if your term is mentioned at all
		wordobjects = reversedictionarylookup(seeking, usedict, limit)
		entriestuples += [(w.entry, w.id) for w in wordobjects]

	if len(entriestuples) == limit:
		returnarray.append('[stopped searching after {lim} finds]\n<br>\n'.format(lim=limit))

	entriestuples = list(set(entriestuples))

	unsortedentries = [(querytotalwordcounts(e[0]), e[0], e[1]) for e in entriestuples]
	entries = list()
	for e in unsortedentries:
		hwcountobject = e[0]
		term = e[1]
		idval = e[2]
		if hwcountobject:
			entries.append((hwcountobject.t, term, idval))
		else:
			entries.append((0, term, idval))
	entries = sorted(entries, reverse=True)
	entriestuples = [(e[1], e[2]) for e in entries]

	# now we retrieve and format the entries
	if entriestuples:
		# summary of entry values first
		countobjectdict = {e: querytotalwordcounts(e[0]) for e in entriestuples}
		summary = list()
		count = 0
		for c in countobjectdict.keys():
			count += 1
			try:
				totalhits = countobjectdict[c].t
			except:
				totalhits = 0
			# c[0]: the word; c[1]: the id
			summary.append((count, c[0], c[1], totalhits))

		summarytemplate = """
		<span class="sensesum">({n})&nbsp;
			<a class="nounderline" href="#{w}_{wdid}">{w}</a>&nbsp;
			<span class="small">({t:,})</span>
		</span>
		"""

		summary = sorted(summary, key=lambda x: x[3], reverse=True)
		summary = [summarytemplate.format(n=e[0], w=e[1], wdid=e[2], t=e[3]) for e in summary]
		returnarray.append('\n<br />\n'.join(summary))

		# then the entries proper
		dbconnection = ConnectionObject()
		dbconnection.setautocommit()
		dbcursor = dbconnection.cursor()

		wordobjects = [probedictionary(setdictionarylanguage(e[0]) + '_dictionary', 'entry_name', e[0], '=', dbcursor=dbcursor, trialnumber=0) for e in entriestuples]
		wordobjects = flattenlistoflists(wordobjects)
		outputobjects = [lexicalOutputObject(w) for w in wordobjects]
		if len(outputobjects) > 1:
			usecounter = True
		else:
			usecounter = False

		count = 0
		for oo in outputobjects:
			count += 1
			if usecounter:
				entry = oo.generatelexicaloutput(countervalue=count)
			else:
				entry = oo.generatelexicaloutput()
			returnarray.append(entry)
	else:
		returnarray.append('<br />[nothing found under "{skg}"]'.format(skg=seeking))

	returndict['newhtml'] = '\n'.join(returnarray)
	returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()])

	jsondict = json.dumps(returndict)

	del progresspolldict[pollid]

	return jsondict