def generatepreliminaryhitlist( so: SearchObject, recap=hipparchia.config['INTERMEDIATESEARCHCAP']) -> List[dbWorkLine]: """ grab the hits for part one of a two part search INTERMEDIATESEARCHCAP is interesting... you can test via "Sought »α« within 1 lines of »ι«" 400k or so seems to be the practical worst case: if you search for "α" in all of the databases you will get 392275 lines back as your intermediate result. You just grabbed a huge % of the total possible collection of lines. you can pull this in about 5s, so there is really no reason to worry about the cap if using the grabber """ actualcap = so.cap so.cap = recap so.poll.statusis('Searching for "{x}"'.format(x=so.termone)) if so.searchtype == 'phraseandproximity': so.poll.statusis('Searching for "{x}"'.format(x=so.phrase)) if so.lemmaone: so.poll.statusis('Searching for all forms of "{x}"'.format( x=so.lemmaone.dictionaryentry)) hitlines = basicprecomposedsqlsearcher(so) so.cap = actualcap return hitlines
def checkneedtoabort(so: SearchObject) -> str: """ can/should we even do this? """ if so.iamarobot: return str() abortjson = str() abort = lambda x: emptyvectoroutput(so, x) activecorpora = so.getactivecorpora() so.poll.statusis('Compiling the list of works to search') so.searchlist = compilesearchlist(listmapper, so.session) # so.seeking should only be set via a fallback when session['baggingmethod'] == 'unlemmatized' if (so.lemmaone or so.tovectorize or so.seeking) and activecorpora: pass elif not activecorpora: abortjson = abort(['no active corpora']) elif not so.searchlist: abortjson = abort(['empty list of places to look']) elif so.vectorquerytype == 'topicmodel': # we don't have and don't need a lemmaone, etc. pass elif so.vectorquerytype == 'analogies': if not so.lemmaone or not so.lemmatwo or not so.lemmathree: abortjson = abort('[did not have three lemmata]') else: # note that some vector queries do not need a term; fix this later... abortjson = abort(['there was no search term']) maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in so.searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: m = 'the vector scope max exceeded: {a} > {b} ' abortjson = abort([ m.format(a=locale.format_string('%d', wordstotal, grouping=True), b=locale.format_string('%d', maxwords, grouping=True)) ]) return abortjson
def buildtriplelemmasearchobject(pollid, one, two, three) -> SearchObject: """ build a search object w/ three lemmata """ seeking = str() proximate = str() if not session['baggingmethod'] == 'unlemmatized': try: termone = lemmatadict[one] termtwo = lemmatadict[two] termthree = lemmatadict[three] except KeyError: termone = None termtwo = None termthree = None so = SearchObject(pollid, seeking, proximate, termone, termtwo, session) so.lemmathree = termthree else: so = SearchObject(pollid, one, two, True, True, session) so.lemmathree = True so.termthree = so.searchtermcleanup(three) return so
def headwordsearch(searchid, headform) -> JSON_STR: """ you get sent here via the morphology tables this is a restricted version of executesearch(): a dictionary headword :param searchid: :param headform: :return: """ probeforsessionvariables() inputlemma = cleaninitialquery(headform) try: lemma = lemmatadict[inputlemma] except KeyError: lemma = None pollid = validatepollid(searchid) seeking = str() proximate = str() proximatelemma = str() so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma, session) jsonoutput = executesearch(pollid, so) return jsonoutput
def basicprecomposedsqlsearcher(so: SearchObject, themanager=None) -> List[dbWorkLine]: """ give me sql and I will search this function just picks a pathway: use the golang module or do things in house? """ so.searchsqldict = insertuniqunames(so.searchsqldict) if not themanager: usesharedlibrary = hipparchia.config['EXTERNALGRABBER'] if not usesharedlibrary: debugmessage('searching via python') themanager = precomposedsqlsearchmanager else: # debugmessage('searching via external helper code') themanager = precomposedexternalsearcher hits = themanager(so) return hits
def updatesearchlistandsearchobject(so: SearchObject) -> SearchObject: """ you have a searchlist; now tell the searchobject more about it... this has been peeled off so that golangvectors() can call it too """ # mark works that have passage exclusions associated with them: # gr0001x001 instead of gr0001w001 if you are skipping part of w001 so.searchlist = flagexclusions(so.searchlist, so.session) so.poll.statusis('Calculating full authors to search') so.searchlist = calculatewholeauthorsearches(so.searchlist, authordict) so.usedcorpora = so.wholecorporasearched() so.poll.statusis('Configuring the search restrictions') so.indexrestrictions = configurewhereclausedata(so.searchlist, workdict, so) return so
def buildsinglelemmasearchobject(pollid: str, one: str) -> SearchObject: """ build a search object w/ one lemma """ try: lemma = lemmatadict[one] except KeyError: lemma = None seeking = str() proximate = str() proximatelemma = str() so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma, session) if so.session['baggingmethod'] == 'unlemmatized': so.seeking = so.searchtermcleanup(one) return so
def precomposedexternalsearcher(so: SearchObject) -> List[dbWorkLine]: """ you are using golang to do the search [1] send the searchdict to redis as a list of json.dumps(items) (keyed to the searchid) [2] send the external fnc the searchid, cap value, worker #, psql login info, redis login info [3] wait for the function to (a) gather; (b) search; (c) store [4] pull the results back from redis via the searchid NB: redis makes sense because the activity poll is going to have to be done via redis anyway... the searched items are stored under the redis key 'searchid_results' json.loads() will leave you with a dictionary of k/v pairs that can be turned into a dbWorkLine """ warning = 'attempted to search via external helper but {x} is not available using precomposedsqlsearchmanager() instead' if not gosearch and not haveexternalhelper(getexternalhelperpath()): x = 'the external module' if not haveexternalhelper(getexternalhelperpath()): x = hipparchia.config['EXTERNALBINARYNAME'] consolewarning(warning.format(x=x), color='red') return precomposedsqlsearchmanager(so) if not canuseredis: consolewarning(warning.format(x='redis'), color='red') return precomposedsqlsearchmanager(so) rc = establishredisconnection() so.searchsqldict = rewritesqlsearchdictforexternalhelper(so) # debugmessage('storing search at "{r}"'.format(r=so.searchid)) for s in so.searchsqldict: rc.sadd(so.searchid, json.dumps(so.searchsqldict[s])) # if 1 > 0: # consolewarning('precomposedgolangsearcher() merely stored the search in redis and did not execute it') # return list() if not hipparchia.config['GRABBERCALLEDVIACLI']: resultrediskey = helpersharedlibrarysearcher(so) else: resultrediskey = helperclibinarysearcher(so) redisresults = redisfetch(resultrediskey) hits = [redishitintodbworkline(r) for r in redisresults] return hits
def singlewordsearch(searchid, searchterm) -> JSON_STR: """ you get sent here via the morphology tables this is a restricted version of executesearch(): single, exact term WINDOWS ONLY ERROR: this function will trigger a recursion error the situation looks a lot like case #3 @ https://bugs.python.org/issue9592 but that is supposed to be a closed bug cf the complaints at https://forums.fast.ai/t/recursion-error-fastai-v1-0-27-windows-10/30673/10 "multiprocessing\popen_spawn_win32.py" is the culprit? the current 'solution' is to send things to executesearch() instead "if osname == 'nt'" this test is inside morphologychartjs(); this is a potential source of future brittleness to the extent that one wants to explore refactoring executesearch() :param searchid: :param searchterm: :return: """ probeforsessionvariables() pollid = validatepollid(searchid) searchterm = cleaninitialquery(searchterm) seeking = ' {s} '.format(s=searchterm) proximate = str() lemma = None proximatelemma = None so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma, session) jsonoutput = executesearch(pollid, so) return jsonoutput
def rebuildsearchobjectviasearchorder(so: SearchObject) -> SearchObject: """ rewrite the searchobject so that you look for the less common things first """ if so.lemmaone and so.lemmatwo: hwone = querytotalwordcounts(so.lemmaone.dictionaryentry) hwtwo = querytotalwordcounts(so.lemmatwo.dictionaryentry) # from server.hipparchiaobjects.wordcountobjects import dbWordCountObject # print('{a}: {b}, {c}: {d}'.format(a=so.lemmaone.dictionaryentry, b=hwone.t, c=so.lemmatwo.dictionaryentry, d=hwtwo.t)) if hwtwo.t < hwone.t: tmp = so.lemmaone so.lemmaone = so.lemmatwo so.lemmatwo = tmp elif so.lemma or so.proximatelemma: pass elif so.accented or re.search(r'^[a-z]', so.termone) and so.near: # choose the necessarily faster option unomdifiedskg = massagesearchtermsforwhitespace(so.seeking) unmodifiedprx = so.proximate leastcommon = findleastcommonterm(unomdifiedskg + ' ' + unmodifiedprx, so.accented) if leastcommon != unomdifiedskg: tmp = so.termone so.termone = so.termtwo so.termtwo = tmp elif len(so.termtwo) > len(so.termone) and so.near: # look for the longest word first since that is probably the quicker route # but you can't swap searchingfor and proximate this way in a 'is not near' search without yielding the wrong focus tmp = so.termone so.termone = so.termtwo so.termtwo = tmp return so
def perparesoforsecondsqldict(so: SearchObject, initialhitlines: List[dbWorkLine], usebetweensyntax=True) -> SearchObject: """ after finding initialhitlines sqlwithinxlinessearch() will run a second query it needs a new sqldict note that "usebetweensyntax=False" will break precomposedphraseandproximitysearch() """ so.indexrestrictions = dict() authorsandlines = dict() if not usebetweensyntax: # consolewarning('sqlwithinxlinessearch(): temptable') # time trials... # Sought all 13 known forms of »ὕβριϲ« within 4 lines of all 230 known forms of »φεύγω« # Searched 7,873 texts and found 9 passages (11.87s) # Searched between 400 B.C.E. and 350 B.C.E. # Sought all 230 known forms of »φεύγω« within 4 lines of all 16 known forms of »κρίϲιϲ« # Searched 7,873 texts and found 12 passages (14.64s) # Searched between 400 B.C.E. and 350 B.C.E. for hl in initialhitlines: linestosearch = list(range(hl.index - so.distance, hl.index + so.distance + 1)) try: authorsandlines[hl.authorid].extend(linestosearch) except KeyError: authorsandlines[hl.authorid] = linestosearch so.searchlist = list(authorsandlines.keys()) for a in authorsandlines: so.indexrestrictions[a] = dict() so.indexrestrictions[a]['type'] = 'temptable' so.indexrestrictions[a]['where'] = wholeworktemptablecontents(a, set(authorsandlines[a])) # print("so.indexrestrictions[a]['where']", so.indexrestrictions[a]['where']) else: # Sought all 13 known forms of »ὕβριϲ« within 4 lines of all 230 known forms of »φεύγω« # Searched 7,873 texts and found 9 passages (9.35s) # Searched between 400 B.C.E. and 350 B.C.E. # Sought all 230 known forms of »φεύγω« within 4 lines of all 16 known forms of »κρίϲιϲ« # Searched 7,873 texts and found 12 passages (11.35s) # Searched between 400 B.C.E. and 350 B.C.E. # consolewarning('sqlwithinxlinessearch(): between') for hl in initialhitlines: boundiaries = (hl.index - so.distance, hl.index + so.distance) try: authorsandlines[hl.authorid].append(boundiaries) except KeyError: authorsandlines[hl.authorid] = [boundiaries] for a in authorsandlines: so.searchlist = list(authorsandlines.keys()) so.indexrestrictions[a] = dict() so.indexrestrictions[a]['where'] = dict() so.indexrestrictions[a]['type'] = 'between' so.indexrestrictions[a]['where']['listofboundaries'] = authorsandlines[a] so.indexrestrictions[a]['where']['listofomissions'] = list() return so
def dispatchvectorsearch(vectortype: str, searchid: str, one=None, two=None, three=None) -> JSON_STR: """ dispatcher for "/vectors/..." requests """ if not hipparchia.config['SEMANTICVECTORSENABLED']: so = SearchObject(str(), str(), str(), str(), str(), session) oo = SearchOutputObject(so) target = 'searchsummary' message = '[semantic vectors have not been enabled]' return oo.generatenulloutput(itemname=target, itemval=message) pollid = validatepollid(searchid) one = depunct(one) two = depunct(two) three = depunct(three) simple = [pollid, one] triple = [pollid, one, two, three] knownfunctions = { 'nearestneighborsquery': { 'bso': simple, 'pref': 'CONCEPTMAPPINGENABLED' }, 'analogies': { 'bso': triple, 'pref': 'VECTORANALOGIESENABLED' }, 'topicmodel': { 'bso': simple, 'pref': 'TOPICMODELINGENABLED' }, 'vectortestfunction': { 'bso': simple, 'pref': 'TESTINGVECTORBUTTONENABLED' }, 'unused': { 'fnc': lambda: str(), 'bso': None, 'pref': None }, } if not knownfunctions[vectortype]['pref'] or not hipparchia.config[ knownfunctions[vectortype]['pref']]: return json.dumps('this type of search has not been enabled') bso = knownfunctions[vectortype]['bso'] so = None if len(bso) == 4: so = buildtriplelemmasearchobject(*bso) if len(bso) == 2: so = buildsinglelemmasearchobject(*bso) so.vectorquerytype = vectortype progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to vectorize') if hipparchia.config['EXTERNALVECTORHELPER']: j = externalvectors(so) else: j = pythonvectors(so) if hipparchia.config['JSONDEBUGMODE']: print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j)) try: del so.poll except AttributeError: pass return j
def precomposedsqlwithinxlinessearch(so: SearchObject) -> List[dbWorkLine]: """ after finding x, look for y within n lines of x people who send phrases to both halves and/or a lot of regex will not always get what they want note that this implementations is significantly slower than the standard withinxlines() + simplewithinxlines() """ initialhitlines = generatepreliminaryhitlist(so) # we are going to need a new searchsqldict w/ a new temptable # sq = { table1: {query: q, data: d, temptable: t}, # table2: {query: q, data: d, temptable: t}, ... # this means refeeding searchlistintosqldict() and priming it for a 'temptable' search # the temptable follows the paradigm of wholeworktemptablecontents() # r {'type': 'temptable', 'where': {'tempquery': '\n\tCREATE TEMPORARY TABLE in0f08_includelist AS \n\t\tSELECT values \n\t\t\tAS includeindex FROM unnest(ARRAY[768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,763,764,765,766,767]) values\n\t'}} so = perparesoforsecondsqldict(so, initialhitlines) so.searchsqldict = searchlistintosqldict(so, so.termtwo) if so.lemmatwo: so.lemmaone = so.lemmatwo so.searchsqldict = rewritesqlsearchdictforlemmata(so) m = 'Now searching among the {n} initial finds for {l}"{x}"' so.poll.statusis(m.format(n=len(initialhitlines), x=so.termtwo, l=str())) if so.lemmaone: so.poll.statusis( m.format(n=len(initialhitlines), x=so.lemmaone.dictionaryentry, l="all forms of ")) so.poll.sethits(0) newhitlines = basicprecomposedsqlsearcher(so) # newhitlines will contain, e.g., in0001w0ig_493 and in0001w0ig_492, i.e., 2 lines that are part of the same 'hit' # so we need can't use newhitlines directly but have to check it against the initial hits # that's fine since "not near" would push us in this direction in any case initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines} newhitlineids = set() for nhl in newhitlines: indices = list( range(nhl.index - so.distance, nhl.index + so.distance + 1)) ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices] newhitlineids.update(ids) finalhitlines = list() if so.near: # "is near" finalhitlines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl in newhitlineids ] elif not so.near: # "is not near" finalhitlines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl not in newhitlineids ] return finalhitlines
def buildsearchobject(searchid: str, therequest: request, thesession: session) -> SearchObject: """ generic searchobject builder :param searchid: :param therequest: :param thesession: :return: """ whitespace = ' ' if not searchid: searchid = str(int(time.time())) probeforsessionvariables() # a search can take 30s or more and the user might alter the session while the search is running # by toggling onehit, etc that can be a problem, so freeze the values now and rely on this instead # of some moving target frozensession = thesession.copy() # need to sanitize input at least a bit: remove digits and punctuation # dispatcher will do searchtermcharactersubstitutions() and massagesearchtermsforwhitespace() to take # care of lunate sigma, etc. seeking = cleaninitialquery(therequest.args.get('skg', '')) proximate = cleaninitialquery(therequest.args.get('prx', '')) inputlemma = cleaninitialquery(therequest.args.get('lem', '')) inputproximatelemma = cleaninitialquery(therequest.args.get('plm', '')) try: lemma = lemmatadict[inputlemma] except KeyError: lemma = None # print('lo forms', lemma.formlist) try: proximatelemma = lemmatadict[inputproximatelemma] except KeyError: proximatelemma = None replacebeta = False if hipparchia.config['UNIVERSALASSUMESBETACODE'] and re.search( '[a-zA-Z]', seeking): # why the 'and' condition: # sending unicode 'οὐθενὸϲ' to the betacode function will result in 0 hits # this is something that could/should be debugged within that function, # but in practice it is silly to allow hybrid betacode/unicode? this only # makes the life of a person who wants unicode+regex w/ a betacode option more difficult replacebeta = True if hipparchia.config['TLGASSUMESBETACODE']: if justtlg() and (re.search('[a-zA-Z]', seeking) or re.search( '[a-zA-Z]', proximate)) and not re.search( minimumgreek, seeking) and not re.search( minimumgreek, proximate): replacebeta = True if replacebeta: seeking = seeking.upper() seeking = replacegreekbetacode(seeking) seeking = seeking.lower() proximate = proximate.upper() proximate = replacegreekbetacode(proximate) proximate = proximate.lower() if seeking == whitespace: seeking = str() if proximate == whitespace: proximate = str() so = SearchObject(searchid, seeking, proximate, lemma, proximatelemma, frozensession) return so
def precomposedsqlphrasesearch(so: SearchObject) -> List[dbWorkLine]: """ you are searching for a relatively rare word: we will keep things simple-ish note that the second half of this is not MP: but searches already only take 6s; so clean code probably wins here FIXME: can't find the phrases in here...: κατεϲκεύαϲεν τὸ ἐνϲόριον FAILS ϲεν τὸ ἐνϲόριον το SUCCEEDS ch0005w001/2749 1 Ῥουφεῖνα Ἰουδαία ἀρχι- 2 ϲυνάγωγοϲ κατεϲκεύα- 3 ϲεν τὸ ἐνϲόριον τοῖϲ ἀπε- ( match: ἀπελευθέροιϲ ) 4 λευθέροιϲ καὶ θρέμ(μ)αϲιν 5 μηδενὸϲ ἄλ(λ)ου ἐξουϲίαν ἔ- actually, this is a BUILDER problem AND a SERVER problem: BUILDER: 2749 does not have κατεϲκεύαϲεν in it hipparchiaDB=# select index, accented_line, hyphenated_words from ch0005 where index between 2746 and 2752; index | accented_line | hyphenated_words -------+-----------------------------------+------------------ 2748 | ῥουφεῖνα ἰουδαία ἀρχιϲυνάγωγοϲ | ἀρχιϲυνάγωγοϲ 2749 | κατεϲκεύα- | 2750 | ϲεν τὸ ἐνϲόριον τοῖϲ ἀπελευθέροιϲ | ἀπελευθέροιϲ 2751 | καὶ θρέμμαϲιν | 2752 | μηδενὸϲ ἄλλου ἐξουϲίαν ἔχοντοϲ | ἔχοντοϲ (5 rows) SERVER: ἀπελευθέροιϲ καὶ θρέμμαϲιν is missed by precomposedsqlphrasesearch() but it is found by precomposedsqlsubqueryphrasesearch() maybe it is time to nuke precomposedsqlphrasesearch() after all... NB: the dynamic workonphrasesearch() CAN find 'ἀπελευθέροιϲ καὶ θρέμμαϲιν' """ debugmessage('executing a precomposedsqlphrasesearch()') so.termone = so.leastcommon searchphrase = so.phrase phraselen = len(searchphrase.split(' ')) initialhitlines = generatepreliminaryhitlist(so) m = 'Now searching among the {h} initial hits for the full phrase "{p}"' so.poll.statusis(m.format(h=so.poll.gethits(), p=so.originalseeking)) so.poll.sethits(0) fullmatches = list() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() commitcount = 0 while initialhitlines and len(fullmatches) <= so.cap: commitcount += 1 if commitcount == hipparchia.config['MPCOMMITCOUNT']: dbconnection.commit() commitcount = 0 hit = initialhitlines.pop() wordset = lookoutsideoftheline(hit.index, phraselen - 1, hit.authorid, so, dbcursor) if not so.accented: wordset = re.sub(r'[.?!;:,·’]', str(), wordset) else: # the difference is in the apostrophe: δ vs δ’ wordset = re.sub(r'[.?!;:,·]', str(), wordset) if so.near and re.search(searchphrase, wordset): fullmatches.append(hit) so.poll.addhits(1) elif not so.near and re.search(searchphrase, wordset) is None: fullmatches.append(hit) so.poll.addhits(1) dbconnection.connectioncleanup() return fullmatches
def ldatopicsgenerateoutput(ldavishtmlandjs: str, searchobject: SearchObject): """ pyLDAvis.prepared_data_to_html() outputs something that is almost pure JS and looks like this: <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css"> <div id="ldavis_el7428760626948328485476648"></div> <script type="text/javascript"> var ldavis_el7428760626948328485476648_data = {"mdsDat": ... } </script> instance = { 'maxfeatures': 2000, 'components': 15, # topics 'maxfreq': .75, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': 5, # word must be found >n times 'iterations': 12, 'mustbelongerthan': 3 } :param ldavishtmlandjs: :param workssearched: :param settings: :param searchobject: :return: """ so = searchobject activepoll = so.poll output = SearchOutputObject(so) workssearched = len(so.searchlist) vv = searchobject.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } lines = ldavishtmlandjs.split('\n') lines = [re.sub(r'\t', str(), l) for l in lines if l] lines.reverse() thisline = str() html = list() while not re.search(r'<script type="text/javascript">', thisline): html.append(thisline) try: thisline = lines.pop() except IndexError: # oops, we never found the script... thisline = '<script type="text/javascript">' # we cut '<script>'; now drop '</script>' lines.reverse() js = lines[:-1] findshtml = '\n'.join(html) findsjs = '\n'.join(js) ldacssurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css' ldacsslocal = '/css/ldavis.css' findshtml = re.sub(ldacssurl, ldacsslocal, findshtml) # brittle: ldavis might change its URLs between versions, etc. # should probably make this conditional upon the presence of the file locally... ldajsurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js' ldajslocal = '/static/jsforldavis.js' findsjs = re.sub(ldajsurl, ldajslocal, findsjs) # this next will break the reloaded figure: hm... # d3jsurl = r'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min' # d3jslocal = '/static/jsd3' # findsjs = re.sub(d3jsurl, d3jslocal, findsjs) # # print('findsjs',findsjs) who = str() where = '{n} authors'.format(n=searchobject.numberofauthorssearched()) if searchobject.numberofauthorssearched() == 1: a = authordict[searchobject.searchlist[0][:6]] who = a.akaname where = who if workssearched == 1: try: w = workdict[searchobject.searchlist[0]] w = w.title except KeyError: w = str() where = '{a}, <worktitle>{w}</worktitle>'.format(a=who, w=w) output.title = 'Latent Dirichlet Allocation' output.found = findshtml output.js = findsjs output.setscope(workssearched) output.sortby = 'weight' output.thesearch = 'thesearch'.format(skg='') output.resultcount = 'the following topics' output.htmlsearch = '{n} topics in {w}'.format(n=settings['components'], w=where) output.searchtime = so.getelapsedtime() activepoll.deactivate() jsonoutput = json.dumps(output.generateoutput()) return jsonoutput
def pythonvectors(so: SearchObject) -> JSON_STR: """ this is the matching function to golangvectors() [0] test to see what will happen: [a] scope problems? [jump away if so...] [b] already a model on file? ... [jump down to #5 if so] [1] generate a searchlist [2] do a searchlistintosqldict() [3] acquire and bag the words [a] grab db lines that are relevant to the search [b] turn them into a unified text block [c] do some preliminary cleanups [d] break the text into sentences and assemble []SentenceWithLocus (NB: these are "unlemmatized bags of words") [e] figure out all of the words used in the passage [f] find all of the parsing info relative to these words [g] figure out which headwords to associate with the collection of words [h] build the lemmatized bags of words ('unlemmatized' can skip [f] and [g]...) [4] hand the bags over to Word2Vec(), etc. [*] [5] run queries against the model and return the JSON results """ # debugmessage('pythonvectors()') assert so.vectorquerytype in [ 'analogies', 'nearestneighborsquery', 'topicmodel' ] # [0] is this really going to happen? so.poll.statusis('Checking for valid search') # [i] do we bail out before even getting started? # note that this can / will return independently and break here abortjson = checkneedtoabort(so) if abortjson: del so.poll return abortjson # [ii] do we actually have a model stored already? so.poll.statusis('Checking for stored search') # calculatewholeauthorsearches() + configurewhereclausedata() so = updatesearchlistandsearchobject(so) so.setsearchlistthumbprint() so.poll.allworkis(-1) # this turns off the % completed notice in the JS so.poll.sethits(0) themodel = checkforstoredvector(so) if not themodel: # [1] generate a searchlist: use executesearch() as the template so.usecolumn = 'marked_up_line' so.cap = 199999999 # [2] do a searchlistintosqldict() [this is killing lda...] so.searchsqldict = searchlistintosqldict(so, str(), vectors=True) bagsofwords = acquireandbagthewords(so) # [4] hand the bags over to Word2Vec(), etc. so.poll.statusis('Building the model') if so.vectorquerytype == 'nearestneighborsquery': themodel = buildgensimmodel(so, bagsofwords) elif so.vectorquerytype == 'analogies': # the same gensim model can serve both analogies and neighbors themodel = buildgensimmodel(so, bagsofwords) elif so.vectorquerytype == 'topicmodel': stops = list(mostcommonwordsviaheadwords()) bagsofsentences = [' '.join(b) for b in bagsofwords] bagsofsentences = [ removestopwords(s, stops) for s in bagsofsentences ] themodel = buildsklearnselectedworks(so, bagsofsentences) else: pass elif so.iamarobot: # there is a model and the bot is attempting to build something that has already been build return '<!-- MODEL EXISTS -->' # so we have a model one way or the other by now... # [5] run queries against the model if so.iamarobot: return '<!-- MODEL BUILT -->' if so.vectorquerytype == 'nearestneighborsquery': jsonoutput = generatenearestneighbordata(None, len(so.searchlist), so, themodel) elif so.vectorquerytype == 'analogies': jsonoutput = gensimgenerateanalogies(themodel, so) elif so.vectorquerytype == 'topicmodel': # def ldatopicsgenerateoutput(ldavishtmlandjs: str, workssearched: int, settings: dict, searchobject: SearchObject): jsonoutput = ldatopicsgenerateoutput(themodel, so) else: jsonoutput = json.dumps( 'golang cannot execute {s} queries'.format(s=so.vectorquerytype)) return jsonoutput
def precomposedsqlsubqueryphrasesearch(so: SearchObject) -> List[dbWorkLine]: """ use subquery syntax to grab multi-line windows of text for phrase searching line ends and line beginning issues can be overcome this way, but then you have plenty of bookkeeping to do to to get the proper results focussed on the right line these searches take linear time: same basic time for any given scope regardless of the query """ # rebuild the searchsqldict but this time pass through rewritequerystringforsubqueryphrasesearching() so.searchsqldict = searchlistintosqldict(so, so.phrase, subqueryphrasesearch=True) # debugmessage('precomposedsqlsubqueryphrasesearch() so.searchsqldict: {d}'.format(d=so.searchsqldict)) # the windowed collection of lines; you will need to work to find the centers # windowing will increase the number of hits: 2+ lines per actual find initialhitlines = generatepreliminaryhitlist(so, recap=so.cap * 3) m = 'Generating final list of hits by searching among the {h} preliminary hits' so.poll.statusis(m.format(h=so.poll.gethits())) so.poll.sethits(0) sp = re.sub(r'^\s', r'(^|\\s)', so.phrase) sp = re.sub(r'\s$', r'(\\s|$)', sp) combinations = QueryCombinator(so.phrase) # the last item is the full phrase and it will have already been searched: ('one two three four five', '') combinations = combinations.combinations() combinations.pop() listoffinds = list() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() setofhits = set() while initialhitlines: # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133] # figure out which line is really the line with the goods # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in # subsequent lines means that you really should check your work carefully; this is not an especially costly # operation relative to the whole search and esp. relative to the speed gains of using a subquery search lineobject = initialhitlines.pop() if not so.onehit or lineobject.authorid not in setofhits: if re.search(sp, getattr(lineobject, so.usewordlist)): listoffinds.append(lineobject) so.poll.addhits(1) setofhits.add(lineobject.authorid) else: try: nextline = initialhitlines[0] except IndexError: nextline = makeablankline('gr0000w000', -1) if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != ( nextline.index - 1): # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101) # usually you won't get a hit by grabbing the next db line, but sometimes you do... query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=lineobject.authorid) data = (lineobject.index + 1, ) dbcursor.execute(query, data) try: nextline = dblineintolineobject(dbcursor.fetchone()) except: nextline = makeablankline('gr0000w000', -1) for c in combinations: tail = c[0] + '$' head = '^' + c[1] t = False h = False try: t = re.search(tail, getattr(lineobject, so.usewordlist)) except re.error: pass try: h = re.search(head, getattr(nextline, so.usewordlist)) except re.error: pass if t and h: listoffinds.append(lineobject) so.poll.addhits(1) setofhits.add(lineobject.authorid) dbconnection.connectioncleanup() return listoffinds
def precomposedphraseandproximitysearch(so: SearchObject) -> List[dbWorkLine]: """ do a precomposedsqlsubqueryphrasesearch() and then search inside the results for part two... corner case tester: two line-enders: non solum + temporum dignitatem [12] Caesar, De Bello Gallico: book 7, chapter 54, section 4, line 2 7.54.3.3 multatos agris, omnibus ereptis sociis, imposito stipendio, 7.54.4.1 obsidibus summa cum contumelia extortis, et quam in 7.54.4.2 fortunam quamque in amplitudinem deduxisset, ut non 7.54.4.3 solum in pristinum statum redissent, sed omnium tem- 7.54.4.4 porum dignitatem et gratiam antecessisse viderentur. corner case tester: two distant line-enders: temporum dignitatem + obsides Galliae ut non solum in pristinum statum redissent, sed omnium tem- 7.54.4.3 porum dignitatem et gratiam antecessisse viderentur. his datis mandatis eos ab se dimisit. Noviodunum erat oppidum Haeduorum ad ripas 7.55.1.1 Ligeris opportuno loco positum. huc Caesar omnes ob- 7.55.2.1 sides Galliae, frumentum, pecuniam publicam, suorum the old code will trick you by pretending it is doing a valid search even though it is not really set up to handle this situation and was not supposed to promise that it could do phrase+ [it's the phrase-spanning-two-lines bit that yields the problem since you do "lemma+" but have no handler for the multi-line issue] 0.0.0-1.8.1 Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν« Searched 3,182 works and found 1 passage (0.77s) Searched between 850 B.C.E. and 300 B.C.E. Sorted by name [1] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47 3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι 3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον 3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν- 3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα 3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου 1.8.2+ Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν« Searched 2,346 works and found 2 passages (2.2s) Searched between 850 B.C.E. and 300 B.C.E. Sorted by name [1] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47 3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι 3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον 3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν- 3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα 3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου [2] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 14, line 54 3c,688,F.14.52 (40) καὶ ἐλυπήθη λύπην ϲφοδρὰν Μεγάβυζοϲ, καὶ ἐπένθηϲε, καὶ ἠιτήϲατο 3c,688,F.14.53 ἐπὶ Ϲυρίαν τὴν ἑαυτοῦ χώραν ἀπιέναι. ἐνταῦθα λάθραι καὶ τοὺϲ ἄλλουϲ τῶν 3c,688,F.14.54 Ἑλλήνων προέπεμπε. καὶ ἀπήιει, καὶ ἀπέϲτη βαϲιλέωϲ, καὶ ἀθροίζει μεγάλην 3c,688,F.14.55 δύναμιν ἄχρι πεντεκαίδεκα μυριάδων χωρὶϲ τῶν ἱππέων [καὶ τῶν πεζῶν]. 3c,688,F.14.56 καὶ πέμπεται Οὔϲιριϲ κατ’ αὐτοῦ ϲὺν ⟨κ⟩ μυριάϲι, καὶ ϲυνάπτεται πόλεμοϲ, καὶ """ # # initially do "within x lines" # phrasefinder = re.compile(r'[^\s]\s[^\s]') if re.search(phrasefinder, so.seeking) and re.search( phrasefinder, so.proximate): secondsearch = precomposedsqlsubqueryphrasesearch elif not re.search(phrasefinder, so.seeking) and re.search( phrasefinder, so.proximate): so.swapseekingandproxmate() so.swaplemmaoneandtwo() secondsearch = basicprecomposedsqlsearcher else: secondsearch = basicprecomposedsqlsearcher c = so.cap ps = so.proximate so.proximate = str() pl = so.lemmatwo so.lemmatwo = str() so.phrase = so.seeking firstterm = so.phrase so.cap = hipparchia.config['INTERMEDIATESEARCHCAP'] initialhitlines = precomposedsqlsubqueryphrasesearch(so) so.seeking = ps so.lemmaone = pl so.setsearchtype() so.cap = c if secondsearch == precomposedsqlsubqueryphrasesearch: so.phrase = ps else: so.phrase = str() so = perparesoforsecondsqldict(so, initialhitlines) so.searchsqldict = searchlistintosqldict(so, so.seeking) if so.lemmaone: so.searchsqldict = rewritesqlsearchdictforlemmata(so) so.poll.sethits(0) newhitlines = secondsearch(so) initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines} newhitlineids = set() for nhl in newhitlines: indices = list( range(nhl.index - so.distance, nhl.index + so.distance + 1)) ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices] newhitlineids.update(ids) maybefinalhitines = list() if so.near: # "is near" maybefinalhitines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl in newhitlineids ] elif not so.near: # "is not near" maybefinalhitines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl not in newhitlineids ] # # if neccessary, do "within x words" as x lines hits will always be a subset of the first set # if so.lemmaone: secondterm = wordlistintoregex(so.lemmaone.formlist) else: secondterm = so.seeking if so.scope == 'words': finalhitlines = paredowntowithinxwords(so, firstterm, secondterm, maybefinalhitines) else: finalhitlines = maybefinalhitines # to humor rewriteskgandprx() # but that formatting doesn't 100% work yet... so.termone = firstterm so.termtwo = secondterm so.lemmatwo = so.lemmaone return finalhitlines
def precomposedsqlsearch(so: SearchObject) -> List[dbWorkLine]: """ flow control for searching governed by so.searchtype speed notes: the speed of these searches is consonant with that of the old search code; usu. <1s difference sqlphrasesearch() was eliminated in order to keep the code base more streamlined """ assert so.searchtype in [ 'simple', 'simplelemma', 'proximity', 'phrase', 'phraseandproximity' ], 'unknown searchtype sent to rawsqlsearches()' so.poll.statusis('Executing a {t} search...'.format(t=so.searchtype)) so.searchsqldict = searchlistintosqldict(so, so.termone) if so.lemmaone: so.searchsqldict = rewritesqlsearchdictforlemmata(so) searchfnc = lambda x: list() if so.searchtype in ['simple', 'simplelemma']: searchfnc = basicprecomposedsqlsearcher elif so.searchtype == 'proximity': # search for the least common terms first: swap termone and termtwo if need be so = rebuildsearchobjectviasearchorder(so) if so.scope == 'lines': # this will hit rawdsqlsearchmanager() 2x searchfnc = precomposedsqlwithinxlinessearch else: searchfnc = precomposedsqlwithinxwords elif so.searchtype == 'phrase': so.phrase = so.termone # so.leastcommon = findleastcommonterm(so.termone, so.accented) searchfnc = precomposedsqlsubqueryphrasesearch elif so.searchtype == 'phraseandproximity': so.phrase = so.termone searchfnc = precomposedphraseandproximitysearch else: # should be hard to reach this because of "assert" above consolewarning( 'rawsqlsearches() does not support {t} searching'.format( t=so.searchtype), color='red') so.searchsqldict = searchlistintosqldict(so, so.termone) if so.lemmaone: so.searchsqldict = rewritesqlsearchdictforlemmata(so) hitlist = searchfnc(so) if so.onehit: # you might still have two hits from the same author; purge the doubles # use unique keys property of a dict() to do it uniqueauthors = {h.authorid: h for h in hitlist} hitlist = [uniqueauthors[a] for a in uniqueauthors] hitlist = hitlist[:so.cap] return hitlist
def buildfakesearchobject(qtype='nearestneighborsquery') -> SearchObject: """ do what it takes to build a hollow searchobject :return: """ frozensession = dict() frozensession['vdim'] = hipparchia.config['VECTORDIMENSIONS'] frozensession['vwindow'] = hipparchia.config['VECTORWINDOW'] frozensession['viterat'] = hipparchia.config['VECTORTRAININGITERATIONS'] frozensession['vminpres'] = hipparchia.config['VECTORMINIMALPRESENCE'] frozensession['vdsamp'] = hipparchia.config['VECTORDOWNSAMPLE'] frozensession['vcutloc'] = hipparchia.config['VECTORDISTANCECUTOFFLOCAL'] frozensession['vcutneighb'] = hipparchia.config[ 'VECTORDISTANCECUTOFFNEARESTNEIGHBOR'] frozensession['vcutlem'] = hipparchia.config[ 'VECTORDISTANCECUTOFFLEMMAPAIR'] frozensession['vnncap'] = hipparchia.config['NEARESTNEIGHBORSCAP'] frozensession['vsentperdoc'] = hipparchia.config['SENTENCESPERDOCUMENT'] frozensession['ldamaxfeatures'] = hipparchia.config['LDAMAXFEATURES'] frozensession['ldacomponents'] = hipparchia.config['LDACOMPONENTS'] frozensession['ldamaxfreq'] = hipparchia.config['LDAMAXFREQ'] frozensession['ldaminfreq'] = hipparchia.config['LDAMINFREQ'] frozensession['ldaiterations'] = hipparchia.config['LDAITERATIONS'] frozensession['ldamustbelongerthan'] = hipparchia.config[ 'LDAMUSTBELONGERTHAN'] frozensession['baggingmethod'] = hipparchia.config['DEFAULTBAGGINGMETHOD'] blanks = ['searchscope', 'nearornot', 'onehit'] for b in blanks: frozensession[b] = None nulls = ['psgselections', 'psgexclusions'] for n in nulls: frozensession[n] = list() zeroes = ['proximity', 'maxresults', 'linesofcontext'] for z in zeroes: frozensession[z] = 0 trueorfalse = [ 'onehit', 'icandodates', 'nearestneighborsquery', 'searchinsidemarkup' ] for x in trueorfalse: frozensession[x] = False for x in [ 'agnexclusions', 'agnselections', 'alocexclusions', 'alocselections', 'analogyfinder', 'auexclusions', 'auselections' ]: frozensession[x] = list() for s in [ 'wkexclusions', 'wkgnexclusions', 'wkgnselections', 'wkselections', 'wlocexclusions', 'wlocselections' ]: frozensession[s] = list() for p in ['psgexclusions', 'psgselections']: frozensession[p] = list() for c in [ 'christiancorpus', 'latincorpus', 'greekcorpus', 'inscriptioncorpus' ]: frozensession[c] = True frozensession['latestdate'] = 1500 frozensession['earliestdate'] = -850 so = SearchObject('vectorbot', str(), str(), None, None, frozensession) # parsevectorsentences() needs the following: so.vectorquerytype = qtype so.usecolumn = 'marked_up_line' so.sortorder = 'shortname' so.iamarobot = True return so