def perseusidmismatch(badworkdbnumber: str, cursor) -> str: """ exception handling Perseus says you can look something up in gr0006w16: but there is no such thing go through the work list and pick the 16th: hope for the best more common is asking for w001 when really 002 or 003 is the 1st valid work number :param badworkdbnumber: :param cursor: :return: """ newworkid = '[na]' # print('ick: perseus wants',badworkdbnumber,'but this does not exist') while newworkid == '[na]': query = 'SELECT universalid FROM works WHERE universalid LIKE %s ORDER BY universalid ASC' data = (badworkdbnumber[0:6]+'%',) try: cursor.execute(query, data) works = cursor.fetchall() try: oldnumber = int(badworkdbnumber[8:10]) newworkid = works[oldnumber][0] except IndexError: newworkid = returnfirstwork(badworkdbnumber[0:6], cursor) except psycopg2.DatabaseError as e: consolewarning('perseusidmismatch() - could not execute', query) consolewarning('Error:', e) newworkid = returnfirstwork(badworkdbnumber[0:6], cursor) return newworkid
def _findpartofspeech(self): tenses = [ 'pres', 'aor', 'fut', 'perf', 'imperf', 'plup', 'futperf', 'part' ] genders = ['masc', 'fem', 'neut', 'masc/neut', 'masc/fem'] if self.analyssiscomponents[0] in tenses: pos = 'conjugated' try: self.analysis = ConjugatedFormAnalysis( self.word, self.language, self.dialects, self.analyssiscomponents) except IndexError: # bad original data; too few items in analyssiscomponents warnstring = 'cannot parse {w}: analysis list is too short: {a}'.format( w=self.word, a=self.analyssiscomponents) consolewarning(warnstring, 'yellow') self.analysis = None elif self.analyssiscomponents[0] in genders: pos = 'declined' try: self.analysis = DeclinedFormAnalysis(self.word, self.language, self.dialects, self.analyssiscomponents) except AssertionError: self.analysis = None else: pos = 'notimplem' self.analysis = None return pos
def setthreadcount(startup=False) -> int: """ used to set worker count on multithreaded functions return either the manual config value or determine it algorithmically :return: """ if not hipparchia.config['AUTOCONFIGWORKERS']: workers = hipparchia.config['WORKERS'] else: workers = int(cpu_count() / 2) + 1 if workers < 1: workers = 1 if workers > cpu_count() and startup: consolewarning( '\nWARNING: thread count exceeds total available number of threads: {a} > {b}' .format(a=workers, b=cpu_count())) consolewarning( 'consider editing "WORKERS" and/or "AUTOCONFIGWORKERS" in "HipparchiaServer/server/settings/performancesettings.py"' ) return workers
def determinevectorworkpile(tempcap=False) -> List[tuple]: """ probe the db for potential vectorization targets :return: """ if not tempcap: cap = hipparchia.config['MAXVECTORSPACE'] else: # real number is just over 93596456 cap = 94000000 if multiprocessing.current_process().name == 'MainProcess': consolewarning( 'the vectorbot is active and searching for items that need to be vectorized', color='green') consolewarning('bagging method has been set to: {b}'.format( b=hipparchia.config['DEFAULTBAGGINGMETHOD'])) authors = [(authordict[a].universalid, authordict[a].countwordsinworks()) for a in authordict] authorsbylength = sorted(authors, key=lambda x: x[1]) # note that we are turning these into one-item lists: genre lists, etc are multi-author lists authortuples = [([a[0]], a[1]) for a in authorsbylength] # print('authortuples[-10:]', authortuples[-10:]) # [(['gr2042'], 1145288), (['gr4013'], 1177392), (['lt0474'], 1207760), (['gr2018'], 1271700), (['gr4089'], 1343587), (['gr4015'], 1422513), (['gr4083'], 1765800), (['gr4090'], 2202504), (['gr0057'], 2594166), (['gr2062'], 4182615)] activelists = [l for l in listmapper if len(listmapper[l]['a']) > 0] corpustuples = list() for item in activelists: corpustuples.append(([ authordict[a].universalid for a in authordict if authordict[a].universalid[:2] == item ], sum([ authordict[a].countwordsinworks() for a in authordict if authordict[a].universalid[:2] == item ]))) # gk 75233496 # lt 7548164 # db 4276074 # in 5485166 # ch 1053646 # you can do the same exercise for genres and time slices # gr up to 300BCE: 13,518,315 workpile = authortuples + corpustuples workpile = [w for w in workpile if w[1] < cap] # test: just caesar # workpile = [(['lt0448'], 999)] return workpile
def startwspolling(theport=None): """ you need a websocket poll server pick between python and golang as a delivery medium """ if not theport: theport = hipparchia.config['PROGRESSPOLLDEFAULTPORT'] if not gosearch: debugmessage('websockets are to be provided via the python socket server') startpythonwspolling(theport) if hipparchia.config['EXTERNALWEBSOCKETS']: debugmessage('websockets are to be provided via the helper app socket server') helperappwebsocketserver(theport) return if not hipparchia.config['GRABBERCALLEDVIACLI']: debugmessage('websockets are to be provided via the helper app socket server') helperappwebsocketserver(theport) return startpythonwspolling(theport) # actually this function never returns consolewarning(failurestring.format(f='startwspolling()'), color='red') return
def clearselections(category, index=-1) -> JSON_STR: """ a selection gets thrown into the trash :return: """ selectiontypes = ['auselections', 'wkselections', 'psgselections', 'agnselections', 'wkgnselections', 'alocselections', 'wlocselections', 'auexclusions', 'wkexclusions', 'psgexclusions', 'agnexclusions', 'wkgnexclusions', 'alocexclusions', 'wlocexclusions'] if category not in selectiontypes: category = None try: index = int(index) except ValueError: index = -1 if category and index > -1: try: session[category].pop(index) except IndexError: consolewarning('\tclearselections() IndexError when popping {c}[{i}]'.format(c=category, i=str(index)), color='red') pass except KeyError: consolewarning('\tclearselections() KeyError when popping {c}[{i}]'.format(c=category, i=str(index)), color='red') pass session.modified = True return getcurrentselections()
def checkcompatability(): if hipparchia.config['EXTERNALGRABBER']: bin = hipparchia.config['EXTERNALBINARYNAME'] p = getexternalhelperpath(bin) if 'Rust' in bin: vmin = RUSTHELPERMIN pre = '--' else: vmin = GOHELPERMIN pre = '-' commandandarguments = [p, '{p}v'.format(p=pre)] version = subprocess.run(commandandarguments, capture_output=True) version = version.stdout # b'Hipparchia Golang Helper CLI Debugging Interface (v.1.3.2)\n' vfinder = re.compile(r'\(v\.((\d+)\.(\d+)\.(\d+))\)') # extra brackets let you find the components, but you only need v[1]: "1.3.2" # note how "1.3.2b", etc. are doomed to fail... v = re.search(vfinder, str(version)) try: minversion = LooseVersion(vmin) except ValueError: minversion = None consolewarning( 'checkcompatability() failed to parse minimum version string "{v}"' .format(v=vmin)) try: binversion = LooseVersion(v[1]) except ValueError: binversion = None consolewarning( 'checkcompatability() failed to parse version info string "{v}"' .format(v=version)) if not binversion or not minversion: return if binversion >= minversion: debugmessage( 'checkcompatability() says that {b} {x} >= {y}'.format(b=bin, x=v[1], y=vmin)) pass else: w = '{b} is out of date. You have {x}. You need {y}. Some/Many functions are likely to fail.' consolewarning(w.format(b=bin, x=v[1], y=vmin), color='red') w = 'You should either upgrade {b} or disable the helper in "settings/helpersettings.py"' consolewarning(w.format(b=bin)) consolewarning('I am now forcibly disabling the grabber...') hipparchia.config['EXTERNALGRABBER'] = False return
def startupprint(message: str, color='white', isbold=False, colorcoded=False, baremessage=True): # consolewarning(message: str, color='yellow', isbold=False, colorcoded=True, baremessage=False) if current_process().name == 'MainProcess': consolewarning(message, color, isbold, colorcoded, baremessage) return
def fetchvectorgraph(imagename) -> bytes: """ grab a graph in the image table so that you can subsequently display it in the browser note that images get deleted after use also note that we hand the data to the db and then immediatel grab it out of the db because of constraints imposed by the way flask works :param imagename: :return: """ if hipparchia.config['RETAINFIGURES']: deletewhendone = False else: deletewhendone = True dbconnection = ConnectionObject(ctype='rw') dbconnection.setautocommit() cursor = dbconnection.cursor() q = 'SELECT imagedata FROM public.storedvectorimages WHERE imagename=%s' d = (imagename,) cursor.execute(q, d) imagedata = cursor.fetchone() # need to convert to bytes, otherwise: # AttributeError: 'memoryview' object has no attribute 'read' try: imagedata = bytes(imagedata[0]) except TypeError: # TypeError: 'NoneType' object is not subscriptable # how did this happen... # if you right click and download a graph in Firefox it will try to pull via the URL # but that figure is almost certainly gone unless you are a debugger retaining figures... imagedata = b'' consolewarning('fetchvectorgraph() failed to fetch image {i}'.format(i=imagename)) # print('fetched {n} from vector image table'.format(n=randomid)) # now we should delete the image because we are done with it if deletewhendone: q = 'DELETE FROM public.storedvectorimages WHERE imagename=%s' d = (imagename,) cursor.execute(q, d) dbconnection.connectioncleanup() return imagedata
def precomposedexternalsearcher(so: SearchObject) -> List[dbWorkLine]: """ you are using golang to do the search [1] send the searchdict to redis as a list of json.dumps(items) (keyed to the searchid) [2] send the external fnc the searchid, cap value, worker #, psql login info, redis login info [3] wait for the function to (a) gather; (b) search; (c) store [4] pull the results back from redis via the searchid NB: redis makes sense because the activity poll is going to have to be done via redis anyway... the searched items are stored under the redis key 'searchid_results' json.loads() will leave you with a dictionary of k/v pairs that can be turned into a dbWorkLine """ warning = 'attempted to search via external helper but {x} is not available using precomposedsqlsearchmanager() instead' if not gosearch and not haveexternalhelper(getexternalhelperpath()): x = 'the external module' if not haveexternalhelper(getexternalhelperpath()): x = hipparchia.config['EXTERNALBINARYNAME'] consolewarning(warning.format(x=x), color='red') return precomposedsqlsearchmanager(so) if not canuseredis: consolewarning(warning.format(x='redis'), color='red') return precomposedsqlsearchmanager(so) rc = establishredisconnection() so.searchsqldict = rewritesqlsearchdictforexternalhelper(so) # debugmessage('storing search at "{r}"'.format(r=so.searchid)) for s in so.searchsqldict: rc.sadd(so.searchid, json.dumps(so.searchsqldict[s])) # if 1 > 0: # consolewarning('precomposedgolangsearcher() merely stored the search in redis and did not execute it') # return list() if not hipparchia.config['GRABBERCALLEDVIACLI']: resultrediskey = helpersharedlibrarysearcher(so) else: resultrediskey = helperclibinarysearcher(so) redisresults = redisfetch(resultrediskey) hits = [redishitintodbworkline(r) for r in redisresults] return hits
def getbaseform(self): if not hipparchia.config['SUPPRESSWARNINGS']: warn = True else: warn = False if self.amgreek(): return self._getgreekbaseform() elif self.amlatin(): return self._getlatinbaseform() else: if warn: consolewarning( 'MorphPossibilityObject failed to determine its own language: {e}' .format(e=self.entry)) return None
def createvectorstable(): """ zap and reconstitute the storedvectors table :return: """ consolewarning('resetting the stored vectors table', color='green') dbconnection = ConnectionObject(ctype='rw') dbcursor = dbconnection.cursor() query = """ DROP TABLE IF EXISTS public.storedvectors; CREATE TABLE public.storedvectors ( ts timestamp without time zone, thumbprint character varying(32) COLLATE pg_catalog."default", uidlist character varying(32) COLLATE pg_catalog."default", vectortype character varying(24) COLLATE pg_catalog."default", baggingmethod character varying(24) COLLATE pg_catalog."default", calculatedvectorspace bytea ) WITH ( OIDS = FALSE ) TABLESPACE pg_default; ALTER TABLE public.storedvectors OWNER to hippa_wr; GRANT SELECT ON TABLE public.storedvectors TO {reader}; GRANT ALL ON TABLE public.storedvectors TO {writer}; """ query = query.format(reader=hipparchia.config['DBUSER'], writer=hipparchia.config['DBWRITEUSER']) dbcursor.execute(query) dbconnection.connectioncleanup() return
def checkneedtocommit(self, commitcountervalue): # commitcountervalue is an MPCounter? try: v = commitcountervalue.value except AttributeError: v = commitcountervalue if v % self.commitcount == 0: try: getattr(self.dbconnection, 'commit')() except psycopg2.DatabaseError: # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # will return often-but-not-always '2' as the status: i.e., STATUS_IN_TRANSACTION consolewarning( '{c} failed its commit()'.format(c=self.uniquename), color='red') status = self.dbconnection.get_transaction_status() consolewarning('\tConnectionObject {me} status is {s}'.format( me=self.uniquename, s=status)) return
def loadusersdict(knownusersandpasswords=None): """ return the userobjects we know about note that this is effectively empty: no dict of users is being passed ATM anyone with ambitions re. a collection of users should insert them via securitysettings.py KNOWNUSERSDICT = {'user1': 'pass1, 'user2': 'pass2'} elaborate user and authentication schemes are a non-priority (as is encryption...) :return: """ userlist = list() if not knownusersandpasswords and hipparchia.config['KNOWNUSERSDICT']: knownusersandpasswords = hipparchia.config['KNOWNUSERSDICT'] userlist = [ PassUser(k, knownusersandpasswords[k]) for k in knownusersandpasswords ] if hipparchia.config['SETADEFAULTUSER']: thepass = hipparchia.config['DEFAULTREMOTEPASS'] if thepass == 'yourremoteuserpassheretrytomakeitstrongplease': thepass = assignuniquename() consolewarning( 'DEFAULTREMOTEPASS cannot be left as "yourremoteuserpassheretrytomakeitstrongplease"' ) consolewarning( 'temporary one-time password is "{p}"'.format(p=thepass)) defaultuser = PassUser(hipparchia.config['DEFAULTREMOTEUSER'], thepass) userlist.append(defaultuser) # anonymoususer = PassUser('Anonymous', 'NoPassword') # userlist.append(anonymoususer) usersdict = {u.username: u for u in userlist} return usersdict
def createstoredimagestable(): """ zap and reconstitute the storedimages table :return: """ consolewarning('resetting the stored images table', color='green') dbconnection = ConnectionObject(ctype='rw') dbcursor = dbconnection.cursor() query = """ DROP TABLE IF EXISTS public.storedvectorimages; CREATE TABLE public.storedvectorimages ( imagename character varying(12), imagedata bytea ) WITH ( OIDS = FALSE ) TABLESPACE pg_default; ALTER TABLE public.storedvectorimages OWNER to hippa_wr; GRANT SELECT ON TABLE public.storedvectorimages TO {reader}; GRANT ALL ON TABLE public.storedvectorimages TO {writer}; """ query = query.format(reader=hipparchia.config['DBUSER'], writer=hipparchia.config['DBWRITEUSER']) dbcursor.execute(query) dbconnection.connectioncleanup() return
def genericexternalcliexecution(theprogram: str, formatterfunction, so: SearchObject) -> str: """ call a golang cli helper; report the result key this basically sets you up for either GOLANGCLIBINARYNAME or GOLANGVECTORBINARYNAME and you will need the relevant formatgolangXXXarguments() too note that the last line of the output of the binary is super-important: it needs to be the result key """ resultrediskey = str() command = getexternalhelperpath(theprogram) commandandarguments = formatterfunction(command, so) try: result = subprocess.run(commandandarguments, capture_output=True) except FileNotFoundError: consolewarning( 'cannot find the golang executable "{x}'.format(x=command), color='red') return resultrediskey if result.returncode == 0: stdo = result.stdout.decode('UTF-8') outputlist = stdo.split('\n') for o in outputlist: debugmessage(o) resultrediskey = [o for o in outputlist if o] resultrediskey = resultrediskey[-1] # but this looks like: 'results sent to redis as ff128837_results' # so you need a little more work resultrediskey = resultrediskey.split()[-1] else: o = str(result.stdout) e = str(result.stderr) e = re.sub(r'\\n', '\n', e) e = re.sub(r'\\t', '\t', e) consolewarning('{c} returned an error'.format( c=hipparchia.config['EXTERNALBINARYNAME']), color='red') consolewarning('{e}'.format(e=o), color='cyan') consolewarning('{e}'.format(e=e), color='yellow') # debugmessage(repr(result)) return resultrediskey
def __init__(self, autocommit='defaultisno', readonlyconnection=True, ctype='ro'): super().__init__(autocommit, readonlyconnection) assert ctype in ['ro', 'rw'], 'connection type must be either "ro" or "rw"' if ctype != 'rw': u = GenericConnectionObject.dbuser p = GenericConnectionObject.dbpass else: u = GenericConnectionObject.dbwriteuser p = GenericConnectionObject.dbwritepass self.readonlyconnection = False try: self.dbconnection = psycopg2.connect( user=u, host=GenericConnectionObject.dbhost, port=GenericConnectionObject.dbport, database=GenericConnectionObject.dbname, password=p) except psycopg2.OperationalError as operror: thefailure = operror.args[0] unknown = 'no pg_hba.conf entry for' if unknown in thefailure: thefailure = 'username and password problem for "DBWRITEUSER": check "securitysettings.py"' consolewarning( GenericConnectionObject.postgresproblem.format(e=thefailure), color='red') sys.exit(0) if self.autocommit == 'autocommit': self.setautocommit() self.setreadonly(self.readonlyconnection) self.curs = getattr(self.dbconnection, 'cursor')() self.thisisafallback = False
def startpythonwspolling(theport): """ launch a websocket poll server tricky because loop.run_forever() will run forever: requires threading the poll is more or less eternal: the libary was coded that way, and it is kind of irritating multiple servers on multiple ports is possible, but not yet implemented: a multi-client model is not a priority :param theport: :return: """ try: theport = int(theport) except ValueError: theport = hipparchia.config['PROGRESSPOLLDEFAULTPORT'] theip = hipparchia.config['MYEXTERNALIPADDRESS'] # because we are not in the main thread we cannot ask for the default loop loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) wspolling = websockets.serve(wscheckpoll, theip, port=theport, loop=loop) consolewarning('opening websocket at {p}'.format(p=theport), color='cyan', isbold=False) try: loop.run_until_complete(wspolling) except OSError: consolewarning('websocket could not be launched: cannot get access to {i}:{p}'.format(p=theport, i=theip), color='red') pass try: loop.run_forever() finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close() # actually this function never returns consolewarning(failurestring.format(f='startpythonwspolling()'), color='red') return
if hipparchia.config['POLLCONNECTIONTYPE'] != 'redis' and not hipparchia.config[ 'EXTERNALWEBSOCKETS']: class ProgressPoll(SharedMemoryProgressPoll): pass else: try: import redis c = establishredisconnection() c.ping() canuseredis = True del c except ImportError: canuseredis = False except redis.exceptions.ConnectionError: canuseredis = False if canuseredis: debugmessage('RedisProgressPoll selected') class ProgressPoll(RedisProgressPoll): pass else: consolewarning( 'configuration asked for a RedisProgressPoll, but you cannot connect to redis' ) class ProgressPoll(SharedMemoryProgressPoll): pass
(see LICENSE in the top level directory of the distribution) """ from server import hipparchia from server.formatting.miscformatting import consolewarning try: from flask_wtf import FlaskForm from werkzeug.security import generate_password_hash, check_password_hash from wtforms import StringField from wtforms.validators import DataRequired except ModuleNotFoundError: if hipparchia.config['LIMITACCESSTOLOGGEDINUSERS']: hipparchia.config['LIMITACCESSTOLOGGEDINUSERS'] = False consolewarning( 'flask_wtf and/or wtforms not found: ~/hipparchia_venv/bin/pip install flask_wtf', color='red') consolewarning('forcibly setting LIMITACCESSTOLOGGEDINUSERS to False', color='red') FlaskForm = None generate_password_hash = None check_password_hash = None StringField = None DataRequired = None class PassUser(object): """ log people into hipparchia
def textmaker(author: str, work=None, passage=None, endpoint=None, citationdelimiter='|') -> JSON_STR: """ build a text suitable for display "GET /textof/lt0474/024/20/30" :return: """ probeforsessionvariables() dbconnection = ConnectionObject('autocommit') dbcursor = dbconnection.cursor() linesevery = hipparchia.config['SHOWLINENUMBERSEVERY'] po = TextmakerInputParsingObject(author, work, passage, endpoint, citationdelimiter) ao = po.authorobject wo = po.workobject segmenttext = str() # consolewarning('po.passageaslist: {p}'.format(p=po.passageaslist)) if ao and wo: # we have both an author and a work, maybe we also have a subset of the work if endpoint: firstlinenumber = finddblinefromincompletelocus( wo, po.passageaslist, dbcursor) lastlinenumber = finddblinefromincompletelocus(wo, po.endpointlist, dbcursor, findlastline=True) if firstlinenumber['code'] == 'success' and lastlinenumber[ 'code'] == 'success': startline = firstlinenumber['line'] endline = lastlinenumber['line'] startlnobj = dblineintolineobject( grabonelinefromwork(ao.universalid, startline, dbcursor)) stoplnobj = dblineintolineobject( grabonelinefromwork(ao.universalid, endline, dbcursor)) else: msg = '"buildtexttospan/" could not find first and last: {a}w{b} - {c} TO {d}' consolewarning( msg.format(a=author, b=work, c=passage, d=endpoint)) startlnobj = makeablankline(work, 0) stoplnobj = makeablankline(work, 1) startline = 0 endline = 1 segmenttext = 'from {a} to {b}'.format(a=startlnobj.shortlocus(), b=stoplnobj.shortlocus()) elif not po.passageaslist: # whole work startline = wo.starts endline = wo.ends else: startandstop = textsegmentfindstartandstop(ao, wo, po.passageaslist, dbcursor) startline = startandstop['startline'] endline = startandstop['endline'] texthtml = buildtext(wo.universalid, startline, endline, linesevery, dbcursor) else: texthtml = str() if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']: texthtml = gtltsubstitutes(texthtml) if not segmenttext: segmenttext = '.'.join(po.passageaslist) if not ao or not wo: ao = makeanemptyauthor('gr0000') wo = makeanemptywork('gr0000w000') results = dict() results['authorname'] = avoidsmallvariants(ao.shortname) results['title'] = avoidsmallvariants(wo.title) results['structure'] = avoidsmallvariants(wo.citation()) results['worksegment'] = segmenttext results['texthtml'] = texthtml results = json.dumps(results) dbconnection.connectioncleanup() return results
def buildindexto(searchid: str, author: str, work=None, passage=None, endpoint=None, citationdelimiter='|', justvocab=False) -> JSON_STR: """ build a complete index to a an author, work, or segment of a work :return: """ probeforsessionvariables() pollid = validatepollid(searchid) starttime = time.time() progresspolldict[pollid] = ProgressPoll(pollid) progresspolldict[pollid].activate() dbconnection = ConnectionObject('autocommit') dbcursor = dbconnection.cursor() po = IndexmakerInputParsingObject(author, work, passage, endpoint, citationdelimiter) ao = po.authorobject wo = po.workobject psg = po.passageaslist stop = po.endpointlist if not work: wo = makeanemptywork('gr0000w000') # bool useheadwords = session['headwordindexing'] allworks = list() output = list() cdict = dict() segmenttext = str() valid = True if ao and work and psg and stop: start = psg firstlinenumber = finddblinefromincompletelocus(wo, start, dbcursor) lastlinenumber = finddblinefromincompletelocus(wo, stop, dbcursor, findlastline=True) if firstlinenumber['code'] == 'success' and lastlinenumber[ 'code'] == 'success': cdict = { wo.universalid: (firstlinenumber['line'], lastlinenumber['line']) } startln = dblineintolineobject( grabonelinefromwork(ao.universalid, firstlinenumber['line'], dbcursor)) stopln = dblineintolineobject( grabonelinefromwork(ao.universalid, lastlinenumber['line'], dbcursor)) else: msg = '"indexspan/" could not find first and last: {a}w{b} - {c} TO {d}' consolewarning(msg.format(a=author, b=work, c=passage, d=endpoint)) startln = makeablankline(work, 0) stopln = makeablankline(work, 1) valid = False segmenttext = 'from {a} to {b}'.format(a=startln.shortlocus(), b=stopln.shortlocus()) elif ao and work and psg: # subsection of a work of an author progresspolldict[pollid].statusis( 'Preparing a partial index to {t}'.format(t=wo.title)) startandstop = textsegmentfindstartandstop(ao, wo, psg, dbcursor) startline = startandstop['startline'] endline = startandstop['endline'] cdict = {wo.universalid: (startline, endline)} elif ao and work: # one work progresspolldict[pollid].statusis( 'Preparing an index to {t}'.format(t=wo.title)) startline = wo.starts endline = wo.ends cdict = {wo.universalid: (startline, endline)} elif ao: # whole author allworks = [ '{w} ⇒ {t}'.format(w=w.universalid[6:10], t=w.title) for w in ao.listofworks ] allworks.sort() progresspolldict[pollid].statusis( 'Preparing an index to the works of {a}'.format(a=ao.shortname)) for wkid in ao.listworkids(): cdict[wkid] = (workdict[wkid].starts, workdict[wkid].ends) else: # we do not have a valid selection valid = False output = ['invalid input'] if not stop: segmenttext = '.'.join(psg) if valid and justvocab: dbconnection.connectioncleanup() del progresspolldict[pollid] return cdict if valid: output = buildindextowork(cdict, progresspolldict[pollid], useheadwords, dbcursor) # get ready to send stuff to the page count = len(output) try: locale.setlocale(locale.LC_ALL, 'en_US') count = locale.format_string('%d', count, grouping=True) except locale.Error: count = str(count) progresspolldict[pollid].statusis('Preparing the index HTML') indexhtml = wordindextohtmltable(output, useheadwords) buildtime = time.time() - starttime buildtime = round(buildtime, 2) progresspolldict[pollid].deactivate() if not ao: ao = makeanemptyauthor('gr0000') results = dict() results['authorname'] = avoidsmallvariants(ao.shortname) results['title'] = avoidsmallvariants(wo.title) results['structure'] = avoidsmallvariants(wo.citation()) results['worksegment'] = segmenttext results['elapsed'] = buildtime results['wordsfound'] = count results['indexhtml'] = indexhtml results['keytoworks'] = allworks results['newjs'] = supplementalindexjs() results = json.dumps(results) dbconnection.connectioncleanup() del progresspolldict[pollid] return results
def _getgreekbaseform(self) -> str: """ the tricky bit: some are quite easy: 'ἐπώνυμοϲ' others are compounds with a ', ' separation there is a HUGE PROBLEM in the original data here: [a] 'ὑπό, ἐκ-ἀράω²': what comes before the comma is a prefix to the verb [b] 'ἠχούϲαϲ, ἠχέω': what comes before the comma is an observed form of the verb when you .split() what do you have at wordandform[0]? you have to look at the full db entry for the word: the number of items in prefixrefs corresponds to the number of prefix checks you will need to make to recompose the verb :return: """ if not hipparchia.config['SUPPRESSWARNINGS']: warn = True else: warn = False # need an aspiration check; incl εκ -⟩ εξ baseform = str() segments = self.entry.split(', ') if len(segments) == 1 and '-' not in segments[-1]: # [a] the simplest case where what you see is what you should seek: 'ἐπώνυμοϲ' baseform = segments[-1] elif len(segments ) == 2 and '-' not in segments[-1] and self.prefixcount == 0: # [b] a compound case, but it does not involve prefixes just morphology: 'ἠχούϲαϲ, ἠχέω' baseform = segments[-1] elif len(segments) == 1 and '-' in segments[-1]: # [c] the simplest version of a prefix: ἐκ-ϲύρω baseform = gkattemptelision(segments[-1]) elif len(segments ) == 2 and '-' in segments[-1] and self.prefixcount == 1: # [d] more info, but we do not need it: ἐκϲύρωμεν, ἐκ-ϲύρω baseform = gkattemptelision(segments[-1]) elif len( segments) > 1 and '-' in segments[-1] and self.prefixcount > 1: # [e] all bets are off: ὑπό,κατά,ἐκ-λάω # print('segments',segments) for i in range(self.prefixcount - 2, -1, -1): baseform = gkattemptelision(segments[-1]) try: baseform = segments[i] + '-' + baseform except IndexError: if warn: consolewarning( 'abandoning efforts to parse {e}'.format( e=self.entry)) baseform = segments[-1] else: if warn: consolewarning( 'MorphPossibilityObject.getbaseform() is confused: {e} - {s}' .format(e=self.entry, s=segments)) # not sure this ever happens with the greek data baseform = re.sub(r'^\s', str(), baseform) return baseform
def verbtabletemplate(mood: str, voice: str, dialect='attic', duals=True, lang='greek') -> str: """ Smythe §383ff cells look like: <td class="morphcell">_attic_subj_pass_pl_2nd_pres_</td> :return: """ if not session['morphduals']: duals = False mytenses = dict() if lang == 'greek': try: mytenses = findmygreektenses(mood, voice) except AssertionError: # hipparchiaDB=# select * from greek_morphology where possible_dictionary_forms like '%ἀπαλλάϲϲω%' and possible_dictionary_forms like '%2nd%' and possible_dictionary_forms like '%imperat%' and possible_dictionary_forms like '%attic%'; # you will see: <possibility_4>ἀπαλλάϲϲω<xref_value>11723310</xref_value><xref_kind>9</xref_kind><transl>set free</transl><analysis>aor imperat 2nd dual</analysis></possibility_4> # contrast: <possibility_6>ἀπαλλάϲϲω<xref_value>11723310</xref_value><xref_kind>9</xref_kind><transl>set free</transl><analysis>aor ind pass 2nd dual (homeric ionic)</analysis></possibility_6> consolewarning( 'invalid parser data: cannot build a table where mood = "{m}" and voice = "{v}"' .format(m=mood, v=voice)) mytenses = dict() if lang == 'latin': mytenses = findmylatintenses(mood, voice) tabletemplate = """ <table class="verbanalysis"> <tbody> {header} {rows} </tbody> </table> <hr class="styled"> """ headerrowtemplate = """ <tr align="center"> <td rowspan="1" colspan="{s}" class="dialectlabel">{dialect}<br> </td> </tr> <tr align="center"> <td rowspan="1" colspan="{s}" class="voicelabel">{voice}<br> </td> </tr> <tr align="center"> <td rowspan="1" colspan="{s}" class="moodlabel">{mood}<br> </td> {tenseheader} </tr>""" tensestemplate = """<tr> <td class="tenselabel"> </td> {alltenses} </tr>""" blank = """ <tr><td> </td>{columns}</tr> """ blankrow = blank.format(columns=''.join( ['<td> </td>' for k in sorted(mytenses.keys()) if mytenses[k]])) tensecell = '<td class="tensecell">{t}<br></td>' tenserows = [ tensecell.format(t=mytenses[k]) for k in sorted(mytenses.keys()) if mytenses[k] ] tenserows = '\n\t\t'.join(tenserows) tenseheader = tensestemplate.format(alltenses=tenserows) fullheader = headerrowtemplate.format(dialect=dialect, voice=voice, mood=mood, tenseheader=tenseheader, s=len(mytenses)) allrows = list() # cell arrangement: left to right and top to bottom is vmnpt # i.e., voice, mood, number, person, tense # we are doing the npt part here # the lists of numbers, persons, tenses, etc is set to match the parser abbreviations cases = list() if lang == 'latin': cases = ['nom', 'gen', 'dat', 'acc', 'abl', 'voc'] if lang == 'greek': cases = ['nom', 'gen', 'dat', 'acc', 'voc'] genders = ['masc', 'fem', 'neut'] if duals: numbers = ['sg', 'dual', 'pl'] else: numbers = ['sg', 'pl'] persons = ['1st', '2nd', '3rd'] tensedict = { 1: 'pres', 2: 'imperf', 3: 'fut', 4: 'aor', 5: 'perf', 6: 'plup', 7: 'futperf' } tenses = [tensedict[k] for k in sorted(mytenses.keys()) if mytenses[k]] morphrowtemplate = """ <tr class="morphrow"> {allcells} </tr> """ morphlabelcell = '<td class="morphlabelcell">{ml}</td>' morphcell = '<td class="morphcell">{mo}</td>' regextemplate = '_{d}_{m}_{v}_{n}_{p}_{t}_' pcpltemplate = '_{d}_{m}_{v}_{n}_{t}_{g}_{c}_' # note that we cant do infinitives and participles yet if mood != 'part' and mood != 'inf': for n in numbers: for p in persons: if p == '1st' and n == 'dual': pass else: allcellsinrow = list() ml = '{n} {p}'.format(n=n, p=p) allcellsinrow.append(morphlabelcell.format(ml=ml)) for t in tenses: mo = regextemplate.format(d=dialect, m=mood, v=voice, n=n, p=p, t=t) allcellsinrow.append(morphcell.format(mo=mo)) thisrow = '\n\t\t'.join(allcellsinrow) allrows.append(morphrowtemplate.format(allcells=thisrow)) elif mood == 'part': for n in numbers: for g in genders: for c in cases: allcellsinrow = list() ml = '{g} {n} {c}'.format(n=n, c=c, g=g) allcellsinrow.append(morphlabelcell.format(ml=ml)) for t in tenses: mo = pcpltemplate.format(d=dialect, m=mood, v=voice, n=n, c=c, t=t, g=g) allcellsinrow.append(morphcell.format(mo=mo)) thisrow = '\n\t\t'.join(allcellsinrow) allrows.append(morphrowtemplate.format(allcells=thisrow)) if session['morphemptyrows']: allrows.append(blankrow) elif mood == 'inf': allcellsinrow = list() allcellsinrow.append(morphlabelcell.format(ml='infinitive')) for t in tenses: mo = regextemplate.format(d=dialect, m=mood, v=voice, n=None, p=None, t=t) allcellsinrow.append(morphcell.format(mo=mo)) thisrow = '\n\t\t'.join(allcellsinrow) allrows.append(morphrowtemplate.format(allcells=thisrow)) rows = '\n'.join(allrows) thetablehtml = tabletemplate.format(header=fullheader, rows=rows) return thetablehtml
def finddblinefromlocus(workobject: dbOpus, citationtuple: tuple, dbcursor, findlastline=False) -> int: """ citationtuple ('9','109','8') to focus on line 9, section 109, book 8 finddblinefromlocus(h, 1, ('130', '24')) ---> 15033 findlastline lets you grab the end of a segment: needed for the "endpoint" selection code :param workid: :param citationtuple: :param dbcursor: :return: """ workid = workobject.universalid lmap = { 0: 'level_00_value', 1: 'level_01_value', 2: 'level_02_value', 3: 'level_03_value', 4: 'level_04_value', 5: 'level_05_value' } workdb = workid[0:6] if workid[0:2] in ['in', 'dp', 'ch']: wklvs = 2 else: wklvs = workobject.availablelevels if wklvs != len(citationtuple): consolewarning( 'mismatch between shape of work and browsing request: impossible citation of {w}.' .format(w=workid)) print(str(wklvs), ' levels vs', list(citationtuple)) print('safe to ignore if you requested the first line of a work') # step one: find the index number of the passage query = 'SELECT index FROM {w} WHERE ( wkuniversalid=%s ) AND '.format( w=workdb) lq = list() for level in range(0, len(citationtuple)): lq.append('{l}=%s'.format(l=lmap[level])) if not findlastline: query = query + ' AND '.join(lq) + ' ORDER BY index ASC' else: query = query + ' AND '.join(lq) + ' ORDER BY index DESC' # if the last selection box was empty you are sent '_0' instead of a real value # (because the first line of lvl05 is not necc. '1') # so we need to kill off 'level_00_value=%s AND ', etc # example: ('-1', '256', 'beta') [here the 1st line is actually '10t', btw] citation = list(citationtuple) if citation[0] == '_0': query = re.sub(r'level_00_value=%s AND ', '', query) citation = citation[1:] if not citation: indexvalue = workdict[workid].starts return indexvalue data = tuple([workid] + citation) try: dbcursor.execute(query, data) found = dbcursor.fetchone() indexvalue = found[0] except TypeError: # TypeError: 'NoneType' object is not subscriptable indexvalue = returnfirstorlastlinenumber(workdb, dbcursor) # print('finddblinefromlocus() - indexvalue:', indexvalue) return indexvalue
def resetpool(): # dangerous to do this while anything interesting is going on # currently checking to see if need cleaning at head of searchdispatcher consolewarning('emptying out PooledConnectionObject._pools()') PooledConnectionObject._pools = dict() PooledConnectionObject.poolneedscleaning = False
except ImportError: from multiprocessing import current_process if current_process().name == 'MainProcess': print('gensim not available') Word2Vec = None try: from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD except ImportError: if current_process().name == 'MainProcess': consolewarning('sklearn is unavailable', color='black') CountVectorizer = None TfidfTransformer = None SGDClassifier = None GridSearchCV = None Pipeline = None try: # will hurl out a bunch of DeprecationWarning messages at the moment... # lib/python3.6/re.py:191: DeprecationWarning: bad escape \s import pyLDAvis import pyLDAvis.sklearn as ldavis except ImportError: if current_process().name == 'MainProcess': consolewarning('pyLDAvis is unavailable', color='black') pyLDAvis = None
async def wscheckpoll(websocket, path): """ a poll checker started by startwspolling(): the client sends the name of a poll and this will output the status of the poll continuously while the poll remains active example: progress {'active': 1, 'total': 20, 'remaining': 20, 'hits': 48, 'message': 'Putting the results in context', 'elapsed': 14.0, 'extrainfo': '<span class="small"></span>'} :param websocket: :param path: :return: """ try: pollid = await websocket.recv() except websockets.exceptions.ConnectionClosed: # you reloaded the page return # comes to us with quotes: "eb91fb11" --> eb91fb11 pollid = re.sub(r'"', str(), pollid) pollid = validatepollid(pollid) while True: progress = dict() try: active = progresspolldict[pollid].getactivity() progress['ID'] = pollid progress['Poolofwork'] = progresspolldict[pollid].worktotal() progress['Remaining'] = progresspolldict[pollid].getremaining() progress['Hitcount'] = progresspolldict[pollid].gethits() progress['Statusmessage'] = progresspolldict[pollid].getstatus() progress['Launchtime'] = progresspolldict[pollid].getlaunchtime() if not hipparchia.config['SUPPRESSLONGREQUESTMESSAGE']: if progresspolldict[pollid].getnotes(): progress['Notes'] = progresspolldict[pollid].getnotes() else: progress['Notes'] = str() except KeyError: # the poll key is deleted from progresspolldict when the query ends; you will always end up here progress['Active'] = 'inactive' try: await websocket.send(json.dumps(progress)) except websockets.exceptions.ConnectionClosed: # you reloaded the page in the middle of a search and both the poll and the socket vanished pass break except TypeError: # TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType # the poll is gone... break await asyncio.sleep(.4) # print(progress) # print('progress %', ((progress['Poolofwork'] - progress['Remaining']) / progress['Poolofwork']) * 100) try: # something changed amid backend updates and json.dumps() started choking on progresspolldict[pollid].getactivity() # active is (now) a <Synchronized wrapper for c_byte(1)>; that was the unexpected change: it was 'bool' # <class 'multiprocessing.sharedctypes.Synchronized'> progress['Active'] = active.value except AttributeError: # AttributeError: 'str' (or 'int' or 'bool') object has no attribute 'value' progress['Active'] = active try: await websocket.send(json.dumps(progress)) except websockets.exceptions.ConnectionClosed: # websockets.exceptions.ConnectionClosed because you reloaded the page in the middle of a search pass except TypeError as e: # "Object of type Synchronized is not JSON serializable" # macOS and indexmaker combo is a problem; macOS is the real problem? consolewarning('websocket non-fatal error: "{e}"'.format(e=e), color='yellow', isbold=False) pass return
def searchlistintosqldict(searchobject: SearchObject, seeking: str, subqueryphrasesearch=False, vectors=False) -> dict: """ take a searchobject grab its searchlist and its exceptionlist and convert them into a collection of sql queries the old strategy would generate the queries as needed and on the fly: this version is slower can costs more memory by definition; it generates all possible queries and it holds them in memory; nevertheless the speed cost should be negligible relative to the total cost of a search; the memory cost can only get interesting if you have lots of users; but here too the overload problem should come from too much postgres and not too much prep in any case these lists of queries can be handed off to a simple MP-aware helper binary that can dodge MP forking in python; this binary can be in rust or go or ... { table1: {query: q, data: d, temptable: t}, table2: {query: q, data: d, temptable: t}, ... } note that a temptable is seldom used but something like searching inside a date range in an inscriptional corpus will trigger the need for one example: δηλοῖ in Aristotle + 3 works of Plato { 'gr0086': { 'temptable': '', 'query': 'SELECT wkuniversalid, index, level_05_value, level_04_value, level_03_value, level_02_value, level_01_value, level_00_value, marked_up_line, accented_line, stripped_line, hyphenated_words, annotations FROM gr0086 WHERE ( accented_line ~* %s ) LIMIT 200', 'data': ('δηλοῖ',) }, 'gr0059': { 'temptable': '', 'query': 'SELECT wkuniversalid, index, level_05_value, level_04_value, level_03_value, level_02_value, level_01_value, level_00_value, marked_up_line, accented_line, stripped_line, hyphenated_words, annotations FROM gr0059 WHERE ( (index BETWEEN 40842 AND 52799) OR (index BETWEEN 2172 AND 4884) OR (index BETWEEN 1 AND 677) ) AND ( accented_line ~* %s ) LIMIT 200', 'data': ('δηλοῖ',) } } 'ch0814': {'temptable': '\n\tCREATE TEMPORARY TABLE ch0814_includelist_UNIQUENAME AS \n\t\tSELECT values \n\t\t\tAS includeindex FROM unnest(ARRAY[11380,11381,11382,11383,11384,11385,11386,11387,11388]) values\n\t', 'query': 'SELECT wkuniversalid, index, level_05_value, level_04_value, level_03_value, level_02_value, level_01_value, level_00_value, marked_up_line, accented_line, stripped_line, hyphenated_words, annotations FROM ch0814 WHERE \n EXISTS\n (SELECT 1 FROM ch0814_includelist_UNIQUENAME incl WHERE incl.includeindex = ch0814.index\n ', 'data': ('',)} a bit fiddly because more than one class of query is constructed here: vanilla, subquery, vector... """ returndict = dict() so = searchobject searchlist = so.indexrestrictions.keys() # templimits are used by proximity searching but so.cap should have been temporarily swapped out lim = str(so.cap) if so.onehit: mylimit = ' ORDER BY index ASC LIMIT 1' else: mylimit = ' ORDER BY index ASC LIMIT {lim}'.format(lim=lim) mysyntax = '~*' # print(so.indexrestrictions) for authortable in searchlist: r = so.indexrestrictions[authortable] whereextensions = str() returndict[authortable] = dict() returndict[authortable]['temptable'] = str() if r['type'] == 'between': whereextensions = buildbetweenwhereextension(authortable, so) if not subqueryphrasesearch and not vectors: whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) else: # whereextensions will come back with an extraneous ' AND' whereextensions = whereextensions[:-4] whr = 'WHERE {xtn}'.format(xtn=whereextensions) elif r['type'] == 'unrestricted': if not subqueryphrasesearch and not vectors: whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) else: whr = str() elif r['type'] == 'temptable': # how to construct the table... # note that the temp table name can't be assigned yet because you can get collisions via lemmatization # since that will give you more than one query per author table: gr1001_0, gr1001_1, ... q = r['where']['tempquery'] q = re.sub('_includelist', '_includelist_UNIQUENAME', q) returndict[authortable]['temptable'] = q # how to SELECT inside the table... wtempate = """ EXISTS (SELECT 1 FROM {tbl}_includelist_UNIQUENAME incl WHERE incl.includeindex = {tbl}.index """ whereextensions = wtempate.format(tbl=authortable) if not vectors: whr = 'WHERE {xtn} AND {au}.{col} {sy} %s)'.format(au=authortable, col=so.usecolumn, sy=mysyntax, xtn=whereextensions) else: whr = 'WHERE {xtn} )'.format(xtn=whereextensions) else: # should never see this consolewarning('error in substringsearch(): unknown whereclause type', r['type']) whr = 'WHERE ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax) if not subqueryphrasesearch and not vectors: qtemplate = 'SELECT {wtmpl} FROM {db} {whr} {lm}' q = qtemplate.format(wtmpl=worklinetemplate, db=authortable, whr=whr, lm=mylimit) elif vectors: q = 'SELECT {wtmpl} FROM {db} {whr}'.format(wtmpl=worklinetemplate, db=authortable, whr=whr) else: if r['type'] == 'temptable': ttstripper = True else: ttstripper = False q = rewritequerystringforsubqueryphrasesearching(authortable, whr, ttstripper, so) d = (seeking,) returndict[authortable]['query'] = q returndict[authortable]['data'] = d # consolewarning("{a}:\nq\t{q}\nd\t{d}\nt\t{t}".format(a=authortable, q=q, d=d, t=returndict[authortable]['temptable']), color="cyan") return returndict
def __init__(self, autocommit='defaultisno', readonlyconnection=True, ctype='ro'): super().__init__(autocommit, readonlyconnection) self.cytpe = ctype if not PooledConnectionObject._pools: # initialize the borg # note that poolsize is implicitly a claim about how many concurrent users you imagine having poolsize = setthreadcount() + 3 # three known pool types; simple should be faster as you are avoiding locking pooltype = connectionpool.SimpleConnectionPool # pooltype = connectionpool.ThreadedConnectionPool # pooltype = connectionpool.PersistentConnectionPool # [A] 'ro' pool kwds = { 'user': GenericConnectionObject.dbuser, 'host': GenericConnectionObject.dbhost, 'port': GenericConnectionObject.dbport, 'database': GenericConnectionObject.dbname, 'password': GenericConnectionObject.dbpass } try: readonlypool = pooltype(poolsize, poolsize * 2, **kwds) except psycopg2.OperationalError as operror: thefailure = operror.args[0] noconnection = 'could not connect to server' badpass = '******' if noconnection in thefailure: e = GenericConnectionObject.noserverproblem.format( h=GenericConnectionObject.dbhost, p=GenericConnectionObject.dbport) consolewarning( GenericConnectionObject.postgresproblem.format(e=e)) if sys.platform == 'darwin': consolewarning(GenericConnectionObject.darwinproblem) if badpass in thefailure: e = GenericConnectionObject.badpassproblem.format( h=GenericConnectionObject.dbhost, p=GenericConnectionObject.dbport) consolewarning( GenericConnectionObject.postgresproblem.format(e=e)) sys.exit(0) # [B] 'rw' pool: only used by the vector graphing functions # and these are always going to be single-threaded littlepool = max(int(setthreadcount() / 2), 2) kwds['user'] = GenericConnectionObject.dbwriteuser kwds['password'] = GenericConnectionObject.dbwritepass # this can be smaller because only vectors do rw and the vectorbot is not allowed in the pool # but you also need to be free to leave rw unset try: readandwritepool = pooltype(littlepool, littlepool, **kwds) except psycopg2.OperationalError: readandwritepool = None PooledConnectionObject._pools['ro'] = readonlypool PooledConnectionObject._pools['rw'] = readandwritepool assert self.cytpe in ['ro', 'rw' ], 'connection type must be either "ro" or "rw"' self.pool = PooledConnectionObject._pools[self.cytpe] if self.cytpe == 'rw': self.readonlyconnection = False if threading.current_thread().name == 'vectorbot': # the vectobot lives in a thread and it will exhaust the pool self.simpleconnectionfallback() else: try: self.dbconnection = self.pool.getconn(key=self.uniquename) except psycopg2.pool.PoolError: # the pool is exhausted: try a basic connection instead # but in the long run should probably make a bigger pool/debug something # at the moment the only way to hit this error is via some sort of platform bug that yields a hung search # that is, something like a ryzen c-state aborted search damages the pool in the long run... consolewarning( 'PoolError: emergency fallback to SimpleConnectionObject()' ) self.simpleconnectionfallback() PooledConnectionObject.poolneedscleaning = True if self.autocommit == 'autocommit': self.setautocommit() self.setreadonly(self.readonlyconnection) self.curs = getattr(self.dbconnection, 'cursor')()