def querytotalwordcounts(word: str, dbcursor=None) -> dbHeadwordObject: """ use the dictionary_headword_wordcounts table [a] take a dictionary entry: ἄκρατοϲ [b] look it up return a countobject :param word: :param dbcursor: :return: """ dbconnection = None if not dbcursor: dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() table = 'dictionary_headword_wordcounts' qtemplate = """ SELECT entry_name , total_count, gr_count, lt_count, dp_count, in_count, ch_count, frequency_classification, early_occurrences, middle_occurrences ,late_occurrences, acta, agric, alchem, anthol, apocalyp, apocryph, apol, astrol, astron, biogr, bucol, caten, chronogr, comic, comm, concil, coq, dialog, docu, doxogr, eccl, eleg, encom, epic, epigr, epist, evangel, exeget, fab, geogr, gnom, gramm, hagiogr, hexametr, hist, homilet, hymn, hypoth, iamb, ignotum, invectiv, inscr, jurisprud, lexicogr, liturg, lyr, magica, math, mech, med, metrolog, mim, mus, myth, narrfict, nathist, onir, orac, orat, paradox, parod, paroem, perieg, phil, physiognom, poem, polyhist, prophet, pseudepigr, rhet, satura, satyr, schol, tact, test, theol, trag FROM {tbl} WHERE entry_name=%s """ q = qtemplate.format(tbl=table) d = (word, ) try: dbcursor.execute(q, d) hw = dbcursor.fetchone() except psycopg2.ProgrammingError: # psycopg2.ProgrammingError: relation "dictionary_headword_wordcounts" does not exist # you have not installed the wordcounts (yet) hw = None try: hwcountobject = dbHeadwordObject(*hw) except: # print('failed to initialize dbHeadwordObject for', word) hwcountobject = None if dbconnection: dbconnection.connectioncleanup() return hwcountobject
def fetchvectorgraph(imagename) -> bytes: """ grab a graph in the image table so that you can subsequently display it in the browser note that images get deleted after use also note that we hand the data to the db and then immediatel grab it out of the db because of constraints imposed by the way flask works :param imagename: :return: """ if hipparchia.config['RETAINFIGURES']: deletewhendone = False else: deletewhendone = True dbconnection = ConnectionObject(ctype='rw') dbconnection.setautocommit() cursor = dbconnection.cursor() q = 'SELECT imagedata FROM public.storedvectorimages WHERE imagename=%s' d = (imagename,) cursor.execute(q, d) imagedata = cursor.fetchone() # need to convert to bytes, otherwise: # AttributeError: 'memoryview' object has no attribute 'read' try: imagedata = bytes(imagedata[0]) except TypeError: # TypeError: 'NoneType' object is not subscriptable # how did this happen... # if you right click and download a graph in Firefox it will try to pull via the URL # but that figure is almost certainly gone unless you are a debugger retaining figures... imagedata = b'' consolewarning('fetchvectorgraph() failed to fetch image {i}'.format(i=imagename)) # print('fetched {n} from vector image table'.format(n=randomid)) # now we should delete the image because we are done with it if deletewhendone: q = 'DELETE FROM public.storedvectorimages WHERE imagename=%s' d = (imagename,) cursor.execute(q, d) dbconnection.connectioncleanup() return imagedata
def reversedictionarylookup(seeking: str, usedict: str, limit=None) -> List: """ find an (approximate) entry in a dictionary note the syntax: ~ return a list of wordobjects :param seeking: :param usedict: :param limit: :return: """ cleanpoolifneeded() dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() assert usedict in [ 'greek', 'latin' ], 'searchdbforlexicalentry() needs usedict to be "greek" or "latin"' objecttemplate = None fields = 'entry_name, metrical_entry, id_number, pos, translations, entry_body, {extra}' if usedict == 'greek': objecttemplate = dbGreekWord fields = fields.format(extra='unaccented_entry') elif usedict == 'latin': objecttemplate = dbLatinWord fields = fields.format(extra='entry_key') if limit: qstring = 'SELECT {f} FROM {d}_dictionary WHERE translations ~ %s LIMIT {lim}' else: qstring = 'SELECT {f} FROM {d}_dictionary WHERE translations ~ %s' query = qstring.format(f=fields, d=usedict, lim=limit) data = ('{s}'.format(s=seeking), ) dbcursor.execute(query, data) matches = dbcursor.fetchall() wordobjects = [objecttemplate(*m) for m in matches] dbconnection.connectioncleanup() return wordobjects
def rankheadwordsbyprevalence(listofheadwords: list) -> dict: """ """ # print('rankheadwordsbyprevalence() listofheadwords', listofheadwords) dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(listofheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) ranked = {r[0]: r[1] for r in results} # you have a problem: you just tossed a bunch of headwords that did not have good prevalence data # discovered when Ϲωκράτηϲ went missing from Plato r = set(ranked.keys()) h = set(listofheadwords) delta = h - r nullranked = {d: 0 for d in delta} ranked = {**ranked, **nullranked} return ranked
def storevectorgraph(figureasbytes): """ store a graph in the image table so that you can subsequently display it in the browser note that images get deleted after use also note that we hand the data to the db and then immediatel grab it out of the db because of constraints imposed by the way flask works :param figureasbytes: :return: """ dbconnection = ConnectionObject(ctype='rw') dbconnection.setautocommit() cursor = dbconnection.cursor() # avoid psycopg2.DataError: value too long for type character varying(12) randomid = assignuniquename(12) q = """ INSERT INTO public.storedvectorimages (imagename, imagedata) VALUES (%s, %s) """ d = (randomid, figureasbytes) try: cursor.execute(q, d) except psycopg2.ProgrammingError: # psycopg2.ProgrammingError: relation "public.storedvectorimages" does not exist createstoredimagestable() cursor.execute(q, d) # print('stored {n} in vector image table'.format(n=randomid)) dbconnection.connectioncleanup() return randomid
def grablemmataobjectfor(db, dbcursor=None, word=None, xref=None, allowsuperscripts=False): """ send a word, return a lemmaobject hipparchiaDB=# select * from greek_lemmata limit 0; dictionary_entry | xref_number | derivative_forms ------------------+-------------+------------------ EITHER 'word' should be set OR 'xref' should be set: not both at the moment we only use 'word' in both calls to this function: hipparchiaobjects/lexicaloutputobjects.py hipparchiaobjects/morphanalysisobjects.py 'allowsuperscripts' because sometimes you are supposed to search under δέω² and sometimes you are not... :param db: :param dbcursor: :param word: :param xref: :param allowsuperscripts: :return: """ dbconnection = None if not dbcursor: dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() field = str() data = None if xref: field = 'xref_number' data = xref if word: field = 'dictionary_entry' data = word if not allowsuperscripts: data = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹]', '', data) if not session['available'][db]: lo = dbLemmaObject( '[parsing is impossible: lemmata data was not installed]', -1, '') return lo if not data: lo = dbLemmaObject( '[programming error: no word or xref set in grablemmataobjectfor()]', -1, '') return lo q = 'SELECT * FROM {db} WHERE {f}=%s'.format(db=db, f=field) d = (data, ) dbcursor.execute(q, d) lem = dbcursor.fetchone() try: lemmaobject = dbLemmaObject(*lem) except TypeError: # 'NoneType' object is not subscriptable lemmaobject = dbLemmaObject('[entry not found]', -1, '') if dbconnection: dbconnection.connectioncleanup() return lemmaobject
def probedictionary(usedictionary: str, usecolumn: str, seeking: str, syntax: str, dbcursor=None, trialnumber=0) -> List: """ this will make several stabs at finding a word in the dictionary we need to do this because sometimes a find in the morphology dictionary does not point to something you can find in the dictionary of meanings sample values: dictionary: 'greek_dictionary' usecolumn: 'entry_name' seeking: 'προχοΐδιον' syntax: '=' or 'LIKE' still unimplemented: τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter) :param dbcursor: :param usedictionary: :param usecolumn: :param seeking: :param syntax: :param trialnumber: :return: """ # print('seeking/trial',seeking,trialnumber) dbconnection = None if not dbcursor: dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() maxtrials = 8 trialnumber += 1 accenteddiaresis = re.compile(r'αί|εί|οί|υί|ηί|ωί') unaccenteddiaresis = re.compile(r'αι|ει|οι|υι|ηι|ωι') # nothingfound = convertdictionaryfindintoobject('nothing', 'nodict') if usedictionary == 'latin_dictionary': extracolumn = 'entry_key' else: extracolumn = 'unaccented_entry' qtemplate = """SELECT entry_name, metrical_entry, id_number, pos, translations, entry_body, {ec} FROM {d} WHERE {col} {sy} %s ORDER BY id_number ASC""" query = qtemplate.format(ec=extracolumn, d=usedictionary, col=usecolumn, sy=syntax) data = (seeking, ) # print('searchdictionary()',query,'\n\t',data) try: dbcursor.execute(query, data) found = dbcursor.fetchall() except psycopg2.DataError: # thrown by dbcursor.execute() # invalid regular expression: parentheses () not balanced # ό)μβροϲ is a (bogus) headword; how many others are there? found = list() # we might be at trial 2+ and so we need to strip the supplement we used at trial #1 if trialnumber > 2: seeking = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking) seeking = re.sub(r'\^', '', seeking) foundobjects = None if len(found) > 0: foundobjects = [ convertdictionaryfindintowordobject(f, usedictionary, dbcursor) for f in found ] elif trialnumber == 1: # failure... # the word is probably there, we have just been given the wrong search term; try some other solutions # [1] first guess: there were multiple possible entries, not just one newword = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹]', '', seeking.lower()) foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber == 2: # grab any/all variants: ⁰¹²³⁴⁵⁶⁷⁸⁹ newword = '^{sk}[¹²³⁴⁵⁶⁷⁸⁹]'.format(sk=seeking) foundobjects = probedictionary(usedictionary, usecolumn, newword, '~', dbcursor, trialnumber) # elif trialnumber < maxtrials and '-' in seeking: # newword = attemptelision(seeking) # foundobject = searchdictionary(cursor, dictionary, usecolumn, newword, '=', trialnumber) elif trialnumber < maxtrials and seeking[-1] == 'ω': # ὑποϲυναλείφομαι is in the dictionary, but greek_lemmata says to look for ὑπό-ϲυναλείφω newword = seeking[:-1] + 'ομαι' foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(r'ομαι$', seeking): # χαρίζω is in the dictionary, but greek_lemmata says to look for χαρίζομαι newword = seeking[:-4] + 'ω' foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(accenteddiaresis, seeking): # false positives very easy here, but we are getting desperate and have nothing to lose diaresis = re.search(accenteddiaresis, seeking) head = seeking[:diaresis.start()] tail = seeking[diaresis.end():] vowels = diaresis.group(0) vowels = vowels[0] + 'ΐ' newword = head + vowels + tail foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(unaccenteddiaresis, seeking): diaresis = re.search(unaccenteddiaresis, seeking) head = seeking[:diaresis.start()] tail = seeking[diaresis.end():] vowels = diaresis.group(0) vowels = vowels[0] + 'ϊ' newword = head + vowels + tail foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials: # τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter) trialnumber = maxtrials - 1 newword = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking) newword = stripaccents(newword) newword = universalregexequivalent(newword) # strip '(' and ')' newword = '^{wd}$'.format(wd=newword[1:-1]) foundobjects = probedictionary(usedictionary, usecolumn, newword, '~', dbcursor, trialnumber) if dbconnection: dbconnection.connectioncleanup() return foundobjects
def findentrybyid(usedict: str, entryid: str) -> dbDictionaryEntry: """ find by id number hipparchiaDB=# select * from greek_dictionary limit 0; entry_name | metrical_entry | unaccented_entry | id_number | pos | translations | entry_body ------------+----------------+------------------+-----------+-----+--------------+------------ (0 rows) hipparchiaDB=# select * from latin_dictionary limit 0; entry_name | metrical_entry | id_number | entry_key | pos | translations | entry_body ------------+----------------+-----------+-----------+-----+--------------+------------ (0 rows) :param usedict: :param entryid: :return: """ cleanpoolifneeded() dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() assert usedict in [ 'greek', 'latin' ], 'searchdbforlexicalentry() needs usedict to be "greek" or "latin"' if usedict == 'latin': extracolumn = 'entry_key' else: extracolumn = 'unaccented_entry' qtemplate = """SELECT entry_name, metrical_entry, id_number, pos, translations, entry_body, {ec} FROM {d}_dictionary WHERE id_number = %s""" query = qtemplate.format(ec=extracolumn, d=usedict) data = (entryid, ) try: dbcursor.execute(query, data) except: # older database: int vs float on this column # psycopg2.errors.InvalidTextRepresentation: invalid input syntax for integer: "13493.0" eidconverted = str(int(float(entryid))) data = (eidconverted, ) dbcursor.execute(query, data) match = dbcursor.fetchone() if match: wordobject = convertdictionaryfindintowordobject( match, '{d}_dictionary'.format(d=usedict), dbcursor) else: wordobject = None dbconnection.connectioncleanup() return wordobject
def bulkenvironsfetcher(table: str, searchresultlist: list, context: int) -> list: """ given a list of SearchResult objects, populate the lineobjects of each SearchResult with their contexts :param table: :param searchresultlist: :param context: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() cursor = dbconnection.cursor() tosearch = deque() reversemap = dict() for r in searchresultlist: resultnumber = r.hitnumber focusline = r.getindex() environs = list( range(int(focusline - (context / 2)), int(focusline + (context / 2)) + 1)) tosearch.extend(environs) rmap = {e: resultnumber for e in environs} reversemap.update(rmap) r.lineobjects = list() tosearch = [str(x) for x in tosearch] tqtemplate = """ CREATE TEMPORARY TABLE {au}_includelist_{ac} AS SELECT values AS includeindex FROM unnest(ARRAY[{lines}]) values """ # avoidcollisions instead of DROP TABLE IF EXISTS; the table disappears when the connection is cleaned up avoidcollisions = assignuniquename() tempquery = tqtemplate.format(au=table, ac=avoidcollisions, lines=','.join(tosearch)) cursor.execute(tempquery) qtemplate = """ SELECT {wtmpl} FROM {au} WHERE EXISTS (SELECT 1 FROM {au}_includelist_{ac} incl WHERE incl.includeindex = {au}.index) """ query = qtemplate.format(wtmpl=worklinetemplate, au=table, ac=avoidcollisions) cursor.execute(query) results = resultiterator(cursor) lines = [dblineintolineobject(r) for r in results] indexedlines = {l.index: l for l in lines} for r in searchresultlist: environs = list( range(int(r.getindex() - (context / 2)), int(r.getindex() + (context / 2)) + 1)) for e in environs: try: r.lineobjects.append(indexedlines[e]) except KeyError: # you requested a line that was outside of the scope of the table # so there was no result and the key will not match a find pass dbconnection.connectioncleanup() return searchresultlist
def reverselexiconsearch(searchid, searchterm) -> JSON_STR: """ attempt to find all of the greek/latin dictionary entries that might go with the english search term 'ape' will drive this crazy; what is needed is a lookup for only the senses this can be built into the dictionary :param searchid: :param searchterm: :return: """ searchterm = searchterm[:hipparchia.config['MAXIMUMLEXICALLENGTH']] pollid = validatepollid(searchid) progresspolldict[pollid] = ProgressPoll(pollid) activepoll = progresspolldict[pollid] activepoll.activate() activepoll.statusis('Searching lexical entries for "{t}"'.format(t=searchterm)) probeforsessionvariables() returndict = dict() returnarray = list() seeking = depunct(searchterm) if justlatin(): searchunder = [('latin', 'hi')] elif justtlg(): searchunder = [('greek', 'tr')] else: searchunder = [('greek', 'tr'), ('latin', 'hi')] limit = hipparchia.config['CAPONDICTIONARYFINDS'] entriestuples = list() for s in searchunder: usedict = s[0] translationlabel = s[1] # first see if your term is mentioned at all wordobjects = reversedictionarylookup(seeking, usedict, limit) entriestuples += [(w.entry, w.id) for w in wordobjects] if len(entriestuples) == limit: returnarray.append('[stopped searching after {lim} finds]\n<br>\n'.format(lim=limit)) entriestuples = list(set(entriestuples)) unsortedentries = [(querytotalwordcounts(e[0]), e[0], e[1]) for e in entriestuples] entries = list() for e in unsortedentries: hwcountobject = e[0] term = e[1] idval = e[2] if hwcountobject: entries.append((hwcountobject.t, term, idval)) else: entries.append((0, term, idval)) entries = sorted(entries, reverse=True) entriestuples = [(e[1], e[2]) for e in entries] # now we retrieve and format the entries if entriestuples: # summary of entry values first countobjectdict = {e: querytotalwordcounts(e[0]) for e in entriestuples} summary = list() count = 0 for c in countobjectdict.keys(): count += 1 try: totalhits = countobjectdict[c].t except: totalhits = 0 # c[0]: the word; c[1]: the id summary.append((count, c[0], c[1], totalhits)) summarytemplate = """ <span class="sensesum">({n}) <a class="nounderline" href="#{w}_{wdid}">{w}</a> <span class="small">({t:,})</span> </span> """ summary = sorted(summary, key=lambda x: x[3], reverse=True) summary = [summarytemplate.format(n=e[0], w=e[1], wdid=e[2], t=e[3]) for e in summary] returnarray.append('\n<br />\n'.join(summary)) # then the entries proper dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() wordobjects = [probedictionary(setdictionarylanguage(e[0]) + '_dictionary', 'entry_name', e[0], '=', dbcursor=dbcursor, trialnumber=0) for e in entriestuples] wordobjects = flattenlistoflists(wordobjects) outputobjects = [lexicalOutputObject(w) for w in wordobjects] if len(outputobjects) > 1: usecounter = True else: usecounter = False count = 0 for oo in outputobjects: count += 1 if usecounter: entry = oo.generatelexicaloutput(countervalue=count) else: entry = oo.generatelexicaloutput() returnarray.append(entry) else: returnarray.append('<br />[nothing found under "{skg}"]'.format(skg=seeking)) returndict['newhtml'] = '\n'.join(returnarray) returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()]) jsondict = json.dumps(returndict) del progresspolldict[pollid] return jsondict
def buildwinnertakesallbagsofwords(morphdict, sentences) -> deque: """ turn a list of sentences into a list of list of headwords here we figure out which headword is the dominant homonym then we just use that term esse ===> sum esse =/=> edo assuming that it is faster to do this 2x so you can do a temp table query rather than iterate into DB not tested/profiled, though... :param morphdict: :param sentences: :return: """ # PART ONE: figure out who the "winners" are going to be bagsofwords = buildflatbagsofwords(morphdict, sentences) allheadwords = {w for bag in bagsofwords for w in bag} dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(allheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) randkedheadwords = {r[0]: r[1] for r in results} # PART TWO: let the winners take all bagsofwords = deque() for s in sentences: lemattized = deque() for word in s: # [('x', 4), ('y', 5), ('z', 1)] try: possibilities = sorted([(item, randkedheadwords[item]) for item in morphdict[word]], key=lambda x: x[1]) # first item of last tuple is the winner lemattized.append(possibilities[-1][0]) except KeyError: pass if lemattized: bagsofwords.append(lemattized) return bagsofwords