def loadallauthorsasobjects() -> dict: """ return a dict of all possible author objects :return: """ print('loading all authors...', end='') dbconnection = ConnectionObject() cursor = dbconnection.cursor() q = 'SELECT * FROM authors' cursor.execute(q) results = resultiterator(cursor) authorsdict = {r[0]: dbAuthor(*r) for r in results} print('\t', len(authorsdict), 'authors loaded', end='') dbconnection.connectioncleanup() return authorsdict
def loadallworksasobjects() -> dict: """ return a dict of all possible work objects :return: """ print('loading all works... ', end='') dbconnection = ConnectionObject() cursor = dbconnection.cursor() q = """ SELECT universalid, title, language, publication_info, levellabels_00, levellabels_01, levellabels_02, levellabels_03, levellabels_04, levellabels_05, workgenre, transmission, worktype, provenance, recorded_date, converted_date, wordcount, firstline, lastline, authentic FROM works """ cursor.execute(q) results = resultiterator(cursor) worksdict = {r[0]: dbOpus(*r) for r in results} print('\t', len(worksdict), 'works loaded', end='') dbconnection.connectioncleanup() return worksdict
def grabbundlesoflines(worksandboundaries: dict, cursor) -> list: """ grab and return lots of lines this is very generic typical uses are one work + a line range (which may or may not be the whole work: {'work1: (start,stop)} multiple (whole) works: {'work1': (start,stop), 'work2': (start,stop), ...} but you could one day use this to mix-and-match: a completeindex of Thuc + Hdt 3 + all Epic... this is, you could use compileauthorandworklist() to feed this function the resulting concorances would be massive :param worksandboundaries: :param cursor: :return: """ lineobjects = deque() for w in worksandboundaries: db = w[0:6] query = 'SELECT {wtmpl} FROM {db} WHERE (index >= %s AND index <= %s)'.format( wtmpl=worklinetemplate, db=db) data = (worksandboundaries[w][0], worksandboundaries[w][1]) cursor.execute(query, data) lines = resultiterator(cursor) thiswork = [dblineintolineobject(l) for l in lines] lineobjects.extend(thiswork) return list(lineobjects)
def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]: """ note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values; hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name); entry_name | total_count | gr_count | lt_count | dp_count | in_count | ch_count ---------------+-------------+----------+----------+----------+----------+---------- κατακλεῖϲαι | 31 | 30 | 0 | 0 | 1 | 0 κατακλειούϲηϲ | 3 | 3 | 0 | 0 | 0 | 0 κατακλῇϲαι | 1 | 1 | 0 | 0 | 0 | 0 (3 rows) :param listofwords: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbcursor = dbconnection.cursor() try: firstletteroffirstword = stripaccents(listofwords[0][0]) except IndexError: return list() if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ': firstletteroffirstword = '0' tqtemplate = """ CREATE TEMP TABLE bulkcounter_{rnd} AS SELECT values AS entriestocheck FROM unnest(ARRAY[%s]) values """ uniquename = assignuniquename(12) tempquery = tqtemplate.format(rnd=uniquename) data = (listofwords, ) dbcursor.execute(tempquery, data) qtemplate = """ SELECT * FROM wordcounts_{x} WHERE EXISTS (SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name) """ query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword) try: dbcursor.execute(query) results = resultiterator(dbcursor) except psycopg2.ProgrammingError: # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist results = list() wordcountobjects = [dbWordCountObject(*r) for r in results] dbconnection.connectioncleanup() return wordcountobjects
def precomposedsqlsearcher(querydict, dbcursor) -> Generator: """ as per substringsearchintosqldict(): sq = { table1: {query: q, data: d, temptable: t}, table2: {query: q, data: d, temptable: t}, ... } only sent the dict at sq[tableN] """ t = querydict['temptable'] q = querydict['query'] d = (querydict['data'],) if t: unique = assignuniquename() t = re.sub('UNIQUENAME', unique, t) q = re.sub('UNIQUENAME', unique, q) dbcursor.execute(t) found = list() # debugmessage('precomposedsqlsearcher() querydict = {q}'.format(q=querydict)) # debugmessage('precomposedsqlsearcher() q:\n\t{q}\nd:\n\t{d}'.format(q=q, d=d)) warnings = { 1: 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex', 2: 'psycopg2.InternalError; did not execute query="{q}" and data="{d}', 3: 'precomposedsqlsearcher() DatabaseError for {c} @ {p}', 4: 'precomposedsqlsearcher() IndexError: malformed query/data combination; empty results returned' } try: dbcursor.execute(q, d) found = resultiterator(dbcursor) except psycopg2.DataError: # e.g., invalid regular expression: parentheses () not balanced consolewarning(warnings[1].format(d=d[0]), color='red') except psycopg2.InternalError: # current transaction is aborted, commands ignored until end of transaction block consolewarning(warnings[2].format(q=q, d=d), color='red') except psycopg2.DatabaseError: # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # added to track PooledConnection threading issues # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4' consolewarning(warnings[3].format(c=dbcursor, p=multiprocessing.current_process().name), color='red') consolewarning('\tq, d: {q}, {d}'.format(q=q, d=q)) except IndexError: found = list() consolewarning(warnings[4], color='red') consolewarning("\tq = {q}".format(q=q, d=q), color='red') consolewarning("\td = :{d}\n===========".format(q=q, d=q), color='yellow') return found
def bulklexicalgrab(listofwords: List[str], tabletouse: str, targetcolumn: str, language: str) -> list: """ grab a bunch of lex/morph entries by using a temp table e.g., lexicalresults = bulklexicalgrab(listofwords, 'dictionary', 'entry_name', language) results = bulklexicalgrab(listofwords, 'morphology', 'observed_form', language) :param listofwords: :param tabletouse: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbcursor = dbconnection.cursor() tqtemplate = """ CREATE TEMP TABLE bulklex_{rnd} AS SELECT values AS entriestocheck FROM unnest(ARRAY[%s]) values """ uniquename = assignuniquename(12) tempquery = tqtemplate.format(rnd=uniquename) data = (listofwords, ) dbcursor.execute(tempquery, data) qtemplate = """ SELECT * FROM {lg}_{thetable} WHERE EXISTS (SELECT 1 FROM bulklex_{rnd} tocheck WHERE tocheck.entriestocheck = {lg}_{thetable}.{target}) """ query = qtemplate.format(rnd=uniquename, thetable=tabletouse, target=targetcolumn, lg=language) try: dbcursor.execute(query) results = resultiterator(dbcursor) except psycopg2.ProgrammingError: # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist results = list() dbconnection.connectioncleanup() return results
def rankheadwordsbyprevalence(listofheadwords: list) -> dict: """ """ # print('rankheadwordsbyprevalence() listofheadwords', listofheadwords) dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(listofheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) ranked = {r[0]: r[1] for r in results} # you have a problem: you just tossed a bunch of headwords that did not have good prevalence data # discovered when Ϲωκράτηϲ went missing from Plato r = set(ranked.keys()) h = set(listofheadwords) delta = h - r nullranked = {d: 0 for d in delta} ranked = {**ranked, **nullranked} return ranked
def loadlemmataasobjects() -> dict: """ return a dict of all possible lemmataobjects hipparchiaDB=# select * from greek_lemmata limit 1; dictionary_entry | xref_number | derivative_forms ------------------+-------------+------------------------ ζῳοτροφία | 49550639 | {ζῳοτροφίᾳ,ζῳοτροφίαϲ} :return: """ print('loading all lemmata...', end=str()) dbconnection = ConnectionObject() cursor = dbconnection.cursor() q = """ SELECT dictionary_entry, xref_number, derivative_forms FROM {lang}_lemmata """ lemmatadict = dict() languages = {1: 'greek', 2: 'latin'} for key in languages: cursor.execute(q.format(lang=languages[key])) results = resultiterator(cursor) lemmatadict = { **{r[0]: dbLemmaObject(*r) for r in results}, **lemmatadict } print('\t', len(lemmatadict), 'lemmata loaded', end=str()) # print('lemmatadict["molestus"]', lemmatadict['molestus'].formlist) # print('lemmatadict["Mausoleus"]', lemmatadict['Mausoleus'].formlist) # print('lemmatadict["λύω"]', lemmatadict['λύω'].formlist) # print('lemmatadict["Δημοϲθένηϲ"]', lemmatadict['Δημοϲθένηϲ'].formlist) dbconnection.connectioncleanup() return lemmatadict
def bulklinegrabber(table: str, column: str, criterion: str, setofcriteria, cursor) -> dict: """ snarf up a huge number of lines :param table: :param column: :param criterion: :param setofcriteria: :param cursor: :return: """ qtemplate = 'SELECT {cri}, {col} FROM {t} WHERE {cri} = ANY(%s)' q = qtemplate.format(col=column, t=table, cri=criterion) d = (list(setofcriteria), ) cursor.execute(q, d) lines = resultiterator(cursor) contents = {'{t}@{i}'.format(t=table, i=l[0]): l[1] for l in lines} return contents
def bulkenvironsfetcher(table: str, searchresultlist: list, context: int) -> list: """ given a list of SearchResult objects, populate the lineobjects of each SearchResult with their contexts :param table: :param searchresultlist: :param context: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() cursor = dbconnection.cursor() tosearch = deque() reversemap = dict() for r in searchresultlist: resultnumber = r.hitnumber focusline = r.getindex() environs = list( range(int(focusline - (context / 2)), int(focusline + (context / 2)) + 1)) tosearch.extend(environs) rmap = {e: resultnumber for e in environs} reversemap.update(rmap) r.lineobjects = list() tosearch = [str(x) for x in tosearch] tqtemplate = """ CREATE TEMPORARY TABLE {au}_includelist_{ac} AS SELECT values AS includeindex FROM unnest(ARRAY[{lines}]) values """ # avoidcollisions instead of DROP TABLE IF EXISTS; the table disappears when the connection is cleaned up avoidcollisions = assignuniquename() tempquery = tqtemplate.format(au=table, ac=avoidcollisions, lines=','.join(tosearch)) cursor.execute(tempquery) qtemplate = """ SELECT {wtmpl} FROM {au} WHERE EXISTS (SELECT 1 FROM {au}_includelist_{ac} incl WHERE incl.includeindex = {au}.index) """ query = qtemplate.format(wtmpl=worklinetemplate, au=table, ac=avoidcollisions) cursor.execute(query) results = resultiterator(cursor) lines = [dblineintolineobject(r) for r in results] indexedlines = {l.index: l for l in lines} for r in searchresultlist: environs = list( range(int(r.getindex() - (context / 2)), int(r.getindex() + (context / 2)) + 1)) for e in environs: try: r.lineobjects.append(indexedlines[e]) except KeyError: # you requested a line that was outside of the scope of the table # so there was no result and the key will not match a find pass dbconnection.connectioncleanup() return searchresultlist
def substringsearch(seeking: str, authortable: str, searchobject: SearchObject, cursor, templimit=None) -> Generator: """ actually one of the most basic search types: look for a string/substring the whereclause is built conditionally: sample 'unrestricted': SELECT * FROM gr0059 WHERE ( stripped_line ~* %s ) LIMIT 200 ('βαλλ',) [i.e, SELECT * FROM gr0059 WHERE ( stripped_line ~* 'βαλλ') LIMIT 200;] sample 'between': SELECT * FROM gr0032 WHERE (index BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 1846 AND 2061) AND ( stripped_line ~* %s ) LIMIT 200 ('βαλλ',) sample 'temptable': [create the temptable] SELECT * FROM in1204 WHERE EXISTS (SELECT 1 FROM in1204_includelist incl WHERE incl.includeindex = in1204.index AND in1204.accented_line ~* %s) LIMIT 200 ('τούτου',) :param seeking: :param authortable: :param searchobject: :param cursor: :param templimit: :return: """ so = searchobject if templimit: lim = str(templimit) else: lim = str(so.cap) if so.onehit: mylimit = ' LIMIT 1' else: mylimit = ' LIMIT {lim}'.format(lim=lim) mysyntax = '~*' found = list() r = so.indexrestrictions[authortable] whereextensions = str() if r['type'] == 'temptable': # make the table q = r['where']['tempquery'] avoidcollisions = assignuniquename() q = re.sub('_includelist', '_includelist_{a}'.format(a=avoidcollisions), q) cursor.execute(q) # now you can work with it wtempate = """ EXISTS (SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index """ whereextensions = wtempate.format(a=avoidcollisions, tbl=authortable) whr = 'WHERE {xtn} AND {au}.{col} {sy} %s)'.format(au=authortable, col=so.usecolumn, sy=mysyntax, xtn=whereextensions) elif r['type'] == 'between': whereextensions = buildbetweenwhereextension(authortable, so) whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) elif r['type'] == 'unrestricted': whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) else: # should never see this consolewarning('error in substringsearch(): unknown whereclause type', r['type']) whr = 'WHERE ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax) qtemplate = 'SELECT {wtmpl} FROM {db} {whr} {lm}' q = qtemplate.format(wtmpl=worklinetemplate, db=authortable, whr=whr, lm=mylimit) d = (seeking, ) # print('q/d\nq:\t{q}\nd:\t{d}\n'.format(q=q, d=d)) try: cursor.execute(q, d) found = resultiterator(cursor) except psycopg2.DataError: # e.g., invalid regular expression: parentheses () not balanced consolewarning( 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex' .format(d=d[0]), color='red') except psycopg2.InternalError: # current transaction is aborted, commands ignored until end of transaction block consolewarning( 'psycopg2.InternalError; did not execute query="{q}" and data="{d}' .format(q=q, d=d), color='red') except psycopg2.DatabaseError: # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # added to track PooledConnection threading issues # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4' consolewarning('DatabaseError for {c} @ {p}'.format( c=cursor, p=multiprocessing.current_process().name), color='red') consolewarning('\tq, d', q, d) return found
def buildwinnertakesallbagsofwords(morphdict, sentences) -> deque: """ turn a list of sentences into a list of list of headwords here we figure out which headword is the dominant homonym then we just use that term esse ===> sum esse =/=> edo assuming that it is faster to do this 2x so you can do a temp table query rather than iterate into DB not tested/profiled, though... :param morphdict: :param sentences: :return: """ # PART ONE: figure out who the "winners" are going to be bagsofwords = buildflatbagsofwords(morphdict, sentences) allheadwords = {w for bag in bagsofwords for w in bag} dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(allheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) randkedheadwords = {r[0]: r[1] for r in results} # PART TWO: let the winners take all bagsofwords = deque() for s in sentences: lemattized = deque() for word in s: # [('x', 4), ('y', 5), ('z', 1)] try: possibilities = sorted([(item, randkedheadwords[item]) for item in morphdict[word]], key=lambda x: x[1]) # first item of last tuple is the winner lemattized.append(possibilities[-1][0]) except KeyError: pass if lemattized: bagsofwords.append(lemattized) return bagsofwords