def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]: """ note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values; hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name); entry_name | total_count | gr_count | lt_count | dp_count | in_count | ch_count ---------------+-------------+----------+----------+----------+----------+---------- κατακλεῖϲαι | 31 | 30 | 0 | 0 | 1 | 0 κατακλειούϲηϲ | 3 | 3 | 0 | 0 | 0 | 0 κατακλῇϲαι | 1 | 1 | 0 | 0 | 0 | 0 (3 rows) :param listofwords: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbcursor = dbconnection.cursor() try: firstletteroffirstword = stripaccents(listofwords[0][0]) except IndexError: return list() if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ': firstletteroffirstword = '0' tqtemplate = """ CREATE TEMP TABLE bulkcounter_{rnd} AS SELECT values AS entriestocheck FROM unnest(ARRAY[%s]) values """ uniquename = assignuniquename(12) tempquery = tqtemplate.format(rnd=uniquename) data = (listofwords, ) dbcursor.execute(tempquery, data) qtemplate = """ SELECT * FROM wordcounts_{x} WHERE EXISTS (SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name) """ query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword) try: dbcursor.execute(query) results = resultiterator(dbcursor) except psycopg2.ProgrammingError: # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist results = list() wordcountobjects = [dbWordCountObject(*r) for r in results] dbconnection.connectioncleanup() return wordcountobjects
def precomposedsqlsearcher(querydict, dbcursor) -> Generator: """ as per substringsearchintosqldict(): sq = { table1: {query: q, data: d, temptable: t}, table2: {query: q, data: d, temptable: t}, ... } only sent the dict at sq[tableN] """ t = querydict['temptable'] q = querydict['query'] d = (querydict['data'],) if t: unique = assignuniquename() t = re.sub('UNIQUENAME', unique, t) q = re.sub('UNIQUENAME', unique, q) dbcursor.execute(t) found = list() # debugmessage('precomposedsqlsearcher() querydict = {q}'.format(q=querydict)) # debugmessage('precomposedsqlsearcher() q:\n\t{q}\nd:\n\t{d}'.format(q=q, d=d)) warnings = { 1: 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex', 2: 'psycopg2.InternalError; did not execute query="{q}" and data="{d}', 3: 'precomposedsqlsearcher() DatabaseError for {c} @ {p}', 4: 'precomposedsqlsearcher() IndexError: malformed query/data combination; empty results returned' } try: dbcursor.execute(q, d) found = resultiterator(dbcursor) except psycopg2.DataError: # e.g., invalid regular expression: parentheses () not balanced consolewarning(warnings[1].format(d=d[0]), color='red') except psycopg2.InternalError: # current transaction is aborted, commands ignored until end of transaction block consolewarning(warnings[2].format(q=q, d=d), color='red') except psycopg2.DatabaseError: # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # added to track PooledConnection threading issues # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4' consolewarning(warnings[3].format(c=dbcursor, p=multiprocessing.current_process().name), color='red') consolewarning('\tq, d: {q}, {d}'.format(q=q, d=q)) except IndexError: found = list() consolewarning(warnings[4], color='red') consolewarning("\tq = {q}".format(q=q, d=q), color='red') consolewarning("\td = :{d}\n===========".format(q=q, d=q), color='yellow') return found
def __init__(self, autocommit, readonlyconnection): # note that only autocommit='autocommit' will make a difference self.autocommit = autocommit self.readonlyconnection = readonlyconnection self.commitcount = hipparchia.config['MPCOMMITCOUNT'] # used for the key for getconn() and putconn(); but unneeded if PersistentConnectionPool # also useful to have on hand for debugging self.uniquename = assignuniquename() # the next two must get filled out when the actual connection is made self.dbconnection = None self.curs = None
def bulklexicalgrab(listofwords: List[str], tabletouse: str, targetcolumn: str, language: str) -> list: """ grab a bunch of lex/morph entries by using a temp table e.g., lexicalresults = bulklexicalgrab(listofwords, 'dictionary', 'entry_name', language) results = bulklexicalgrab(listofwords, 'morphology', 'observed_form', language) :param listofwords: :param tabletouse: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbcursor = dbconnection.cursor() tqtemplate = """ CREATE TEMP TABLE bulklex_{rnd} AS SELECT values AS entriestocheck FROM unnest(ARRAY[%s]) values """ uniquename = assignuniquename(12) tempquery = tqtemplate.format(rnd=uniquename) data = (listofwords, ) dbcursor.execute(tempquery, data) qtemplate = """ SELECT * FROM {lg}_{thetable} WHERE EXISTS (SELECT 1 FROM bulklex_{rnd} tocheck WHERE tocheck.entriestocheck = {lg}_{thetable}.{target}) """ query = qtemplate.format(rnd=uniquename, thetable=tabletouse, target=targetcolumn, lg=language) try: dbcursor.execute(query) results = resultiterator(dbcursor) except psycopg2.ProgrammingError: # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist results = list() dbconnection.connectioncleanup() return results
def insertuniqunames(sqldict: dict) -> dict: """ swap out "_UNIQUENAME" in the prerolled queries """ for item in sqldict: if sqldict[item]['temptable']: u = assignuniquename() sqldict[item]['query'] = re.sub(r'UNIQUENAME', u, sqldict[item]['query']) sqldict[item]['temptable'] = re.sub(r'UNIQUENAME', u, sqldict[item]['temptable']) return sqldict
def rankheadwordsbyprevalence(listofheadwords: list) -> dict: """ """ # print('rankheadwordsbyprevalence() listofheadwords', listofheadwords) dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(listofheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) ranked = {r[0]: r[1] for r in results} # you have a problem: you just tossed a bunch of headwords that did not have good prevalence data # discovered when Ϲωκράτηϲ went missing from Plato r = set(ranked.keys()) h = set(listofheadwords) delta = h - r nullranked = {d: 0 for d in delta} ranked = {**ranked, **nullranked} return ranked
def loadusersdict(knownusersandpasswords=None): """ return the userobjects we know about note that this is effectively empty: no dict of users is being passed ATM anyone with ambitions re. a collection of users should insert them via securitysettings.py KNOWNUSERSDICT = {'user1': 'pass1, 'user2': 'pass2'} elaborate user and authentication schemes are a non-priority (as is encryption...) :return: """ userlist = list() if not knownusersandpasswords and hipparchia.config['KNOWNUSERSDICT']: knownusersandpasswords = hipparchia.config['KNOWNUSERSDICT'] userlist = [ PassUser(k, knownusersandpasswords[k]) for k in knownusersandpasswords ] if hipparchia.config['SETADEFAULTUSER']: thepass = hipparchia.config['DEFAULTREMOTEPASS'] if thepass == 'yourremoteuserpassheretrytomakeitstrongplease': thepass = assignuniquename() consolewarning( 'DEFAULTREMOTEPASS cannot be left as "yourremoteuserpassheretrytomakeitstrongplease"' ) consolewarning( 'temporary one-time password is "{p}"'.format(p=thepass)) defaultuser = PassUser(hipparchia.config['DEFAULTREMOTEUSER'], thepass) userlist.append(defaultuser) # anonymoususer = PassUser('Anonymous', 'NoPassword') # userlist.append(anonymoususer) usersdict = {u.username: u for u in userlist} return usersdict
def storevectorgraph(figureasbytes): """ store a graph in the image table so that you can subsequently display it in the browser note that images get deleted after use also note that we hand the data to the db and then immediatel grab it out of the db because of constraints imposed by the way flask works :param figureasbytes: :return: """ dbconnection = ConnectionObject(ctype='rw') dbconnection.setautocommit() cursor = dbconnection.cursor() # avoid psycopg2.DataError: value too long for type character varying(12) randomid = assignuniquename(12) q = """ INSERT INTO public.storedvectorimages (imagename, imagedata) VALUES (%s, %s) """ d = (randomid, figureasbytes) try: cursor.execute(q, d) except psycopg2.ProgrammingError: # psycopg2.ProgrammingError: relation "public.storedvectorimages" does not exist createstoredimagestable() cursor.execute(q, d) # print('stored {n} in vector image table'.format(n=randomid)) dbconnection.connectioncleanup() return randomid
def bulkenvironsfetcher(table: str, searchresultlist: list, context: int) -> list: """ given a list of SearchResult objects, populate the lineobjects of each SearchResult with their contexts :param table: :param searchresultlist: :param context: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() cursor = dbconnection.cursor() tosearch = deque() reversemap = dict() for r in searchresultlist: resultnumber = r.hitnumber focusline = r.getindex() environs = list( range(int(focusline - (context / 2)), int(focusline + (context / 2)) + 1)) tosearch.extend(environs) rmap = {e: resultnumber for e in environs} reversemap.update(rmap) r.lineobjects = list() tosearch = [str(x) for x in tosearch] tqtemplate = """ CREATE TEMPORARY TABLE {au}_includelist_{ac} AS SELECT values AS includeindex FROM unnest(ARRAY[{lines}]) values """ # avoidcollisions instead of DROP TABLE IF EXISTS; the table disappears when the connection is cleaned up avoidcollisions = assignuniquename() tempquery = tqtemplate.format(au=table, ac=avoidcollisions, lines=','.join(tosearch)) cursor.execute(tempquery) qtemplate = """ SELECT {wtmpl} FROM {au} WHERE EXISTS (SELECT 1 FROM {au}_includelist_{ac} incl WHERE incl.includeindex = {au}.index) """ query = qtemplate.format(wtmpl=worklinetemplate, au=table, ac=avoidcollisions) cursor.execute(query) results = resultiterator(cursor) lines = [dblineintolineobject(r) for r in results] indexedlines = {l.index: l for l in lines} for r in searchresultlist: environs = list( range(int(r.getindex() - (context / 2)), int(r.getindex() + (context / 2)) + 1)) for e in environs: try: r.lineobjects.append(indexedlines[e]) except KeyError: # you requested a line that was outside of the scope of the table # so there was no result and the key will not match a find pass dbconnection.connectioncleanup() return searchresultlist
def subqueryphrasesearch(workerid, foundlineobjects: ListProxy, searchphrase: str, listofplacestosearch: ListProxy, searchobject: SearchObject, dbconnection) -> ListProxy: """ foundlineobjects, searchingfor, searchlist, commitcount, whereclauseinfo, activepoll use subquery syntax to grab multi-line windows of text for phrase searching line ends and line beginning issues can be overcome this way, but then you have plenty of bookkeeping to do to to get the proper results focussed on the right line tablestosearch: ['lt0400', 'lt0022', ...] a search inside of Ar., Eth. Eud.: SELECT secondpass.index, secondpass.accented_line FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM (SELECT index, accented_line, concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle FROM gr0086 WHERE ( (index BETWEEN 15982 AND 18745) ) ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s LIMIT 200 a search in x., hell and x., mem less book 3 of hell and book 2 of mem: SELECT secondpass.index, secondpass.accented_line FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM (SELECT index, accented_line, concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle FROM gr0032 WHERE ( (index BETWEEN 1 AND 7918) OR (index BETWEEN 7919 AND 11999) ) AND ( (index NOT BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 8845 AND 9864) ) ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s LIMIT 200 :return: """ # print('subqueryphrasesearch()') so = searchobject activepoll = so.poll # build incomplete sfo that will handle everything other than iteratethroughsearchlist() sfo = returnsearchfncobject(workerid, foundlineobjects, listofplacestosearch, so, dbconnection, None) querytemplate = """ SELECT secondpass.index, secondpass.{co} FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.{co} FROM (SELECT index, {co}, concat({co}, ' ', lead({co}) OVER (ORDER BY index ASC)) AS linebundle FROM {db} {whr} ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s {lim}""" wheretempate = """ WHERE EXISTS (SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index) """ # substringsearch() needs ability to CREATE TEMPORARY TABLE sfo.dbconnection.setreadonly(False) dbcursor = sfo.dbconnection.cursor() qcomb = QueryCombinator(searchphrase) # the last item is the full phrase: ('one two three four five', '') combinations = qcomb.combinations() combinations.pop() # lines start/end sp = re.sub(r'^\s', r'(^|\\s)', searchphrase) sp = re.sub(r'\s$', r'(\\s|$)', sp) # on the reasoning behind the following substitution see 'DEBUGGING notes: SQL oddities' above # sp = re.sub(r' ', r'\\s', sp) if not so.onehit: lim = ' LIMIT ' + str(so.cap) else: # the windowing problem means that '1' might be something that gets discarded lim = ' LIMIT 5' if so.redissearchlist: listofplacestosearch = True while listofplacestosearch and activepoll.gethits() <= so.cap: # sfo.getnextfnc() also takes care of the commitcount authortable = sfo.getnextfnc() sfo.updatepollremaining() if authortable: whr = str() r = so.indexrestrictions[authortable] if r['type'] == 'between': indexwedwhere = buildbetweenwhereextension(authortable, so) if indexwedwhere != '': # indexwedwhere will come back with an extraneous ' AND' indexwedwhere = indexwedwhere[:-4] whr = 'WHERE {iw}'.format(iw=indexwedwhere) elif r['type'] == 'temptable': avoidcollisions = assignuniquename() q = r['where']['tempquery'] q = re.sub('_includelist', '_includelist_{a}'.format(a=avoidcollisions), q) dbcursor.execute(q) whr = wheretempate.format(tbl=authortable, a=avoidcollisions) query = querytemplate.format(db=authortable, co=so.usecolumn, whr=whr, lim=lim) data = (sp, ) # print('subqueryphrasesearch() find indices() q,d:\n\t',query, data) dbcursor.execute(query, data) indices = [i[0] for i in dbcursor.fetchall()] # this will yield a bunch of windows: you need to find the centers; see 'while...' below locallineobjects = list() if indices: for i in indices: query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=authortable) data = (i, ) # print('subqueryphrasesearch() iterate through indices() q,d:\n\t', query, data) dbcursor.execute(query, data) locallineobjects.append( dblineintolineobject(dbcursor.fetchone())) locallineobjects.reverse() # debugging # for l in locallineobjects: # print(l.universalid, l.locus(), getattr(l,so.usewordlist)) gotmyonehit = False while locallineobjects and activepoll.gethits( ) <= so.cap and not gotmyonehit: # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133] # figure out which line is really the line with the goods # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in # subsequent lines means that you really should check your work carefully; this is not an especially costly # operation relative to the whole search and esp. relative to the speed gains of using a subquery search lineobject = locallineobjects.pop() if re.search(sp, getattr(lineobject, so.usewordlist)): sfo.addnewfindstolistoffinds([lineobject]) activepoll.addhits(1) if so.onehit: gotmyonehit = True else: try: nextline = locallineobjects[0] except IndexError: nextline = makeablankline('gr0000w000', -1) if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != ( nextline.index - 1): # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101) # usually you won't get a hit by grabbing the next db line, but sometimes you do... query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=authortable) data = (lineobject.index + 1, ) # print('subqueryphrasesearch() "while locallineobjects..." loop q,d:\n\t', query, data) dbcursor.execute(query, data) try: nextline = dblineintolineobject( dbcursor.fetchone()) except: nextline = makeablankline('gr0000w000', -1) for c in combinations: tail = c[0] + '$' head = '^' + c[1] # debugging # print('re',getattr(lo,so.usewordlist),tail, head, getattr(next,so.usewordlist)) t = False h = False try: t = re.search(tail, getattr(lineobject, so.usewordlist)) except re.error: pass try: h = re.search(head, getattr(nextline, so.usewordlist)) except re.error: pass if t and h: sfo.addnewfindstolistoffinds([lineobject]) activepoll.addhits(1) if so.onehit: gotmyonehit = True else: # redis will return None for authortable if the set is now empty listofplacestosearch = None sfo.listcleanup() if sfo.needconnectioncleanup: sfo.dbconnection.connectioncleanup() return foundlineobjects
def substringsearch(seeking: str, authortable: str, searchobject: SearchObject, cursor, templimit=None) -> Generator: """ actually one of the most basic search types: look for a string/substring the whereclause is built conditionally: sample 'unrestricted': SELECT * FROM gr0059 WHERE ( stripped_line ~* %s ) LIMIT 200 ('βαλλ',) [i.e, SELECT * FROM gr0059 WHERE ( stripped_line ~* 'βαλλ') LIMIT 200;] sample 'between': SELECT * FROM gr0032 WHERE (index BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 1846 AND 2061) AND ( stripped_line ~* %s ) LIMIT 200 ('βαλλ',) sample 'temptable': [create the temptable] SELECT * FROM in1204 WHERE EXISTS (SELECT 1 FROM in1204_includelist incl WHERE incl.includeindex = in1204.index AND in1204.accented_line ~* %s) LIMIT 200 ('τούτου',) :param seeking: :param authortable: :param searchobject: :param cursor: :param templimit: :return: """ so = searchobject if templimit: lim = str(templimit) else: lim = str(so.cap) if so.onehit: mylimit = ' LIMIT 1' else: mylimit = ' LIMIT {lim}'.format(lim=lim) mysyntax = '~*' found = list() r = so.indexrestrictions[authortable] whereextensions = str() if r['type'] == 'temptable': # make the table q = r['where']['tempquery'] avoidcollisions = assignuniquename() q = re.sub('_includelist', '_includelist_{a}'.format(a=avoidcollisions), q) cursor.execute(q) # now you can work with it wtempate = """ EXISTS (SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index """ whereextensions = wtempate.format(a=avoidcollisions, tbl=authortable) whr = 'WHERE {xtn} AND {au}.{col} {sy} %s)'.format(au=authortable, col=so.usecolumn, sy=mysyntax, xtn=whereextensions) elif r['type'] == 'between': whereextensions = buildbetweenwhereextension(authortable, so) whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) elif r['type'] == 'unrestricted': whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax, xtn=whereextensions) else: # should never see this consolewarning('error in substringsearch(): unknown whereclause type', r['type']) whr = 'WHERE ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax) qtemplate = 'SELECT {wtmpl} FROM {db} {whr} {lm}' q = qtemplate.format(wtmpl=worklinetemplate, db=authortable, whr=whr, lm=mylimit) d = (seeking, ) # print('q/d\nq:\t{q}\nd:\t{d}\n'.format(q=q, d=d)) try: cursor.execute(q, d) found = resultiterator(cursor) except psycopg2.DataError: # e.g., invalid regular expression: parentheses () not balanced consolewarning( 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex' .format(d=d[0]), color='red') except psycopg2.InternalError: # current transaction is aborted, commands ignored until end of transaction block consolewarning( 'psycopg2.InternalError; did not execute query="{q}" and data="{d}' .format(q=q, d=d), color='red') except psycopg2.DatabaseError: # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq # added to track PooledConnection threading issues # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4' consolewarning('DatabaseError for {c} @ {p}'.format( c=cursor, p=multiprocessing.current_process().name), color='red') consolewarning('\tq, d', q, d) return found
def buildwinnertakesallbagsofwords(morphdict, sentences) -> deque: """ turn a list of sentences into a list of list of headwords here we figure out which headword is the dominant homonym then we just use that term esse ===> sum esse =/=> edo assuming that it is faster to do this 2x so you can do a temp table query rather than iterate into DB not tested/profiled, though... :param morphdict: :param sentences: :return: """ # PART ONE: figure out who the "winners" are going to be bagsofwords = buildflatbagsofwords(morphdict, sentences) allheadwords = {w for bag in bagsofwords for w in bag} dbconnection = ConnectionObject(readonlyconnection=False) dbconnection.setautocommit() dbcursor = dbconnection.cursor() rnd = assignuniquename(6) tqtemplate = """ CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords """ qtemplate = """ SELECT entry_name, total_count FROM {db} WHERE EXISTS (SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name) """ tempquery = tqtemplate.format(rnd=rnd, allwords=list(allheadwords)) dbcursor.execute(tempquery) # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values # third parameter is query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts') dbcursor.execute(query) results = resultiterator(dbcursor) randkedheadwords = {r[0]: r[1] for r in results} # PART TWO: let the winners take all bagsofwords = deque() for s in sentences: lemattized = deque() for word in s: # [('x', 4), ('y', 5), ('z', 1)] try: possibilities = sorted([(item, randkedheadwords[item]) for item in morphdict[word]], key=lambda x: x[1]) # first item of last tuple is the winner lemattized.append(possibilities[-1][0]) except KeyError: pass if lemattized: bagsofwords.append(lemattized) return bagsofwords