Ejemplo n.º 1
0
def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]:
    """

	note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed

	hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS  entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values;

	hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name);
	  entry_name   | total_count | gr_count | lt_count | dp_count | in_count | ch_count
	---------------+-------------+----------+----------+----------+----------+----------
	 κατακλεῖϲαι   |          31 |       30 |        0 |        0 |        1 |        0
	 κατακλειούϲηϲ |           3 |        3 |        0 |        0 |        0 |        0
	 κατακλῇϲαι    |           1 |        1 |        0 |        0 |        0 |        0
	(3 rows)

	:param listofwords:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbcursor = dbconnection.cursor()

    try:
        firstletteroffirstword = stripaccents(listofwords[0][0])
    except IndexError:
        return list()

    if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ':
        firstletteroffirstword = '0'

    tqtemplate = """
	CREATE TEMP TABLE bulkcounter_{rnd} AS
		SELECT values AS 
			entriestocheck FROM unnest(ARRAY[%s]) values
	"""

    uniquename = assignuniquename(12)
    tempquery = tqtemplate.format(rnd=uniquename)
    data = (listofwords, )
    dbcursor.execute(tempquery, data)

    qtemplate = """
	SELECT * FROM wordcounts_{x} WHERE EXISTS 
		(SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name)
	"""

    query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword)
    try:
        dbcursor.execute(query)
        results = resultiterator(dbcursor)
    except psycopg2.ProgrammingError:
        # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist
        results = list()

    wordcountobjects = [dbWordCountObject(*r) for r in results]

    dbconnection.connectioncleanup()

    return wordcountobjects
def precomposedsqlsearcher(querydict, dbcursor) -> Generator:
    """

    as per substringsearchintosqldict():
        sq = { table1: {query: q, data: d, temptable: t},
        table2: {query: q, data: d, temptable: t},
        ... }

    only sent the dict at sq[tableN]

    """

    t = querydict['temptable']
    q = querydict['query']
    d = (querydict['data'],)

    if t:
        unique = assignuniquename()
        t = re.sub('UNIQUENAME', unique, t)
        q = re.sub('UNIQUENAME', unique, q)
        dbcursor.execute(t)

    found = list()

    # debugmessage('precomposedsqlsearcher() querydict = {q}'.format(q=querydict))
    # debugmessage('precomposedsqlsearcher() q:\n\t{q}\nd:\n\t{d}'.format(q=q, d=d))

    warnings = {
        1: 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex',
        2: 'psycopg2.InternalError; did not execute query="{q}" and data="{d}',
        3: 'precomposedsqlsearcher() DatabaseError for {c} @ {p}',
        4: 'precomposedsqlsearcher() IndexError: malformed query/data combination; empty results returned'
    }

    try:
        dbcursor.execute(q, d)
        found = resultiterator(dbcursor)
    except psycopg2.DataError:
        # e.g., invalid regular expression: parentheses () not balanced
        consolewarning(warnings[1].format(d=d[0]), color='red')
    except psycopg2.InternalError:
        # current transaction is aborted, commands ignored until end of transaction block
        consolewarning(warnings[2].format(q=q, d=d), color='red')
    except psycopg2.DatabaseError:
        # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq
        # added to track PooledConnection threading issues
        # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4'
        consolewarning(warnings[3].format(c=dbcursor, p=multiprocessing.current_process().name), color='red')
        consolewarning('\tq, d: {q}, {d}'.format(q=q, d=q))
    except IndexError:
        found = list()
        consolewarning(warnings[4], color='red')
        consolewarning("\tq = {q}".format(q=q, d=q), color='red')
        consolewarning("\td = :{d}\n===========".format(q=q, d=q), color='yellow')

    return found
Ejemplo n.º 3
0
 def __init__(self, autocommit, readonlyconnection):
     # note that only autocommit='autocommit' will make a difference
     self.autocommit = autocommit
     self.readonlyconnection = readonlyconnection
     self.commitcount = hipparchia.config['MPCOMMITCOUNT']
     # used for the key for getconn() and putconn(); but unneeded if PersistentConnectionPool
     # also useful to have on hand for debugging
     self.uniquename = assignuniquename()
     # the next two must get filled out when the actual connection is made
     self.dbconnection = None
     self.curs = None
Ejemplo n.º 4
0
def bulklexicalgrab(listofwords: List[str], tabletouse: str, targetcolumn: str,
                    language: str) -> list:
    """

	grab a bunch of lex/morph entries by using a temp table

	e.g.,
		lexicalresults = bulklexicalgrab(listofwords, 'dictionary', 'entry_name', language)
		results = bulklexicalgrab(listofwords, 'morphology', 'observed_form', language)

	:param listofwords:
	:param tabletouse:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbcursor = dbconnection.cursor()

    tqtemplate = """
	CREATE TEMP TABLE bulklex_{rnd} AS
		SELECT values AS 
			entriestocheck FROM unnest(ARRAY[%s]) values
	"""

    uniquename = assignuniquename(12)
    tempquery = tqtemplate.format(rnd=uniquename)
    data = (listofwords, )
    dbcursor.execute(tempquery, data)

    qtemplate = """
	SELECT * FROM {lg}_{thetable} WHERE EXISTS 
		(SELECT 1 FROM bulklex_{rnd} tocheck WHERE tocheck.entriestocheck = {lg}_{thetable}.{target})
	"""

    query = qtemplate.format(rnd=uniquename,
                             thetable=tabletouse,
                             target=targetcolumn,
                             lg=language)

    try:
        dbcursor.execute(query)
        results = resultiterator(dbcursor)
    except psycopg2.ProgrammingError:
        # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist
        results = list()

    dbconnection.connectioncleanup()

    return results
Ejemplo n.º 5
0
def insertuniqunames(sqldict: dict) -> dict:
    """

	swap out "_UNIQUENAME" in the prerolled queries

	"""

    for item in sqldict:
        if sqldict[item]['temptable']:
            u = assignuniquename()
            sqldict[item]['query'] = re.sub(r'UNIQUENAME', u,
                                            sqldict[item]['query'])
            sqldict[item]['temptable'] = re.sub(r'UNIQUENAME', u,
                                                sqldict[item]['temptable'])

    return sqldict
Ejemplo n.º 6
0
def rankheadwordsbyprevalence(listofheadwords: list) -> dict:
    """

	"""

    # print('rankheadwordsbyprevalence() listofheadwords', listofheadwords)

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    dbcursor = dbconnection.cursor()
    rnd = assignuniquename(6)

    tqtemplate = """
	CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS
		SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords
	"""

    qtemplate = """
	SELECT entry_name, total_count FROM {db} 
		WHERE EXISTS 
			(SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name)
	"""

    tempquery = tqtemplate.format(rnd=rnd, allwords=list(listofheadwords))
    dbcursor.execute(tempquery)
    # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values
    # third parameter is

    query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts')
    dbcursor.execute(query)
    results = resultiterator(dbcursor)

    ranked = {r[0]: r[1] for r in results}

    # you have a problem: you just tossed a bunch of headwords that did not have good prevalence data
    # discovered when Ϲωκράτηϲ went missing from Plato

    r = set(ranked.keys())
    h = set(listofheadwords)
    delta = h - r

    nullranked = {d: 0 for d in delta}

    ranked = {**ranked, **nullranked}

    return ranked
Ejemplo n.º 7
0
def loadusersdict(knownusersandpasswords=None):
    """

	return the userobjects we know about

	note that this is effectively empty: no dict of users is being passed ATM

	anyone with ambitions re. a collection of users should insert them via securitysettings.py

		KNOWNUSERSDICT = {'user1': 'pass1, 'user2': 'pass2'}

	elaborate user and authentication schemes are a non-priority (as is encryption...)

	:return:
	"""

    userlist = list()

    if not knownusersandpasswords and hipparchia.config['KNOWNUSERSDICT']:
        knownusersandpasswords = hipparchia.config['KNOWNUSERSDICT']
        userlist = [
            PassUser(k, knownusersandpasswords[k])
            for k in knownusersandpasswords
        ]

    if hipparchia.config['SETADEFAULTUSER']:
        thepass = hipparchia.config['DEFAULTREMOTEPASS']
        if thepass == 'yourremoteuserpassheretrytomakeitstrongplease':
            thepass = assignuniquename()
            consolewarning(
                'DEFAULTREMOTEPASS cannot be left as "yourremoteuserpassheretrytomakeitstrongplease"'
            )
            consolewarning(
                'temporary one-time password is "{p}"'.format(p=thepass))
        defaultuser = PassUser(hipparchia.config['DEFAULTREMOTEUSER'], thepass)
        userlist.append(defaultuser)

    # anonymoususer = PassUser('Anonymous', 'NoPassword')
    # userlist.append(anonymoususer)

    usersdict = {u.username: u for u in userlist}

    return usersdict
Ejemplo n.º 8
0
def storevectorgraph(figureasbytes):
	"""

	store a graph in the image table so that you can subsequently display it in the browser

	note that images get deleted after use

	also note that we hand the data to the db and then immediatel grab it out of the db because of
	constraints imposed by the way flask works

	:param figureasbytes:
	:return:
	"""

	dbconnection = ConnectionObject(ctype='rw')
	dbconnection.setautocommit()
	cursor = dbconnection.cursor()

	# avoid psycopg2.DataError: value too long for type character varying(12)
	randomid = assignuniquename(12)

	q = """
	INSERT INTO public.storedvectorimages 
		(imagename, imagedata)
		VALUES (%s, %s)
	"""

	d = (randomid, figureasbytes)
	try:
		cursor.execute(q, d)
	except psycopg2.ProgrammingError:
		# psycopg2.ProgrammingError: relation "public.storedvectorimages" does not exist
		createstoredimagestable()
		cursor.execute(q, d)

	# print('stored {n} in vector image table'.format(n=randomid))

	dbconnection.connectioncleanup()

	return randomid
Ejemplo n.º 9
0
def bulkenvironsfetcher(table: str, searchresultlist: list,
                        context: int) -> list:
    """

	given a list of SearchResult objects, populate the lineobjects of each SearchResult with their contexts

	:param table:
	:param searchresultlist:
	:param context:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    cursor = dbconnection.cursor()

    tosearch = deque()
    reversemap = dict()

    for r in searchresultlist:
        resultnumber = r.hitnumber
        focusline = r.getindex()
        environs = list(
            range(int(focusline - (context / 2)),
                  int(focusline + (context / 2)) + 1))
        tosearch.extend(environs)
        rmap = {e: resultnumber for e in environs}
        reversemap.update(rmap)
        r.lineobjects = list()

    tosearch = [str(x) for x in tosearch]

    tqtemplate = """
	CREATE TEMPORARY TABLE {au}_includelist_{ac} AS
		SELECT values AS 
			includeindex FROM unnest(ARRAY[{lines}]) values
	"""

    # avoidcollisions instead of DROP TABLE IF EXISTS; the table disappears when the connection is cleaned up
    avoidcollisions = assignuniquename()

    tempquery = tqtemplate.format(au=table,
                                  ac=avoidcollisions,
                                  lines=','.join(tosearch))
    cursor.execute(tempquery)

    qtemplate = """
	SELECT {wtmpl} FROM {au} WHERE EXISTS 
		(SELECT 1 FROM {au}_includelist_{ac} incl WHERE incl.includeindex = {au}.index)
	"""

    query = qtemplate.format(wtmpl=worklinetemplate,
                             au=table,
                             ac=avoidcollisions)
    cursor.execute(query)
    results = resultiterator(cursor)

    lines = [dblineintolineobject(r) for r in results]
    indexedlines = {l.index: l for l in lines}

    for r in searchresultlist:
        environs = list(
            range(int(r.getindex() - (context / 2)),
                  int(r.getindex() + (context / 2)) + 1))
        for e in environs:
            try:
                r.lineobjects.append(indexedlines[e])
            except KeyError:
                # you requested a line that was outside of the scope of the table
                # so there was no result and the key will not match a find
                pass

    dbconnection.connectioncleanup()

    return searchresultlist
Ejemplo n.º 10
0
def subqueryphrasesearch(workerid, foundlineobjects: ListProxy,
                         searchphrase: str, listofplacestosearch: ListProxy,
                         searchobject: SearchObject,
                         dbconnection) -> ListProxy:
    """

    foundlineobjects, searchingfor, searchlist, commitcount, whereclauseinfo, activepoll

    use subquery syntax to grab multi-line windows of text for phrase searching

    line ends and line beginning issues can be overcome this way, but then you have plenty of
    bookkeeping to do to to get the proper results focussed on the right line

    tablestosearch:
        ['lt0400', 'lt0022', ...]

    a search inside of Ar., Eth. Eud.:

        SELECT secondpass.index, secondpass.accented_line
                FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM
                    (SELECT index, accented_line,
                        concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle
                        FROM gr0086 WHERE ( (index BETWEEN 15982 AND 18745) ) ) firstpass
                    ) secondpass
                WHERE secondpass.linebundle ~ %s  LIMIT 200

    a search in x., hell and x., mem less book 3 of hell and book 2 of mem:
        SELECT secondpass.index, secondpass.accented_line
                FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM
                    (SELECT index, accented_line,
                        concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle
                        FROM gr0032 WHERE ( (index BETWEEN 1 AND 7918) OR (index BETWEEN 7919 AND 11999) ) AND ( (index NOT BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 8845 AND 9864) ) ) firstpass
                    ) secondpass
                WHERE secondpass.linebundle ~ %s  LIMIT 200

    :return:
    """
    # print('subqueryphrasesearch()')
    so = searchobject
    activepoll = so.poll

    # build incomplete sfo that will handle everything other than iteratethroughsearchlist()
    sfo = returnsearchfncobject(workerid, foundlineobjects,
                                listofplacestosearch, so, dbconnection, None)

    querytemplate = """
		SELECT secondpass.index, secondpass.{co} FROM 
			(SELECT firstpass.index, firstpass.linebundle, firstpass.{co} FROM
					(SELECT index, {co}, concat({co}, ' ', lead({co}) OVER (ORDER BY index ASC)) AS linebundle
						FROM {db} {whr} ) firstpass
			) secondpass
		WHERE secondpass.linebundle ~ %s {lim}"""

    wheretempate = """
	WHERE EXISTS
		(SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index)
	"""

    # substringsearch() needs ability to CREATE TEMPORARY TABLE
    sfo.dbconnection.setreadonly(False)
    dbcursor = sfo.dbconnection.cursor()

    qcomb = QueryCombinator(searchphrase)
    # the last item is the full phrase:  ('one two three four five', '')
    combinations = qcomb.combinations()
    combinations.pop()
    # lines start/end
    sp = re.sub(r'^\s', r'(^|\\s)', searchphrase)
    sp = re.sub(r'\s$', r'(\\s|$)', sp)
    # on the reasoning behind the following substitution see 'DEBUGGING notes: SQL oddities' above
    # sp = re.sub(r' ', r'\\s', sp)

    if not so.onehit:
        lim = ' LIMIT ' + str(so.cap)
    else:
        # the windowing problem means that '1' might be something that gets discarded
        lim = ' LIMIT 5'

    if so.redissearchlist:
        listofplacestosearch = True

    while listofplacestosearch and activepoll.gethits() <= so.cap:
        # sfo.getnextfnc() also takes care of the commitcount
        authortable = sfo.getnextfnc()
        sfo.updatepollremaining()

        if authortable:
            whr = str()
            r = so.indexrestrictions[authortable]
            if r['type'] == 'between':
                indexwedwhere = buildbetweenwhereextension(authortable, so)
                if indexwedwhere != '':
                    # indexwedwhere will come back with an extraneous ' AND'
                    indexwedwhere = indexwedwhere[:-4]
                    whr = 'WHERE {iw}'.format(iw=indexwedwhere)
            elif r['type'] == 'temptable':
                avoidcollisions = assignuniquename()
                q = r['where']['tempquery']
                q = re.sub('_includelist',
                           '_includelist_{a}'.format(a=avoidcollisions), q)
                dbcursor.execute(q)
                whr = wheretempate.format(tbl=authortable, a=avoidcollisions)

            query = querytemplate.format(db=authortable,
                                         co=so.usecolumn,
                                         whr=whr,
                                         lim=lim)
            data = (sp, )
            # print('subqueryphrasesearch() find indices() q,d:\n\t',query, data)
            dbcursor.execute(query, data)
            indices = [i[0] for i in dbcursor.fetchall()]
            # this will yield a bunch of windows: you need to find the centers; see 'while...' below

            locallineobjects = list()
            if indices:
                for i in indices:
                    query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                        wtmpl=worklinetemplate, tb=authortable)
                    data = (i, )
                    # print('subqueryphrasesearch() iterate through indices() q,d:\n\t', query, data)
                    dbcursor.execute(query, data)
                    locallineobjects.append(
                        dblineintolineobject(dbcursor.fetchone()))

            locallineobjects.reverse()
            # debugging
            # for l in locallineobjects:
            #	print(l.universalid, l.locus(), getattr(l,so.usewordlist))

            gotmyonehit = False
            while locallineobjects and activepoll.gethits(
            ) <= so.cap and not gotmyonehit:
                # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133]
                # figure out which line is really the line with the goods
                # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in
                # subsequent lines means that you really should check your work carefully; this is not an especially costly
                # operation relative to the whole search and esp. relative to the speed gains of using a subquery search
                lineobject = locallineobjects.pop()
                if re.search(sp, getattr(lineobject, so.usewordlist)):
                    sfo.addnewfindstolistoffinds([lineobject])
                    activepoll.addhits(1)
                    if so.onehit:
                        gotmyonehit = True
                else:
                    try:
                        nextline = locallineobjects[0]
                    except IndexError:
                        nextline = makeablankline('gr0000w000', -1)

                    if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != (
                            nextline.index - 1):
                        # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101)
                        # usually you won't get a hit by grabbing the next db line, but sometimes you do...
                        query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                            wtmpl=worklinetemplate, tb=authortable)
                        data = (lineobject.index + 1, )
                        # print('subqueryphrasesearch() "while locallineobjects..." loop q,d:\n\t', query, data)
                        dbcursor.execute(query, data)
                        try:
                            nextline = dblineintolineobject(
                                dbcursor.fetchone())
                        except:
                            nextline = makeablankline('gr0000w000', -1)

                    for c in combinations:
                        tail = c[0] + '$'
                        head = '^' + c[1]
                        # debugging
                        # print('re',getattr(lo,so.usewordlist),tail, head, getattr(next,so.usewordlist))

                        t = False
                        h = False
                        try:
                            t = re.search(tail,
                                          getattr(lineobject, so.usewordlist))
                        except re.error:
                            pass
                        try:
                            h = re.search(head,
                                          getattr(nextline, so.usewordlist))
                        except re.error:
                            pass

                        if t and h:
                            sfo.addnewfindstolistoffinds([lineobject])
                            activepoll.addhits(1)
                            if so.onehit:
                                gotmyonehit = True
        else:
            # redis will return None for authortable if the set is now empty
            listofplacestosearch = None

    sfo.listcleanup()

    if sfo.needconnectioncleanup:
        sfo.dbconnection.connectioncleanup()

    return foundlineobjects
Ejemplo n.º 11
0
def substringsearch(seeking: str,
                    authortable: str,
                    searchobject: SearchObject,
                    cursor,
                    templimit=None) -> Generator:
    """

    actually one of the most basic search types: look for a string/substring

    the whereclause is built conditionally:

    sample 'unrestricted':
        SELECT * FROM gr0059 WHERE  ( stripped_line ~* %s )  LIMIT 200 ('βαλλ',)
        [i.e, SELECT * FROM gr0059 WHERE  ( stripped_line ~* 'βαλλ') LIMIT 200;]
    sample 'between':
        SELECT * FROM gr0032 WHERE (index BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 1846 AND 2061) AND ( stripped_line ~* %s )  LIMIT 200 ('βαλλ',)
    sample 'temptable':
        [create the temptable]
        SELECT * FROM in1204 WHERE EXISTS (SELECT 1 FROM in1204_includelist incl WHERE incl.includeindex = in1204.index AND in1204.accented_line ~* %s)  LIMIT 200 ('τούτου',)

    :param seeking:
    :param authortable:
    :param searchobject:
    :param cursor:
    :param templimit:
    :return:
    """

    so = searchobject

    if templimit:
        lim = str(templimit)
    else:
        lim = str(so.cap)

    if so.onehit:
        mylimit = ' LIMIT 1'
    else:
        mylimit = ' LIMIT {lim}'.format(lim=lim)

    mysyntax = '~*'
    found = list()

    r = so.indexrestrictions[authortable]
    whereextensions = str()

    if r['type'] == 'temptable':
        # make the table
        q = r['where']['tempquery']
        avoidcollisions = assignuniquename()
        q = re.sub('_includelist',
                   '_includelist_{a}'.format(a=avoidcollisions), q)
        cursor.execute(q)
        # now you can work with it
        wtempate = """
		EXISTS
			(SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index
		"""
        whereextensions = wtempate.format(a=avoidcollisions, tbl=authortable)
        whr = 'WHERE {xtn} AND {au}.{col} {sy} %s)'.format(au=authortable,
                                                           col=so.usecolumn,
                                                           sy=mysyntax,
                                                           xtn=whereextensions)
    elif r['type'] == 'between':
        whereextensions = buildbetweenwhereextension(authortable, so)
        whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn,
                                                   sy=mysyntax,
                                                   xtn=whereextensions)
    elif r['type'] == 'unrestricted':
        whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn,
                                                   sy=mysyntax,
                                                   xtn=whereextensions)
    else:
        # should never see this
        consolewarning('error in substringsearch(): unknown whereclause type',
                       r['type'])
        whr = 'WHERE ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax)

    qtemplate = 'SELECT {wtmpl} FROM {db} {whr} {lm}'
    q = qtemplate.format(wtmpl=worklinetemplate,
                         db=authortable,
                         whr=whr,
                         lm=mylimit)
    d = (seeking, )

    # print('q/d\nq:\t{q}\nd:\t{d}\n'.format(q=q, d=d))

    try:
        cursor.execute(q, d)
        found = resultiterator(cursor)
    except psycopg2.DataError:
        # e.g., invalid regular expression: parentheses () not balanced
        consolewarning(
            'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex'
            .format(d=d[0]),
            color='red')
    except psycopg2.InternalError:
        # current transaction is aborted, commands ignored until end of transaction block
        consolewarning(
            'psycopg2.InternalError; did not execute query="{q}" and data="{d}'
            .format(q=q, d=d),
            color='red')
    except psycopg2.DatabaseError:
        # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq
        # added to track PooledConnection threading issues
        # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4'
        consolewarning('DatabaseError for {c} @ {p}'.format(
            c=cursor, p=multiprocessing.current_process().name),
                       color='red')
        consolewarning('\tq, d', q, d)

    return found
Ejemplo n.º 12
0
def buildwinnertakesallbagsofwords(morphdict, sentences) -> deque:
    """

	turn a list of sentences into a list of list of headwords

	here we figure out which headword is the dominant homonym

	then we just use that term

		esse ===> sum
		esse =/=> edo

	assuming that it is faster to do this 2x so you can do a temp table query rather than iterate into DB

	not tested/profiled, though...

	:param morphdict:
	:param sentences:
	:return:
	"""

    # PART ONE: figure out who the "winners" are going to be

    bagsofwords = buildflatbagsofwords(morphdict, sentences)

    allheadwords = {w for bag in bagsofwords for w in bag}

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    dbcursor = dbconnection.cursor()

    rnd = assignuniquename(6)

    tqtemplate = """
	CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS
		SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords
	"""

    qtemplate = """
	SELECT entry_name, total_count FROM {db} 
		WHERE EXISTS 
			(SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name)
	"""

    tempquery = tqtemplate.format(rnd=rnd, allwords=list(allheadwords))
    dbcursor.execute(tempquery)
    # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values
    # third parameter is

    query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts')
    dbcursor.execute(query)
    results = resultiterator(dbcursor)

    randkedheadwords = {r[0]: r[1] for r in results}

    # PART TWO: let the winners take all

    bagsofwords = deque()
    for s in sentences:
        lemattized = deque()
        for word in s:
            # [('x', 4), ('y', 5), ('z', 1)]
            try:
                possibilities = sorted([(item, randkedheadwords[item])
                                        for item in morphdict[word]],
                                       key=lambda x: x[1])
                # first item of last tuple is the winner
                lemattized.append(possibilities[-1][0])
            except KeyError:
                pass
        if lemattized:
            bagsofwords.append(lemattized)

    return bagsofwords