Ejemplo n.º 1
0
def loadallauthorsasobjects() -> dict:
    """

	return a dict of all possible author objects

	:return:
	"""

    print('loading all authors...', end='')

    dbconnection = ConnectionObject()
    cursor = dbconnection.cursor()

    q = 'SELECT * FROM authors'

    cursor.execute(q)
    results = resultiterator(cursor)

    authorsdict = {r[0]: dbAuthor(*r) for r in results}

    print('\t', len(authorsdict), 'authors loaded', end='')

    dbconnection.connectioncleanup()

    return authorsdict
Ejemplo n.º 2
0
def loadallworksasobjects() -> dict:
    """

	return a dict of all possible work objects

	:return:
	"""

    print('loading all works...  ', end='')

    dbconnection = ConnectionObject()
    cursor = dbconnection.cursor()

    q = """
	SELECT universalid, title, language, publication_info, levellabels_00, levellabels_01, levellabels_02,
		levellabels_03, levellabels_04, levellabels_05, workgenre, transmission, worktype, provenance, 
		recorded_date, converted_date, wordcount, firstline, lastline, authentic FROM works
	"""

    cursor.execute(q)
    results = resultiterator(cursor)

    worksdict = {r[0]: dbOpus(*r) for r in results}

    print('\t', len(worksdict), 'works loaded', end='')

    dbconnection.connectioncleanup()

    return worksdict
Ejemplo n.º 3
0
def grabbundlesoflines(worksandboundaries: dict, cursor) -> list:
    """
	grab and return lots of lines
	this is very generic
	typical uses are
		one work + a line range (which may or may not be the whole work: {'work1: (start,stop)}
		multiple (whole) works: {'work1': (start,stop), 'work2': (start,stop), ...}
	but you could one day use this to mix-and-match:
		a completeindex of Thuc + Hdt 3 + all Epic...
	this is, you could use compileauthorandworklist() to feed this function
	the resulting concorances would be massive

	:param worksandboundaries:
	:param cursor:
	:return:
	"""

    lineobjects = deque()

    for w in worksandboundaries:
        db = w[0:6]
        query = 'SELECT {wtmpl} FROM {db} WHERE (index >= %s AND index <= %s)'.format(
            wtmpl=worklinetemplate, db=db)
        data = (worksandboundaries[w][0], worksandboundaries[w][1])
        cursor.execute(query, data)
        lines = resultiterator(cursor)

        thiswork = [dblineintolineobject(l) for l in lines]
        lineobjects.extend(thiswork)

    return list(lineobjects)
Ejemplo n.º 4
0
def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]:
    """

	note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed

	hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS  entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values;

	hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name);
	  entry_name   | total_count | gr_count | lt_count | dp_count | in_count | ch_count
	---------------+-------------+----------+----------+----------+----------+----------
	 κατακλεῖϲαι   |          31 |       30 |        0 |        0 |        1 |        0
	 κατακλειούϲηϲ |           3 |        3 |        0 |        0 |        0 |        0
	 κατακλῇϲαι    |           1 |        1 |        0 |        0 |        0 |        0
	(3 rows)

	:param listofwords:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbcursor = dbconnection.cursor()

    try:
        firstletteroffirstword = stripaccents(listofwords[0][0])
    except IndexError:
        return list()

    if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ':
        firstletteroffirstword = '0'

    tqtemplate = """
	CREATE TEMP TABLE bulkcounter_{rnd} AS
		SELECT values AS 
			entriestocheck FROM unnest(ARRAY[%s]) values
	"""

    uniquename = assignuniquename(12)
    tempquery = tqtemplate.format(rnd=uniquename)
    data = (listofwords, )
    dbcursor.execute(tempquery, data)

    qtemplate = """
	SELECT * FROM wordcounts_{x} WHERE EXISTS 
		(SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name)
	"""

    query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword)
    try:
        dbcursor.execute(query)
        results = resultiterator(dbcursor)
    except psycopg2.ProgrammingError:
        # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist
        results = list()

    wordcountobjects = [dbWordCountObject(*r) for r in results]

    dbconnection.connectioncleanup()

    return wordcountobjects
def precomposedsqlsearcher(querydict, dbcursor) -> Generator:
    """

    as per substringsearchintosqldict():
        sq = { table1: {query: q, data: d, temptable: t},
        table2: {query: q, data: d, temptable: t},
        ... }

    only sent the dict at sq[tableN]

    """

    t = querydict['temptable']
    q = querydict['query']
    d = (querydict['data'],)

    if t:
        unique = assignuniquename()
        t = re.sub('UNIQUENAME', unique, t)
        q = re.sub('UNIQUENAME', unique, q)
        dbcursor.execute(t)

    found = list()

    # debugmessage('precomposedsqlsearcher() querydict = {q}'.format(q=querydict))
    # debugmessage('precomposedsqlsearcher() q:\n\t{q}\nd:\n\t{d}'.format(q=q, d=d))

    warnings = {
        1: 'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex',
        2: 'psycopg2.InternalError; did not execute query="{q}" and data="{d}',
        3: 'precomposedsqlsearcher() DatabaseError for {c} @ {p}',
        4: 'precomposedsqlsearcher() IndexError: malformed query/data combination; empty results returned'
    }

    try:
        dbcursor.execute(q, d)
        found = resultiterator(dbcursor)
    except psycopg2.DataError:
        # e.g., invalid regular expression: parentheses () not balanced
        consolewarning(warnings[1].format(d=d[0]), color='red')
    except psycopg2.InternalError:
        # current transaction is aborted, commands ignored until end of transaction block
        consolewarning(warnings[2].format(q=q, d=d), color='red')
    except psycopg2.DatabaseError:
        # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq
        # added to track PooledConnection threading issues
        # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4'
        consolewarning(warnings[3].format(c=dbcursor, p=multiprocessing.current_process().name), color='red')
        consolewarning('\tq, d: {q}, {d}'.format(q=q, d=q))
    except IndexError:
        found = list()
        consolewarning(warnings[4], color='red')
        consolewarning("\tq = {q}".format(q=q, d=q), color='red')
        consolewarning("\td = :{d}\n===========".format(q=q, d=q), color='yellow')

    return found
Ejemplo n.º 6
0
def bulklexicalgrab(listofwords: List[str], tabletouse: str, targetcolumn: str,
                    language: str) -> list:
    """

	grab a bunch of lex/morph entries by using a temp table

	e.g.,
		lexicalresults = bulklexicalgrab(listofwords, 'dictionary', 'entry_name', language)
		results = bulklexicalgrab(listofwords, 'morphology', 'observed_form', language)

	:param listofwords:
	:param tabletouse:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbcursor = dbconnection.cursor()

    tqtemplate = """
	CREATE TEMP TABLE bulklex_{rnd} AS
		SELECT values AS 
			entriestocheck FROM unnest(ARRAY[%s]) values
	"""

    uniquename = assignuniquename(12)
    tempquery = tqtemplate.format(rnd=uniquename)
    data = (listofwords, )
    dbcursor.execute(tempquery, data)

    qtemplate = """
	SELECT * FROM {lg}_{thetable} WHERE EXISTS 
		(SELECT 1 FROM bulklex_{rnd} tocheck WHERE tocheck.entriestocheck = {lg}_{thetable}.{target})
	"""

    query = qtemplate.format(rnd=uniquename,
                             thetable=tabletouse,
                             target=targetcolumn,
                             lg=language)

    try:
        dbcursor.execute(query)
        results = resultiterator(dbcursor)
    except psycopg2.ProgrammingError:
        # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist
        results = list()

    dbconnection.connectioncleanup()

    return results
Ejemplo n.º 7
0
def rankheadwordsbyprevalence(listofheadwords: list) -> dict:
    """

	"""

    # print('rankheadwordsbyprevalence() listofheadwords', listofheadwords)

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    dbcursor = dbconnection.cursor()
    rnd = assignuniquename(6)

    tqtemplate = """
	CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS
		SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords
	"""

    qtemplate = """
	SELECT entry_name, total_count FROM {db} 
		WHERE EXISTS 
			(SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name)
	"""

    tempquery = tqtemplate.format(rnd=rnd, allwords=list(listofheadwords))
    dbcursor.execute(tempquery)
    # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values
    # third parameter is

    query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts')
    dbcursor.execute(query)
    results = resultiterator(dbcursor)

    ranked = {r[0]: r[1] for r in results}

    # you have a problem: you just tossed a bunch of headwords that did not have good prevalence data
    # discovered when Ϲωκράτηϲ went missing from Plato

    r = set(ranked.keys())
    h = set(listofheadwords)
    delta = h - r

    nullranked = {d: 0 for d in delta}

    ranked = {**ranked, **nullranked}

    return ranked
Ejemplo n.º 8
0
def loadlemmataasobjects() -> dict:
    """

	return a dict of all possible lemmataobjects

	hipparchiaDB=# select * from greek_lemmata limit 1;
	 dictionary_entry | xref_number |    derivative_forms
	------------------+-------------+------------------------
	 ζῳοτροφία        |    49550639 | {ζῳοτροφίᾳ,ζῳοτροφίαϲ}

	:return:
	"""

    print('loading all lemmata...', end=str())
    dbconnection = ConnectionObject()
    cursor = dbconnection.cursor()

    q = """
	SELECT dictionary_entry, xref_number, derivative_forms FROM {lang}_lemmata
	"""

    lemmatadict = dict()

    languages = {1: 'greek', 2: 'latin'}

    for key in languages:
        cursor.execute(q.format(lang=languages[key]))
        results = resultiterator(cursor)
        lemmatadict = {
            **{r[0]: dbLemmaObject(*r)
               for r in results},
            **lemmatadict
        }

    print('\t', len(lemmatadict), 'lemmata loaded', end=str())
    # print('lemmatadict["molestus"]', lemmatadict['molestus'].formlist)
    # print('lemmatadict["Mausoleus"]', lemmatadict['Mausoleus'].formlist)
    # print('lemmatadict["λύω"]', lemmatadict['λύω'].formlist)
    # print('lemmatadict["Δημοϲθένηϲ"]', lemmatadict['Δημοϲθένηϲ'].formlist)

    dbconnection.connectioncleanup()

    return lemmatadict
Ejemplo n.º 9
0
def bulklinegrabber(table: str, column: str, criterion: str, setofcriteria,
                    cursor) -> dict:
    """

	snarf up a huge number of lines

	:param table:
	:param column:
	:param criterion:
	:param setofcriteria:
	:param cursor:
	:return:
	"""

    qtemplate = 'SELECT {cri}, {col} FROM {t} WHERE {cri} = ANY(%s)'
    q = qtemplate.format(col=column, t=table, cri=criterion)
    d = (list(setofcriteria), )

    cursor.execute(q, d)
    lines = resultiterator(cursor)

    contents = {'{t}@{i}'.format(t=table, i=l[0]): l[1] for l in lines}

    return contents
Ejemplo n.º 10
0
def bulkenvironsfetcher(table: str, searchresultlist: list,
                        context: int) -> list:
    """

	given a list of SearchResult objects, populate the lineobjects of each SearchResult with their contexts

	:param table:
	:param searchresultlist:
	:param context:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    cursor = dbconnection.cursor()

    tosearch = deque()
    reversemap = dict()

    for r in searchresultlist:
        resultnumber = r.hitnumber
        focusline = r.getindex()
        environs = list(
            range(int(focusline - (context / 2)),
                  int(focusline + (context / 2)) + 1))
        tosearch.extend(environs)
        rmap = {e: resultnumber for e in environs}
        reversemap.update(rmap)
        r.lineobjects = list()

    tosearch = [str(x) for x in tosearch]

    tqtemplate = """
	CREATE TEMPORARY TABLE {au}_includelist_{ac} AS
		SELECT values AS 
			includeindex FROM unnest(ARRAY[{lines}]) values
	"""

    # avoidcollisions instead of DROP TABLE IF EXISTS; the table disappears when the connection is cleaned up
    avoidcollisions = assignuniquename()

    tempquery = tqtemplate.format(au=table,
                                  ac=avoidcollisions,
                                  lines=','.join(tosearch))
    cursor.execute(tempquery)

    qtemplate = """
	SELECT {wtmpl} FROM {au} WHERE EXISTS 
		(SELECT 1 FROM {au}_includelist_{ac} incl WHERE incl.includeindex = {au}.index)
	"""

    query = qtemplate.format(wtmpl=worklinetemplate,
                             au=table,
                             ac=avoidcollisions)
    cursor.execute(query)
    results = resultiterator(cursor)

    lines = [dblineintolineobject(r) for r in results]
    indexedlines = {l.index: l for l in lines}

    for r in searchresultlist:
        environs = list(
            range(int(r.getindex() - (context / 2)),
                  int(r.getindex() + (context / 2)) + 1))
        for e in environs:
            try:
                r.lineobjects.append(indexedlines[e])
            except KeyError:
                # you requested a line that was outside of the scope of the table
                # so there was no result and the key will not match a find
                pass

    dbconnection.connectioncleanup()

    return searchresultlist
Ejemplo n.º 11
0
def substringsearch(seeking: str,
                    authortable: str,
                    searchobject: SearchObject,
                    cursor,
                    templimit=None) -> Generator:
    """

    actually one of the most basic search types: look for a string/substring

    the whereclause is built conditionally:

    sample 'unrestricted':
        SELECT * FROM gr0059 WHERE  ( stripped_line ~* %s )  LIMIT 200 ('βαλλ',)
        [i.e, SELECT * FROM gr0059 WHERE  ( stripped_line ~* 'βαλλ') LIMIT 200;]
    sample 'between':
        SELECT * FROM gr0032 WHERE (index BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 1846 AND 2061) AND ( stripped_line ~* %s )  LIMIT 200 ('βαλλ',)
    sample 'temptable':
        [create the temptable]
        SELECT * FROM in1204 WHERE EXISTS (SELECT 1 FROM in1204_includelist incl WHERE incl.includeindex = in1204.index AND in1204.accented_line ~* %s)  LIMIT 200 ('τούτου',)

    :param seeking:
    :param authortable:
    :param searchobject:
    :param cursor:
    :param templimit:
    :return:
    """

    so = searchobject

    if templimit:
        lim = str(templimit)
    else:
        lim = str(so.cap)

    if so.onehit:
        mylimit = ' LIMIT 1'
    else:
        mylimit = ' LIMIT {lim}'.format(lim=lim)

    mysyntax = '~*'
    found = list()

    r = so.indexrestrictions[authortable]
    whereextensions = str()

    if r['type'] == 'temptable':
        # make the table
        q = r['where']['tempquery']
        avoidcollisions = assignuniquename()
        q = re.sub('_includelist',
                   '_includelist_{a}'.format(a=avoidcollisions), q)
        cursor.execute(q)
        # now you can work with it
        wtempate = """
		EXISTS
			(SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index
		"""
        whereextensions = wtempate.format(a=avoidcollisions, tbl=authortable)
        whr = 'WHERE {xtn} AND {au}.{col} {sy} %s)'.format(au=authortable,
                                                           col=so.usecolumn,
                                                           sy=mysyntax,
                                                           xtn=whereextensions)
    elif r['type'] == 'between':
        whereextensions = buildbetweenwhereextension(authortable, so)
        whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn,
                                                   sy=mysyntax,
                                                   xtn=whereextensions)
    elif r['type'] == 'unrestricted':
        whr = 'WHERE {xtn} ( {c} {sy} %s )'.format(c=so.usecolumn,
                                                   sy=mysyntax,
                                                   xtn=whereextensions)
    else:
        # should never see this
        consolewarning('error in substringsearch(): unknown whereclause type',
                       r['type'])
        whr = 'WHERE ( {c} {sy} %s )'.format(c=so.usecolumn, sy=mysyntax)

    qtemplate = 'SELECT {wtmpl} FROM {db} {whr} {lm}'
    q = qtemplate.format(wtmpl=worklinetemplate,
                         db=authortable,
                         whr=whr,
                         lm=mylimit)
    d = (seeking, )

    # print('q/d\nq:\t{q}\nd:\t{d}\n'.format(q=q, d=d))

    try:
        cursor.execute(q, d)
        found = resultiterator(cursor)
    except psycopg2.DataError:
        # e.g., invalid regular expression: parentheses () not balanced
        consolewarning(
            'DataError; cannot search for »{d}«\n\tcheck for unbalanced parentheses and/or bad regex'
            .format(d=d[0]),
            color='red')
    except psycopg2.InternalError:
        # current transaction is aborted, commands ignored until end of transaction block
        consolewarning(
            'psycopg2.InternalError; did not execute query="{q}" and data="{d}'
            .format(q=q, d=d),
            color='red')
    except psycopg2.DatabaseError:
        # psycopg2.DatabaseError: error with status PGRES_TUPLES_OK and no message from the libpq
        # added to track PooledConnection threading issues
        # will see: 'DatabaseError for <cursor object at 0x136bab520; closed: 0> @ Process-4'
        consolewarning('DatabaseError for {c} @ {p}'.format(
            c=cursor, p=multiprocessing.current_process().name),
                       color='red')
        consolewarning('\tq, d', q, d)

    return found
Ejemplo n.º 12
0
def buildwinnertakesallbagsofwords(morphdict, sentences) -> deque:
    """

	turn a list of sentences into a list of list of headwords

	here we figure out which headword is the dominant homonym

	then we just use that term

		esse ===> sum
		esse =/=> edo

	assuming that it is faster to do this 2x so you can do a temp table query rather than iterate into DB

	not tested/profiled, though...

	:param morphdict:
	:param sentences:
	:return:
	"""

    # PART ONE: figure out who the "winners" are going to be

    bagsofwords = buildflatbagsofwords(morphdict, sentences)

    allheadwords = {w for bag in bagsofwords for w in bag}

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbconnection.setautocommit()
    dbcursor = dbconnection.cursor()

    rnd = assignuniquename(6)

    tqtemplate = """
	CREATE TEMPORARY TABLE temporary_headwordlist_{rnd} AS
		SELECT headwords AS hw FROM unnest(ARRAY[{allwords}]) headwords
	"""

    qtemplate = """
	SELECT entry_name, total_count FROM {db} 
		WHERE EXISTS 
			(SELECT 1 FROM temporary_headwordlist_{rnd} temptable WHERE temptable.hw = {db}.entry_name)
	"""

    tempquery = tqtemplate.format(rnd=rnd, allwords=list(allheadwords))
    dbcursor.execute(tempquery)
    # https://www.psycopg.org/docs/extras.html#psycopg2.extras.execute_values
    # third parameter is

    query = qtemplate.format(rnd=rnd, db='dictionary_headword_wordcounts')
    dbcursor.execute(query)
    results = resultiterator(dbcursor)

    randkedheadwords = {r[0]: r[1] for r in results}

    # PART TWO: let the winners take all

    bagsofwords = deque()
    for s in sentences:
        lemattized = deque()
        for word in s:
            # [('x', 4), ('y', 5), ('z', 1)]
            try:
                possibilities = sorted([(item, randkedheadwords[item])
                                        for item in morphdict[word]],
                                       key=lambda x: x[1])
                # first item of last tuple is the winner
                lemattized.append(possibilities[-1][0])
            except KeyError:
                pass
        if lemattized:
            bagsofwords.append(lemattized)

    return bagsofwords