Exemple #1
0
def generatepreliminaryhitlist(
        so: SearchObject,
        recap=hipparchia.config['INTERMEDIATESEARCHCAP']) -> List[dbWorkLine]:
    """

    grab the hits for part one of a two part search

    INTERMEDIATESEARCHCAP is interesting...

    you can test via "Sought »α« within 1 lines of »ι«"

    400k or so seems to be the practical worst case: if you search for "α" in all of the databases you will get 392275
    lines back as your intermediate result. You just grabbed a huge % of the total possible collection of lines.

    you can pull this in about 5s, so there is really no reason to worry about the cap if using the grabber

    """

    actualcap = so.cap
    so.cap = recap

    so.poll.statusis('Searching for "{x}"'.format(x=so.termone))
    if so.searchtype == 'phraseandproximity':
        so.poll.statusis('Searching for "{x}"'.format(x=so.phrase))

    if so.lemmaone:
        so.poll.statusis('Searching for all forms of "{x}"'.format(
            x=so.lemmaone.dictionaryentry))

    hitlines = basicprecomposedsqlsearcher(so)
    so.cap = actualcap

    return hitlines
def checkneedtoabort(so: SearchObject) -> str:
    """

    can/should we even do this?

    """

    if so.iamarobot:
        return str()

    abortjson = str()
    abort = lambda x: emptyvectoroutput(so, x)
    activecorpora = so.getactivecorpora()
    so.poll.statusis('Compiling the list of works to search')
    so.searchlist = compilesearchlist(listmapper, so.session)

    # so.seeking should only be set via a fallback when session['baggingmethod'] == 'unlemmatized'
    if (so.lemmaone or so.tovectorize or so.seeking) and activecorpora:
        pass
    elif not activecorpora:
        abortjson = abort(['no active corpora'])
    elif not so.searchlist:
        abortjson = abort(['empty list of places to look'])
    elif so.vectorquerytype == 'topicmodel':
        # we don't have and don't need a lemmaone, etc.
        pass
    elif so.vectorquerytype == 'analogies':
        if not so.lemmaone or not so.lemmatwo or not so.lemmathree:
            abortjson = abort('[did not have three lemmata]')
    else:
        # note that some vector queries do not need a term; fix this later...
        abortjson = abort(['there was no search term'])

    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in so.searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        m = 'the vector scope max exceeded: {a} > {b} '
        abortjson = abort([
            m.format(a=locale.format_string('%d', wordstotal, grouping=True),
                     b=locale.format_string('%d', maxwords, grouping=True))
        ])

    return abortjson
Exemple #3
0
def buildtriplelemmasearchobject(pollid, one, two, three) -> SearchObject:
    """

	build a search object w/ three lemmata

	"""

    seeking = str()
    proximate = str()

    if not session['baggingmethod'] == 'unlemmatized':
        try:
            termone = lemmatadict[one]
            termtwo = lemmatadict[two]
            termthree = lemmatadict[three]
        except KeyError:
            termone = None
            termtwo = None
            termthree = None

        so = SearchObject(pollid, seeking, proximate, termone, termtwo,
                          session)
        so.lemmathree = termthree
    else:
        so = SearchObject(pollid, one, two, True, True, session)
        so.lemmathree = True
        so.termthree = so.searchtermcleanup(three)

    return so
Exemple #4
0
def headwordsearch(searchid, headform) -> JSON_STR:
    """

	you get sent here via the morphology tables

	this is a restricted version of executesearch(): a dictionary headword

	:param searchid:
	:param headform:
	:return:
	"""

    probeforsessionvariables()
    inputlemma = cleaninitialquery(headform)

    try:
        lemma = lemmatadict[inputlemma]
    except KeyError:
        lemma = None

    pollid = validatepollid(searchid)
    seeking = str()
    proximate = str()

    proximatelemma = str()

    so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma,
                      session)

    jsonoutput = executesearch(pollid, so)

    return jsonoutput
Exemple #5
0
def basicprecomposedsqlsearcher(so: SearchObject,
                                themanager=None) -> List[dbWorkLine]:
    """

    give me sql and I will search

    this function just picks a pathway: use the golang module or do things in house?

    """

    so.searchsqldict = insertuniqunames(so.searchsqldict)

    if not themanager:
        usesharedlibrary = hipparchia.config['EXTERNALGRABBER']

        if not usesharedlibrary:
            debugmessage('searching via python')
            themanager = precomposedsqlsearchmanager
        else:
            # debugmessage('searching via external helper code')
            themanager = precomposedexternalsearcher

    hits = themanager(so)

    return hits
Exemple #6
0
def updatesearchlistandsearchobject(so: SearchObject) -> SearchObject:
    """

	you have a searchlist; now tell the searchobject more about it...

	this has been peeled off so that golangvectors() can call it too

	"""

    # mark works that have passage exclusions associated with them:
    # gr0001x001 instead of gr0001w001 if you are skipping part of w001
    so.searchlist = flagexclusions(so.searchlist, so.session)

    so.poll.statusis('Calculating full authors to search')
    so.searchlist = calculatewholeauthorsearches(so.searchlist, authordict)
    so.usedcorpora = so.wholecorporasearched()
    so.poll.statusis('Configuring the search restrictions')
    so.indexrestrictions = configurewhereclausedata(so.searchlist, workdict,
                                                    so)

    return so
Exemple #7
0
def buildsinglelemmasearchobject(pollid: str, one: str) -> SearchObject:
    """

	build a search object w/ one lemma

	"""
    try:
        lemma = lemmatadict[one]
    except KeyError:
        lemma = None

    seeking = str()
    proximate = str()
    proximatelemma = str()
    so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma,
                      session)

    if so.session['baggingmethod'] == 'unlemmatized':
        so.seeking = so.searchtermcleanup(one)

    return so
def precomposedexternalsearcher(so: SearchObject) -> List[dbWorkLine]:
    """

    you are using golang to do the search

    [1] send the searchdict to redis as a list of json.dumps(items) (keyed to the searchid)
    [2] send the external fnc the searchid, cap value, worker #, psql login info, redis login info
    [3] wait for the function to (a) gather; (b) search; (c) store
    [4] pull the results back from redis via the searchid
    NB: redis makes sense because the activity poll is going to have to be done via redis anyway...

    the searched items are stored under the redis key 'searchid_results'
    json.loads() will leave you with a dictionary of k/v pairs that can be turned into a dbWorkLine

    """

    warning = 'attempted to search via external helper but {x} is not available using precomposedsqlsearchmanager() instead'

    if not gosearch and not haveexternalhelper(getexternalhelperpath()):
        x = 'the external module'
        if not haveexternalhelper(getexternalhelperpath()):
            x = hipparchia.config['EXTERNALBINARYNAME']
        consolewarning(warning.format(x=x), color='red')
        return precomposedsqlsearchmanager(so)

    if not canuseredis:
        consolewarning(warning.format(x='redis'), color='red')
        return precomposedsqlsearchmanager(so)

    rc = establishredisconnection()

    so.searchsqldict = rewritesqlsearchdictforexternalhelper(so)
    # debugmessage('storing search at "{r}"'.format(r=so.searchid))

    for s in so.searchsqldict:
        rc.sadd(so.searchid, json.dumps(so.searchsqldict[s]))

    # if 1 > 0:
    #     consolewarning('precomposedgolangsearcher() merely stored the search in redis and did not execute it')
    #     return list()

    if not hipparchia.config['GRABBERCALLEDVIACLI']:
        resultrediskey = helpersharedlibrarysearcher(so)
    else:
        resultrediskey = helperclibinarysearcher(so)

    redisresults = redisfetch(resultrediskey)

    hits = [redishitintodbworkline(r) for r in redisresults]

    return hits
Exemple #9
0
def singlewordsearch(searchid, searchterm) -> JSON_STR:
    """

	you get sent here via the morphology tables

	this is a restricted version of executesearch(): single, exact term

	WINDOWS ONLY ERROR: this function will trigger a recursion error

	the situation looks a lot like case #3 @ https://bugs.python.org/issue9592

	but that is supposed to be a closed bug

	cf the complaints at https://forums.fast.ai/t/recursion-error-fastai-v1-0-27-windows-10/30673/10

	"multiprocessing\popen_spawn_win32.py" is the culprit?

	the current 'solution' is to send things to executesearch() instead "if osname == 'nt'"
	this test is inside morphologychartjs(); this is a potential source of future brittleness
	to the extent that one wants to explore refactoring executesearch()

	:param searchid:
	:param searchterm:
	:return:
	"""

    probeforsessionvariables()

    pollid = validatepollid(searchid)
    searchterm = cleaninitialquery(searchterm)
    seeking = ' {s} '.format(s=searchterm)
    proximate = str()
    lemma = None
    proximatelemma = None

    so = SearchObject(pollid, seeking, proximate, lemma, proximatelemma,
                      session)

    jsonoutput = executesearch(pollid, so)

    return jsonoutput
def rebuildsearchobjectviasearchorder(so: SearchObject) -> SearchObject:
    """

	rewrite the searchobject so that you look for the less common things first

	"""

    if so.lemmaone and so.lemmatwo:
        hwone = querytotalwordcounts(so.lemmaone.dictionaryentry)
        hwtwo = querytotalwordcounts(so.lemmatwo.dictionaryentry)
        # from server.hipparchiaobjects.wordcountobjects import dbWordCountObject
        # print('{a}: {b}, {c}: {d}'.format(a=so.lemmaone.dictionaryentry, b=hwone.t, c=so.lemmatwo.dictionaryentry, d=hwtwo.t))
        if hwtwo.t < hwone.t:
            tmp = so.lemmaone
            so.lemmaone = so.lemmatwo
            so.lemmatwo = tmp
    elif so.lemma or so.proximatelemma:
        pass
    elif so.accented or re.search(r'^[a-z]', so.termone) and so.near:
        # choose the necessarily faster option
        unomdifiedskg = massagesearchtermsforwhitespace(so.seeking)
        unmodifiedprx = so.proximate
        leastcommon = findleastcommonterm(unomdifiedskg + ' ' + unmodifiedprx,
                                          so.accented)
        if leastcommon != unomdifiedskg:
            tmp = so.termone
            so.termone = so.termtwo
            so.termtwo = tmp
    elif len(so.termtwo) > len(so.termone) and so.near:
        # look for the longest word first since that is probably the quicker route
        # but you can't swap searchingfor and proximate this way in a 'is not near' search without yielding the wrong focus
        tmp = so.termone
        so.termone = so.termtwo
        so.termtwo = tmp

    return so
def perparesoforsecondsqldict(so: SearchObject, initialhitlines: List[dbWorkLine], usebetweensyntax=True) -> SearchObject:
    """

    after finding initialhitlines sqlwithinxlinessearch() will run a second query

    it needs a new sqldict

    note that "usebetweensyntax=False" will break precomposedphraseandproximitysearch()

    """

    so.indexrestrictions = dict()
    authorsandlines = dict()

    if not usebetweensyntax:
        # consolewarning('sqlwithinxlinessearch(): temptable')
        # time trials...
        # Sought all 13 known forms of »ὕβριϲ« within 4 lines of all 230 known forms of »φεύγω«
        # Searched 7,873 texts and found 9 passages (11.87s)
        # Searched between 400 B.C.E. and 350 B.C.E.

        # Sought all 230 known forms of »φεύγω« within 4 lines of all 16 known forms of »κρίϲιϲ«
        # Searched 7,873 texts and found 12 passages (14.64s)
        # Searched between 400 B.C.E. and 350 B.C.E.

        for hl in initialhitlines:
            linestosearch = list(range(hl.index - so.distance, hl.index + so.distance + 1))
            try:
                authorsandlines[hl.authorid].extend(linestosearch)
            except KeyError:
                authorsandlines[hl.authorid] = linestosearch

        so.searchlist = list(authorsandlines.keys())

        for a in authorsandlines:
            so.indexrestrictions[a] = dict()
            so.indexrestrictions[a]['type'] = 'temptable'
            so.indexrestrictions[a]['where'] = wholeworktemptablecontents(a, set(authorsandlines[a]))
            # print("so.indexrestrictions[a]['where']", so.indexrestrictions[a]['where'])
    else:
        # Sought all 13 known forms of »ὕβριϲ« within 4 lines of all 230 known forms of »φεύγω«
        # Searched 7,873 texts and found 9 passages (9.35s)
        # Searched between 400 B.C.E. and 350 B.C.E.

        # Sought all 230 known forms of »φεύγω« within 4 lines of all 16 known forms of »κρίϲιϲ«
        # Searched 7,873 texts and found 12 passages (11.35s)
        # Searched between 400 B.C.E. and 350 B.C.E.

        # consolewarning('sqlwithinxlinessearch(): between')
        for hl in initialhitlines:
            boundiaries = (hl.index - so.distance, hl.index + so.distance)
            try:
                authorsandlines[hl.authorid].append(boundiaries)
            except KeyError:
                authorsandlines[hl.authorid] = [boundiaries]
        for a in authorsandlines:
            so.searchlist = list(authorsandlines.keys())
            so.indexrestrictions[a] = dict()
            so.indexrestrictions[a]['where'] = dict()
            so.indexrestrictions[a]['type'] = 'between'
            so.indexrestrictions[a]['where']['listofboundaries'] = authorsandlines[a]
            so.indexrestrictions[a]['where']['listofomissions'] = list()

    return so
Exemple #12
0
def dispatchvectorsearch(vectortype: str,
                         searchid: str,
                         one=None,
                         two=None,
                         three=None) -> JSON_STR:
    """

	dispatcher for "/vectors/..." requests

	"""

    if not hipparchia.config['SEMANTICVECTORSENABLED']:
        so = SearchObject(str(), str(), str(), str(), str(), session)
        oo = SearchOutputObject(so)
        target = 'searchsummary'
        message = '[semantic vectors have not been enabled]'
        return oo.generatenulloutput(itemname=target, itemval=message)

    pollid = validatepollid(searchid)
    one = depunct(one)
    two = depunct(two)
    three = depunct(three)

    simple = [pollid, one]
    triple = [pollid, one, two, three]

    knownfunctions = {
        'nearestneighborsquery': {
            'bso': simple,
            'pref': 'CONCEPTMAPPINGENABLED'
        },
        'analogies': {
            'bso': triple,
            'pref': 'VECTORANALOGIESENABLED'
        },
        'topicmodel': {
            'bso': simple,
            'pref': 'TOPICMODELINGENABLED'
        },
        'vectortestfunction': {
            'bso': simple,
            'pref': 'TESTINGVECTORBUTTONENABLED'
        },
        'unused': {
            'fnc': lambda: str(),
            'bso': None,
            'pref': None
        },
    }

    if not knownfunctions[vectortype]['pref'] or not hipparchia.config[
            knownfunctions[vectortype]['pref']]:
        return json.dumps('this type of search has not been enabled')

    bso = knownfunctions[vectortype]['bso']

    so = None

    if len(bso) == 4:
        so = buildtriplelemmasearchobject(*bso)

    if len(bso) == 2:
        so = buildsinglelemmasearchobject(*bso)

    so.vectorquerytype = vectortype

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]
    so.poll.activate()
    so.poll.statusis('Preparing to vectorize')

    if hipparchia.config['EXTERNALVECTORHELPER']:
        j = externalvectors(so)
    else:
        j = pythonvectors(so)

    if hipparchia.config['JSONDEBUGMODE']:
        print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j))

    try:
        del so.poll
    except AttributeError:
        pass

    return j
Exemple #13
0
def precomposedsqlwithinxlinessearch(so: SearchObject) -> List[dbWorkLine]:
    """

    after finding x, look for y within n lines of x

    people who send phrases to both halves and/or a lot of regex will not always get what they want

    note that this implementations is significantly slower than the standard withinxlines() + simplewithinxlines()

    """

    initialhitlines = generatepreliminaryhitlist(so)

    # we are going to need a new searchsqldict w/ a new temptable
    # sq = { table1: {query: q, data: d, temptable: t},
    #         table2: {query: q, data: d, temptable: t}, ...

    # this means refeeding searchlistintosqldict() and priming it for a 'temptable' search
    # the temptable follows the paradigm of wholeworktemptablecontents()
    # r {'type': 'temptable', 'where': {'tempquery': '\n\tCREATE TEMPORARY TABLE in0f08_includelist AS \n\t\tSELECT values \n\t\t\tAS includeindex FROM unnest(ARRAY[768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,763,764,765,766,767]) values\n\t'}}

    so = perparesoforsecondsqldict(so, initialhitlines)

    so.searchsqldict = searchlistintosqldict(so, so.termtwo)
    if so.lemmatwo:
        so.lemmaone = so.lemmatwo
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    m = 'Now searching among the {n} initial finds for {l}"{x}"'
    so.poll.statusis(m.format(n=len(initialhitlines), x=so.termtwo, l=str()))
    if so.lemmaone:
        so.poll.statusis(
            m.format(n=len(initialhitlines),
                     x=so.lemmaone.dictionaryentry,
                     l="all forms of "))

    so.poll.sethits(0)
    newhitlines = basicprecomposedsqlsearcher(so)

    # newhitlines will contain, e.g., in0001w0ig_493 and in0001w0ig_492, i.e., 2 lines that are part of the same 'hit'
    # so we need can't use newhitlines directly but have to check it against the initial hits
    # that's fine since "not near" would push us in this direction in any case

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()
    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    finalhitlines = list()
    if so.near:
        # "is near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    return finalhitlines
def buildsearchobject(searchid: str, therequest: request,
                      thesession: session) -> SearchObject:
    """

	generic searchobject builder

	:param searchid:
	:param therequest:
	:param thesession:
	:return:
	"""

    whitespace = ' '

    if not searchid:
        searchid = str(int(time.time()))

    probeforsessionvariables()

    # a search can take 30s or more and the user might alter the session while the search is running
    # by toggling onehit, etc that can be a problem, so freeze the values now and rely on this instead
    # of some moving target
    frozensession = thesession.copy()

    # need to sanitize input at least a bit: remove digits and punctuation
    # dispatcher will do searchtermcharactersubstitutions() and massagesearchtermsforwhitespace() to take
    # care of lunate sigma, etc.

    seeking = cleaninitialquery(therequest.args.get('skg', ''))
    proximate = cleaninitialquery(therequest.args.get('prx', ''))
    inputlemma = cleaninitialquery(therequest.args.get('lem', ''))
    inputproximatelemma = cleaninitialquery(therequest.args.get('plm', ''))

    try:
        lemma = lemmatadict[inputlemma]
    except KeyError:
        lemma = None

    # print('lo forms', lemma.formlist)

    try:
        proximatelemma = lemmatadict[inputproximatelemma]
    except KeyError:
        proximatelemma = None

    replacebeta = False

    if hipparchia.config['UNIVERSALASSUMESBETACODE'] and re.search(
            '[a-zA-Z]', seeking):
        # why the 'and' condition:
        #   sending unicode 'οὐθενὸϲ' to the betacode function will result in 0 hits
        #   this is something that could/should be debugged within that function,
        #   but in practice it is silly to allow hybrid betacode/unicode? this only
        #   makes the life of a person who wants unicode+regex w/ a betacode option more difficult
        replacebeta = True

    if hipparchia.config['TLGASSUMESBETACODE']:
        if justtlg() and (re.search('[a-zA-Z]', seeking) or re.search(
                '[a-zA-Z]', proximate)) and not re.search(
                    minimumgreek, seeking) and not re.search(
                        minimumgreek, proximate):
            replacebeta = True

    if replacebeta:
        seeking = seeking.upper()
        seeking = replacegreekbetacode(seeking)
        seeking = seeking.lower()
        proximate = proximate.upper()
        proximate = replacegreekbetacode(proximate)
        proximate = proximate.lower()

    if seeking == whitespace:
        seeking = str()

    if proximate == whitespace:
        proximate = str()

    so = SearchObject(searchid, seeking, proximate, lemma, proximatelemma,
                      frozensession)

    return so
def precomposedsqlphrasesearch(so: SearchObject) -> List[dbWorkLine]:
    """

    you are searching for a relatively rare word: we will keep things simple-ish

    note that the second half of this is not MP: but searches already only take 6s; so clean code probably wins here

    FIXME:

    can't find the phrases in here...:

        κατεϲκεύαϲεν τὸ ἐνϲόριον FAILS
        ϲεν τὸ ἐνϲόριον το SUCCEEDS

    ch0005w001/2749

    1 Ῥουφεῖνα Ἰουδαία ἀρχι-
    2 ϲυνάγωγοϲ κατεϲκεύα-
    3 ϲεν τὸ ἐνϲόριον τοῖϲ ἀπε-     ( match: ἀπελευθέροιϲ )
    4 λευθέροιϲ καὶ θρέμ(μ)αϲιν
    5 μηδενὸϲ ἄλ(λ)ου ἐξουϲίαν ἔ-

    actually, this is a BUILDER problem AND a SERVER problem:

    BUILDER:

    2749 does not have κατεϲκεύαϲεν in it

    hipparchiaDB=# select index, accented_line, hyphenated_words  from ch0005 where index between 2746 and 2752;
     index |           accented_line           | hyphenated_words
    -------+-----------------------------------+------------------
      2748 | ῥουφεῖνα ἰουδαία ἀρχιϲυνάγωγοϲ    | ἀρχιϲυνάγωγοϲ
      2749 | κατεϲκεύα-                        |
      2750 | ϲεν τὸ ἐνϲόριον τοῖϲ ἀπελευθέροιϲ | ἀπελευθέροιϲ
      2751 | καὶ θρέμμαϲιν                     |
      2752 | μηδενὸϲ ἄλλου ἐξουϲίαν ἔχοντοϲ    | ἔχοντοϲ
    (5 rows)

    SERVER: ἀπελευθέροιϲ καὶ θρέμμαϲιν is missed by precomposedsqlphrasesearch()
    but it is found by precomposedsqlsubqueryphrasesearch()

    maybe it is time to nuke precomposedsqlphrasesearch() after all...

    NB: the dynamic workonphrasesearch() CAN find 'ἀπελευθέροιϲ καὶ θρέμμαϲιν'

    """
    debugmessage('executing a precomposedsqlphrasesearch()')

    so.termone = so.leastcommon
    searchphrase = so.phrase
    phraselen = len(searchphrase.split(' '))

    initialhitlines = generatepreliminaryhitlist(so)

    m = 'Now searching among the {h} initial hits for the full phrase "{p}"'
    so.poll.statusis(m.format(h=so.poll.gethits(), p=so.originalseeking))
    so.poll.sethits(0)

    fullmatches = list()

    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()
    commitcount = 0
    while initialhitlines and len(fullmatches) <= so.cap:
        commitcount += 1
        if commitcount == hipparchia.config['MPCOMMITCOUNT']:
            dbconnection.commit()
            commitcount = 0

        hit = initialhitlines.pop()

        wordset = lookoutsideoftheline(hit.index, phraselen - 1, hit.authorid, so, dbcursor)

        if not so.accented:
            wordset = re.sub(r'[.?!;:,·’]', str(), wordset)
        else:
            # the difference is in the apostrophe: δ vs δ’
            wordset = re.sub(r'[.?!;:,·]', str(), wordset)

        if so.near and re.search(searchphrase, wordset):
            fullmatches.append(hit)
            so.poll.addhits(1)
        elif not so.near and re.search(searchphrase, wordset) is None:
            fullmatches.append(hit)
            so.poll.addhits(1)

    dbconnection.connectioncleanup()

    return fullmatches
Exemple #16
0
def ldatopicsgenerateoutput(ldavishtmlandjs: str, searchobject: SearchObject):
    """

	pyLDAvis.prepared_data_to_html() outputs something that is almost pure JS and looks like this:

		<link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css">


		<div id="ldavis_el7428760626948328485476648"></div>
		<script type="text/javascript">

		var ldavis_el7428760626948328485476648_data = {"mdsDat": ...

		}
		</script>


	instance = {
		'maxfeatures': 2000,
		'components': 15,  # topics
		'maxfreq': .75,  # fewer than n% of sentences should have this word (i.e., purge common words)
		'minfreq': 5,  # word must be found >n times
		'iterations': 12,
		'mustbelongerthan': 3
	}

	:param ldavishtmlandjs:
	:param workssearched:
	:param settings:
	:param searchobject:
	:return:
	"""

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)

    workssearched = len(so.searchlist)

    vv = searchobject.vectorvalues
    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    lines = ldavishtmlandjs.split('\n')
    lines = [re.sub(r'\t', str(), l) for l in lines if l]

    lines.reverse()

    thisline = str()
    html = list()

    while not re.search(r'<script type="text/javascript">', thisline):
        html.append(thisline)
        try:
            thisline = lines.pop()
        except IndexError:
            # oops, we never found the script...
            thisline = '<script type="text/javascript">'

    # we cut '<script>'; now drop '</script>'
    lines.reverse()
    js = lines[:-1]

    findshtml = '\n'.join(html)
    findsjs = '\n'.join(js)

    ldacssurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css'
    ldacsslocal = '/css/ldavis.css'
    findshtml = re.sub(ldacssurl, ldacsslocal, findshtml)

    # brittle: ldavis might change its URLs between versions, etc.
    # should probably make this conditional upon the presence of the file locally...
    ldajsurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js'
    ldajslocal = '/static/jsforldavis.js'
    findsjs = re.sub(ldajsurl, ldajslocal, findsjs)

    # this next will break the reloaded figure: hm...
    # d3jsurl = r'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min'
    # d3jslocal = '/static/jsd3'
    # findsjs = re.sub(d3jsurl, d3jslocal, findsjs)
    #
    # print('findsjs',findsjs)

    who = str()
    where = '{n} authors'.format(n=searchobject.numberofauthorssearched())

    if searchobject.numberofauthorssearched() == 1:
        a = authordict[searchobject.searchlist[0][:6]]
        who = a.akaname
        where = who

    if workssearched == 1:
        try:
            w = workdict[searchobject.searchlist[0]]
            w = w.title
        except KeyError:
            w = str()
        where = '{a}, <worktitle>{w}</worktitle>'.format(a=who, w=w)

    output.title = 'Latent Dirichlet Allocation'
    output.found = findshtml
    output.js = findsjs

    output.setscope(workssearched)
    output.sortby = 'weight'
    output.thesearch = 'thesearch'.format(skg='')
    output.resultcount = 'the following topics'
    output.htmlsearch = '{n} topics in {w}'.format(n=settings['components'],
                                                   w=where)
    output.searchtime = so.getelapsedtime()
    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
def pythonvectors(so: SearchObject) -> JSON_STR:
    """

    this is the matching function to golangvectors()

    [0] test to see what will happen:
        [a] scope problems? [jump away if so...]
        [b] already a model on file? ... [jump down to #5 if so]
    [1] generate a searchlist
    [2] do a searchlistintosqldict()
    [3] acquire and bag the words
        [a] grab db lines that are relevant to the search
        [b] turn them into a unified text block
        [c] do some preliminary cleanups
        [d] break the text into sentences and assemble []SentenceWithLocus (NB: these are "unlemmatized bags of words")
        [e] figure out all of the words used in the passage
        [f] find all of the parsing info relative to these words
        [g] figure out which headwords to associate with the collection of words
        [h] build the lemmatized bags of words ('unlemmatized' can skip [f] and [g]...)
    [4] hand the bags over to Word2Vec(), etc. [*]
    [5] run queries against the model and return the JSON results
    """

    # debugmessage('pythonvectors()')
    assert so.vectorquerytype in [
        'analogies', 'nearestneighborsquery', 'topicmodel'
    ]

    # [0] is this really going to happen?
    so.poll.statusis('Checking for valid search')
    # [i] do we bail out before even getting started?
    # note that this can / will return independently and break here
    abortjson = checkneedtoabort(so)
    if abortjson:
        del so.poll
        return abortjson

    # [ii] do we actually have a model stored already?
    so.poll.statusis('Checking for stored search')
    # calculatewholeauthorsearches() + configurewhereclausedata()
    so = updatesearchlistandsearchobject(so)
    so.setsearchlistthumbprint()
    so.poll.allworkis(-1)  # this turns off the % completed notice in the JS
    so.poll.sethits(0)

    themodel = checkforstoredvector(so)

    if not themodel:
        # [1] generate a searchlist: use executesearch() as the template

        so.usecolumn = 'marked_up_line'
        so.cap = 199999999

        # [2] do a searchlistintosqldict() [this is killing lda...]
        so.searchsqldict = searchlistintosqldict(so, str(), vectors=True)

        bagsofwords = acquireandbagthewords(so)

        # [4] hand the bags over to Word2Vec(), etc.
        so.poll.statusis('Building the model')
        if so.vectorquerytype == 'nearestneighborsquery':
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'analogies':
            # the same gensim model can serve both analogies and neighbors
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'topicmodel':
            stops = list(mostcommonwordsviaheadwords())
            bagsofsentences = [' '.join(b) for b in bagsofwords]
            bagsofsentences = [
                removestopwords(s, stops) for s in bagsofsentences
            ]
            themodel = buildsklearnselectedworks(so, bagsofsentences)
        else:
            pass
    elif so.iamarobot:
        # there is a model and the bot is attempting to build something that has already been build
        return '<!-- MODEL EXISTS -->'

    # so we have a model one way or the other by now...
    # [5] run queries against the model
    if so.iamarobot:
        return '<!-- MODEL BUILT -->'

    if so.vectorquerytype == 'nearestneighborsquery':
        jsonoutput = generatenearestneighbordata(None, len(so.searchlist), so,
                                                 themodel)
    elif so.vectorquerytype == 'analogies':
        jsonoutput = gensimgenerateanalogies(themodel, so)
    elif so.vectorquerytype == 'topicmodel':
        # def ldatopicsgenerateoutput(ldavishtmlandjs: str, workssearched: int, settings: dict, searchobject: SearchObject):
        jsonoutput = ldatopicsgenerateoutput(themodel, so)
    else:
        jsonoutput = json.dumps(
            'golang cannot execute {s} queries'.format(s=so.vectorquerytype))

    return jsonoutput
Exemple #18
0
def precomposedsqlsubqueryphrasesearch(so: SearchObject) -> List[dbWorkLine]:
    """

    use subquery syntax to grab multi-line windows of text for phrase searching

    line ends and line beginning issues can be overcome this way, but then you have plenty of
    bookkeeping to do to to get the proper results focussed on the right line

    these searches take linear time: same basic time for any given scope regardless of the query

    """

    # rebuild the searchsqldict but this time pass through rewritequerystringforsubqueryphrasesearching()
    so.searchsqldict = searchlistintosqldict(so,
                                             so.phrase,
                                             subqueryphrasesearch=True)

    # debugmessage('precomposedsqlsubqueryphrasesearch() so.searchsqldict: {d}'.format(d=so.searchsqldict))

    # the windowed collection of lines; you will need to work to find the centers
    # windowing will increase the number of hits: 2+ lines per actual find
    initialhitlines = generatepreliminaryhitlist(so, recap=so.cap * 3)

    m = 'Generating final list of hits by searching among the {h} preliminary hits'
    so.poll.statusis(m.format(h=so.poll.gethits()))
    so.poll.sethits(0)

    sp = re.sub(r'^\s', r'(^|\\s)', so.phrase)
    sp = re.sub(r'\s$', r'(\\s|$)', sp)

    combinations = QueryCombinator(so.phrase)
    # the last item is the full phrase and it will have already been searched:  ('one two three four five', '')
    combinations = combinations.combinations()
    combinations.pop()

    listoffinds = list()

    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    setofhits = set()

    while initialhitlines:
        # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133]
        # figure out which line is really the line with the goods
        # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in
        # subsequent lines means that you really should check your work carefully; this is not an especially costly
        # operation relative to the whole search and esp. relative to the speed gains of using a subquery search
        lineobject = initialhitlines.pop()
        if not so.onehit or lineobject.authorid not in setofhits:
            if re.search(sp, getattr(lineobject, so.usewordlist)):
                listoffinds.append(lineobject)
                so.poll.addhits(1)
                setofhits.add(lineobject.authorid)
            else:
                try:
                    nextline = initialhitlines[0]
                except IndexError:
                    nextline = makeablankline('gr0000w000', -1)

                if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != (
                        nextline.index - 1):
                    # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101)
                    # usually you won't get a hit by grabbing the next db line, but sometimes you do...
                    query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                        wtmpl=worklinetemplate, tb=lineobject.authorid)
                    data = (lineobject.index + 1, )
                    dbcursor.execute(query, data)
                    try:
                        nextline = dblineintolineobject(dbcursor.fetchone())
                    except:
                        nextline = makeablankline('gr0000w000', -1)

                for c in combinations:
                    tail = c[0] + '$'
                    head = '^' + c[1]

                    t = False
                    h = False
                    try:
                        t = re.search(tail, getattr(lineobject,
                                                    so.usewordlist))
                    except re.error:
                        pass
                    try:
                        h = re.search(head, getattr(nextline, so.usewordlist))
                    except re.error:
                        pass

                    if t and h:
                        listoffinds.append(lineobject)
                        so.poll.addhits(1)
                        setofhits.add(lineobject.authorid)

    dbconnection.connectioncleanup()
    return listoffinds
Exemple #19
0
def precomposedphraseandproximitysearch(so: SearchObject) -> List[dbWorkLine]:
    """

    do a precomposedsqlsubqueryphrasesearch() and then search inside the results for part two...

    corner case tester: two line-enders: non solum + temporum dignitatem

    [12]   Caesar, De Bello Gallico: book 7, chapter 54, section 4, line 2

    7.54.3.3 multatos agris, omnibus ereptis sociis, imposito stipendio,
    7.54.4.1 obsidibus summa cum contumelia extortis, et quam in
    7.54.4.2 fortunam quamque in amplitudinem deduxisset, ut non
    7.54.4.3 solum in pristinum statum redissent, sed omnium tem-
    7.54.4.4 porum dignitatem et gratiam antecessisse viderentur.


    corner case tester: two distant line-enders: temporum dignitatem + obsides Galliae

    ut non
    solum in pristinum statum redissent, sed omnium tem- 	7.54.4.3
    porum dignitatem et gratiam antecessisse viderentur.
    his datis mandatis eos ab se dimisit.
          Noviodunum erat oppidum Haeduorum ad ripas 	7.55.1.1
    Ligeris opportuno loco positum. huc Caesar omnes ob- 	7.55.2.1
    sides Galliae, frumentum, pecuniam publicam, suorum

    the old code will trick you by pretending it is doing a valid search even though it is not really set up
    to handle this situation and was not supposed to promise that it could do phrase+
    [it's the phrase-spanning-two-lines bit that yields the problem since you do "lemma+" but have no handler for
    the multi-line issue]

    0.0.0-1.8.1

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 3,182 works and found 1 passage (0.77s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου

    1.8.2+

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 2,346 works and found 2 passages (2.2s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου
    [2]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 14, line 54

    3c,688,F.14.52    (40) καὶ ἐλυπήθη λύπην ϲφοδρὰν Μεγάβυζοϲ, καὶ ἐπένθηϲε, καὶ ἠιτήϲατο
    3c,688,F.14.53 ἐπὶ Ϲυρίαν τὴν ἑαυτοῦ χώραν ἀπιέναι. ἐνταῦθα λάθραι καὶ τοὺϲ ἄλλουϲ τῶν
    3c,688,F.14.54 Ἑλλήνων προέπεμπε. καὶ ἀπήιει, καὶ ἀπέϲτη βαϲιλέωϲ, καὶ ἀθροίζει μεγάλην
    3c,688,F.14.55 δύναμιν ἄχρι πεντεκαίδεκα μυριάδων χωρὶϲ τῶν ἱππέων [καὶ τῶν πεζῶν].
    3c,688,F.14.56 καὶ πέμπεται Οὔϲιριϲ κατ’ αὐτοῦ ϲὺν ⟨κ⟩ μυριάϲι, καὶ ϲυνάπτεται πόλεμοϲ, καὶ

    """

    #
    # initially do "within x lines"
    #

    phrasefinder = re.compile(r'[^\s]\s[^\s]')
    if re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        secondsearch = precomposedsqlsubqueryphrasesearch
    elif not re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        so.swapseekingandproxmate()
        so.swaplemmaoneandtwo()
        secondsearch = basicprecomposedsqlsearcher
    else:
        secondsearch = basicprecomposedsqlsearcher

    c = so.cap
    ps = so.proximate
    so.proximate = str()
    pl = so.lemmatwo
    so.lemmatwo = str()
    so.phrase = so.seeking
    firstterm = so.phrase

    so.cap = hipparchia.config['INTERMEDIATESEARCHCAP']

    initialhitlines = precomposedsqlsubqueryphrasesearch(so)

    so.seeking = ps
    so.lemmaone = pl
    so.setsearchtype()
    so.cap = c

    if secondsearch == precomposedsqlsubqueryphrasesearch:
        so.phrase = ps
    else:
        so.phrase = str()

    so = perparesoforsecondsqldict(so, initialhitlines)
    so.searchsqldict = searchlistintosqldict(so, so.seeking)

    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    so.poll.sethits(0)

    newhitlines = secondsearch(so)

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()

    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    maybefinalhitines = list()
    if so.near:
        # "is near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    #
    # if neccessary, do "within x words" as x lines hits will always be a subset of the first set
    #

    if so.lemmaone:
        secondterm = wordlistintoregex(so.lemmaone.formlist)
    else:
        secondterm = so.seeking

    if so.scope == 'words':
        finalhitlines = paredowntowithinxwords(so, firstterm, secondterm,
                                               maybefinalhitines)
    else:
        finalhitlines = maybefinalhitines

    # to humor rewriteskgandprx()
    # but that formatting doesn't 100% work yet...

    so.termone = firstterm
    so.termtwo = secondterm
    so.lemmatwo = so.lemmaone

    return finalhitlines
Exemple #20
0
def precomposedsqlsearch(so: SearchObject) -> List[dbWorkLine]:
    """

    flow control for searching governed by so.searchtype

    speed notes: the speed of these searches is consonant with that of the old search code; usu. <1s difference

    sqlphrasesearch() was eliminated in order to keep the code base more streamlined

    """

    assert so.searchtype in [
        'simple', 'simplelemma', 'proximity', 'phrase', 'phraseandproximity'
    ], 'unknown searchtype sent to rawsqlsearches()'

    so.poll.statusis('Executing a {t} search...'.format(t=so.searchtype))

    so.searchsqldict = searchlistintosqldict(so, so.termone)
    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    searchfnc = lambda x: list()

    if so.searchtype in ['simple', 'simplelemma']:
        searchfnc = basicprecomposedsqlsearcher
    elif so.searchtype == 'proximity':
        # search for the least common terms first: swap termone and termtwo if need be
        so = rebuildsearchobjectviasearchorder(so)
        if so.scope == 'lines':
            # this will hit rawdsqlsearchmanager() 2x
            searchfnc = precomposedsqlwithinxlinessearch
        else:
            searchfnc = precomposedsqlwithinxwords
    elif so.searchtype == 'phrase':
        so.phrase = so.termone
        # so.leastcommon = findleastcommonterm(so.termone, so.accented)
        searchfnc = precomposedsqlsubqueryphrasesearch
    elif so.searchtype == 'phraseandproximity':
        so.phrase = so.termone
        searchfnc = precomposedphraseandproximitysearch
    else:
        # should be hard to reach this because of "assert" above
        consolewarning(
            'rawsqlsearches() does not support {t} searching'.format(
                t=so.searchtype),
            color='red')

    so.searchsqldict = searchlistintosqldict(so, so.termone)
    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    hitlist = searchfnc(so)

    if so.onehit:
        # you might still have two hits from the same author; purge the doubles
        # use unique keys property of a dict() to do it
        uniqueauthors = {h.authorid: h for h in hitlist}
        hitlist = [uniqueauthors[a] for a in uniqueauthors]

    hitlist = hitlist[:so.cap]

    return hitlist
Exemple #21
0
def buildfakesearchobject(qtype='nearestneighborsquery') -> SearchObject:
    """

	do what it takes to build a hollow searchobject

	:return:
	"""

    frozensession = dict()

    frozensession['vdim'] = hipparchia.config['VECTORDIMENSIONS']
    frozensession['vwindow'] = hipparchia.config['VECTORWINDOW']
    frozensession['viterat'] = hipparchia.config['VECTORTRAININGITERATIONS']
    frozensession['vminpres'] = hipparchia.config['VECTORMINIMALPRESENCE']
    frozensession['vdsamp'] = hipparchia.config['VECTORDOWNSAMPLE']
    frozensession['vcutloc'] = hipparchia.config['VECTORDISTANCECUTOFFLOCAL']
    frozensession['vcutneighb'] = hipparchia.config[
        'VECTORDISTANCECUTOFFNEARESTNEIGHBOR']
    frozensession['vcutlem'] = hipparchia.config[
        'VECTORDISTANCECUTOFFLEMMAPAIR']
    frozensession['vnncap'] = hipparchia.config['NEARESTNEIGHBORSCAP']
    frozensession['vsentperdoc'] = hipparchia.config['SENTENCESPERDOCUMENT']
    frozensession['ldamaxfeatures'] = hipparchia.config['LDAMAXFEATURES']
    frozensession['ldacomponents'] = hipparchia.config['LDACOMPONENTS']
    frozensession['ldamaxfreq'] = hipparchia.config['LDAMAXFREQ']
    frozensession['ldaminfreq'] = hipparchia.config['LDAMINFREQ']
    frozensession['ldaiterations'] = hipparchia.config['LDAITERATIONS']
    frozensession['ldamustbelongerthan'] = hipparchia.config[
        'LDAMUSTBELONGERTHAN']
    frozensession['baggingmethod'] = hipparchia.config['DEFAULTBAGGINGMETHOD']

    blanks = ['searchscope', 'nearornot', 'onehit']
    for b in blanks:
        frozensession[b] = None

    nulls = ['psgselections', 'psgexclusions']
    for n in nulls:
        frozensession[n] = list()

    zeroes = ['proximity', 'maxresults', 'linesofcontext']
    for z in zeroes:
        frozensession[z] = 0

    trueorfalse = [
        'onehit', 'icandodates', 'nearestneighborsquery', 'searchinsidemarkup'
    ]
    for x in trueorfalse:
        frozensession[x] = False

    for x in [
            'agnexclusions', 'agnselections', 'alocexclusions',
            'alocselections', 'analogyfinder', 'auexclusions', 'auselections'
    ]:
        frozensession[x] = list()

    for s in [
            'wkexclusions', 'wkgnexclusions', 'wkgnselections', 'wkselections',
            'wlocexclusions', 'wlocselections'
    ]:
        frozensession[s] = list()

    for p in ['psgexclusions', 'psgselections']:
        frozensession[p] = list()

    for c in [
            'christiancorpus', 'latincorpus', 'greekcorpus',
            'inscriptioncorpus'
    ]:
        frozensession[c] = True

    frozensession['latestdate'] = 1500
    frozensession['earliestdate'] = -850

    so = SearchObject('vectorbot', str(), str(), None, None, frozensession)

    # parsevectorsentences() needs the following:
    so.vectorquerytype = qtype
    so.usecolumn = 'marked_up_line'
    so.sortorder = 'shortname'
    so.iamarobot = True

    return so