def pythonvectors(so: SearchObject) -> JSON_STR:
    """

    this is the matching function to golangvectors()

    [0] test to see what will happen:
        [a] scope problems? [jump away if so...]
        [b] already a model on file? ... [jump down to #5 if so]
    [1] generate a searchlist
    [2] do a searchlistintosqldict()
    [3] acquire and bag the words
        [a] grab db lines that are relevant to the search
        [b] turn them into a unified text block
        [c] do some preliminary cleanups
        [d] break the text into sentences and assemble []SentenceWithLocus (NB: these are "unlemmatized bags of words")
        [e] figure out all of the words used in the passage
        [f] find all of the parsing info relative to these words
        [g] figure out which headwords to associate with the collection of words
        [h] build the lemmatized bags of words ('unlemmatized' can skip [f] and [g]...)
    [4] hand the bags over to Word2Vec(), etc. [*]
    [5] run queries against the model and return the JSON results
    """

    # debugmessage('pythonvectors()')
    assert so.vectorquerytype in [
        'analogies', 'nearestneighborsquery', 'topicmodel'
    ]

    # [0] is this really going to happen?
    so.poll.statusis('Checking for valid search')
    # [i] do we bail out before even getting started?
    # note that this can / will return independently and break here
    abortjson = checkneedtoabort(so)
    if abortjson:
        del so.poll
        return abortjson

    # [ii] do we actually have a model stored already?
    so.poll.statusis('Checking for stored search')
    # calculatewholeauthorsearches() + configurewhereclausedata()
    so = updatesearchlistandsearchobject(so)
    so.setsearchlistthumbprint()
    so.poll.allworkis(-1)  # this turns off the % completed notice in the JS
    so.poll.sethits(0)

    themodel = checkforstoredvector(so)

    if not themodel:
        # [1] generate a searchlist: use executesearch() as the template

        so.usecolumn = 'marked_up_line'
        so.cap = 199999999

        # [2] do a searchlistintosqldict() [this is killing lda...]
        so.searchsqldict = searchlistintosqldict(so, str(), vectors=True)

        bagsofwords = acquireandbagthewords(so)

        # [4] hand the bags over to Word2Vec(), etc.
        so.poll.statusis('Building the model')
        if so.vectorquerytype == 'nearestneighborsquery':
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'analogies':
            # the same gensim model can serve both analogies and neighbors
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'topicmodel':
            stops = list(mostcommonwordsviaheadwords())
            bagsofsentences = [' '.join(b) for b in bagsofwords]
            bagsofsentences = [
                removestopwords(s, stops) for s in bagsofsentences
            ]
            themodel = buildsklearnselectedworks(so, bagsofsentences)
        else:
            pass
    elif so.iamarobot:
        # there is a model and the bot is attempting to build something that has already been build
        return '<!-- MODEL EXISTS -->'

    # so we have a model one way or the other by now...
    # [5] run queries against the model
    if so.iamarobot:
        return '<!-- MODEL BUILT -->'

    if so.vectorquerytype == 'nearestneighborsquery':
        jsonoutput = generatenearestneighbordata(None, len(so.searchlist), so,
                                                 themodel)
    elif so.vectorquerytype == 'analogies':
        jsonoutput = gensimgenerateanalogies(themodel, so)
    elif so.vectorquerytype == 'topicmodel':
        # def ldatopicsgenerateoutput(ldavishtmlandjs: str, workssearched: int, settings: dict, searchobject: SearchObject):
        jsonoutput = ldatopicsgenerateoutput(themodel, so)
    else:
        jsonoutput = json.dumps(
            'golang cannot execute {s} queries'.format(s=so.vectorquerytype))

    return jsonoutput
Exemple #2
0
def buildfakesearchobject(qtype='nearestneighborsquery') -> SearchObject:
    """

	do what it takes to build a hollow searchobject

	:return:
	"""

    frozensession = dict()

    frozensession['vdim'] = hipparchia.config['VECTORDIMENSIONS']
    frozensession['vwindow'] = hipparchia.config['VECTORWINDOW']
    frozensession['viterat'] = hipparchia.config['VECTORTRAININGITERATIONS']
    frozensession['vminpres'] = hipparchia.config['VECTORMINIMALPRESENCE']
    frozensession['vdsamp'] = hipparchia.config['VECTORDOWNSAMPLE']
    frozensession['vcutloc'] = hipparchia.config['VECTORDISTANCECUTOFFLOCAL']
    frozensession['vcutneighb'] = hipparchia.config[
        'VECTORDISTANCECUTOFFNEARESTNEIGHBOR']
    frozensession['vcutlem'] = hipparchia.config[
        'VECTORDISTANCECUTOFFLEMMAPAIR']
    frozensession['vnncap'] = hipparchia.config['NEARESTNEIGHBORSCAP']
    frozensession['vsentperdoc'] = hipparchia.config['SENTENCESPERDOCUMENT']
    frozensession['ldamaxfeatures'] = hipparchia.config['LDAMAXFEATURES']
    frozensession['ldacomponents'] = hipparchia.config['LDACOMPONENTS']
    frozensession['ldamaxfreq'] = hipparchia.config['LDAMAXFREQ']
    frozensession['ldaminfreq'] = hipparchia.config['LDAMINFREQ']
    frozensession['ldaiterations'] = hipparchia.config['LDAITERATIONS']
    frozensession['ldamustbelongerthan'] = hipparchia.config[
        'LDAMUSTBELONGERTHAN']
    frozensession['baggingmethod'] = hipparchia.config['DEFAULTBAGGINGMETHOD']

    blanks = ['searchscope', 'nearornot', 'onehit']
    for b in blanks:
        frozensession[b] = None

    nulls = ['psgselections', 'psgexclusions']
    for n in nulls:
        frozensession[n] = list()

    zeroes = ['proximity', 'maxresults', 'linesofcontext']
    for z in zeroes:
        frozensession[z] = 0

    trueorfalse = [
        'onehit', 'icandodates', 'nearestneighborsquery', 'searchinsidemarkup'
    ]
    for x in trueorfalse:
        frozensession[x] = False

    for x in [
            'agnexclusions', 'agnselections', 'alocexclusions',
            'alocselections', 'analogyfinder', 'auexclusions', 'auselections'
    ]:
        frozensession[x] = list()

    for s in [
            'wkexclusions', 'wkgnexclusions', 'wkgnselections', 'wkselections',
            'wlocexclusions', 'wlocselections'
    ]:
        frozensession[s] = list()

    for p in ['psgexclusions', 'psgselections']:
        frozensession[p] = list()

    for c in [
            'christiancorpus', 'latincorpus', 'greekcorpus',
            'inscriptioncorpus'
    ]:
        frozensession[c] = True

    frozensession['latestdate'] = 1500
    frozensession['earliestdate'] = -850

    so = SearchObject('vectorbot', str(), str(), None, None, frozensession)

    # parsevectorsentences() needs the following:
    so.vectorquerytype = qtype
    so.usecolumn = 'marked_up_line'
    so.sortorder = 'shortname'
    so.iamarobot = True

    return so