Beispiel #1
0
def executesearch(searchid: str, so=None, req=request) -> JSON_STR:
    """

	the interface to all of the other search functions

	tell me what you are looking for and i'll try to find it

	the results are returned in a json bundle that will be used to update the html on the page

	note that cosdistbysentence vector queries also flow through here: they need a hitdict

	overview:
		buildsearchobject() and then start modifying elements of the SearchObject

		build a search list via compilesearchlist()
			modify search list via flagexclusions()
			modify search list via calculatewholeauthorsearches()
		build search list restrictions via indexrestrictions()

		search via searchdispatcher()

		format results via buildresultobjects()

	:return:
	"""

    pollid = validatepollid(searchid)

    if not so:
        # there is a so if singlewordsearch() sent you here
        probeforsessionvariables()
        so = buildsearchobject(pollid, req, session)

    frozensession = so.session

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]

    so.poll.activate()
    so.poll.statusis('Preparing to search')

    nosearch = True
    output = SearchOutputObject(so)

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if frozensession[c]]

    if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph']
            or frozensession['topicmodel']) and activecorpora:
        so.poll.statusis('Compiling the list of works to search')
        so.searchlist = compilesearchlist(listmapper, frozensession)

    if so.searchlist:
        # do this before updatesearchlistandsearchobject() which collapses items and cuts your total
        workssearched = len(so.searchlist)

        # calculatewholeauthorsearches() + configurewhereclausedata()
        so = updatesearchlistandsearchobject(so)

        nosearch = False
        skg = None
        prx = None

        isgreek = re.compile(
            '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]'
        )

        if so.lemmaone:
            so.termone = wordlistintoregex(so.lemma.formlist)
            skg = so.termone
            if re.search(isgreek, skg):
                # 'v' is a problem because the lemmata list is going to send 'u'
                # but the greek lemmata are accented
                so.usecolumn = 'accented_line'

        if so.lemmatwo:
            so.termtwo = wordlistintoregex(so.lemmatwo.formlist)
            prx = so.termtwo
            if re.search(isgreek, prx):
                so.usecolumn = 'accented_line'

        so.setsearchtype()
        thesearch = so.generatesearchdescription()
        htmlsearch = so.generatehtmlsearchdescription()

        # now that the SearchObject is built, do the search...
        hits = precomposedsqlsearch(so)
        so.poll.statusis('Putting the results in context')

        # hits is List[dbWorkLine]
        hitdict = sortresultslist(hits, so, authordict, workdict)

        if so.vectorquerytype == 'cosdistbylineorword':
            # print('executesearch(): h - cosdistbylineorword')
            # take these hits and head on over to the vector worker
            output = findabsolutevectorsfromhits(so, hitdict, workssearched)
            del progresspolldict[pollid]
            return output

        resultlist = buildresultobjects(hitdict, authordict, workdict, so)

        so.poll.statusis('Converting results to HTML')

        sandp = rewriteskgandprx(skg, prx, htmlsearch, so)
        skg = sandp['skg']
        prx = sandp['prx']
        htmlsearch = sandp['html']

        for r in resultlist:
            r.lineobjects = flagsearchterms(r, skg, prx, so)

        if so.context > 0:
            findshtml = htmlifysearchfinds(resultlist, so)
        else:
            findshtml = nocontexthtmlifysearchfinds(resultlist)

        if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']:
            findshtml = gtltsubstitutes(findshtml)

        findsjs = insertbrowserclickjs('browser')

        resultcount = len(resultlist)

        if resultcount < so.cap:
            hitmax = False
        else:
            hitmax = True

        output.title = thesearch
        output.found = findshtml
        output.js = findsjs
        output.setresultcount(resultcount, 'passages')
        output.setscope(workssearched)
        output.searchtime = so.getelapsedtime()
        output.thesearch = thesearch
        output.htmlsearch = htmlsearch
        output.hitmax = hitmax

    if nosearch:
        if not activecorpora:
            output.reasons.append('there are no active databases')
        if len(so.seeking) == 0:
            output.reasons.append('there is no search term')
        if len(so.seeking) > 0 and len(so.searchlist) == 0:
            output.reasons.append('zero works match the search criteria')

        output.title = '(empty query)'
        output.setresultcount(0, 'passages')
        output.explainemptysearch()

    so.poll.deactivate()
    jsonoutput = json.dumps(output.generateoutput())

    del progresspolldict[pollid]

    return jsonoutput
Beispiel #2
0
def nearestneighborgenerateoutput(findshtml: str, mostsimilar: list,
                                  imagename: str, workssearched: int,
                                  searchobject: SearchObject) -> str:
    """

	:param findshtml:
	:param mostsimilar:
	:param imagename:
	:param workssearched:
	:param searchobject:
	:param activepoll:
	:param starttime:
	:return:
	"""

    vectorsearchwaslemmatized = True

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)
    output.image = imagename

    findsjs = generatevectorjs()

    try:
        lm = so.lemma.dictionaryentry
    except AttributeError:
        # AttributeError: 'NoneType' object has no attribute 'dictionaryentry'
        vectorsearchwaslemmatized = False
        lm = so.seeking

    try:
        pr = so.proximatelemma.dictionaryentry
    except AttributeError:
        # proximatelemma is None
        pr = None

    if vectorsearchwaslemmatized:
        extrastringone = 'all forms of '
        ht = 'all {n} known forms of <span class="sought">»{skg}«</span>'.format(
            n=len(so.lemma.formlist), skg=lm)
    else:
        extrastringone = str()
        ht = '<span class="sought">»{skg}«</span>'.format(skg=lm)

    output.title = 'Neighbors for {es}»{skg}«'.format(skg=lm,
                                                      pr=pr,
                                                      es=extrastringone)
    output.found = findshtml
    output.js = findsjs

    try:
        output.setresultcount(len(mostsimilar), 'proximate terms to graph')
    except TypeError:
        pass

    output.setscope(workssearched)
    output.thesearch = '{es}»{skg}«'.format(skg=lm, es=extrastringone)
    output.htmlsearch = ht
    output.sortby = 'proximity'
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    jsonoutput = json.dumps(output.generateoutput())
    activepoll.deactivate()

    if isinstance(activepoll, RedisProgressPoll):
        activepoll.deleteredispoll()

    del activepoll
    return jsonoutput
Beispiel #3
0
def generateabsolutevectorsoutput(listsofwords: list, workssearched: list,
                                  searchobject, vtype: str):
    """


	:return:
	"""
    so = searchobject
    vv = so.vectorvalues
    activepoll = so.poll

    # find all words in use
    allwords = findsetofallwords(listsofwords)
    # print('allwords', allwords)

    # find all possible forms of all the words we used
    # consider subtracting some set like: rarewordsthatpretendtobecommon = {}
    activepoll.statusis('Finding headwords')
    morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
    morphdict = convertmophdicttodict(morphdict)

    # find all possible headwords of all of the forms in use
    # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
    allheadwords = dict()
    for m in morphdict.keys():
        for h in morphdict[m]:
            allheadwords[h] = m

    if so.lemma:
        # set to none for now
        subtractterm = None
    else:
        subtractterm = so.seeking

    activepoll.statusis('Building vectors')
    vectorspace = buildrudimentaryvectorspace(allheadwords,
                                              morphdict,
                                              listsofwords,
                                              subtractterm=subtractterm)

    # for k in vectorspace.keys():
    # 	print(k, vectorspace[k])

    if so.lemma:
        focus = so.lemma.dictionaryentry
    else:
        focus = so.seeking

    activepoll.statusis('Calculating cosine distances')
    cosinevalues = caclulatecosinevalues(focus, vectorspace,
                                         allheadwords.keys())
    # cosinevalues = vectorcosinedispatching(focus, vectorspace, allheadwords.keys())
    # print('generatevectoroutput cosinevalues', cosinevalues)

    # apply the threshold and drop the 'None' items
    threshold = 1.0 - vv.localcutoffdistance
    falseidentity = .02
    cosinevalues = {
        c: 1 - cosinevalues[c]
        for c in cosinevalues
        if cosinevalues[c] and falseidentity < cosinevalues[c] < threshold
    }
    mostsimilar = [(c, cosinevalues[c]) for c in cosinevalues]
    mostsimilar = sorted(mostsimilar, key=lambda t: t[1], reverse=True)

    findshtml = formatnnmatches(mostsimilar, vv)

    # next we look for the interrelationships of the words that are above the threshold
    activepoll.statusis('Calculating metacosine distances')
    imagename = graphbliteraldistancematches(focus, mostsimilar, so)

    findsjs = generatevectorjs()

    output = SearchOutputObject(so)

    output.title = 'Cosine distances to »{skg}«'.format(skg=focus)
    output.found = findshtml
    output.js = findsjs

    if not so.session['cosdistbylineorword']:
        space = 'related terms in {s} {t}'.format(s=len(listsofwords), t=vtype)
    else:
        dist = so.session['proximity']
        scale = {'words': 'word', 'lines': 'line'}
        if int(dist) > 1:
            plural = 's'
        else:
            plural = str()
        space = 'related terms within {a} {b}{s}'.format(
            a=dist, b=scale[so.session['searchscope']], s=plural)

    found = max(vv.neighborscap, len(cosinevalues))
    output.setresultcount(found, space)
    output.setscope(workssearched)

    if so.lemma:
        xtra = 'all forms of '
    else:
        xtra = str()

    output.thesearch = '{x}»{skg}«'.format(x=xtra, skg=focus)
    output.htmlsearch = '{x}<span class="sought">»{skg}«</span>'.format(
        x=xtra, skg=focus)

    output.sortby = 'distance with a cutoff of {c}'.format(
        c=vv.localcutoffdistance)
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput