Beispiel #1
0
def lsiformatoutput(findshtml: str, workssearched: int, matches: list,
                    searchobject: SearchObject) -> str:
    """

	should use OutputObject() instead

	:param findshtml:
	:param workssearched:
	:param searchobject:
	:param activepoll:
	:param starttime:
	:return:
	"""

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)

    output.found = findshtml
    output.js = insertbrowserclickjs('browser')
    output.setscope(workssearched)
    output.title = 'Sentences that are reminiscent of »{skg}«'.format(
        skg=so.seeking)
    output.thesearch = output.title
    output.htmlsearch = 'sentences that are reminiscent of <span class="sought">»{skg}«</span>'.format(
        skg=so.seeking)
    output.resultcount = '{n} sentences above the cutoff'.format(
        n=len(matches))
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Beispiel #2
0
def ldatopicsgenerateoutput(ldavishtmlandjs: str, searchobject: SearchObject):
    """

	pyLDAvis.prepared_data_to_html() outputs something that is almost pure JS and looks like this:

		<link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css">


		<div id="ldavis_el7428760626948328485476648"></div>
		<script type="text/javascript">

		var ldavis_el7428760626948328485476648_data = {"mdsDat": ...

		}
		</script>


	instance = {
		'maxfeatures': 2000,
		'components': 15,  # topics
		'maxfreq': .75,  # fewer than n% of sentences should have this word (i.e., purge common words)
		'minfreq': 5,  # word must be found >n times
		'iterations': 12,
		'mustbelongerthan': 3
	}

	:param ldavishtmlandjs:
	:param workssearched:
	:param settings:
	:param searchobject:
	:return:
	"""

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)

    workssearched = len(so.searchlist)

    vv = searchobject.vectorvalues
    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    lines = ldavishtmlandjs.split('\n')
    lines = [re.sub(r'\t', str(), l) for l in lines if l]

    lines.reverse()

    thisline = str()
    html = list()

    while not re.search(r'<script type="text/javascript">', thisline):
        html.append(thisline)
        try:
            thisline = lines.pop()
        except IndexError:
            # oops, we never found the script...
            thisline = '<script type="text/javascript">'

    # we cut '<script>'; now drop '</script>'
    lines.reverse()
    js = lines[:-1]

    findshtml = '\n'.join(html)
    findsjs = '\n'.join(js)

    ldacssurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css'
    ldacsslocal = '/css/ldavis.css'
    findshtml = re.sub(ldacssurl, ldacsslocal, findshtml)

    # brittle: ldavis might change its URLs between versions, etc.
    # should probably make this conditional upon the presence of the file locally...
    ldajsurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js'
    ldajslocal = '/static/jsforldavis.js'
    findsjs = re.sub(ldajsurl, ldajslocal, findsjs)

    # this next will break the reloaded figure: hm...
    # d3jsurl = r'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min'
    # d3jslocal = '/static/jsd3'
    # findsjs = re.sub(d3jsurl, d3jslocal, findsjs)
    #
    # print('findsjs',findsjs)

    who = str()
    where = '{n} authors'.format(n=searchobject.numberofauthorssearched())

    if searchobject.numberofauthorssearched() == 1:
        a = authordict[searchobject.searchlist[0][:6]]
        who = a.akaname
        where = who

    if workssearched == 1:
        try:
            w = workdict[searchobject.searchlist[0]]
            w = w.title
        except KeyError:
            w = str()
        where = '{a}, <worktitle>{w}</worktitle>'.format(a=who, w=w)

    output.title = 'Latent Dirichlet Allocation'
    output.found = findshtml
    output.js = findsjs

    output.setscope(workssearched)
    output.sortby = 'weight'
    output.thesearch = 'thesearch'.format(skg='')
    output.resultcount = 'the following topics'
    output.htmlsearch = '{n} topics in {w}'.format(n=settings['components'],
                                                   w=where)
    output.searchtime = so.getelapsedtime()
    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Beispiel #3
0
def executesearch(searchid: str, so=None, req=request) -> JSON_STR:
    """

	the interface to all of the other search functions

	tell me what you are looking for and i'll try to find it

	the results are returned in a json bundle that will be used to update the html on the page

	note that cosdistbysentence vector queries also flow through here: they need a hitdict

	overview:
		buildsearchobject() and then start modifying elements of the SearchObject

		build a search list via compilesearchlist()
			modify search list via flagexclusions()
			modify search list via calculatewholeauthorsearches()
		build search list restrictions via indexrestrictions()

		search via searchdispatcher()

		format results via buildresultobjects()

	:return:
	"""

    pollid = validatepollid(searchid)

    if not so:
        # there is a so if singlewordsearch() sent you here
        probeforsessionvariables()
        so = buildsearchobject(pollid, req, session)

    frozensession = so.session

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]

    so.poll.activate()
    so.poll.statusis('Preparing to search')

    nosearch = True
    output = SearchOutputObject(so)

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if frozensession[c]]

    if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph']
            or frozensession['topicmodel']) and activecorpora:
        so.poll.statusis('Compiling the list of works to search')
        so.searchlist = compilesearchlist(listmapper, frozensession)

    if so.searchlist:
        # do this before updatesearchlistandsearchobject() which collapses items and cuts your total
        workssearched = len(so.searchlist)

        # calculatewholeauthorsearches() + configurewhereclausedata()
        so = updatesearchlistandsearchobject(so)

        nosearch = False
        skg = None
        prx = None

        isgreek = re.compile(
            '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]'
        )

        if so.lemmaone:
            so.termone = wordlistintoregex(so.lemma.formlist)
            skg = so.termone
            if re.search(isgreek, skg):
                # 'v' is a problem because the lemmata list is going to send 'u'
                # but the greek lemmata are accented
                so.usecolumn = 'accented_line'

        if so.lemmatwo:
            so.termtwo = wordlistintoregex(so.lemmatwo.formlist)
            prx = so.termtwo
            if re.search(isgreek, prx):
                so.usecolumn = 'accented_line'

        so.setsearchtype()
        thesearch = so.generatesearchdescription()
        htmlsearch = so.generatehtmlsearchdescription()

        # now that the SearchObject is built, do the search...
        hits = precomposedsqlsearch(so)
        so.poll.statusis('Putting the results in context')

        # hits is List[dbWorkLine]
        hitdict = sortresultslist(hits, so, authordict, workdict)

        if so.vectorquerytype == 'cosdistbylineorword':
            # print('executesearch(): h - cosdistbylineorword')
            # take these hits and head on over to the vector worker
            output = findabsolutevectorsfromhits(so, hitdict, workssearched)
            del progresspolldict[pollid]
            return output

        resultlist = buildresultobjects(hitdict, authordict, workdict, so)

        so.poll.statusis('Converting results to HTML')

        sandp = rewriteskgandprx(skg, prx, htmlsearch, so)
        skg = sandp['skg']
        prx = sandp['prx']
        htmlsearch = sandp['html']

        for r in resultlist:
            r.lineobjects = flagsearchterms(r, skg, prx, so)

        if so.context > 0:
            findshtml = htmlifysearchfinds(resultlist, so)
        else:
            findshtml = nocontexthtmlifysearchfinds(resultlist)

        if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']:
            findshtml = gtltsubstitutes(findshtml)

        findsjs = insertbrowserclickjs('browser')

        resultcount = len(resultlist)

        if resultcount < so.cap:
            hitmax = False
        else:
            hitmax = True

        output.title = thesearch
        output.found = findshtml
        output.js = findsjs
        output.setresultcount(resultcount, 'passages')
        output.setscope(workssearched)
        output.searchtime = so.getelapsedtime()
        output.thesearch = thesearch
        output.htmlsearch = htmlsearch
        output.hitmax = hitmax

    if nosearch:
        if not activecorpora:
            output.reasons.append('there are no active databases')
        if len(so.seeking) == 0:
            output.reasons.append('there is no search term')
        if len(so.seeking) > 0 and len(so.searchlist) == 0:
            output.reasons.append('zero works match the search criteria')

        output.title = '(empty query)'
        output.setresultcount(0, 'passages')
        output.explainemptysearch()

    so.poll.deactivate()
    jsonoutput = json.dumps(output.generateoutput())

    del progresspolldict[pollid]

    return jsonoutput
Beispiel #4
0
def nearestneighborgenerateoutput(findshtml: str, mostsimilar: list,
                                  imagename: str, workssearched: int,
                                  searchobject: SearchObject) -> str:
    """

	:param findshtml:
	:param mostsimilar:
	:param imagename:
	:param workssearched:
	:param searchobject:
	:param activepoll:
	:param starttime:
	:return:
	"""

    vectorsearchwaslemmatized = True

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)
    output.image = imagename

    findsjs = generatevectorjs()

    try:
        lm = so.lemma.dictionaryentry
    except AttributeError:
        # AttributeError: 'NoneType' object has no attribute 'dictionaryentry'
        vectorsearchwaslemmatized = False
        lm = so.seeking

    try:
        pr = so.proximatelemma.dictionaryentry
    except AttributeError:
        # proximatelemma is None
        pr = None

    if vectorsearchwaslemmatized:
        extrastringone = 'all forms of '
        ht = 'all {n} known forms of <span class="sought">»{skg}«</span>'.format(
            n=len(so.lemma.formlist), skg=lm)
    else:
        extrastringone = str()
        ht = '<span class="sought">»{skg}«</span>'.format(skg=lm)

    output.title = 'Neighbors for {es}»{skg}«'.format(skg=lm,
                                                      pr=pr,
                                                      es=extrastringone)
    output.found = findshtml
    output.js = findsjs

    try:
        output.setresultcount(len(mostsimilar), 'proximate terms to graph')
    except TypeError:
        pass

    output.setscope(workssearched)
    output.thesearch = '{es}»{skg}«'.format(skg=lm, es=extrastringone)
    output.htmlsearch = ht
    output.sortby = 'proximity'
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    jsonoutput = json.dumps(output.generateoutput())
    activepoll.deactivate()

    if isinstance(activepoll, RedisProgressPoll):
        activepoll.deleteredispoll()

    del activepoll
    return jsonoutput
Beispiel #5
0
def generateabsolutevectorsoutput(listsofwords: list, workssearched: list,
                                  searchobject, vtype: str):
    """


	:return:
	"""
    so = searchobject
    vv = so.vectorvalues
    activepoll = so.poll

    # find all words in use
    allwords = findsetofallwords(listsofwords)
    # print('allwords', allwords)

    # find all possible forms of all the words we used
    # consider subtracting some set like: rarewordsthatpretendtobecommon = {}
    activepoll.statusis('Finding headwords')
    morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
    morphdict = convertmophdicttodict(morphdict)

    # find all possible headwords of all of the forms in use
    # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
    allheadwords = dict()
    for m in morphdict.keys():
        for h in morphdict[m]:
            allheadwords[h] = m

    if so.lemma:
        # set to none for now
        subtractterm = None
    else:
        subtractterm = so.seeking

    activepoll.statusis('Building vectors')
    vectorspace = buildrudimentaryvectorspace(allheadwords,
                                              morphdict,
                                              listsofwords,
                                              subtractterm=subtractterm)

    # for k in vectorspace.keys():
    # 	print(k, vectorspace[k])

    if so.lemma:
        focus = so.lemma.dictionaryentry
    else:
        focus = so.seeking

    activepoll.statusis('Calculating cosine distances')
    cosinevalues = caclulatecosinevalues(focus, vectorspace,
                                         allheadwords.keys())
    # cosinevalues = vectorcosinedispatching(focus, vectorspace, allheadwords.keys())
    # print('generatevectoroutput cosinevalues', cosinevalues)

    # apply the threshold and drop the 'None' items
    threshold = 1.0 - vv.localcutoffdistance
    falseidentity = .02
    cosinevalues = {
        c: 1 - cosinevalues[c]
        for c in cosinevalues
        if cosinevalues[c] and falseidentity < cosinevalues[c] < threshold
    }
    mostsimilar = [(c, cosinevalues[c]) for c in cosinevalues]
    mostsimilar = sorted(mostsimilar, key=lambda t: t[1], reverse=True)

    findshtml = formatnnmatches(mostsimilar, vv)

    # next we look for the interrelationships of the words that are above the threshold
    activepoll.statusis('Calculating metacosine distances')
    imagename = graphbliteraldistancematches(focus, mostsimilar, so)

    findsjs = generatevectorjs()

    output = SearchOutputObject(so)

    output.title = 'Cosine distances to »{skg}«'.format(skg=focus)
    output.found = findshtml
    output.js = findsjs

    if not so.session['cosdistbylineorword']:
        space = 'related terms in {s} {t}'.format(s=len(listsofwords), t=vtype)
    else:
        dist = so.session['proximity']
        scale = {'words': 'word', 'lines': 'line'}
        if int(dist) > 1:
            plural = 's'
        else:
            plural = str()
        space = 'related terms within {a} {b}{s}'.format(
            a=dist, b=scale[so.session['searchscope']], s=plural)

    found = max(vv.neighborscap, len(cosinevalues))
    output.setresultcount(found, space)
    output.setscope(workssearched)

    if so.lemma:
        xtra = 'all forms of '
    else:
        xtra = str()

    output.thesearch = '{x}»{skg}«'.format(x=xtra, skg=focus)
    output.htmlsearch = '{x}<span class="sought">»{skg}«</span>'.format(
        x=xtra, skg=focus)

    output.sortby = 'distance with a cutoff of {c}'.format(
        c=vv.localcutoffdistance)
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput