Esempio n. 1
0
def threejsgraphofvectors(sentencetuples, workssearched, so, vectorspace):
    """

	unused parameters so that the shape of this function's inputs can match other parallel functions

	see https://github.com/tobydoig/3dword2vec

	:param sentencetuples:
	:param workssearched:
	:param so:
	:param vectorspace:
	:return:
	"""

    output = SearchOutputObject(so)

    graphdata = reducetothreedimensions(so, vectorspace)

    output.found = str()
    output.htmlsearch = str()
    output.found = trheedimensionalhtml()
    output.js = threedimensionaljs(graphdata)

    jsonoutput = json.dumps(output.generateoutput())

    # print('jsonoutput', jsonoutput)
    return jsonoutput
Esempio n. 2
0
def dispatchvectorsearch(vectortype: str,
                         searchid: str,
                         one=None,
                         two=None,
                         three=None) -> JSON_STR:
    """

	dispatcher for "/vectors/..." requests

	"""

    if not hipparchia.config['SEMANTICVECTORSENABLED']:
        so = SearchObject(str(), str(), str(), str(), str(), session)
        oo = SearchOutputObject(so)
        target = 'searchsummary'
        message = '[semantic vectors have not been enabled]'
        return oo.generatenulloutput(itemname=target, itemval=message)

    pollid = validatepollid(searchid)
    one = depunct(one)
    two = depunct(two)
    three = depunct(three)

    simple = [pollid, one]
    triple = [pollid, one, two, three]

    knownfunctions = {
        'nearestneighborsquery': {
            'bso': simple,
            'pref': 'CONCEPTMAPPINGENABLED'
        },
        'analogies': {
            'bso': triple,
            'pref': 'VECTORANALOGIESENABLED'
        },
        'topicmodel': {
            'bso': simple,
            'pref': 'TOPICMODELINGENABLED'
        },
        'vectortestfunction': {
            'bso': simple,
            'pref': 'TESTINGVECTORBUTTONENABLED'
        },
        'unused': {
            'fnc': lambda: str(),
            'bso': None,
            'pref': None
        },
    }

    if not knownfunctions[vectortype]['pref'] or not hipparchia.config[
            knownfunctions[vectortype]['pref']]:
        return json.dumps('this type of search has not been enabled')

    bso = knownfunctions[vectortype]['bso']

    so = None

    if len(bso) == 4:
        so = buildtriplelemmasearchobject(*bso)

    if len(bso) == 2:
        so = buildsinglelemmasearchobject(*bso)

    so.vectorquerytype = vectortype

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]
    so.poll.activate()
    so.poll.statusis('Preparing to vectorize')

    if hipparchia.config['EXTERNALVECTORHELPER']:
        j = externalvectors(so)
    else:
        j = pythonvectors(so)

    if hipparchia.config['JSONDEBUGMODE']:
        print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j))

    try:
        del so.poll
    except AttributeError:
        pass

    return j
Esempio n. 3
0
def ldatopicsgenerateoutput(ldavishtmlandjs: str, searchobject: SearchObject):
    """

	pyLDAvis.prepared_data_to_html() outputs something that is almost pure JS and looks like this:

		<link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css">


		<div id="ldavis_el7428760626948328485476648"></div>
		<script type="text/javascript">

		var ldavis_el7428760626948328485476648_data = {"mdsDat": ...

		}
		</script>


	instance = {
		'maxfeatures': 2000,
		'components': 15,  # topics
		'maxfreq': .75,  # fewer than n% of sentences should have this word (i.e., purge common words)
		'minfreq': 5,  # word must be found >n times
		'iterations': 12,
		'mustbelongerthan': 3
	}

	:param ldavishtmlandjs:
	:param workssearched:
	:param settings:
	:param searchobject:
	:return:
	"""

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)

    workssearched = len(so.searchlist)

    vv = searchobject.vectorvalues
    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    lines = ldavishtmlandjs.split('\n')
    lines = [re.sub(r'\t', str(), l) for l in lines if l]

    lines.reverse()

    thisline = str()
    html = list()

    while not re.search(r'<script type="text/javascript">', thisline):
        html.append(thisline)
        try:
            thisline = lines.pop()
        except IndexError:
            # oops, we never found the script...
            thisline = '<script type="text/javascript">'

    # we cut '<script>'; now drop '</script>'
    lines.reverse()
    js = lines[:-1]

    findshtml = '\n'.join(html)
    findsjs = '\n'.join(js)

    ldacssurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css'
    ldacsslocal = '/css/ldavis.css'
    findshtml = re.sub(ldacssurl, ldacsslocal, findshtml)

    # brittle: ldavis might change its URLs between versions, etc.
    # should probably make this conditional upon the presence of the file locally...
    ldajsurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js'
    ldajslocal = '/static/jsforldavis.js'
    findsjs = re.sub(ldajsurl, ldajslocal, findsjs)

    # this next will break the reloaded figure: hm...
    # d3jsurl = r'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min'
    # d3jslocal = '/static/jsd3'
    # findsjs = re.sub(d3jsurl, d3jslocal, findsjs)
    #
    # print('findsjs',findsjs)

    who = str()
    where = '{n} authors'.format(n=searchobject.numberofauthorssearched())

    if searchobject.numberofauthorssearched() == 1:
        a = authordict[searchobject.searchlist[0][:6]]
        who = a.akaname
        where = who

    if workssearched == 1:
        try:
            w = workdict[searchobject.searchlist[0]]
            w = w.title
        except KeyError:
            w = str()
        where = '{a}, <worktitle>{w}</worktitle>'.format(a=who, w=w)

    output.title = 'Latent Dirichlet Allocation'
    output.found = findshtml
    output.js = findsjs

    output.setscope(workssearched)
    output.sortby = 'weight'
    output.thesearch = 'thesearch'.format(skg='')
    output.resultcount = 'the following topics'
    output.htmlsearch = '{n} topics in {w}'.format(n=settings['components'],
                                                   w=where)
    output.searchtime = so.getelapsedtime()
    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Esempio n. 4
0
def lsiformatoutput(findshtml: str, workssearched: int, matches: list,
                    searchobject: SearchObject) -> str:
    """

	should use OutputObject() instead

	:param findshtml:
	:param workssearched:
	:param searchobject:
	:param activepoll:
	:param starttime:
	:return:
	"""

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)

    output.found = findshtml
    output.js = insertbrowserclickjs('browser')
    output.setscope(workssearched)
    output.title = 'Sentences that are reminiscent of »{skg}«'.format(
        skg=so.seeking)
    output.thesearch = output.title
    output.htmlsearch = 'sentences that are reminiscent of <span class="sought">»{skg}«</span>'.format(
        skg=so.seeking)
    output.resultcount = '{n} sentences above the cutoff'.format(
        n=len(matches))
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Esempio n. 5
0
def analogiesgenerateoutput(searchobject, findstuples: list) -> JSON_STR:
    """

	findstuples: [(word1, value1), (word2, value2), ...]

	@htmlcommentdecorator here will actually kill the json...

	:param searchobject:
	:param findstuples:
	:return:
	"""

    so = searchobject
    output = SearchOutputObject(so)

    if so.session['baggingmethod'] != 'unlemmatized':
        a = so.lemmaone.dictionaryentry
        b = so.lemmatwo.dictionaryentry
        c = so.lemmathree.dictionaryentry
    else:
        a = so.seeking
        b = so.proximate
        c = so.termthree

    tabletemplate = """
	<table class="vectortable outline">
	{thdr}
	{rows}
	<table>
	"""

    thdrtemplate = """
	<tr>
		<th>{a}</th>
		<th>{b}</th>
		<th>{c}</th>
	</tr>
	"""

    meth = searchobject.session['baggingmethod']

    thdr = thdrtemplate.format(a='Bagging method:', b=meth, c=str())

    rowtemplate = """
	<tr>
		<td>{wrd}</td>
		<td></td>
		<td>{val}</td>
	</tr>
	"""

    therows = [rowtemplate.format(wrd=t[0], val=t[1]) for t in findstuples]
    therows = '\n'.join(therows)

    thetable = tabletemplate.format(thdr=thdr, rows=therows)
    output.found = thetable

    activepoll = so.poll
    output.title = '{a} : {b} :: {c} : ???'.format(a=a, b=b, c=c)

    output.searchtime = so.getelapsedtime()
    activepoll.deactivate()
    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Esempio n. 6
0
def nearestneighborgenerateoutput(findshtml: str, mostsimilar: list,
                                  imagename: str, workssearched: int,
                                  searchobject: SearchObject) -> str:
    """

	:param findshtml:
	:param mostsimilar:
	:param imagename:
	:param workssearched:
	:param searchobject:
	:param activepoll:
	:param starttime:
	:return:
	"""

    vectorsearchwaslemmatized = True

    so = searchobject
    activepoll = so.poll
    output = SearchOutputObject(so)
    output.image = imagename

    findsjs = generatevectorjs()

    try:
        lm = so.lemma.dictionaryentry
    except AttributeError:
        # AttributeError: 'NoneType' object has no attribute 'dictionaryentry'
        vectorsearchwaslemmatized = False
        lm = so.seeking

    try:
        pr = so.proximatelemma.dictionaryentry
    except AttributeError:
        # proximatelemma is None
        pr = None

    if vectorsearchwaslemmatized:
        extrastringone = 'all forms of '
        ht = 'all {n} known forms of <span class="sought">»{skg}«</span>'.format(
            n=len(so.lemma.formlist), skg=lm)
    else:
        extrastringone = str()
        ht = '<span class="sought">»{skg}«</span>'.format(skg=lm)

    output.title = 'Neighbors for {es}»{skg}«'.format(skg=lm,
                                                      pr=pr,
                                                      es=extrastringone)
    output.found = findshtml
    output.js = findsjs

    try:
        output.setresultcount(len(mostsimilar), 'proximate terms to graph')
    except TypeError:
        pass

    output.setscope(workssearched)
    output.thesearch = '{es}»{skg}«'.format(skg=lm, es=extrastringone)
    output.htmlsearch = ht
    output.sortby = 'proximity'
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    jsonoutput = json.dumps(output.generateoutput())
    activepoll.deactivate()

    if isinstance(activepoll, RedisProgressPoll):
        activepoll.deleteredispoll()

    del activepoll
    return jsonoutput
Esempio n. 7
0
def executesearch(searchid: str, so=None, req=request) -> JSON_STR:
    """

	the interface to all of the other search functions

	tell me what you are looking for and i'll try to find it

	the results are returned in a json bundle that will be used to update the html on the page

	note that cosdistbysentence vector queries also flow through here: they need a hitdict

	overview:
		buildsearchobject() and then start modifying elements of the SearchObject

		build a search list via compilesearchlist()
			modify search list via flagexclusions()
			modify search list via calculatewholeauthorsearches()
		build search list restrictions via indexrestrictions()

		search via searchdispatcher()

		format results via buildresultobjects()

	:return:
	"""

    pollid = validatepollid(searchid)

    if not so:
        # there is a so if singlewordsearch() sent you here
        probeforsessionvariables()
        so = buildsearchobject(pollid, req, session)

    frozensession = so.session

    progresspolldict[pollid] = ProgressPoll(pollid)
    so.poll = progresspolldict[pollid]

    so.poll.activate()
    so.poll.statusis('Preparing to search')

    nosearch = True
    output = SearchOutputObject(so)

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if frozensession[c]]

    if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph']
            or frozensession['topicmodel']) and activecorpora:
        so.poll.statusis('Compiling the list of works to search')
        so.searchlist = compilesearchlist(listmapper, frozensession)

    if so.searchlist:
        # do this before updatesearchlistandsearchobject() which collapses items and cuts your total
        workssearched = len(so.searchlist)

        # calculatewholeauthorsearches() + configurewhereclausedata()
        so = updatesearchlistandsearchobject(so)

        nosearch = False
        skg = None
        prx = None

        isgreek = re.compile(
            '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]'
        )

        if so.lemmaone:
            so.termone = wordlistintoregex(so.lemma.formlist)
            skg = so.termone
            if re.search(isgreek, skg):
                # 'v' is a problem because the lemmata list is going to send 'u'
                # but the greek lemmata are accented
                so.usecolumn = 'accented_line'

        if so.lemmatwo:
            so.termtwo = wordlistintoregex(so.lemmatwo.formlist)
            prx = so.termtwo
            if re.search(isgreek, prx):
                so.usecolumn = 'accented_line'

        so.setsearchtype()
        thesearch = so.generatesearchdescription()
        htmlsearch = so.generatehtmlsearchdescription()

        # now that the SearchObject is built, do the search...
        hits = precomposedsqlsearch(so)
        so.poll.statusis('Putting the results in context')

        # hits is List[dbWorkLine]
        hitdict = sortresultslist(hits, so, authordict, workdict)

        if so.vectorquerytype == 'cosdistbylineorword':
            # print('executesearch(): h - cosdistbylineorword')
            # take these hits and head on over to the vector worker
            output = findabsolutevectorsfromhits(so, hitdict, workssearched)
            del progresspolldict[pollid]
            return output

        resultlist = buildresultobjects(hitdict, authordict, workdict, so)

        so.poll.statusis('Converting results to HTML')

        sandp = rewriteskgandprx(skg, prx, htmlsearch, so)
        skg = sandp['skg']
        prx = sandp['prx']
        htmlsearch = sandp['html']

        for r in resultlist:
            r.lineobjects = flagsearchterms(r, skg, prx, so)

        if so.context > 0:
            findshtml = htmlifysearchfinds(resultlist, so)
        else:
            findshtml = nocontexthtmlifysearchfinds(resultlist)

        if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']:
            findshtml = gtltsubstitutes(findshtml)

        findsjs = insertbrowserclickjs('browser')

        resultcount = len(resultlist)

        if resultcount < so.cap:
            hitmax = False
        else:
            hitmax = True

        output.title = thesearch
        output.found = findshtml
        output.js = findsjs
        output.setresultcount(resultcount, 'passages')
        output.setscope(workssearched)
        output.searchtime = so.getelapsedtime()
        output.thesearch = thesearch
        output.htmlsearch = htmlsearch
        output.hitmax = hitmax

    if nosearch:
        if not activecorpora:
            output.reasons.append('there are no active databases')
        if len(so.seeking) == 0:
            output.reasons.append('there is no search term')
        if len(so.seeking) > 0 and len(so.searchlist) == 0:
            output.reasons.append('zero works match the search criteria')

        output.title = '(empty query)'
        output.setresultcount(0, 'passages')
        output.explainemptysearch()

    so.poll.deactivate()
    jsonoutput = json.dumps(output.generateoutput())

    del progresspolldict[pollid]

    return jsonoutput
Esempio n. 8
0
def generateabsolutevectorsoutput(listsofwords: list, workssearched: list,
                                  searchobject, vtype: str):
    """


	:return:
	"""
    so = searchobject
    vv = so.vectorvalues
    activepoll = so.poll

    # find all words in use
    allwords = findsetofallwords(listsofwords)
    # print('allwords', allwords)

    # find all possible forms of all the words we used
    # consider subtracting some set like: rarewordsthatpretendtobecommon = {}
    activepoll.statusis('Finding headwords')
    morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
    morphdict = convertmophdicttodict(morphdict)

    # find all possible headwords of all of the forms in use
    # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
    allheadwords = dict()
    for m in morphdict.keys():
        for h in morphdict[m]:
            allheadwords[h] = m

    if so.lemma:
        # set to none for now
        subtractterm = None
    else:
        subtractterm = so.seeking

    activepoll.statusis('Building vectors')
    vectorspace = buildrudimentaryvectorspace(allheadwords,
                                              morphdict,
                                              listsofwords,
                                              subtractterm=subtractterm)

    # for k in vectorspace.keys():
    # 	print(k, vectorspace[k])

    if so.lemma:
        focus = so.lemma.dictionaryentry
    else:
        focus = so.seeking

    activepoll.statusis('Calculating cosine distances')
    cosinevalues = caclulatecosinevalues(focus, vectorspace,
                                         allheadwords.keys())
    # cosinevalues = vectorcosinedispatching(focus, vectorspace, allheadwords.keys())
    # print('generatevectoroutput cosinevalues', cosinevalues)

    # apply the threshold and drop the 'None' items
    threshold = 1.0 - vv.localcutoffdistance
    falseidentity = .02
    cosinevalues = {
        c: 1 - cosinevalues[c]
        for c in cosinevalues
        if cosinevalues[c] and falseidentity < cosinevalues[c] < threshold
    }
    mostsimilar = [(c, cosinevalues[c]) for c in cosinevalues]
    mostsimilar = sorted(mostsimilar, key=lambda t: t[1], reverse=True)

    findshtml = formatnnmatches(mostsimilar, vv)

    # next we look for the interrelationships of the words that are above the threshold
    activepoll.statusis('Calculating metacosine distances')
    imagename = graphbliteraldistancematches(focus, mostsimilar, so)

    findsjs = generatevectorjs()

    output = SearchOutputObject(so)

    output.title = 'Cosine distances to »{skg}«'.format(skg=focus)
    output.found = findshtml
    output.js = findsjs

    if not so.session['cosdistbylineorword']:
        space = 'related terms in {s} {t}'.format(s=len(listsofwords), t=vtype)
    else:
        dist = so.session['proximity']
        scale = {'words': 'word', 'lines': 'line'}
        if int(dist) > 1:
            plural = 's'
        else:
            plural = str()
        space = 'related terms within {a} {b}{s}'.format(
            a=dist, b=scale[so.session['searchscope']], s=plural)

    found = max(vv.neighborscap, len(cosinevalues))
    output.setresultcount(found, space)
    output.setscope(workssearched)

    if so.lemma:
        xtra = 'all forms of '
    else:
        xtra = str()

    output.thesearch = '{x}»{skg}«'.format(x=xtra, skg=focus)
    output.htmlsearch = '{x}<span class="sought">»{skg}«</span>'.format(
        x=xtra, skg=focus)

    output.sortby = 'distance with a cutoff of {c}'.format(
        c=vv.localcutoffdistance)
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Esempio n. 9
0
def tsnegraphofvectors(sentencetuples, workssearched, so, vectorspace):
    """

	lifted from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

	unused parameters so that the shape of this function's inputs can match other parallel functions

	:param sentencetuples:
	:param workssearched:
	:param so:
	:param vectorspace:
	:return:
	"""

    plotdict = reducetotwodimensions(vectorspace)
    xvalues = plotdict['xvalues']
    yvalues = plotdict['yvalues']
    labels = plotdict['labels']

    # random.seed(0)

    plt.figure(figsize=(12, 12))
    # https://jonasjacek.github.io/colors/
    plt.scatter(xvalues, yvalues, color='#c6c6c6')

    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (xvalues[i], yvalues[i]))

    graphobject = BytesIO()
    plt.savefig(graphobject)
    plt.clf()
    plt.close()

    graphobject = graphobject.getvalue()

    imagename = storevectorgraph(graphobject)

    # print('http://localhost:5000/getstoredfigure/{i}'.format(i=imagename))

    output = SearchOutputObject(so)
    output.image = imagename

    findsjs = generatevectorjs()

    htmltemplate = """
	<p id="imagearea"></p>
	"""

    output.found = str()
    output.htmlsearch = str()
    output.found = htmltemplate
    output.js = findsjs

    jsonoutput = json.dumps(output.generateoutput())

    # print('jsonoutput', jsonoutput)
    return jsonoutput