Exemple #1
0
def basicprecomposedsqlsearcher(so: SearchObject,
                                themanager=None) -> List[dbWorkLine]:
    """

    give me sql and I will search

    this function just picks a pathway: use the golang module or do things in house?

    """

    so.searchsqldict = insertuniqunames(so.searchsqldict)

    if not themanager:
        usesharedlibrary = hipparchia.config['EXTERNALGRABBER']

        if not usesharedlibrary:
            debugmessage('searching via python')
            themanager = precomposedsqlsearchmanager
        else:
            # debugmessage('searching via external helper code')
            themanager = precomposedexternalsearcher

    hits = themanager(so)

    return hits
def precomposedexternalsearcher(so: SearchObject) -> List[dbWorkLine]:
    """

    you are using golang to do the search

    [1] send the searchdict to redis as a list of json.dumps(items) (keyed to the searchid)
    [2] send the external fnc the searchid, cap value, worker #, psql login info, redis login info
    [3] wait for the function to (a) gather; (b) search; (c) store
    [4] pull the results back from redis via the searchid
    NB: redis makes sense because the activity poll is going to have to be done via redis anyway...

    the searched items are stored under the redis key 'searchid_results'
    json.loads() will leave you with a dictionary of k/v pairs that can be turned into a dbWorkLine

    """

    warning = 'attempted to search via external helper but {x} is not available using precomposedsqlsearchmanager() instead'

    if not gosearch and not haveexternalhelper(getexternalhelperpath()):
        x = 'the external module'
        if not haveexternalhelper(getexternalhelperpath()):
            x = hipparchia.config['EXTERNALBINARYNAME']
        consolewarning(warning.format(x=x), color='red')
        return precomposedsqlsearchmanager(so)

    if not canuseredis:
        consolewarning(warning.format(x='redis'), color='red')
        return precomposedsqlsearchmanager(so)

    rc = establishredisconnection()

    so.searchsqldict = rewritesqlsearchdictforexternalhelper(so)
    # debugmessage('storing search at "{r}"'.format(r=so.searchid))

    for s in so.searchsqldict:
        rc.sadd(so.searchid, json.dumps(so.searchsqldict[s]))

    # if 1 > 0:
    #     consolewarning('precomposedgolangsearcher() merely stored the search in redis and did not execute it')
    #     return list()

    if not hipparchia.config['GRABBERCALLEDVIACLI']:
        resultrediskey = helpersharedlibrarysearcher(so)
    else:
        resultrediskey = helperclibinarysearcher(so)

    redisresults = redisfetch(resultrediskey)

    hits = [redishitintodbworkline(r) for r in redisresults]

    return hits
def pythonvectors(so: SearchObject) -> JSON_STR:
    """

    this is the matching function to golangvectors()

    [0] test to see what will happen:
        [a] scope problems? [jump away if so...]
        [b] already a model on file? ... [jump down to #5 if so]
    [1] generate a searchlist
    [2] do a searchlistintosqldict()
    [3] acquire and bag the words
        [a] grab db lines that are relevant to the search
        [b] turn them into a unified text block
        [c] do some preliminary cleanups
        [d] break the text into sentences and assemble []SentenceWithLocus (NB: these are "unlemmatized bags of words")
        [e] figure out all of the words used in the passage
        [f] find all of the parsing info relative to these words
        [g] figure out which headwords to associate with the collection of words
        [h] build the lemmatized bags of words ('unlemmatized' can skip [f] and [g]...)
    [4] hand the bags over to Word2Vec(), etc. [*]
    [5] run queries against the model and return the JSON results
    """

    # debugmessage('pythonvectors()')
    assert so.vectorquerytype in [
        'analogies', 'nearestneighborsquery', 'topicmodel'
    ]

    # [0] is this really going to happen?
    so.poll.statusis('Checking for valid search')
    # [i] do we bail out before even getting started?
    # note that this can / will return independently and break here
    abortjson = checkneedtoabort(so)
    if abortjson:
        del so.poll
        return abortjson

    # [ii] do we actually have a model stored already?
    so.poll.statusis('Checking for stored search')
    # calculatewholeauthorsearches() + configurewhereclausedata()
    so = updatesearchlistandsearchobject(so)
    so.setsearchlistthumbprint()
    so.poll.allworkis(-1)  # this turns off the % completed notice in the JS
    so.poll.sethits(0)

    themodel = checkforstoredvector(so)

    if not themodel:
        # [1] generate a searchlist: use executesearch() as the template

        so.usecolumn = 'marked_up_line'
        so.cap = 199999999

        # [2] do a searchlistintosqldict() [this is killing lda...]
        so.searchsqldict = searchlistintosqldict(so, str(), vectors=True)

        bagsofwords = acquireandbagthewords(so)

        # [4] hand the bags over to Word2Vec(), etc.
        so.poll.statusis('Building the model')
        if so.vectorquerytype == 'nearestneighborsquery':
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'analogies':
            # the same gensim model can serve both analogies and neighbors
            themodel = buildgensimmodel(so, bagsofwords)
        elif so.vectorquerytype == 'topicmodel':
            stops = list(mostcommonwordsviaheadwords())
            bagsofsentences = [' '.join(b) for b in bagsofwords]
            bagsofsentences = [
                removestopwords(s, stops) for s in bagsofsentences
            ]
            themodel = buildsklearnselectedworks(so, bagsofsentences)
        else:
            pass
    elif so.iamarobot:
        # there is a model and the bot is attempting to build something that has already been build
        return '<!-- MODEL EXISTS -->'

    # so we have a model one way or the other by now...
    # [5] run queries against the model
    if so.iamarobot:
        return '<!-- MODEL BUILT -->'

    if so.vectorquerytype == 'nearestneighborsquery':
        jsonoutput = generatenearestneighbordata(None, len(so.searchlist), so,
                                                 themodel)
    elif so.vectorquerytype == 'analogies':
        jsonoutput = gensimgenerateanalogies(themodel, so)
    elif so.vectorquerytype == 'topicmodel':
        # def ldatopicsgenerateoutput(ldavishtmlandjs: str, workssearched: int, settings: dict, searchobject: SearchObject):
        jsonoutput = ldatopicsgenerateoutput(themodel, so)
    else:
        jsonoutput = json.dumps(
            'golang cannot execute {s} queries'.format(s=so.vectorquerytype))

    return jsonoutput
Exemple #4
0
def precomposedsqlsearch(so: SearchObject) -> List[dbWorkLine]:
    """

    flow control for searching governed by so.searchtype

    speed notes: the speed of these searches is consonant with that of the old search code; usu. <1s difference

    sqlphrasesearch() was eliminated in order to keep the code base more streamlined

    """

    assert so.searchtype in [
        'simple', 'simplelemma', 'proximity', 'phrase', 'phraseandproximity'
    ], 'unknown searchtype sent to rawsqlsearches()'

    so.poll.statusis('Executing a {t} search...'.format(t=so.searchtype))

    so.searchsqldict = searchlistintosqldict(so, so.termone)
    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    searchfnc = lambda x: list()

    if so.searchtype in ['simple', 'simplelemma']:
        searchfnc = basicprecomposedsqlsearcher
    elif so.searchtype == 'proximity':
        # search for the least common terms first: swap termone and termtwo if need be
        so = rebuildsearchobjectviasearchorder(so)
        if so.scope == 'lines':
            # this will hit rawdsqlsearchmanager() 2x
            searchfnc = precomposedsqlwithinxlinessearch
        else:
            searchfnc = precomposedsqlwithinxwords
    elif so.searchtype == 'phrase':
        so.phrase = so.termone
        # so.leastcommon = findleastcommonterm(so.termone, so.accented)
        searchfnc = precomposedsqlsubqueryphrasesearch
    elif so.searchtype == 'phraseandproximity':
        so.phrase = so.termone
        searchfnc = precomposedphraseandproximitysearch
    else:
        # should be hard to reach this because of "assert" above
        consolewarning(
            'rawsqlsearches() does not support {t} searching'.format(
                t=so.searchtype),
            color='red')

    so.searchsqldict = searchlistintosqldict(so, so.termone)
    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    hitlist = searchfnc(so)

    if so.onehit:
        # you might still have two hits from the same author; purge the doubles
        # use unique keys property of a dict() to do it
        uniqueauthors = {h.authorid: h for h in hitlist}
        hitlist = [uniqueauthors[a] for a in uniqueauthors]

    hitlist = hitlist[:so.cap]

    return hitlist
Exemple #5
0
def precomposedphraseandproximitysearch(so: SearchObject) -> List[dbWorkLine]:
    """

    do a precomposedsqlsubqueryphrasesearch() and then search inside the results for part two...

    corner case tester: two line-enders: non solum + temporum dignitatem

    [12]   Caesar, De Bello Gallico: book 7, chapter 54, section 4, line 2

    7.54.3.3 multatos agris, omnibus ereptis sociis, imposito stipendio,
    7.54.4.1 obsidibus summa cum contumelia extortis, et quam in
    7.54.4.2 fortunam quamque in amplitudinem deduxisset, ut non
    7.54.4.3 solum in pristinum statum redissent, sed omnium tem-
    7.54.4.4 porum dignitatem et gratiam antecessisse viderentur.


    corner case tester: two distant line-enders: temporum dignitatem + obsides Galliae

    ut non
    solum in pristinum statum redissent, sed omnium tem- 	7.54.4.3
    porum dignitatem et gratiam antecessisse viderentur.
    his datis mandatis eos ab se dimisit.
          Noviodunum erat oppidum Haeduorum ad ripas 	7.55.1.1
    Ligeris opportuno loco positum. huc Caesar omnes ob- 	7.55.2.1
    sides Galliae, frumentum, pecuniam publicam, suorum

    the old code will trick you by pretending it is doing a valid search even though it is not really set up
    to handle this situation and was not supposed to promise that it could do phrase+
    [it's the phrase-spanning-two-lines bit that yields the problem since you do "lemma+" but have no handler for
    the multi-line issue]

    0.0.0-1.8.1

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 3,182 works and found 1 passage (0.77s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου

    1.8.2+

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 2,346 works and found 2 passages (2.2s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου
    [2]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 14, line 54

    3c,688,F.14.52    (40) καὶ ἐλυπήθη λύπην ϲφοδρὰν Μεγάβυζοϲ, καὶ ἐπένθηϲε, καὶ ἠιτήϲατο
    3c,688,F.14.53 ἐπὶ Ϲυρίαν τὴν ἑαυτοῦ χώραν ἀπιέναι. ἐνταῦθα λάθραι καὶ τοὺϲ ἄλλουϲ τῶν
    3c,688,F.14.54 Ἑλλήνων προέπεμπε. καὶ ἀπήιει, καὶ ἀπέϲτη βαϲιλέωϲ, καὶ ἀθροίζει μεγάλην
    3c,688,F.14.55 δύναμιν ἄχρι πεντεκαίδεκα μυριάδων χωρὶϲ τῶν ἱππέων [καὶ τῶν πεζῶν].
    3c,688,F.14.56 καὶ πέμπεται Οὔϲιριϲ κατ’ αὐτοῦ ϲὺν ⟨κ⟩ μυριάϲι, καὶ ϲυνάπτεται πόλεμοϲ, καὶ

    """

    #
    # initially do "within x lines"
    #

    phrasefinder = re.compile(r'[^\s]\s[^\s]')
    if re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        secondsearch = precomposedsqlsubqueryphrasesearch
    elif not re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        so.swapseekingandproxmate()
        so.swaplemmaoneandtwo()
        secondsearch = basicprecomposedsqlsearcher
    else:
        secondsearch = basicprecomposedsqlsearcher

    c = so.cap
    ps = so.proximate
    so.proximate = str()
    pl = so.lemmatwo
    so.lemmatwo = str()
    so.phrase = so.seeking
    firstterm = so.phrase

    so.cap = hipparchia.config['INTERMEDIATESEARCHCAP']

    initialhitlines = precomposedsqlsubqueryphrasesearch(so)

    so.seeking = ps
    so.lemmaone = pl
    so.setsearchtype()
    so.cap = c

    if secondsearch == precomposedsqlsubqueryphrasesearch:
        so.phrase = ps
    else:
        so.phrase = str()

    so = perparesoforsecondsqldict(so, initialhitlines)
    so.searchsqldict = searchlistintosqldict(so, so.seeking)

    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    so.poll.sethits(0)

    newhitlines = secondsearch(so)

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()

    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    maybefinalhitines = list()
    if so.near:
        # "is near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    #
    # if neccessary, do "within x words" as x lines hits will always be a subset of the first set
    #

    if so.lemmaone:
        secondterm = wordlistintoregex(so.lemmaone.formlist)
    else:
        secondterm = so.seeking

    if so.scope == 'words':
        finalhitlines = paredowntowithinxwords(so, firstterm, secondterm,
                                               maybefinalhitines)
    else:
        finalhitlines = maybefinalhitines

    # to humor rewriteskgandprx()
    # but that formatting doesn't 100% work yet...

    so.termone = firstterm
    so.termtwo = secondterm
    so.lemmatwo = so.lemmaone

    return finalhitlines
Exemple #6
0
def precomposedsqlsubqueryphrasesearch(so: SearchObject) -> List[dbWorkLine]:
    """

    use subquery syntax to grab multi-line windows of text for phrase searching

    line ends and line beginning issues can be overcome this way, but then you have plenty of
    bookkeeping to do to to get the proper results focussed on the right line

    these searches take linear time: same basic time for any given scope regardless of the query

    """

    # rebuild the searchsqldict but this time pass through rewritequerystringforsubqueryphrasesearching()
    so.searchsqldict = searchlistintosqldict(so,
                                             so.phrase,
                                             subqueryphrasesearch=True)

    # debugmessage('precomposedsqlsubqueryphrasesearch() so.searchsqldict: {d}'.format(d=so.searchsqldict))

    # the windowed collection of lines; you will need to work to find the centers
    # windowing will increase the number of hits: 2+ lines per actual find
    initialhitlines = generatepreliminaryhitlist(so, recap=so.cap * 3)

    m = 'Generating final list of hits by searching among the {h} preliminary hits'
    so.poll.statusis(m.format(h=so.poll.gethits()))
    so.poll.sethits(0)

    sp = re.sub(r'^\s', r'(^|\\s)', so.phrase)
    sp = re.sub(r'\s$', r'(\\s|$)', sp)

    combinations = QueryCombinator(so.phrase)
    # the last item is the full phrase and it will have already been searched:  ('one two three four five', '')
    combinations = combinations.combinations()
    combinations.pop()

    listoffinds = list()

    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    setofhits = set()

    while initialhitlines:
        # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133]
        # figure out which line is really the line with the goods
        # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in
        # subsequent lines means that you really should check your work carefully; this is not an especially costly
        # operation relative to the whole search and esp. relative to the speed gains of using a subquery search
        lineobject = initialhitlines.pop()
        if not so.onehit or lineobject.authorid not in setofhits:
            if re.search(sp, getattr(lineobject, so.usewordlist)):
                listoffinds.append(lineobject)
                so.poll.addhits(1)
                setofhits.add(lineobject.authorid)
            else:
                try:
                    nextline = initialhitlines[0]
                except IndexError:
                    nextline = makeablankline('gr0000w000', -1)

                if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != (
                        nextline.index - 1):
                    # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101)
                    # usually you won't get a hit by grabbing the next db line, but sometimes you do...
                    query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format(
                        wtmpl=worklinetemplate, tb=lineobject.authorid)
                    data = (lineobject.index + 1, )
                    dbcursor.execute(query, data)
                    try:
                        nextline = dblineintolineobject(dbcursor.fetchone())
                    except:
                        nextline = makeablankline('gr0000w000', -1)

                for c in combinations:
                    tail = c[0] + '$'
                    head = '^' + c[1]

                    t = False
                    h = False
                    try:
                        t = re.search(tail, getattr(lineobject,
                                                    so.usewordlist))
                    except re.error:
                        pass
                    try:
                        h = re.search(head, getattr(nextline, so.usewordlist))
                    except re.error:
                        pass

                    if t and h:
                        listoffinds.append(lineobject)
                        so.poll.addhits(1)
                        setofhits.add(lineobject.authorid)

    dbconnection.connectioncleanup()
    return listoffinds
Exemple #7
0
def precomposedsqlwithinxlinessearch(so: SearchObject) -> List[dbWorkLine]:
    """

    after finding x, look for y within n lines of x

    people who send phrases to both halves and/or a lot of regex will not always get what they want

    note that this implementations is significantly slower than the standard withinxlines() + simplewithinxlines()

    """

    initialhitlines = generatepreliminaryhitlist(so)

    # we are going to need a new searchsqldict w/ a new temptable
    # sq = { table1: {query: q, data: d, temptable: t},
    #         table2: {query: q, data: d, temptable: t}, ...

    # this means refeeding searchlistintosqldict() and priming it for a 'temptable' search
    # the temptable follows the paradigm of wholeworktemptablecontents()
    # r {'type': 'temptable', 'where': {'tempquery': '\n\tCREATE TEMPORARY TABLE in0f08_includelist AS \n\t\tSELECT values \n\t\t\tAS includeindex FROM unnest(ARRAY[768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,763,764,765,766,767]) values\n\t'}}

    so = perparesoforsecondsqldict(so, initialhitlines)

    so.searchsqldict = searchlistintosqldict(so, so.termtwo)
    if so.lemmatwo:
        so.lemmaone = so.lemmatwo
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    m = 'Now searching among the {n} initial finds for {l}"{x}"'
    so.poll.statusis(m.format(n=len(initialhitlines), x=so.termtwo, l=str()))
    if so.lemmaone:
        so.poll.statusis(
            m.format(n=len(initialhitlines),
                     x=so.lemmaone.dictionaryentry,
                     l="all forms of "))

    so.poll.sethits(0)
    newhitlines = basicprecomposedsqlsearcher(so)

    # newhitlines will contain, e.g., in0001w0ig_493 and in0001w0ig_492, i.e., 2 lines that are part of the same 'hit'
    # so we need can't use newhitlines directly but have to check it against the initial hits
    # that's fine since "not near" would push us in this direction in any case

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()
    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    finalhitlines = list()
    if so.near:
        # "is near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    return finalhitlines