def breaktextsintosentences(foundsentences: ListProxy, searchlist: ListProxy, so: SearchObject, dbconnection: ConnectionObject) -> List[tuple]: """ break a text into sentences that contain the term we are looking for that is, findsentences() both chunks and searches findsentences() results[0] ('line/gr0014w001/1', 'ἀντὶ πολλῶν ἄν ὦ ἄνδρεϲ ἀθηναῖοι χρημάτων ὑμᾶϲ ἑλέϲθαι νομίζω εἰ φανερὸν γένοιτο τὸ μέλλον ϲυνοίϲειν τῇ πόλει περὶ ὧν νυνὶ ϲκοπεῖτε') :param foundsentences: :param searchlist: :param activepoll: :param searchobject: :return: """ activepoll = so.poll dbcursor = dbconnection.cursor() commitcount = 0 while searchlist: commitcount += 1 try: authortable = searchlist.pop() except IndexError: authortable = None if authortable: foundsentences.extend(findsentences(authortable, so, dbcursor)) dbconnection.checkneedtocommit(commitcount) try: activepoll.remain(len(searchlist)) except TypeError: pass return foundsentences
def mpmorphology(terms: list, furtherdeabbreviate: bool, dictofmorphobjects, dbconnection: ConnectionObject) -> dict: """ build a dict of morphology objects :param terms: :param furtherdeabbreviate: :param dictofmorphobjects: :param dbconnection: :return: """ if not dbconnection: dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() commitcount = 0 while terms: commitcount += 1 dbconnection.checkneedtocommit(commitcount) try: term = terms.pop() except IndexError: term = None if term: mo = lookformorphologymatches( term, dbcursor, furtherdeabbreviate=furtherdeabbreviate) if mo: dictofmorphobjects[term] = mo else: dictofmorphobjects[term] = None if not icanpickleconnections(): dbconnection.connectioncleanup() return dictofmorphobjects
class GenericSearchFunctionObject(object): """ a class to hold repeated code for the searches the chief difference between most search types is the number and names of the parameters passed to self.searchfunction one also has the option of storing the searchlist either in shared memory or in a redis set the retrieval, parsing, and ending checks for the two structures are different handling the matrix of possibilities for search_type and list_type combinations produces the somewhat tangled set of options below """ def __init__(self, workerid, foundlineobjects: ListProxy, listofplacestosearch, searchobject: SearchObject, dbconnection, searchfunction): self.workerid = workerid self.commitcount = 0 if dbconnection: self.dbconnection = dbconnection self.needconnectioncleanup = False else: # you are running Windows and can't pickle your connections self.dbconnection = ConnectionObject() self.needconnectioncleanup = True self.dbcursor = self.dbconnection.cursor() self.so = searchobject self.foundlineobjects = foundlineobjects self.listofplacestosearch = listofplacestosearch self.searchfunction = searchfunction self.searchfunctionparameters = None self.activepoll = self.so.poll self.parameterswapper = self.simpleparamswapper self.emptytest = self.listofplacestosearch try: self.getnetxitem = self.listofplacestosearch.pop except AttributeError: # this should get implemented momentarily after this GenericObject has been initialized self.getnetxitem = NotImplementedError self.remainder = self.listofplacestosearch self.emptyerror = IndexError self.remaindererror = TypeError def authorsamongthefinds(self) -> set: authorset = {f.authorid for f in self.foundlineobjects} return authorset def getnextfnc(self): self.commitcount += 1 self.dbconnection.checkneedtocommit(self.commitcount) try: nextsearchlocation = self.getnetxitem(0) except self.emptyerror: nextsearchlocation = None return nextsearchlocation def getremain(self): return len(self.remainder) def listcleanup(self): pass def addnewfindstolistoffinds(self, newfinds: list): self.foundlineobjects.extend(newfinds) # nf = ', '.join([f.universalid for f in newfinds]) # print('{c} {u}\tadded\t{ln}'.format(c=self.workerid, u=self.dbconnection.uniquename, ln=nf)) def updatepollremaining(self): try: self.activepoll.remain(self.getremain()) except self.remaindererror: self.activepoll.setnotes( 'Number remaining unavailable: % complete will be inaccurate') pass def updatepollfinds(self, lines: list): if lines: numberoffinds = len(lines) self.activepoll.addhits(numberoffinds) return def simpleparamswapper(self, texttoinsert: str, insertposition: int) -> list: """ the various searchfunctions have different interfaces this lets you get the right collection of paramaters into the various functions :param texttoinsert: :param insertposition: :return: """ parameters = self.searchfunctionparameters parameters[insertposition] = texttoinsert return parameters def tupleparamswapper(self, tupletoinsert: tuple, insertposition: int) -> list: """ somewhat brittle, but... this handles the non-standard case of a tuple that needs swapping instead of an individual name (i.e., it works with the lemmatized search) :param tupletoinsert: :param insertposition: :return: """ if self.so.redissearchlist: tupletoinsert = pickle.loads(tupletoinsert) parameters = self.searchfunctionparameters head = parameters[:insertposition] tail = parameters[insertposition + 1:] newparams = head + list(tupletoinsert) + tail return newparams def iteratethroughsearchlist(self): """ this is the simple core of the whole thing; the rest is about feeding it properly if you do not pickle the lineobjects here and now you will need to generate line objects at the other end foundlineobjects = [dblineintolineobject(item) for item in founddblineobjects] you will also need to use lo.decompose() in phrasesearching.py to feed the findslist :return: """ insertposition = self.searchfunctionparameters.index('parametertoswap') while self.emptytest and self.activepoll.gethits() <= self.so.cap: srchfunct = self.searchfunction nextitem = self.getnextfnc() if self.so.session['onehit']: # simplelemma chunk might have already searched and found in an author if self.so.lemma or self.so.proximatelemma: # nextitem looks like '(chunk, item)' if nextitem[1] in self.authorsamongthefinds(): srchfunct = None if nextitem and srchfunct: params = self.parameterswapper(nextitem, insertposition) foundlines = srchfunct(*tuple(params)) lineobjects = [dblineintolineobject(f) for f in foundlines] self.addnewfindstolistoffinds(lineobjects) self.updatepollfinds(lineobjects) self.updatepollremaining() elif not srchfunct: pass else: # listofplacestosearch has been exhausted break self.listcleanup() if self.needconnectioncleanup: self.dbconnection.connectioncleanup() # empty return because foundlineobjects is a ListProxy: # ask for self.foundlineobjects as the search result instead # print('{i} finished'.format(i=self.workerid)) return
def workonprecomposedsqlsearch(workerid: int, foundlineobjects: ListProxy, listofplacestosearch: ListProxy, searchobject: SearchObject, dbconnection) -> ListProxy: """ iterate through listofplacestosearch execute precomposedsqlsearcher() on each item in the list gather the results... listofplacestosearch elements are dicts and the whole looks like: [{'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)}, {'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)} ...] this is supposed to give you one query per hipparchiaDB table unless you are lemmatizing """ if not dbconnection: dbconnection = ConnectionObject() so = searchobject activepoll = so.poll dbconnection.setreadonly(False) dbcursor = dbconnection.cursor() commitcount = 0 getnetxitem = listofplacestosearch.pop emptyerror = IndexError remaindererror = TypeError while listofplacestosearch and activepoll.gethits() <= so.cap: # if workerid == 0: # print('remain:', len(listofplacestosearch)) commitcount += 1 dbconnection.checkneedtocommit(commitcount) try: querydict = getnetxitem(0) # consolewarning("workonprecomposedsqlsearch() querydict:\n\t{q}".format(q=querydict)) except emptyerror: querydict = None listofplacestosearch = None if querydict: foundlines = precomposedsqlsearcher(querydict, dbcursor) lineobjects = [dblineintolineobject(f) for f in foundlines] foundlineobjects.extend(lineobjects) if lineobjects: numberoffinds = len(lineobjects) activepoll.addhits(numberoffinds) else: listofplacestosearch = None try: activepoll.remain(len(listofplacestosearch)) except remaindererror: pass if not icanpickleconnections(): dbconnection.connectioncleanup() return foundlineobjects