def precomposedsqlsearchmanager(so: SearchObject) -> List[dbWorkLine]:
    """

    quick and dirty dispatcher

    note that you need so.searchsqldict to be properly configured before you get here

    """

    activepoll = so.poll

    workers = setthreadcount()

    manager = Manager()
    foundlineobjects = manager.list()

    searchsqlbyauthor = [so.searchsqldict[k] for k in so.searchsqldict.keys()]
    searchsqlbyauthor = manager.list(searchsqlbyauthor)

    activepoll.allworkis(len(searchsqlbyauthor))
    activepoll.remain(len(searchsqlbyauthor))
    activepoll.sethits(0)

    argumentuple = [foundlineobjects, searchsqlbyauthor, so]

    if icanpickleconnections():
        oneconnectionperworker = {i: ConnectionObject() for i in range(workers)}
    else:
        oneconnectionperworker = {i: None for i in range(workers)}

    argumentswithconnections = [tuple([i] + list(argumentuple) + [oneconnectionperworker[i]]) for i in range(workers)]

    if platform.system() == 'Windows':
        # windows hates multiprocessing; but in practice windows should never be coming here: HipparchiaGoDBHelper...
        return workonprecomposedsqlsearch(*argumentswithconnections[0])

    jobs = [Process(target=workonprecomposedsqlsearch, args=argumentswithconnections[i]) for i in range(workers)]

    for j in jobs:
        j.start()
    for j in jobs:
        j.join()

    # generator needs to turn into a list
    foundlineobjects = list(foundlineobjects)

    for c in oneconnectionperworker:
        oneconnectionperworker[c].connectioncleanup()

    return foundlineobjects
def getrequiredmorphobjects(setofterms: set,
                            furtherdeabbreviate=False) -> dict:
    """

	take a set of terms

	find the morphobjects associated with them

	:param terms:
	:return:
	"""

    workers = setthreadcount()

    if icanpickleconnections():
        oneconnectionperworker = {
            i: ConnectionObject()
            for i in range(workers)
        }
    else:
        oneconnectionperworker = {i: None for i in range(workers)}

    if platform.system() == 'Windows':
        # windows hates multiprocessing; but in practice windows should never be coming here: HipparchiaGoDBHelper...
        return mpmorphology(list(setofterms), furtherdeabbreviate, dict(),
                            oneconnectionperworker[0])

    manager = Manager()
    terms = manager.list(list(setofterms))
    morphobjects = manager.dict()

    jobs = [
        Process(target=mpmorphology,
                args=(terms, furtherdeabbreviate, morphobjects,
                      oneconnectionperworker[i])) for i in range(workers)
    ]
    for j in jobs:
        j.start()
    for j in jobs:
        j.join()

    if oneconnectionperworker[0]:
        for c in oneconnectionperworker:
            oneconnectionperworker[c].connectioncleanup()

    return morphobjects
def mpmorphology(terms: list, furtherdeabbreviate: bool, dictofmorphobjects,
                 dbconnection: ConnectionObject) -> dict:
    """

	build a dict of morphology objects

	:param terms:
	:param furtherdeabbreviate:
	:param dictofmorphobjects:
	:param dbconnection:
	:return:
	"""

    if not dbconnection:
        dbconnection = ConnectionObject()

    dbcursor = dbconnection.cursor()

    commitcount = 0
    while terms:
        commitcount += 1
        dbconnection.checkneedtocommit(commitcount)
        try:
            term = terms.pop()
        except IndexError:
            term = None

        if term:
            mo = lookformorphologymatches(
                term, dbcursor, furtherdeabbreviate=furtherdeabbreviate)
            if mo:
                dictofmorphobjects[term] = mo
            else:
                dictofmorphobjects[term] = None

    if not icanpickleconnections():
        dbconnection.connectioncleanup()

    return dictofmorphobjects
Beispiel #4
0
def dynamicsqlsearchdispatcher(searchobject: SearchObject) -> List[dbWorkLine]:
    """

	assign the search to multiprocessing workers
		searchobject:
			<server.hipparchiaclasses.SearchObject object at 0x1102c15f8>
		activepoll:
			<server.hipparchiaclasses.ProgressPoll object at 0x1102c15f8>

	:param searchobject:
	:param activepoll:
	:return:
	"""

    # clean out the pool if necessary before starting
    # this seems like the safest time for a reset of the pool: otherwise you could have workers working
    # but if you have a multi-user environment AND pool problems this code might make things worse
    cleanpoolifneeded()

    so = searchobject
    activepoll = so.poll

    # recompose 'searchingfor' (if it exists)
    # note that 'proximate' does not need as many checks
    if so.seeking:
        searchingfor = massagesearchtermsforwhitespace(so.seeking)
    else:
        searchingfor = str()

    # lunate sigmas / UV / JI issues
    unomdifiedskg = searchingfor
    unmodifiedprx = so.proximate

    activepoll.statusis('Loading the the dispatcher...')

    # of long-term interest is the new shared_memory module; using it will break the 3.6-3.7 installations
    # https://docs.python.org/3.8/library/multiprocessing.shared_memory.html#module-multiprocessing.shared_memory

    manager = Manager()
    founddblineobjects = manager.list()

    workers = setthreadcount()

    if so.redisresultlist and so.redissearchlist:
        listofplacestosearch = None
        buildredissearchlist(list(so.indexrestrictions.keys()), so.searchid)
    else:
        listofplacestosearch = manager.list(so.indexrestrictions.keys())

    activepoll.allworkis(len(so.searchlist))
    activepoll.remain(len(so.indexrestrictions.keys()))
    activepoll.sethits(0)

    # be careful about getting mp aware args into the function

    targetfunction = None
    argumentuple = None

    if so.searchtype == 'simple':
        activepoll.statusis('Executing a simple word search...')
        targetfunction = workonsimplesearch
        argumentuple = (founddblineobjects, listofplacestosearch, so)
    elif so.searchtype == 'simplelemma':
        activepoll.statusis(
            'Executing a lemmatized word search for the {n} known forms of {w}...'
            .format(n=len(so.lemma.formlist), w=so.lemma.dictionaryentry))
        # don't search for every form at once (100+?)
        # instead build a list of tuples: [(ORed_regex_forms_part_01, authortable1), ...]
        chunksize = hipparchia.config['LEMMACHUNKSIZE']
        terms = so.lemma.formlist
        chunked = [
            terms[i:i + chunksize] for i in range(0, len(terms), chunksize)
        ]
        chunked = [wordlistintoregex(c) for c in chunked]
        searchtuples = manager.list()
        masterlist = so.indexrestrictions.keys()
        for c in chunked:
            for item in masterlist:
                searchtuples.append((c, item))
        activepoll.allworkis(len(searchtuples))
        if so.usequeue:
            searchtuples = loadsearchqueue([t for t in searchtuples], workers)
        if so.redissearchlist:
            ptuples = [pickle.dumps(s) for s in searchtuples]
            buildredissearchlist(ptuples, so.searchid)
        targetfunction = workonsimplelemmasearch
        argumentuple = (founddblineobjects, searchtuples, so)
    elif so.searchtype == 'phrase':
        activepoll.statusis('Executing a phrase search.')
        so.leastcommon = findleastcommonterm(so.termone, so.accented)
        lccount = findleastcommontermcount(so.termone, so.accented)

        # print('least common word in phrase:', lccount, ':', so.leastcommon, so.termone)
        # longestterm = max([len(t) for t in so.termone.split(' ') if t])
        # need to figure out when it will be faster to go to subqueryphrasesearch() and when not to
        # logic + trial and error
        #   e.g., any phrase involving λιποταξίου (e.g., γράψομαι λιποταξίου) can be very fast because that form appears 36x:
        #   you can find it in 1s but if you go through subqueryphrasesearch() you will spend about 17s per full TLG search
        # lccount = -1 if you are unaccented
        #   'if 0 < lccount < 500 or longestterm > 5' got burned badly with 'ἐξ ἀρχῆϲ πρῶτον'
        #   'or (lccount == -1 and longestterm > 6)' would take 1m to find διαφοραϲ ιδεαν via workonphrasesearch()
        #   but the same can be found in 16.45s via subqueryphrasesearch()
        # it looks like unaccented searches are very regularly faster via subqueryphrasesearch()
        #   when is this not true? being wrong about sqs() means spending an extra 10s; being wrong about phs() means an extra 40s...
        if 0 < lccount < 500:
            # print('workonphrasesearch()', searchingfor)
            targetfunction = workonphrasesearch
            argumentuple = (founddblineobjects, listofplacestosearch, so)
        else:
            # print('subqueryphrasesearch()', searchingfor)
            targetfunction = subqueryphrasesearch
            argumentuple = (founddblineobjects, so.termone,
                            listofplacestosearch, so)
    elif so.searchtype == 'proximity':
        activepoll.statusis('Executing a proximity search...')
        if so.lemma or so.proximatelemma:
            pass
        elif so.accented or re.search(r'^[a-z]', so.termone) and so.near:
            # choose the necessarily faster option
            leastcommon = findleastcommonterm(
                unomdifiedskg + ' ' + unmodifiedprx, so.accented)
            if leastcommon != unomdifiedskg:
                tmp = so.termone
                so.termone = so.termtwo
                so.termtwo = tmp
        elif len(so.termtwo) > len(so.termone) and so.near:
            # look for the longest word first since that is probably the quicker route
            # but you can't swap searchingfor and proximate this way in a 'is not near' search without yielding the wrong focus
            tmp = so.termone
            so.termone = so.termtwo
            so.termtwo = tmp
        targetfunction = workonproximitysearch
        argumentuple = (founddblineobjects, listofplacestosearch, so)
    else:
        # impossible, but...
        workers = 0

    # non-parallel multiprocessing implementation across platforms: widows can't pickle a connection;
    # everyone else needs to pickle the connection
    if icanpickleconnections():
        # you need to give each job its own connection if you use a connection pool
        # otherwise there will be problems with threading
        # note that we are not yet taking care of connection types: 'autocommit', etc
        oneconnectionperworker = {
            i: ConnectionObject()
            for i in range(workers)
        }
    else:
        # will grab a connection later once inside of 'sfo'
        oneconnectionperworker = {i: None for i in range(workers)}

    # note that the following (when fully implemented...) does not produce speedups
    # operedisconnectionperworker = {i: establishredisconnection() for i in range(workers)}

    argumentswithconnections = [
        tuple([i] + list(argumentuple) + [oneconnectionperworker[i]])
        for i in range(workers)
    ]
    jobs = [
        Process(target=targetfunction, args=argumentswithconnections[i])
        for i in range(workers)
    ]

    for j in jobs:
        j.start()

    for j in jobs:
        j.join()

    if so.redisresultlist:
        foundlineobjects = loadredisresults(so.searchid)
    else:
        # foundlineobjects = [dblineintolineobject(item) for item in founddblineobjects]
        foundlineobjects = list(founddblineobjects)

    if oneconnectionperworker[0]:
        for c in oneconnectionperworker:
            oneconnectionperworker[c].connectioncleanup()

    return foundlineobjects
def workonprecomposedsqlsearch(workerid: int, foundlineobjects: ListProxy, listofplacestosearch: ListProxy,
                               searchobject: SearchObject, dbconnection) -> ListProxy:
    """

    iterate through listofplacestosearch

    execute precomposedsqlsearcher() on each item in the list

    gather the results...

    listofplacestosearch elements are dicts and the whole looks like:

        [{'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)},
        {'temptable': '', 'query': 'SELECT ...', 'data': ('ὕβριν',)} ...]

    this is supposed to give you one query per hipparchiaDB table unless you are lemmatizing

    """

    if not dbconnection:
        dbconnection = ConnectionObject()

    so = searchobject
    activepoll = so.poll
    dbconnection.setreadonly(False)
    dbcursor = dbconnection.cursor()
    commitcount = 0
    getnetxitem = listofplacestosearch.pop
    emptyerror = IndexError
    remaindererror = TypeError

    while listofplacestosearch and activepoll.gethits() <= so.cap:
        # if workerid == 0:
        #     print('remain:', len(listofplacestosearch))
        commitcount += 1
        dbconnection.checkneedtocommit(commitcount)

        try:
            querydict = getnetxitem(0)
            # consolewarning("workonprecomposedsqlsearch() querydict:\n\t{q}".format(q=querydict))
        except emptyerror:
            querydict = None
            listofplacestosearch = None

        if querydict:
            foundlines = precomposedsqlsearcher(querydict, dbcursor)
            lineobjects = [dblineintolineobject(f) for f in foundlines]
            foundlineobjects.extend(lineobjects)

            if lineobjects:
                numberoffinds = len(lineobjects)
                activepoll.addhits(numberoffinds)
        else:
            listofplacestosearch = None

        try:
            activepoll.remain(len(listofplacestosearch))
        except remaindererror:
            pass

    if not icanpickleconnections():
        dbconnection.connectioncleanup()

    return foundlineobjects