def paredowntowithinxwords(so: SearchObject, firstterm: str, secondterm: str, hitlines: List[dbWorkLine]) -> List[dbWorkLine]: """ pare down hitlines finds to within words finds """ so.poll.sethits(0) dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() fullmatches = list() commitcount = 0 while hitlines and len(fullmatches) < so.cap: commitcount += 1 if commitcount == hipparchia.config['MPCOMMITCOUNT']: dbconnection.commit() commitcount = 0 hit = hitlines.pop() leadandlag = grableadingandlagging(hit, so, dbcursor, firstterm) # debugmessage('leadandlag for {h}: {l}'.format(h=hit.uniqueid, l=leadandlag)) lagging = leadandlag['lag'] leading = leadandlag['lead'] if so.near and (re.search(secondterm, leading) or re.search(secondterm, lagging)): fullmatches.append(hit) so.poll.addhits(1) elif not so.near and not re.search( secondterm, leading) and not re.search(secondterm, lagging): fullmatches.append(hit) so.poll.addhits(1) dbconnection.connectioncleanup() return fullmatches
def precomposedsqlphrasesearch(so: SearchObject) -> List[dbWorkLine]: """ you are searching for a relatively rare word: we will keep things simple-ish note that the second half of this is not MP: but searches already only take 6s; so clean code probably wins here FIXME: can't find the phrases in here...: κατεϲκεύαϲεν τὸ ἐνϲόριον FAILS ϲεν τὸ ἐνϲόριον το SUCCEEDS ch0005w001/2749 1 Ῥουφεῖνα Ἰουδαία ἀρχι- 2 ϲυνάγωγοϲ κατεϲκεύα- 3 ϲεν τὸ ἐνϲόριον τοῖϲ ἀπε- ( match: ἀπελευθέροιϲ ) 4 λευθέροιϲ καὶ θρέμ(μ)αϲιν 5 μηδενὸϲ ἄλ(λ)ου ἐξουϲίαν ἔ- actually, this is a BUILDER problem AND a SERVER problem: BUILDER: 2749 does not have κατεϲκεύαϲεν in it hipparchiaDB=# select index, accented_line, hyphenated_words from ch0005 where index between 2746 and 2752; index | accented_line | hyphenated_words -------+-----------------------------------+------------------ 2748 | ῥουφεῖνα ἰουδαία ἀρχιϲυνάγωγοϲ | ἀρχιϲυνάγωγοϲ 2749 | κατεϲκεύα- | 2750 | ϲεν τὸ ἐνϲόριον τοῖϲ ἀπελευθέροιϲ | ἀπελευθέροιϲ 2751 | καὶ θρέμμαϲιν | 2752 | μηδενὸϲ ἄλλου ἐξουϲίαν ἔχοντοϲ | ἔχοντοϲ (5 rows) SERVER: ἀπελευθέροιϲ καὶ θρέμμαϲιν is missed by precomposedsqlphrasesearch() but it is found by precomposedsqlsubqueryphrasesearch() maybe it is time to nuke precomposedsqlphrasesearch() after all... NB: the dynamic workonphrasesearch() CAN find 'ἀπελευθέροιϲ καὶ θρέμμαϲιν' """ debugmessage('executing a precomposedsqlphrasesearch()') so.termone = so.leastcommon searchphrase = so.phrase phraselen = len(searchphrase.split(' ')) initialhitlines = generatepreliminaryhitlist(so) m = 'Now searching among the {h} initial hits for the full phrase "{p}"' so.poll.statusis(m.format(h=so.poll.gethits(), p=so.originalseeking)) so.poll.sethits(0) fullmatches = list() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() commitcount = 0 while initialhitlines and len(fullmatches) <= so.cap: commitcount += 1 if commitcount == hipparchia.config['MPCOMMITCOUNT']: dbconnection.commit() commitcount = 0 hit = initialhitlines.pop() wordset = lookoutsideoftheline(hit.index, phraselen - 1, hit.authorid, so, dbcursor) if not so.accented: wordset = re.sub(r'[.?!;:,·’]', str(), wordset) else: # the difference is in the apostrophe: δ vs δ’ wordset = re.sub(r'[.?!;:,·]', str(), wordset) if so.near and re.search(searchphrase, wordset): fullmatches.append(hit) so.poll.addhits(1) elif not so.near and re.search(searchphrase, wordset) is None: fullmatches.append(hit) so.poll.addhits(1) dbconnection.connectioncleanup() return fullmatches