def search(self, q, stemmed, syn): path = os.path.dirname(__file__) ix = open_dir(path + "/index") parser = MultifieldParser(["title", "content"], ix.schema) searcher = ix.searcher() cp = qparser.OperatorsPlugin(And="&", Or="\|", AndNot="!") parser.replace_plugin(cp) if syn == True: for w in q.split(" "): synonyms = self.getGermanSynonyms(w) synQ = "(" + w for s in synonyms: synQ += "\|" + s synQ += ")" qNew = q.replace(w, synQ) q = qNew searchQuery = parser.parse(q) allresults = searcher.search(searchQuery, limit=None) if stemmed == True: results = self.searchGermanStemmed(q, ix) allresults.upgrade_and_extend(results) results = self.searchFrenchStemmed(q, ix) allresults.upgrade_and_extend(results) print(searchQuery) return allresults
def lookup(self, term, fuzzy=False, limit=None): term = term.strip() term = term.lower() if limit: limit = limit else: limit = self.RESULTS_LIMIT fields = ( 'indice', 'indice_game', 'name', 'name_jp', 'game', 'version', 'classification', 'element', 'code', 'size', 'damage_min', 'damage_max', 'recovery', 'rarity' ) if fuzzy: parser = MultifieldParser( fields, schema=self.index.schema, termclass=FuzzyTerm ) else: parser = MultifieldParser(fields, schema=self.index.schema) operators = OperatorsPlugin( And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="\\-" ) parser.replace_plugin(operators) query = parser.parse(term) results = [] try: searcher = self.index.searcher() results = searcher.search(query, limit=limit) if not results and not fuzzy: # Try a Fuzzy Search. return self.lookup(term, fuzzy=True, limit=self.FUZZY_LIMIT) except IndexError: pass return results
def searchFrenchStemmed(self, q, ix): stemmer = snowballstemmer.stemmer('french') parser = MultifieldParser(["title", "titleStemmed", "contentStemmed" ,"content"], ix.schema) queryStemmedList = stemmer.stemWords(unicode(q).split()) q = "" for w in queryStemmedList: q += w + " " searcher = ix.searcher() cp = qparser.OperatorsPlugin(And="&", Or="\|", AndNot="!") parser.replace_plugin(cp) searchQuery = parser.parse(q) print(searchQuery) results= searcher.search(searchQuery, limit=None) return results
def ranked_search(query): # ix = create_in("indexdir", schema) ix = open_dir("indexdir") writer = ix.writer() with ix.searcher(weighting=scoring.BM25F(0.75, 1.2)) as searcher: # qp = QueryParser("full_description", schema) qp = MultifieldParser([ "description", "full_description", "file_text", "reporter", "date" ], ix.schema) cp = OperatorsPlugin(And="&", Or="\|", AndMaybe="&~", Not=None) qp.replace_plugin(cp) qAND = qp.parse(query, normalize=True) qOR = qp.parse(queryOR(query), normalize=True) qAM = qp.parse(queryAndMaybe(query), normalize=True) # results are Result objects which have lists of hits qAND_docs = searcher.search(qAND, limit=None, terms=True) qOR_docs = searcher.search(qOR, limit=None, terms=True) qAM_docs = searcher.search(qAM, limit=None, terms=True) qAND_docs.fragmenter.maxchars = 160 qOR_docs.fragmenter.maxchars = 160 qAM_docs.fragmenter.maxchars = 160 # Shortening the surrounding chars of text. Default was 20. Now tweet size qAND_docs.fragmenter.surround = 20 qOR_docs.fragmenter.surround = 20 qAM_docs.fragmenter.surround = 20 # Used for highlighting matched terms # Scoring will still be reflected in order. # Sending only OR docs will just simplify display hits = get_stats(qOR_docs) results = [] for i in range(len(list(qAND_docs))): report_id = Report.objects.get( pk=int(qOR_docs[i].fields()['report_id'])) results.append((report_id, qOR_docs[i].score)) for j in range(len(list(qOR_docs))): report_id = Report.objects.get( pk=int(qOR_docs[j].fields()['report_id'])) results.append((report_id, qOR_docs[j].score)) for k in range(len(list(qAM_docs))): report_id = Report.objects.get( pk=int(qAM_docs[k].fields()['report_id'])) results.append((report_id, qAM_docs[k].score)) results = removeDuplicates(results) results_and_info = { 'results': results, 'hits': hits, } # for numid, fields in hits.items(): # print(numid) # for field, field_value in fields.items(): # print(field) # if field == 'full_description': # for term in field_value: # print(term) # print('\n') # elif field == 'description': # for term2 in field_value: # print(term2) # print('\n') # elif field == 'file_text': # print(field_value) # print('\n') # elif field == 'date': # print(field_value) # print('\n') # elif field == 'reporter': # print(field_value) # print('\n') return results_and_info