def filterPlaylists(playlist_pickle, index_dir, filterpickle): with open(playlist_pickle, 'r') as f: playlists = pickle.load(f) N = len(playlists) filtered_playlists = [] index = whoosh.index.open_dir(index_dir) with index.searcher() as searcher: searcher.set_caching_policy(save=False) qa = whoosh.qparser.SimpleParser('artist', index.schema) qt = whoosh.qparser.SimpleParser('title', index.schema) for x in [qa, qt]: x.remove_plugin_class(whoosh.qparser.PlusMinusPlugin) for (i, P) in enumerate(playlists): filtered_playlists.append(filterThisPlaylist(P, searcher, qa, qt)) if i % 10 == 0: print '%5d/%5d' % (i, N) with open(filterpickle, 'w') as f: pickle.dump(filtered_playlists, f) print '%d playlists' % len(filtered_playlists) pass
def __init__(self, model_class, primary, index): self.model_class = model_class self.primary = primary self.index = index self.searcher = index.searcher() fields = set(index.schema._fields.keys()) - set([self.primary]) self.parser = MultifieldParser(list(fields), index.schema)
def search_tracks(index, title=None, artist=None, num_results=None): '''Search an MSD track index''' if artist: q_artist = whoosh.qparser.QueryParser('artist_name', index.schema).parse(artist) if title: q_title = whoosh.qparser.QueryParser('title', index.schema).parse(title) # Merge the queries if title and artist: q = whoosh.query.And([q_artist, q_title]) elif title: q = q_title elif artist: q = q_artist else: raise ValueError('Invalid query') with index.searcher() as search: return [dict(item) for item in search.search(q, limit=num_results)] # No results return []
def search_snippet(): query = request.args.get('query', None) page = int(request.args.get('page', 1)) if query: qp = MultifieldParser(["title", "content", "tag"], schema=index.schema) q = qp.parse(query) else: q = Every() response = {"results": [], "total": 0} with index.searcher() as searcher: results = searcher.search_page(q, page, pagelen=config.SEARCH_PAGINATION, sortedby="title") for snippet in results: response["results"].append({ 'id': snippet['id'], 'title': snippet['title'] }) response["total_snippets"] = len(results) response["total_pages"] = (len(results) - 1) / config.SEARCH_PAGINATION + 1 response["current_page"] = page return json.dumps(response) return json.dumps(response)
def search(cls, search_string, values_of='', group=whoosh.qparser.OrGroup, match_substrings=True, limit=None): """Searches the fields for given search_string. Returns the found records if 'values_of' is left empty, else the values of the given columns. :param search_string: The string to search for. :param values_of: If given, the method will not return the whole records, but only values of given column. Defaults to returning whole records. :param group: The whoosh group to use for searching. Defaults to :class:`whoosh.qparser.OrGroup` which searches for all words in all columns. :param match_substrings: ``True`` if you want to match substrings, ``False`` otherwise. :param limit: The number of the top records to be returned. Defaults to ``None`` and returns all records. """ index = Whooshee.get_or_create_index(_get_app(cls), cls) prepped_string = cls.prep_search_string(search_string, match_substrings) with index.searcher() as searcher: parser = whoosh.qparser.MultifieldParser(cls.schema.names(), index.schema, group=group) query = parser.parse(prepped_string) results = searcher.search(query, limit=limit) if values_of: return [x[values_of] for x in results] return results
def get_tags_for_filepath(self, filepath): index = self.open_index() with index.searcher() as searcher: result = searcher.document(filepath=filepath) if result: return result['tags'] return None
def result_iter(): with fasteners.InterProcessLock(index_lock_path): with index.searcher() as searcher: yield from searcher.search( query, limit=None if args.limit < 1 else args.limit, sortedby='timestamp')
def kmeans(self, query, index): # creating document collection # the collection is like ((doc1: (term1: 4),(term2: 5)),(doc2(...)),...) doc_collection = dict() with index.searcher() as s: # Returns a generator of the document numbers for all documents for i in s.document_numbers(): doc_item = list() v = s.vector(i, "content") for term_item in v.items_as("frequency"): doc_item.append(term_item) doc_collection[i] = doc_item #for i in doc_collection.items(): #print i # Creating document collection only with the query terms' frequency doc_query_collection = dict() # Parsing the query # Returns a set of all terms in this query tree. for q_item in query.all_terms(phrases=True): #first_elts = [x[1] for x in doc_collection.items()] for doc in doc_collection: print doc #for doc in doc_collection.items(): ## get the value of the key as q_item, ## if it does not exist, return 0 #if q_item in doc[0]: #print q_item ##tf = doc.values()[1] ##doc_query_collection[doc] = tf for i in doc_query_collection.items(): print i
def __parse_execute(artifact, schema, index, include_filepath): """ Execute the search for the given commit :param artifact: the (file name, artifact) tuple to search for :param schema: the search schema to use :param index: the search index to use :param include_filepath: indicator whether to use the 'file name' part of the artifact into account :return: a match list of tuples (file name, artifact, message ID) """ log.devinfo("Searching for artifact ({}, {})...".format( artifact[0], artifact[1])) result = [] with index.searcher() as searcher: # initialize query parser query_parser = QueryParser("content", schema=schema) # construct query if include_filepath: my_query = query_parser.parse(artifact[0] + " AND " + artifact[1]) else: my_query = query_parser.parse(artifact[1]) # search! query_result = searcher.search(my_query, terms=True) # construct result from query answer for r in query_result: result_tuple = (artifact[0], artifact[1], r["messageID"]) result.append(result_tuple) return result
def search(self, trans, search_term, page, page_size, boosts): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ tool_index_dir = os.path.join(trans.app.config.whoosh_index_dir, 'tools') index_exists = whoosh.index.exists_in(tool_index_dir) if index_exists: index = whoosh.index.open_dir(tool_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. tool_weighting = scoring.BM25F(field_B={ 'name_B' : boosts.tool_name_boost, 'description_B' : boosts.tool_description_boost, 'help_B' : boosts.tool_help_boost, 'repo_owner_username_B' : boosts.tool_repo_owner_username_boost}) searcher = index.searcher(weighting=tool_weighting) parser = MultifieldParser([ 'name', 'description', 'help', 'repo_owner_username'], schema=schema) user_query = parser.parse('*' + search_term + '*') try: hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True) except ValueError: raise ObjectNotFound('The requested page does not exist.') log.debug('searching tools for: #' + str(search_term)) log.debug('total hits: ' + str(len(hits))) log.debug('scored hits: ' + str(hits.scored_length())) results = {} results['total_results'] = str(len(hits)) results['page'] = str(page) results['page_size'] = str(page_size) results['hits'] = [] for hit in hits: hit_dict = {} hit_dict['id'] = hit.get('id') hit_dict['repo_owner_username'] = hit.get('repo_owner_username') hit_dict['repo_name'] = hit.get('repo_name') hit_dict['name'] = hit.get('name') hit_dict['description'] = hit.get('description') matched_terms = {k: unicodify(v) for k, v in hit.matched_terms()} results['hits'].append({'tool': hit_dict, 'matched_terms': matched_terms, 'score': hit.score}) return results finally: searcher.close() else: raise exceptions.InternalServerError('The search index file is missing.')
def get_searcher(self): """ Returns a reference to the index searcher, creating if necssary. """ if self.index_searcher: return self.index_searcher index = self.get_index() self.index_searcher = index.searcher() return self.index_searcher
def __init__(self, mapping, primary, index, session=None): self.mapping = mapping self.primary = primary self.index = index self.session = session self.searcher = index.searcher() fields = set(index.schema._fields.keys()) - set([self.primary]) self.parser = MultifieldParser(list(fields), index.schema)
def search_db(storage, schema): index = storage.open_index(schema=schema) searcher = index.searcher() parser = QueryParser(content_field_name, schema=schema) parsed_query = parser.parse('2020') raw_results = searcher.search(parsed_query) for hit in raw_results: print hit.highlights(content_field_name)
def search_documents(self, query, field="tags"): index = self.open_index() qp = QueryParser(field, schema=index.schema) q = qp.parse(unicode(query)) with index.searcher() as searcher: results = searcher.search(q, limit=None) new_results = [hit.fields() for hit in results] return new_results
def search( self, trans, search_term, page, page_size, boosts ): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' ) index_exists = whoosh.index.exists_in( tool_index_dir ) if index_exists: index = whoosh.index.open_dir( tool_index_dir ) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. tool_weighting = scoring.BM25F( field_B={ 'name_B' : boosts.tool_name_boost, 'description_B' : boosts.tool_description_boost, 'help_B' : boosts.tool_help_boost, 'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } ) searcher = index.searcher( weighting=tool_weighting ) parser = MultifieldParser( [ 'name', 'description', 'help', 'repo_owner_username' ], schema=tool_schema ) user_query = parser.parse( '*' + search_term + '*' ) try: hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True ) except ValueError: raise ObjectNotFound( 'The requested page does not exist.' ) log.debug( 'searching tools for: #' + str( search_term ) ) log.debug( 'total hits: ' + str( len( hits ) ) ) log.debug( 'scored hits: ' + str( hits.scored_length() ) ) results = {} results[ 'total_results'] = str( len( hits ) ) results[ 'page'] = str( page ) results[ 'page_size'] = str( page_size ) results[ 'hits' ] = [] for hit in hits: hit_dict = {} hit_dict[ 'id' ] = hit.get( 'id' ) hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' ) hit_dict[ 'repo_name' ] = hit.get( 'repo_name' ) hit_dict[ 'name' ] = hit.get( 'name' ) hit_dict[ 'description' ] = hit.get( 'description' ) results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } ) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.' )
def search_page(query, indexname=None, page=1, pagelen=20, fields=None, schema=None): index = Index.get_index(indexname=indexname, schema=schema) q = Index.build_query(index, query, indexname=indexname, fields=fields) return index.searcher().search_page(q, page, pagelen=pagelen)
def query(self, query_dict={}, group_by=[]): """ Perform a query against an index. query_dict: { '<field name>' : ['<value>', '<value>', ...], '<field name>' : '<value>', .... } For example: { 'platforms': ['Microsoft Windows NT', 'Microsoft Windows 2000'], 'products': 'mozilla', 'contributors': 'Jonathan Baker' } """ # update the index self.update() # construct query by looping through schema fields and adding terms query_fields = [] for field in self.get_fieldnames(): if field in query_dict and query_dict[field]: # get field values as list values = query_dict[field] if isinstance(values, str): values = [values] # get a whoosh.query.Term for each value field_values = [] for value in values: field_values.append(whoosh.query.Term(field, self.whoosh_escape(value))) # OR field values together and add to query_fields list query_fields.append(whoosh.query.Or(field_values)) if query_fields: # create query by ANDing query_fields together query = query_fields[0] if len(query_fields) == 1 else whoosh.query.And(query_fields) #this.message('debug','parsed whoosh query:\n\t{0}'.format(repr(query))) else: query = whoosh.query.Every() # assemble query args query_kwargs = { 'scored': False, 'sortedby': False, 'terms': False } if group_by: query_kwargs['groupedby'] = group_by query_kwargs['maptype'] = whoosh.sorting.Count # run query against index index = self.get_index() with index.searcher() as index_searcher: results = index_searcher.search(query, **query_kwargs) if group_by: return results.groups().copy() else: return [result.fields() for result in results]
def query(query_string): # pragma: no cover """Searches the index and yields matching item_ids. """ with index.searcher() as searcher: q = parser.parse(query_string) results = searcher.search(q) for hit in results: yield hit.fields()
def search_artists(index, artist=None, num_results=None): '''Search an MSD track index''' q = whoosh.qparser.QueryParser('artist_name', index.schema).parse(artist) with index.searcher() as search: return [dict(item) for item in search.search(q, limit=num_results)] # No results return []
def search_artists(index, name, num_results=None): if isinstance(name, str): name = unicode(name, errors='ignore') q = whoosh.qparser.QueryParser('artist_name', index.schema).parse(name) with index.searcher() as search: return [(item.score, dict(item)) for item in search.search(q, limit=num_results)] return None
def key_terms(storage, schema): index = storage.open_index(schema=schema) ixreader = index.reader() searcher = index.searcher() docnums = [] KEY_LEN = 500 DOC_LEN = 1000 for id in xrange(DOC_LEN): docnums.append(id) #for id in ixreader.all_doc_ids(): # print id, terms = {} i = 0 for term, score in searcher.key_terms(docnums, content_field_name, KEY_LEN): terms[term] = i i += 1 print 'key_terms finished' ar = np.zeros((len(docnums), KEY_LEN)) for i in xrange(DOC_LEN): term_weights = ixreader.vector_as("weight", i, content_field_name) all_weight = 0 n = 0 for term, weight in term_weights: if term in terms: ar[i][terms[term]] = weight all_weight += weight n += 1 for j in xrange(KEY_LEN): ar[i][j] = ar[i][j] / weight u, s, v = lin.svd(ar, full_matrices=False) data = u[:, 0:100] print 'svd finished' k = KMeans(init='k-means++', n_init=10) k.fit(data) #centroids = k.cluster_centers_ labels = k.labels_ print 'kmeans finished' #af = AffinityPropagation(affinity="euclidean").fit(data) #cluster_centers_indices = af.cluster_centers_indices_ #labels = af.labels_ doc_arr = np.array(range(DOC_LEN)) for i in range(np.max(labels)): print 'group:', (i + 1) for doc_num in doc_arr[labels == i]: print ixreader.stored_fields(doc_num).get( 'id'), ixreader.stored_fields(doc_num).get('title').split( '|')[0] + '/', print '\n'
def search_index_no_page(index, query, index_name, limits=None, filter=None): result = [] try: search_field = {"call": ["name"]} searcher = index.searcher() mparser = MultifieldParser(search_field[index_name], schema=index.schema) q = mparser.parse(query) result = searcher.search(q, filter=filter, limit=limits) except Exception, e: LOG.exception(e) result = False
def test_tamilprefix(self): f = open( '/home/nanditha/projects/tamilthedal/trunk/src/encyclopedia/utilities/pyunitwildtext' ) cont = f.readline() text = cont.split(':') index = open_dir(settings.INDEX_PATH) wildtext = unicode(str(text[0]), 'utf-8') + u'*' qp = query.Wildcard("content", wildtext) srch = index.searcher() res = srch.search(qp) self.assertNotEqual(len(res), 0) print len(res), 'results'
def key_terms(storage, schema): index = storage.open_index(schema=schema) ixreader = index.reader() searcher = index.searcher() docnums = [] KEY_LEN = 500 DOC_LEN = 1000 for id in xrange(DOC_LEN): docnums.append(id) #for id in ixreader.all_doc_ids(): # print id, terms = {} i = 0 for term,score in searcher.key_terms(docnums, content_field_name, KEY_LEN): terms[term] = i i += 1 print 'key_terms finished' ar = np.zeros( (len(docnums), KEY_LEN) ) for i in xrange(DOC_LEN): term_weights = ixreader.vector_as("weight", i, content_field_name) all_weight = 0 n = 0 for term,weight in term_weights: if term in terms: ar[i][terms[term]] = weight all_weight += weight n += 1 for j in xrange(KEY_LEN): ar[i][j] = ar[i][j]/weight u,s,v = lin.svd(ar, full_matrices=False) data = u[:,0:100] print 'svd finished' k = KMeans(init='k-means++', n_init=10) k.fit(data) #centroids = k.cluster_centers_ labels = k.labels_ print 'kmeans finished' #af = AffinityPropagation(affinity="euclidean").fit(data) #cluster_centers_indices = af.cluster_centers_indices_ #labels = af.labels_ doc_arr = np.array(range(DOC_LEN)) for i in range(np.max(labels)): print 'group:', (i+1) for doc_num in doc_arr[labels==i]: print ixreader.stored_fields(doc_num).get('id'), ixreader.stored_fields(doc_num).get('title').split('|')[0]+ '/', print '\n'
def search_for_class(self, query, cls, limit=50, filter=None): index = self.indexes[cls.__name__] searcher = index.searcher() fields = set(index.schema._fields.keys()) - set(["uid"]) parser = MultifieldParser(list(fields), index.schema) facets = sorting.Facets() facets.add_field("language") facets.add_field("mime_type") facets.add_field("creator") facets.add_field("owner") results = searcher.search(parser.parse(query), groupedby=facets, limit=limit, filter=filter) return results
def query(self, query_string: str) -> List[papis.document.Document]: self.logger.debug('Query string %s' % query_string) index = self.get_index() qp = whoosh.qparser.MultifieldParser(['title', 'author', 'tags'], schema=self.get_schema()) qp.add_plugin(whoosh.qparser.FuzzyTermPlugin()) query = qp.parse(query_string) with index.searcher() as searcher: results = searcher.search(query, limit=None) self.logger.debug(results) documents = [ papis.document.from_folder(r.get(Database.get_id_key())) for r in results] return documents
def key_all(): index = storage.open_index(schema=schema) searcher = index.searcher() reader = searcher.reader() cnt = 0 filename = 'idf.txt' accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+", re.UNICODE) file = codecs.open(filename, "w", "utf-8") file.write('%s\n' % reader.doc_count_all() ) for term in reader.field_terms('content'): #for term in reader.most_frequent_terms('content', 100): if not accepted_chars.match(term): continue term_info = reader.term_info('content', term) file.write('%s %d %d\n' % (term, term_info.doc_frequency(), term_info.max_weight()) ) file.close()
def search(query, page=1, per_page=20): with index.searcher() as s: qp = qparser.MultifieldParser(['title', 'content'], index.schema) q = qp.parse(unicode(query)) try: result_page = s.search_page(q, page, pagelen=per_page) except ValueError: if page == 1: return SearchResultPage(None, page) return None results = result_page.results results.highlighter.fragmenter.maxchars = 512 results.highlighter.fragmenter.surround = 40 results.highlighter.formatter = highlight.HtmlFormatter('em', classname='search-match', termclass='search-term', between=u'<span class=ellipsis> … </span>') return SearchResultPage(result_page, page)
def filterPlaylists(playlist_dir, index_dir, filterpickle): filtered_playlists = [] index = whoosh.index.open_dir(index_dir) with index.searcher() as searcher: qa = whoosh.qparser.QueryParser('artist', index.schema) qt = whoosh.qparser.QueryParser('title', index.schema) for (i, P) in enumerate(playlistGenerator(playlist_dir)): filtered_playlists.append(filterThisPlaylist(P, searcher, qa, qt)) print '%6d' % i with open(filterpickle, 'w') as f: pickle.dump(filtered_playlists, f) print '%d playlists' % len(filtered_playlists) pass
def get_results(_agency, _year, _content_type, user_query): global index with index.searcher() as searcher: agency_in = _agency pub_year_in = _year content_type_in = _content_type user_filter = build_filters(agency_in=agency_in, content_type_in=content_type_in, pub_year_in=pub_year_in) query = qp.parse(user_query) results = searcher.search(query, filter=user_filter) new_list = [] for r in results: new_list.append(dict(r)) return new_list
def search(query, page=1, per_page=20): with index.searcher() as s: qp = qparser.MultifieldParser(['title', 'content'], index.schema) q = qp.parse(unicode(query)) try: result_page = s.search_page(q, page, pagelen=per_page) except ValueError: if page == 1: return SearchResultPage(None, page) return None results = result_page.results results.highlighter.fragmenter.maxchars = 512 results.highlighter.fragmenter.surround = 40 results.highlighter.formatter = highlight.HtmlFormatter( 'em', classname='search-match', termclass='search-term', between=u'<span class=ellipsis> … </span>') return SearchResultPage(result_page, page)
def search_snippet(): query = request.args.get('query', None) page = request.args.get('page', 1) if query: qp = MultifieldParser(["title", "content", "tag"], schema=index.schema) q = qp.parse(query) else: q = Every() response = {"results":[], "total": 0} with index.searcher() as searcher: results = searcher.search_page(q, page, pagelen=config.SEARCH_PAGINATION, sortedby="title") for snippet in results: response["results"].append({'id': snippet['id'], 'title': snippet['title']}) response["total"] = len(results) return json.dumps(response) return json.dumps(response)
def search_db(): index = storage.open_index(schema=schema) searcher = index.searcher() parser = QueryParser(index_fieldname, schema=schema) parsed_query = parser.parse('%s:%s' % ('id', qq_id)) raw_results = searcher.search(parsed_query) _, names = get_content('qq7') corpus_filename = 'name_qq7' terms = {} corpus_file = codecs.open(corpus_filename, "r", "utf-8") for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() frequency = int(tokens[1].strip()) terms[term] = frequency n_terms = {} corpus_filename = 'dict.txt' corpus_file = codecs.open(corpus_filename, "r", "utf-8") for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() if len(tokens) >= 3 and tokens[2].find('n') >= 0: n_terms[term] = tokens[2] else: n_terms[term] = '' keys = [] for keyword, score in raw_results.key_terms(index_fieldname, docs=1000, numterms=240): if keyword in names: continue if terms.has_key(keyword): if not n_terms.has_key(keyword): keys.append(keyword) elif n_terms.has_key(keyword) and n_terms[keyword].find('n') >= 0: keys.append(keyword) #keys.append(keyword) print ', '.join(keys)
def search_db(): index = storage.open_index(schema=schema) searcher = index.searcher() parser = QueryParser(index_fieldname, schema=schema) parsed_query = parser.parse('%s:%s' % ('id', qq_id)) raw_results = searcher.search(parsed_query) _,names = get_content('qq7') corpus_filename = 'name_qq7' terms = {} corpus_file = codecs.open(corpus_filename, "r", "utf-8") for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() frequency = int(tokens[1].strip()) terms[term] = frequency n_terms = {} corpus_filename = 'dict.txt' corpus_file = codecs.open(corpus_filename, "r", "utf-8") for line in corpus_file: tokens = line.split(" ") term = tokens[0].strip() if len(tokens) >= 3 and tokens[2].find('n')>=0: n_terms[term] = tokens[2] else: n_terms[term] = '' keys = [] for keyword, score in raw_results.key_terms(index_fieldname, docs=1000, numterms=240): if keyword in names: continue if terms.has_key(keyword): if not n_terms.has_key(keyword): keys.append(keyword) elif n_terms.has_key(keyword) and n_terms[keyword].find('n')>=0: keys.append(keyword) #keys.append(keyword) print ', '.join(keys)
def search(query, indexname=None, fields=None, limit=10, discriminators=None, exclude_discriminators=None, schema=None): index = Index.get_index(indexname=indexname, schema=schema) if exclude_discriminators and discriminators: q = And([ Index.build_query(index, query, indexname=indexname, fields=fields), Not( Or([ Term("discriminator", discriminator) for discriminator in discriminators ])) ]) elif discriminators: q = And([ Index.build_query(index, query, indexname=indexname, fields=fields), Or([ Term("discriminator", discriminator) for discriminator in discriminators ]) ]) else: q = Index.build_query(index, query, indexname=indexname, fields=fields) return index.searcher().search(q, limit=limit)
def search(qstring, index_folder="index_fullname"): index = whoosh.index.open_dir(os.path.join(dirname, index_folder)) schema = index.schema qp = MultifieldParser(["fullname"], schema=schema) q = qp.parse(unicode(qstring)) with index.searcher() as searcher: searchResult = searcher.search(q, limit=20) # result = {r["fullname"] for r in searchResult} ids = [{ "rank": index_rank, "id": r["id"] } for index_rank, r in enumerate(searchResult)] # ids = {r for r in searchResult} corrector = searcher.corrector("fullname") suggestions = [] if len(ids) == 0: suggestions = corrector.suggest(qstring, limit=6) # suggestionResults = {s["fullname"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)} # result = result.union(suggestionResults) # ids_suggestion = [s["id"] for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)] # ids_suggestion = {s for suggest in suggestions for s in searcher.search(qp.parse(unicode(suggest)), limit=5)} # ids = ids+ids_suggestion return {"ids": list(ids), "suggestions": suggestions}
def search( self, trans, search_term, page, page_size, boosts ): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ whoosh_index_dir = trans.app.config.whoosh_index_dir index_exists = whoosh.index.exists_in( whoosh_index_dir ) if index_exists: index = whoosh.index.open_dir( whoosh_index_dir ) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. repo_weighting = RepoWeighting( field_B={ 'name_B' : boosts.repo_name_boost, 'description_B' : boosts.repo_description_boost, 'long_description_B' : boosts.repo_long_description_boost, 'homepage_url_B' : boosts.repo_homepage_url_boost, 'remote_repository_url_B' : boosts.repo_remote_repository_url_boost, 'repo_owner_username' : boosts.repo_owner_username_boost } ) searcher = index.searcher( weighting=repo_weighting ) parser = MultifieldParser( [ 'name', 'description', 'long_description', 'homepage_url', 'remote_repository_url', 'repo_owner_username' ], schema=schema ) user_query = parser.parse( '*' + search_term + '*' ) try: hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True ) except ValueError: raise ObjectNotFound( 'The requested page does not exist.' ) log.debug( 'searching for: #' + str( search_term ) ) log.debug( 'total hits: ' + str( len( hits ) ) ) log.debug( 'scored hits: ' + str( hits.scored_length() ) ) results = {} results[ 'total_results'] = str( len( hits ) ) results[ 'page'] = str( page ) results[ 'page_size'] = str( page_size ) results[ 'hits' ] = [] for hit in hits: hit_dict = {} hit_dict[ 'id' ] = trans.security.encode_id( hit.get( 'id' ) ) hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' ) hit_dict[ 'name' ] = hit.get( 'name' ) hit_dict[ 'long_description' ] = hit.get( 'long_description' ) hit_dict[ 'remote_repository_url' ] = hit.get( 'remote_repository_url' ) hit_dict[ 'homepage_url' ] = hit.get( 'homepage_url' ) hit_dict[ 'description' ] = hit.get( 'description' ) hit_dict[ 'last_updated' ] = hit.get( 'last_updated' ) hit_dict[ 'full_last_updated' ] = hit.get( 'full_last_updated' ) hit_dict[ 'approved' ] = hit.get( 'approved' ) hit_dict[ 'times_downloaded' ] = hit.get( 'times_downloaded' ) results[ 'hits' ].append( {'repository': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } ) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.' )
searcher - whoosh searcher (create with index.searcher() then close it yourself) schema - whoosh schema (index.schema) artist - Artist name title - Song name threshold - Score threshold for a match Output: best_match - best match for the search, or None of no match ''' arparser = whoosh.qparser.QueryParser('artist', schema) tiparser = whoosh.qparser.QueryParser('title', schema) q = whoosh.query.And([arparser.parse(unicode(artist, encoding='utf-8')), tiparser.parse(unicode(title, encoding='utf-8'))]) results = searcher.search(q) result = None if len(results) > 0: r = results[0] if r.score > threshold: result = [r['track_id'], r['artist'], r['title']] return result # <codecell> if __name__=='__main__': import os if not os.path.exists('Whoosh Indices/cal10k_index/'): create_index('Whoosh Indices/cal10k_index/', get_cal10k_list('File Lists/EchoNestTrackIDs.tab') ) index = get_whoosh_index('Whoosh Indices/cal10k_index/') print search( index.searcher(), index.schema, 'queen', 'under pressure' )
#!/usr/bin/env python import whoosh, whoosh.index, whoosh.qparser import sys import pprint if __name__ == '__main__': index = whoosh.index.open_dir(sys.argv[1]) with index.searcher() as search: q = whoosh.qparser.MultifieldParser(['title', 'artist'], index.schema).parse(unicode(' '.join(sys.argv[2:]))) results = search.search(q, terms=True) for r in results: pprint.pprint(r) pprint.pprint(r.matched_terms()) print '---'
# qry_flaclist_index.py # # Query the index built by mk_flaclist_index.py # 2014-03-14 Dan Ellis [email protected] # Querying import whoosh, whoosh.index, whoosh.qparser indexdir = 'WCDindexdir' index = whoosh.index.open_dir(indexdir) search = index.searcher() arparser = whoosh.qparser.QueryParser('artist', index.schema) alparser = whoosh.qparser.QueryParser('album', index.schema) tiparser = whoosh.qparser.QueryParser('title', index.schema) # One example query #artist = u'Darrell Scott' #album = u'Transatlantic Sessions - Series 3: Volume One' #title = u'Shattered Cross' # #qry = whoosh.query.And([arparser.parse(artist), alparser.parse(album), tiparser.parse(title)]) #results = search.search(qry) # #if len(results) == 0: # # drop the album # qry = whoosh.query.And([arparser.parse(artist), tiparser.parse(title)]) # results = search.search(qry) # #import pprint #for r in results: # pprint.pprint(r)
def search(self, trans, search_term, page, **kwd): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ if search_ready: toolshed_whoosh_index_dir = trans.app.config.toolshed_whoosh_index_dir index_exists = whoosh.index.exists_in(toolshed_whoosh_index_dir) if index_exists: index = whoosh.index.open_dir(toolshed_whoosh_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. repo_weighting = RepoWeighting( field_B={ 'name_B': 0.9, 'description_B': 0.6, 'long_description_B': 0.5, 'homepage_url_B': 0.3, 'remote_repository_url_B': 0.2, 'repo_owner_username': 0.3 }) searcher = index.searcher(weighting=repo_weighting) parser = MultifieldParser([ 'name', 'description', 'long_description', 'homepage_url', 'remote_repository_url', 'repo_owner_username' ], schema=schema) user_query = parser.parse('*' + search_term + '*') hits = searcher.search_page(user_query, page, pagelen=10, terms=True) log.debug('searching for: #' + str(search_term)) log.debug('total hits: ' + str(len(hits))) log.debug('scored hits: ' + str(hits.scored_length())) results = {} results['total_results'] = str(len(hits)) results['hits'] = [] for hit in hits: hit_dict = {} hit_dict['id'] = trans.security.encode_id( hit.get('id')) hit_dict['repo_owner_username'] = hit.get( 'repo_owner_username') hit_dict['name'] = hit.get('name') hit_dict['long_description'] = hit.get( 'long_description') hit_dict['remote_repository_url'] = hit.get( 'remote_repository_url') hit_dict['homepage_url'] = hit.get('homepage_url') hit_dict['description'] = hit.get('description') hit_dict['last_updated'] = hit.get('last_updated') hit_dict['full_last_updated'] = hit.get( 'full_last_updated') hit_dict['approved'] = hit.get('approved') hit_dict['times_downloaded'] = hit.get( 'times_downloaded') results['hits'].append({ 'repository': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score }) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.') else: raise exceptions.InternalServerError( 'Could not initialize search.')
def __init__(self, primary, index): self.primary_key_name = primary self._index = index self.searcher = index.searcher() self._all_fields = list(set(index.schema._fields.keys()) - set([self.primary_key_name]))
def search(self, trans, search_term, page, page_size, boosts): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ whoosh_index_dir = trans.app.config.whoosh_index_dir index_exists = whoosh.index.exists_in(whoosh_index_dir) if index_exists: index = whoosh.index.open_dir(whoosh_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. repo_weighting = RepoWeighting( field_B={ "name_B": boosts.repo_name_boost, "description_B": boosts.repo_description_boost, "long_description_B": boosts.repo_long_description_boost, "homepage_url_B": boosts.repo_homepage_url_boost, "remote_repository_url_B": boosts.repo_remote_repository_url_boost, "repo_owner_username": boosts.repo_owner_username_boost, } ) searcher = index.searcher(weighting=repo_weighting) parser = MultifieldParser( [ "name", "description", "long_description", "homepage_url", "remote_repository_url", "repo_owner_username", ], schema=schema, ) user_query = parser.parse("*" + search_term + "*") try: hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True) except ValueError: raise ObjectNotFound("The requested page does not exist.") log.debug("searching for: #" + str(search_term)) log.debug("total hits: " + str(len(hits))) log.debug("scored hits: " + str(hits.scored_length())) results = {} results["total_results"] = str(len(hits)) results["page"] = str(page) results["page_size"] = str(page_size) results["hits"] = [] for hit in hits: hit_dict = {} hit_dict["id"] = trans.security.encode_id(hit.get("id")) hit_dict["repo_owner_username"] = hit.get("repo_owner_username") hit_dict["name"] = hit.get("name") hit_dict["long_description"] = hit.get("long_description") hit_dict["remote_repository_url"] = hit.get("remote_repository_url") hit_dict["homepage_url"] = hit.get("homepage_url") hit_dict["description"] = hit.get("description") hit_dict["last_updated"] = hit.get("last_updated") hit_dict["full_last_updated"] = hit.get("full_last_updated") hit_dict["approved"] = hit.get("approved") hit_dict["times_downloaded"] = hit.get("times_downloaded") results["hits"].append( {"repository": hit_dict, "matched_terms": hit.matched_terms(), "score": hit.score} ) return results finally: searcher.close() else: raise exceptions.InternalServerError("The search index file is missing.")
import whoosh.index as index from whoosh import columns, fields, index, sorting from whoosh.qparser import QueryParser # ix = index.open_dir("./") # facet = sorting.FieldFacet("id", reverse=True) # searcher = ix.searcher() # # searchwords = "新西兰" # qp = QueryParser("gtitle", schema=ix.schema) # q = qp.parse(searchwords) # results = searcher.search(q, sortedby=facet) # for each in results: # print(each) from whoosh.qparser import QueryParser from whoosh.index import open_dir from whoosh.sorting import FieldFacet new_list = [] index = open_dir("./index/", indexname='goods') # 读取建立好的索引 with index.searcher() as searcher: parser = QueryParser("gtitle", index.schema) # 要搜索的项目,比如“phone_name myquery = parser.parse("鸭蛋") facet = FieldFacet("id", reverse=True) # 按序排列搜索结果 results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 for result1 in results: print(dict(result1)) new_list.append(dict(result1))
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen, indexname, docnum_field, score_field, fieldboosts, search_kwargs): import whoosh.index import whoosh.query import whoosh.qparser if not search_kwargs: search_kwargs = dict() if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: # figure out header hdr = tuple() if docnum_field is not None: hdr += (docnum_field, ) if score_field is not None: hdr += (score_field, ) stored_names = tuple(index.schema.stored_names()) hdr += stored_names yield hdr # parse the query if isinstance(query, string_types): # search all fields by default parser = whoosh.qparser.MultifieldParser(index.schema.names(), index.schema, fieldboosts=fieldboosts) query = parser.parse(query) elif isinstance(query, whoosh.query.Query): pass else: raise ArgumentError( 'expected string or whoosh.query.Query, found %r' % query) # make a function to turn docs into tuples astuple = operator.itemgetter(*index.schema.stored_names()) with index.searcher() as searcher: if limit is not None: results = searcher.search(query, limit=limit, **search_kwargs) else: results = searcher.search_page(query, pagenum, pagelen=pagelen, **search_kwargs) if docnum_field is None and score_field is None: for doc in results: yield astuple(doc) else: for (docnum, score), doc in izip(results.items(), results): row = tuple() if docnum_field is not None: row += (docnum, ) if score_field is not None: row += (score, ) row += astuple(doc) yield row except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close() # TODO guess schema
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen, indexname, docnum_field, score_field, fieldboosts, search_kwargs): import whoosh.index import whoosh.query import whoosh.qparser if not search_kwargs: search_kwargs = dict() if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: # figure out header hdr = tuple() if docnum_field is not None: hdr += (docnum_field,) if score_field is not None: hdr += (score_field,) stored_names = tuple(index.schema.stored_names()) hdr += stored_names yield hdr # parse the query if isinstance(query, string_types): # search all fields by default parser = whoosh.qparser.MultifieldParser( index.schema.names(), index.schema, fieldboosts=fieldboosts ) query = parser.parse(query) elif isinstance(query, whoosh.query.Query): pass else: raise ArgumentError( 'expected string or whoosh.query.Query, found %r' % query ) # make a function to turn docs into tuples astuple = operator.itemgetter(*index.schema.stored_names()) with index.searcher() as searcher: if limit is not None: results = searcher.search(query, limit=limit, **search_kwargs) else: results = searcher.search_page(query, pagenum, pagelen=pagelen, **search_kwargs) if docnum_field is None and score_field is None: for doc in results: yield astuple(doc) else: for (docnum, score), doc in izip(results.items(), results): row = tuple() if docnum_field is not None: row += (docnum,) if score_field is not None: row += (score,) row += astuple(doc) yield row except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close() # TODO guess schema
def search(self, q, index='default', fields=None, Models=(), object_types=(), prefix=True, facet_by_type=None, **search_args): """Interface to search indexes. :param q: unparsed search string. :param index: name of index to use for search. :param fields: optionnal mapping of field names -> boost factor? :param Models: list of Model classes to limit search on. :param object_types: same as `Models`, but directly the model string. :param prefix: enable or disable search by prefix :param facet_by_type: if set, returns a dict of object_type: results with a max of `limit` matches for each type. :param search_args: any valid parameter for :meth:`whoosh.searching.Search.search`. This includes `limit`, `groupedby` and `sortedby` """ index = self.app_state.indexes[index] if not fields: fields = self.default_search_fields valid_fields = { f for f in index.schema.names(check_names=fields) if prefix or not f.endswith('_prefix') } for invalid in set(fields) - valid_fields: del fields[invalid] parser = DisMaxParser(fields, index.schema) query = parser.parse(q) filters = search_args.setdefault('filter', None) filters = [filters] if filters is not None else [] del search_args['filter'] if not hasattr(g, 'is_manager') or not g.is_manager: # security access filter user = current_user roles = {indexable_role(user)} if not user.is_anonymous: roles.add(indexable_role(Anonymous)) roles.add(indexable_role(Authenticated)) roles |= {indexable_role(r) for r in security.get_roles(user)} filter_q = wq.Or( [wq.Term('allowed_roles_and_users', role) for role in roles], ) filters.append(filter_q) object_types = set(object_types) for m in Models: object_type = m.entity_type if not object_type: continue object_types.add(object_type) if object_types: object_types &= self.app_state.indexed_fqcn else: # ensure we don't show content types previously indexed but not yet # cleaned from index object_types = self.app_state.indexed_fqcn # limit object_type filter_q = wq.Or([wq.Term('object_type', t) for t in object_types]) filters.append(filter_q) for func in self.app_state.search_filter_funcs: filter_q = func() if filter_q is not None: filters.append(filter_q) if filters: filter_q = wq.And(filters) if len(filters) > 1 else filters[0] # search_args['filter'] = filter_q query = filter_q & query if facet_by_type: if not object_types: object_types = [t[0] for t in self.searchable_object_types()] # limit number of documents to score, per object type collapse_limit = 5 search_args['groupedby'] = 'object_type' search_args['collapse'] = 'object_type' search_args['collapse_limit'] = collapse_limit search_args['limit'] = (search_args['collapse_limit'] * max( len(object_types), 1, )) with index.searcher(closereader=False) as searcher: # 'closereader' is needed, else results cannot by used outside 'with' # statement results = searcher.search(query, **search_args) if facet_by_type: positions = { doc_id: pos for pos, doc_id in enumerate(i[1] for i in results.top_n) } sr = results results = {} for typename, doc_ids in sr.groups('object_type').items(): results[typename] = [ sr[positions[oid]] for oid in doc_ids[:collapse_limit] ] return results
def search(self, trans, search_term, page, page_size, boosts): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :param boosts: namedtuple containing custom boosts for searchfields, see api/repositories.py :param page_size: integer defining a length of one page :param page: integer with the number of page requested :returns results: dictionary containing hits themselves and the hits summary """ log.debug('raw search query: #' + str(search_term)) lower_search_term = search_term.lower() allow_query, search_term_without_filters = self._parse_reserved_filters( lower_search_term) log.debug('term without filters: #' + str(search_term_without_filters)) whoosh_index_dir = trans.app.config.whoosh_index_dir index_exists = whoosh.index.exists_in(whoosh_index_dir) if index_exists: index = whoosh.index.open_dir(whoosh_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. repo_weighting = RepoWeighting( field_B={ 'name_B': boosts.repo_name_boost, 'description_B': boosts.repo_description_boost, 'long_description_B': boosts.repo_long_description_boost, 'homepage_url_B': boosts.repo_homepage_url_boost, 'remote_repository_url_B': boosts.repo_remote_repository_url_boost, 'repo_owner_username_B': boosts.repo_owner_username_boost, 'categories_B': boosts.categories_boost }) searcher = index.searcher(weighting=repo_weighting) parser = MultifieldParser([ 'name', 'description', 'long_description', 'homepage_url', 'remote_repository_url', 'repo_owner_username', 'categories' ], schema=schema) # If user query has just filters prevent wildcard search. if len(search_term_without_filters) < 1: user_query = Every('name') sortedby = 'name' else: user_query = parser.parse('*' + search_term_without_filters + '*') sortedby = '' try: hits = searcher.search_page(user_query, page, pagelen=page_size, filter=allow_query, terms=True, sortedby=sortedby) log.debug('total hits: ' + str(len(hits))) log.debug('scored hits: ' + str(hits.scored_length())) except ValueError: raise ObjectNotFound('The requested page does not exist.') results = {} results['total_results'] = str(len(hits)) results['page'] = str(page) results['page_size'] = str(page_size) results['hits'] = [] for hit in hits: log.debug('matched terms: ' + str(hit.matched_terms())) hit_dict = {} hit_dict['id'] = trans.security.encode_id(hit.get('id')) hit_dict['repo_owner_username'] = hit.get( 'repo_owner_username') hit_dict['name'] = hit.get('name') hit_dict['long_description'] = hit.get('long_description') hit_dict['remote_repository_url'] = hit.get( 'remote_repository_url') hit_dict['homepage_url'] = hit.get('homepage_url') hit_dict['description'] = hit.get('description') hit_dict['last_updated'] = hit.get('last_updated') hit_dict['full_last_updated'] = hit.get( 'full_last_updated') hit_dict['repo_lineage'] = hit.get('repo_lineage') hit_dict['categories'] = hit.get('categories') hit_dict['approved'] = hit.get('approved') hit_dict['times_downloaded'] = hit.get('times_downloaded') results['hits'].append({ 'repository': hit_dict, 'score': hit.score }) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.')
def get_snippet_by_id(snippet_id): return index.searcher().document(id=snippet_id)