def test_fieldboost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india")) w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india")) w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo")) w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india")) w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango")) w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango")) w.commit() def field_booster(fieldname, factor=2.0): "Returns a function which will boost the given field in a query tree" def booster_fn(obj): if obj.is_leaf() and obj.field() == fieldname: obj = copy.deepcopy(obj) obj.boost *= factor return obj else: return obj return booster_fn with ix.searcher() as s: q = Or([Term("a", u("alfa")), Term("b", u("alfa"))]) q = q.accept(field_booster("a", 100.0)) assert_equal(text_type(q), text_type("(a:alfa^100.0 OR b:alfa)")) r = s.search(q) assert_equal([hit["id"] for hit in r], [2, 5, 6, 3, 0, 1, 4])
def test_regular_or(): ix = get_index() with ix.searcher() as s: oq = Or([Term("text", "bravo"), Term("text", "alfa")]) m = oq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next()
def matcher(self, searcher, context=None): fieldname = self.fieldname constantscore = self.constantscore reader = searcher.reader() qs = [Term(fieldname, word) for word in self._btexts(reader)] if not qs: return matching.NullMatcher() if len(qs) == 1: # If there's only one term, just use it q = qs[0] elif constantscore or len(qs) > self.TOO_MANY_CLAUSES: # If there's so many clauses that an Or search would take forever, # trade memory for time and just find all the matching docs and # serve them as one ListMatcher fmt = searcher.schema[fieldname].format doc_to_values = defaultdict(list) doc_to_weights = defaultdict(float) for q in qs: m = q.matcher(searcher) while m.is_active(): docnum = m.id() doc_to_values[docnum].append(m.value()) if not constantscore: doc_to_weights[docnum] += m.weight() m.next() docnums = sorted(doc_to_values.keys()) # This is a list of lists of value strings -- ListMatcher will # actually do the work of combining multiple values if the user # asks for them values = [doc_to_values[docnum] for docnum in docnums] kwargs = {"values": values, "format": fmt} if constantscore: kwargs["all_weights"] = self.boost else: kwargs["weights"] = [doc_to_weights[docnum] for docnum in docnums] #return matching.ListMatcher(docnums, term=term, **kwargs) return matching.ListMatcher(docnums, **kwargs) else: # The default case: Or the terms together from whoosh.query import Or q = Or(qs) m = q.matcher(searcher, context) #m = matching.SingleTermMatcher(m, term) return m
def parse(self, input, normalize=True): reqs, opts, nots, phrase = self._sort(self._split(input)) make_clause = self.make_clause make_filter_clause = self.make_filter_clause reqs = [make_clause(text) for text in reqs] opts = [make_clause(text) for text in opts] nots = [make_filter_clause(text) for text in nots] pctmatch = int((len(reqs) + len(opts)) * self.minpercent) - len(reqs) minmatch = max(pctmatch, self.minmatch - len(reqs), 0) q = Or(opts, minmatch=minmatch) if reqs: q = AndMaybe(And(reqs), q) if nots: q = AndNot(q, Or(nots)) if normalize: q = q.normalize() return q
def search_whoosh_index_headline(query, paths): if not paths: return [] ix = get_whoosh_index() parser = MultifieldParser(['content', 'title', 'abstract'], ix.schema) q = parser.parse(query) allow_q = Or([Term('path', path) for path in paths]) res = [] with ix.searcher() as searcher: results = searcher.search(q, filter=allow_q, limit=len(paths), terms=True) for hit in results: res.append({ # 'title': hit['title'], 'short_url': hit['path'], 'highlights': u' [...] '.join(filter(None, [hit.highlights("title", top=5), hit.highlights("abstract", top=5), hit.highlights("content", top=5)])) }) return res
def validate_name(meta, itemid): """ Check whether the names are valid. Will just return, if they are valid, will raise a NameNotValidError if not. """ names = meta.get(NAME) current_namespace = meta.get(NAMESPACE) if current_namespace is None: raise NameNotValidError(L_("No namespace field in the meta.")) namespaces = [namespace.rstrip('/') for namespace, _ in app.cfg.namespace_mapping] if len(names) != len(set(names)): msg = L_("The names in the name list must be unique.") flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) # Item names must not start with '@' or '+', '@something' denotes a field where as '+something' denotes a view. invalid_names = [name for name in names if name.startswith(('@', '+'))] if invalid_names: msg = L_("Item names (%(invalid_names)s) must not start with '@' or '+'", invalid_names=", ".join(invalid_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) namespaces = namespaces + NAMESPACES_IDENTIFIER # Also dont allow item names to match with identifier namespaces. # Item names must not match with existing namespaces. invalid_names = [name for name in names if name.split('/', 1)[0] in namespaces] if invalid_names: msg = L_("Item names (%(invalid_names)s) must not match with existing namespaces.", invalid_names=", ".join(invalid_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) query = And([Or([Term(NAME, name) for name in names]), Term(NAMESPACE, current_namespace)]) # There should be not item existing with the same name. if itemid is not None: query = And([query, Not(Term(ITEMID, itemid))]) # search for items except the current item. with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: results = searcher.search(query) duplicate_names = {name for result in results for name in result[NAME] if name in names} if duplicate_names: msg = L_("Item(s) named %(duplicate_names)s already exist.", duplicate_names=", ".join(duplicate_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg)
def get_subitem_revs(self): """ Create a list of subitems of this item. Subitems are in the form of storage Revisions. """ query = And([ Term(WIKINAME, app.cfg.interwikiname), Term(NAMESPACE, self.fqname.namespace) ]) # trick: an item of empty name can be considered as "virtual root item" # that has all wiki items as sub items if self.names: query = And([ query, Or([ Prefix(NAME_EXACT, prefix) for prefix in self.subitem_prefixes ]) ]) revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return revs
def search(self, q_str: str, in_chats: Optional[List[int]], page_len: int, page_num: int = 1) -> SearchResult: q = self.query_parser.parse(q_str) with self.ix.searcher() as searcher: q_filter = in_chats and Or( [Term('chat_id', str(chat_id)) for chat_id in in_chats]) result_page = searcher.search_page(q, page_num, page_len, filter=q_filter, sortedby='post_time', reverse=True) hits = [ SearchHit(IndexMsg(**msg), self.highlighter.highlight_hit(msg, 'content')) for msg in result_page ] return SearchResult(hits, result_page.is_last_page(), result_page.total)
def get_subscribers(**meta): """ Get all users that are subscribed to the item :param meta: key/value pairs from item metadata - itemid, name, namespace, tags keys :return: a set of Subscriber objects """ itemid = meta.get(ITEMID) name = meta.get(NAME) namespace = meta.get(NAMESPACE) fqname = CompositeName(namespace, ITEMID, itemid) tags = meta.get(TAGS) terms = [] if itemid is not None: terms.extend([Term(SUBSCRIPTION_IDS, u"{0}:{1}".format(ITEMID, itemid))]) if namespace is not None: if name is not None: terms.extend(Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(NAME, namespace, name_)) for name_ in name) if tags is not None: terms.extend(Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(TAGS, namespace, tag)) for tag in tags) query = Or(terms) with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: result_iterators = [searcher.search(query, limit=None), ] subscription_patterns = searcher.lexicon(SUBSCRIPTION_PATTERNS) patterns = get_matched_subscription_patterns(subscription_patterns, **meta) result_iterators.extend(searcher.documents(subscription_patterns=pattern) for pattern in patterns) subscribers = set() for user in chain.from_iterable(result_iterators): email = user.get(EMAIL) if email: from moin.user import User u = User(uid=user.get(ITEMID)) if u.may.read(fqname): locale = user.get(LOCALE, DEFAULT_LOCALE) subscribers.add(Subscriber(user[ITEMID], user[NAME][0], email, locale)) return subscribers
def more_like(self, pk, source, top=5): """Find similar units.""" index = self.get_source_index() with index.searcher() as searcher: # Extract key terms kts = searcher.key_terms_from_text('source', source, numterms=10, normalize=False) # Create an Or query from the key terms query = Or( [Term('source', word, boost=weight) for word, weight in kts]) # Grab fulltext results results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)] if not results: return [] # Normalize scores to 0-100 max_score = max([h[1] for h in results]) scores = {h[0]: h[1] * 100 / max_score for h in results} # Filter results with score above 50 and not current unit return [h[0] for h in results if scores[h[0]] > 50 and h[0] != pk]
def more_like(self, pk, source, top=5): """Find similar units.""" index = self.get_source_index() with index.searcher() as searcher: # Extract key terms kts = searcher.key_terms_from_text( 'source', source, numterms=10, normalize=False ) # Create an Or query from the key terms query = Or([Term('source', word, boost=weight) for word, weight in kts]) LOGGER.debug('more like query: %r', query) # Grab fulltext results results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)] LOGGER.debug('found %d matches', len(results)) if not results: return [] # Filter bad results threshold = max((h[1] for h in results)) / 2 results = [h[0] for h in results if h[1] > threshold] LOGGER.debug('filter %d matches over threshold %d', len(results), threshold) return results
def __call__(self): command = self.request.matchdict['command'] params = self.request.params # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if command == 'namesearch': search_index = self.request.registry.settings['whoosh_index'] query_parser = self.request.registry.settings['query_parser'] with search_index.searcher() as searcher: query = query_parser.parse(params['term']) # allowable = Or([Term(u'item_type', u'species'), Term(u'item_type', u'climate')]) # allowable = Or([Term(u'item_type', u'species')]) allowable = Or([ Term(u'item_type', u'species'), Term(u'item_type', u'refugia'), Term(u'item_type', u'aoc'), Term(u'item_type', u'richness') ]) results = searcher.search(query, filter=allowable) matches = {} for result in results: matches[result['nice_name']] = { "type": result['item_type'], "path": result['item_path'], "mapId": result['item_id'] } json_content = json.dumps(matches) return Response(body=json_content, content_type='application/json') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if command == 'mapsearch': es = self.request.registry.settings['search_conn'] allowable = ['species', 'refugia', 'aoc', 'richness'] query = { "query": { "bool": { "must": { "match": { "nice_name": { "query": params['term'], "operator": "and" } } }, "filter": { "terms": { "item_type": allowable } } } }, "from": 0, "size": 15 } results = es.search(index='wallace', doc_type='map', body=query) matches = {} for result in results['hits']['hits']: doc = result['_source'] matches[doc['nice_name']] = { "type": doc['item_type'], "path": doc['item_path'], "mapId": doc['item_id'] } json_content = json.dumps(matches) return Response(body=json_content, content_type='application/json') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - elif command == 'preplayer': gs_user = self.request.registry.settings['climas.gs_user'] gs_pass = self.request.registry.settings['climas.gs_pass'] # ==== what's the map they want? map_type = params['info[type]'] map_path = params['info[path]'] map_id = params['info[mapId]'] map_projection = params['proj'] if map_type == 'species': path_to_map_tif = ''.join([ "file:///rdsi/wallace2/W2_website/species/", map_path, "/summaries_temperature/", map_projection, ".tif" ]) else: # all the summary maps types have the same path path_to_map_tif = ''.join([ "file:///rdsi/wallace2/W2_website/", map_path, "/", map_projection, ".tif" ]) coverage_name = '@'.join( [map_type, map_id.replace(' ', '_'), map_projection]) # TODO: remove this debug print(path_to_map_tif) # ==== insert that map into geoserver # todo: put this into a timeout somehow poke = requests.put( "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/" + coverage_name + "/external.geotiff", data=path_to_map_tif, auth=(gs_user, gs_pass)) poke = requests.post( "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/" + coverage_name + "/coverages", data="<coverage><name>" + coverage_name + "</name><nativeName>" + map_projection + "</nativeName></coverage>", auth=(gs_user, gs_pass), headers={'Content-type': 'text/xml'}) # ==== return the WMS url for that layer if (poke.ok or 'already exists' in poke.text): result = { "ok": True, "mapUrl": u"http://wallace-maps.hpc.jcu.edu.au/geoserver/wallace/wms", "layerName": u"wallace:" + coverage_name } json_content = json.dumps(result) return Response(body=json_content, content_type='application/json') json_content = json.dumps({ "ok": False, "status_code": poke.status_code, "status_reason": poke.reason, "result": poke.text }) # if we haven't returned yet, our layer poke didn't work return Response(status_code=500, body=json_content, content_type='application/json')
def __recs_query(self, positive_rated_document_list: list, rated_document_list: list, scores: list, recs_number: int, items_directory: str, candidate_list: list) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list (list): list of contents that the user liked rated_document_list (list): list of all the contents that the user rated scores (list): ratings given by the user recs_number (int): how many items must be recommended. Only the number can be specified, not a specific item for which compute the prediction items_directory (str): directory where the items are stored candidate_list (list): list of the items that can be recommended, if None all unrated items will be used Returns: score_frame (pd.DataFrame): dataFrame containing the recommendations for the user """ ix = open_dir(items_directory) with ix.searcher(weighting=scoring.TF_IDF if self.__classic_similarity else scoring.BM25F) as searcher: # Initializes user_docs which is a dictionary that has the document as key and # another dictionary as value. The dictionary value has the name of the field as key # and its contents as value. By doing so we obtain the data of the fields while # also storing information regarding the field and the document where it was field_list = None user_docs = {} for doc in positive_rated_document_list: user_docs[doc] = dict() field_list = searcher.stored_fields(doc) for field_name in field_list: if field_name == 'content_id': continue user_docs[doc][field_name] = field_list[field_name] logger.info("Building query") # For each field of each document one string (containing the name of the field and the data in it) # is created and added to the query. # Also each part of the query that refers to a document # is boosted by the score given by the user to said document string_query = "(" for doc, score in zip(user_docs.keys(), scores): string_query += "(" for field_name in field_list: if field_name == 'content_id': continue word_list = user_docs[doc][field_name].split() string_query += field_name + ":(" for term in word_list: string_query += term + " " string_query += ") " string_query += ")^" + str(score) + " " string_query += ") " # The requirement of retrieved documents to be in a candidate list (if passed) is added # by building a query for the content id of said documents. # Also the query containing all the content ids for the documents that the user rated # is created. # Both these queries will be used by the index searcher candidate_query_list = None rated_query_list = [] for document in rated_document_list: rated_query_list.append(Term("content_id", document)) rated_query_list = Or(rated_query_list) if candidate_list is not None: candidate_query_list = [] for candidate in candidate_list: candidate_query_list.append(Term("content_id", candidate)) candidate_query_list = Or(candidate_query_list) # The filter and mask arguments of the index searcher are used respectively # to find only candidate documents and to ignore documents rated by the user schema = ix.schema query = QueryParser("content_id", schema=schema, group=qparser.OrGroup).parse(string_query) score_docs = searcher.search(query, limit=recs_number, filter=candidate_query_list, mask=rated_query_list) logger.info("Building score frame to return") # Builds the recommendation frame. Items in the candidate list or rated by the user # were already filtered previously by the index searcher columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for result in score_docs: item_id = result["content_id"] score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, result.score)], columns=columns)]) return score_frame
def search_doc(directory, word, doc_types, num_page=1, num_by_page=10, show_num_results=True): """ * -------------{Function}--------------- * Returns a list of docs that contains a given set of words that matches a g * -------------{returns}---------------- * {set} query results . . . * -------------{params}----------------- * : directory -> path of the index * : word -> words to query * : doc_types -> type of doc to search * : num_page -> number of pages to search * : show_num_results -> number of results to return """ index_schema = load_index(directory) doctypes_schema = load_doctypes_schema(directory) # Retrieves the fields to search from the doctypes schema fields_to_search = [] for doc_type in doc_types: doc_type = doc_type.lower() try: schema = doctypes_schema[doc_type] fields_to_search = fields_to_search + schema except: logger.warning( "Schema not found for {doc_type}".format(doc_type=doc_type)) # By default we search "content" (for BC) and "tags" fields = ["content", "tags"] + fields_to_search logger.info( "search will be performed on fields {fields}".format(fields=fields)) # Creates the query parser # MultifieldParser allows search on multiple fields # We use custom FuzzyTerm class to set the Leveshtein distance to 2 parser = MultifieldParser(fields, schema=doctypes_schema, termclass=CustomFuzzyTerm) query = parser.parse(word) # Creates a filter on the doctype field doctype_filter_matcher = [] for doc_type in doc_types: term = FuzzyTerm("doc_type", doc_type.lower(), 1.0, 2) doctype_filter_matcher.append(term) doc_type_filter = Or(doctype_filter_matcher) # Processes the search(request the index, whoosh magic) with index_schema.searcher() as searcher: results = searcher.search_page(query, num_page, pagelen=num_by_page, filter=doc_type_filter) results_id = [result["doc_id"] for result in results] logger.info("Results: {results_id}".format(results_id=results_id)) # Ensures BC if the number of results is not requested if show_num_results: return {"ids": results_id, "num_results": len(results)} else: return {"ids": results_id}
def search(self, query): with self.index.searcher() as searcher: terms = [FuzzyTerm("content", word, maxdist=2) for word in query] search_query = Or(terms) results = searcher.search(search_query) return [result["filename"] for result in results]
def test_wildcard(): _run_query(Or([Wildcard('value', u('*red*')), Wildcard('name', u('*yellow*'))]), [u("A"), u("C"), u("D"), u("E")]) # Missing _run_query(Wildcard('value', 'glonk*'), [])
def query(self, string_query: str, results_number: int, mask_list: list = None, candidate_list: list = None, classic_similarity: bool = True) -> dict: """ Uses a search index to query the index in order to retrieve specific contents using a query expressed in string form Args: string_query (str): query expressed as a string results_number (int): number of results the searcher will return for the query mask_list (list): list of content_ids of items to ignore in the search process candidate_list (list): list of content_ids of items to consider in the search process, if it is not None only items in the list will be considered classic_similarity (bool): if True, classic tf idf is used for scoring, otherwise BM25F is used Returns: results (dict): the final results dictionary containing the results found from the search index for the query. The dictionary will be in the following form: {content_id: {"item": item_dictionary, "score": item_score}, ...} content_id is the content_id for the corresponding item item_dictionary is the dictionary of the item containing the fields as keys and the contents as values. So it will be in the following form: {"Plot": "this is the plot", "Genre": "this is the Genre"} The item_dictionary will not contain the content_id since it is already defined and used as key of the external dictionary items_score is the score given to the item for the query by the index searcher """ ix = open_dir(self.directory) with ix.searcher( weighting=TF_IDF if classic_similarity else BM25F) as searcher: candidate_query_list = None mask_query_list = None # the mask list contains the content_id for the items to ignore in the searching process # from the mask list a mask query is created and it will be used by the searcher if mask_list is not None: mask_query_list = [] for document in mask_list: mask_query_list.append(Term("content_id", document)) mask_query_list = Or(mask_query_list) # the candidate list contains the content_id for the items to consider in the searching process # from the candidate list a candidate query is created and it will be used by the searcher if candidate_list is not None: candidate_query_list = [] for candidate in candidate_list: candidate_query_list.append(Term("content_id", candidate)) candidate_query_list = Or(candidate_query_list) schema = ix.schema parser = QueryParser("content_id", schema=schema, group=OrGroup) # regular expression to match the possible field styles # examples: "content_id" or "Genre#2" or "Genre#2#custom_id" parser.add_plugin( FieldsPlugin(r'(?P<text>[\w-]+(\#[\w-]+(\#[\w-]+)?)?|[*]):')) query = parser.parse(string_query) score_docs = \ searcher.search(query, limit=results_number, filter=candidate_query_list, mask=mask_query_list) # creation of the results dictionary, This phase is necessary because the Hit objects returned by the # searcher as results need the reader inside the search index in order to return information # so it would be impossible to access a field or the score of the item from outside this method # because of that this dictionary containing the most important infos is created results = {} for hit in score_docs: hit_dict = dict(hit) content_id = hit_dict.pop("content_id") results[content_id] = {} results[content_id]["item"] = hit_dict results[content_id]["score"] = hit.score return results
def advanced_query(parameters, page = 0, n = 10): """ :param dict text_paramters: a dictionary of field-to-query pairs specifying text-based queries. this is good for fields like "rules_text", "name", "flavor_text", etc. :param dict range_parameters: a dictionary of field to range pairs. this is good for fields like power, toughness, cmc, etc. :param dict point_parameters: a dictionary of field to value parameters. Every card in the return set must have an exact match to every value in the dict. In example, if point_parameters is {'cmc': 5} then for every card in the set card.cmc == 5 must evaluate to true. .. warning:: using this parameter will cause the query system to filter through whoosh results, slowing down computation. :param int page: the 'page' of results to return :param int n: the number of results per page. :return: Exact class TBD, will provide way to iterate over the page's worth of results. """ import whoosh.fields from whoosh.query import And, Or schema = get_whoosh_index().schema # fix `page` and `n` (they may be string versions of ints) page = int(page) n = int(n) # After talking with Ben it sounds like we can do something to the effect # of taking multiple sub queries and perform unions and intersections on their # results # This is going to be the best way to get the desired results. # to start: build a list of all the query objects we'll be searching. query_objs = [] for field, target in parameters.items(): # Coerce potential numeric point queries to whoosh syntax. if isinstance(target, float): target = int(target+0.5) if isinstance(target, int): target = str(target) #target = f"{{{target-1} TO {target+1}}}" #target = target.replace("[ TO", "[TO").replace("TO ]", "TO]") # Coerce range queries to whoosh syntax, assume they're inclusive bounds. if isinstance(target, (list, tuple)): if len(target) != 2: raise ValueError(f"Unable to treat parameter as range query! ({target})") target = f"[{target[0] if target[0] != -1 else ''} TO {target[1] if target[1] != -1 else ''}]" # whoosh has issues if there's an open ended range with a space separating TO from the bracket: target = target.replace("[ TO", "[TO").replace("TO ]", "TO]") # the comma-separated KEYWORD fields have been giving us some issue # it seems that whoosh is a bit bi-polar when it comes to commas in these fields. # so we'll add two subqueries, one with a comma and one without. if field in schema and isinstance(schema[field], whoosh.fields.KEYWORD): # add the extra query object: subqueries = [QueryParser(field, schema).parse(target.lower()+','), QueryParser(field, schema).parse(target.lower())] query_objs.append(Or(subqueries)) else: query_objs.append(QueryParser(field, schema).parse(target.lower())) # again, lower capitalization on everything if not len(query_objs): return [] # now build a nice big compound query: query = And(query_objs) with get_whoosh_index().searcher() as searcher: # run that query and return the appropriate results page. try: results = searcher.search_page(query, page+1, n) except Exception: print(repr(query)) raise return [x['data_obj'] for x in results]
def test_not(): _run_query(Or([Term("value", u("red")), Term("name", u("yellow")), Not(Term("name", u("quick")))]), [u("A"), u("E")])
def oq(): return Or([Term("a", u("a")), Term("a", u("b"))])
def test_query_copy_hash(): def do(q1, q2): q1a = copy.deepcopy(q1) assert_equal(q1, q1a) assert_equal(hash(q1), hash(q1a)) assert_not_equal(q1, q2) do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], boost=1.5)) do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), boost=1.5)) do(FuzzyTerm("a", u("b"), constantscore=True), FuzzyTerm("a", u("b"), constantscore=False)) do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) do(TermRange("a", None, u("c")), TermRange("a", None, None)) do(TermRange("a", u("b"), u("c"), boost=1.1), TermRange("a", u("b"), u("c"), boost=1.5)) do(TermRange("a", u("b"), u("c"), constantscore=True), TermRange("a", u("b"), u("c"), constantscore=False)) do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) do(NumericRange("a", None, 5), NumericRange("a", None, None)) do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) do(NumericRange("a", 3, 6, constantscore=True), NumericRange("a", 3, 6, constantscore=False)) # do(DateRange) do(Variations("a", u("render")), Variations("a", u("renders"))) do(Variations("a", u("render"), boost=1.1), Variations("a", u("renders"), boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")]), Phrase("a", [u("b"), u("c"), u("e")])) do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), Phrase("a", [u("b"), u("c"), u("d")], slop=2)) # do(Ordered) do(Every(), Every("a")) do(Every("a"), Every("b")) do(Every("a", boost=1.1), Every("a", boost=1.5)) do(NullQuery, Term("a", u("b"))) do(ConstantScoreQuery(Term("a", u("b"))), ConstantScoreQuery(Term("a", u("c")))) do(ConstantScoreQuery(Term("a", u("b")), score=2.0), ConstantScoreQuery(Term("a", u("c")), score=2.1)) do(Require(Term("a", u("b")), Term("c", u("d"))), Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) # do(Require) # do(AndMaybe) # do(AndNot) # do(Otherwise) do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), limit=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d"))), SpanNear(Term("a", u("b")), Term("c", u("e")))) do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) do(SpanNot(Term("a", u("b")), Term("a", u("c"))), SpanNot(Term("a", u("b")), Term("a", u("d")))) do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) do(SpanContains(Term("a", u("b")), Term("a", u("c"))), SpanContains(Term("a", u("b")), Term("a", u("d"))))
# Query 3: USA JPN parser = QueryParser("strong_hashtags", index.schema) query = parser.parse("USA JPN") results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) # Query 4: RTs about USA parser = QueryParser("tweet_text", index.schema) query = parser.parse("RT USA") results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) # Query 5: score query = Or([Term("strong_hastags","FIFAWWC"),Term("tweet_text","score")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) ############################################################################# """ Upload Whoosh index directory to S3 bucket """ # version to upload to Github will have AWS keys removed AWS_KEY = '' AWS_SECRET = '' from boto.s3.connection import S3Connection conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = conn.create_bucket('w205_assignment4')
def do_search(event): print("Algoritmo di ranking:" + str(app._model.get())) og = qparser.OrGroup.factory(0.9) if not app.txtEntry.get(): return False query = app.txtEntry.get() regex = r'((?:article|incollection|inproceedings|phdthesis|mastersthesis|publication|venue)(?:.(?:(?:author|title|year|publisher)))?:\s?(?:\*{0,1}\w*\*{0,1})?(?:\".+?\")?)' queries = re.findall(regex, query) # Trovo tutte le ricerche con campi specifici e le metto in una lista spec_query = len(queries) query = re.sub(regex, ' ', query).strip() # Tolgo le ricerche che ho trovato prima dalla query. Così ora rimangono solo le ricerche per frase e le parole singole if len(query) > 0: queries = queries + re.findall(r'".+?"', query) # Trovo tutte le ricerche per frase, quindi quelle comprese tra " " e le aggiungo alla lista di prima query = re.sub(r'".+?"', '', query) # Tolgo le ricerche per frase dalla query. Ora quindi rimangono solo le singole parole query = re.sub(r'\s+', ' ', query).strip() # Tolgo gli spazi inutili (dovuti alle sostituzioni di prima) queries = queries + query.split( ' ') # Splitto la query per spazi così trovo le singole parole e le aggiungo alla lista try: queries.remove('') except Exception: pass queries_for_publi = list() queries_for_venue = list() ven_query = " AND (tag:book OR tag:proceedings)" tag_query = " AND (tag:article OR tag:incollection OR tag:phdthesis OR tag:mastersthesis OR tag:inproceedings)" start_time = datetime.datetime.now() for split in queries[:spec_query]: split = split.split(':') dotted = split[0].split('.') tag = dotted[0] if len(dotted) > 1: field = dotted[1] else: field = 0 if tag in publi: tag_query = " AND (tag:" + tag + ")" else: tag_query = " AND (tag:article OR tag:incollection OR tag:phdthesis OR tag:mastersthesis OR tag:inproceedings)" q = qparser.QueryParser(field, schema=schema_publi, group=og) if tag in (publi + ['publication']) and field in ['author', 'title', 'year']: queries_for_publi.append(q.parse(split[1] + tag_query)) elif tag == 'venue' and field in ['title', 'publisher']: queries_for_venue.append(q.parse(split[1] + ven_query)) elif not field: if tag == 'venue': qv = MultifieldParser(terms_venue, schema=schema_venue) queries_for_venue.append(qv.parse(split[1] + ven_query)) else: qp = MultifieldParser(terms_publi, schema=schema_publi) queries_for_publi.append(qp.parse(split[1] + tag_query)) if queries[spec_query:]: # Se son rimasti dei termini non specifici qv = MultifieldParser(terms_venue, schema=schema_venue) qp = MultifieldParser(terms_publi, schema=schema_publi) for split in queries[spec_query:]: queries_for_venue.append(qv.parse(split + ven_query)) queries_for_publi.append(qp.parse(split + tag_query)) app.venue_results = [y for y in app.get_vx_searcher().search(Or([x for x in queries_for_venue]), limit=None)] app.publi_results = [y for y in app.get_px_searcher().search(Or([x for x in queries_for_publi]), limit=None)] print("SEARCHING: " + str(datetime.datetime.now() - start_time)) start_time = datetime.datetime.now() app.len_publi = len(app.publi_results) app.len_venue = len(app.venue_results) print("PUBLICATION LEN:" + str(app.len_publi)) print("VENUE LEN:" + str(app.len_venue)) if app.venue_results and app.publi_results: app.results, app.indice_t = threshold_2(list(), app.publi_results, app.venue_results, app.len_publi, app.len_venue) print("THRESHOLD: " + str(datetime.datetime.now() - start_time)) app.listNodes.delete(0, 'end') app.listSelection.delete(0, 'end') if not app.publi_results or not app.venue_results: app.counter = 0 app.results = app.publi_results + app.venue_results for hit in app.results[:10]: if len(hit.get("title")) > 50: app.listNodes.insert(END, hit.get("title")[0:50] + "...") else: app.listNodes.insert(END, hit.get("title")) app.listNodes.bind('<<ListboxSelect>>', onselect) else: for hit in app.results: l = len(hit.pub.get("title")) if not l: if len(hit.venue.get("title")) > 50: app.listNodes.insert(END, hit.venue.get("title")[:50] + "...") else: app.listNodes.insert(END, hit.venue.get("title")) elif l > 50: app.listNodes.insert(END, hit.pub.get("title")[0:50] + "...") else: app.listNodes.insert(END, hit.pub.get("title")) app.listNodes.bind('<<ListboxSelect>>', onselect_for_dict) print("PUTTING ON LIST: " + str(datetime.datetime.now() - start_time))
def perform_search(self, sentence): with self._searcher() as s: tokens = sentence.split() tokens = [token for token in tokens if token != REPLACED] print('tokens=', tokens) exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens], boost=.5) exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens], boost=.5, scale=0.9) # Added variability of maxdist based on word length fuzzy_or_match = Or([ FuzzyTerm(TEXT_FIELD, t, prefixlength=1, maxdist=1 if len(t) < 8 else 2) for t in tokens if len(t) >= 4 ], boost=.2, scale=0.9) if len(tokens) > 1: # add bigrams if there are any bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)] bigram_fuzzy_or_match = Or([ FuzzyTerm(BIGRAMS_FIELD, b, prefixlength=3, maxdist=2 if len(b) < 8 else 3) for b in bigrams ], scale=0.9) else: bigram_fuzzy_or_match = None non_brand_or_match = Or( [Term(NONBRAND_TEXT_FIELD, t) for t in tokens]) # q = exact_and_match \ # | exact_or_match \ # | fuzzy_or_match # my_match = Or([Term(f, token) for token in tokens], boost=1) # q = my_match # # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0, # scale=0.9) q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match if bigram_fuzzy_or_match: q = q | bigram_fuzzy_or_match print(q) search_results = self.get_search_results(self._index, s, q) for x in search_results: print(x, x.score) if search_results: score, text, matched = search_results[0].items() return text, list(set(matched)) else: return None, None
def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): parsed_query = self.parser.parse(query_string) if len(model_choices) > 0: narrow_model = [Term(DJANGO_CT, rm) for rm in model_choices] parsed_query = And([Or(narrow_model), parsed_query]) searcher = self.index.searcher() # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page(start_offset, end_offset) collapse_field = kwargs.get("collapse") collapse_limit = kwargs.get("collapse_limit") search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse } if collapse_field is not None: search_kwargs['collapse'] = FieldFacet(collapse_field) search_kwargs['collapse_limit'] = 1 if kwargs.get("collapse_order") is not None: order = kwargs.get("collapse_order") collapse_order = FieldFacet(order.replace('-', ''), reverse=order.find('-') > -1) search_kwargs['collapse_order'] = collapse_order # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( grouped_results = None if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } if collapse_field is not None and collapse_limit > 1: search_kwargs = { 'sortedby': collapse_order } grouped_results = [] for result in raw_page: query = And([Term(collapse_field, result[collapse_field]), parsed_query]) results = searcher.search(query, limit=collapse_limit, **search_kwargs) grouped_results.append(results) results = self._process_results(raw_page, result_class=result_class, collapse_field=collapse_field, grouped_results=grouped_results) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, }
def search(request): if request.method == 'POST': form = Search_Form(request.POST) if form.is_valid(): if not aux_check_index(): aux_reset_all() key = form.cleaned_data['key_word'].lower() type = form.cleaned_data['type'] ix = open_dir(dirindex) with ix.searcher() as searcher: words = key.strip().split() terms_classified = [] for word in words: terms = [] for desc in [ 'descripcionECI', 'descripcionMM', 'descripcionFC' ]: terms.append(Term(desc, word)) terms_classified.append(terms) subqueries = [] for t in terms_classified: if type == 'N3': subqueries.append(And(t)) else: subqueries.append(Or(t)) query = subqueries[0] if len(subqueries) > 1: if type == 'N1': query = Or(subqueries) else: query = And(subqueries) results = searcher.search(query) title = "Resultados para: " mostrar = True if len(results) == 0: title = "No hay resultados para: " mostrar = False eci = [] mm = [] fc = [] for r in results: eci.append( Historico_ECI.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) mm.append( Historico_MM.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) fc.append( Historico_FC.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) return render( request, 'search.html', { "eci": eci, "mm": mm, 'fc': fc, "title": title + key, "mostrar": mostrar }) else: form = Search_Form() return render(request, 'search.html', {'form': form})
def nq(level): if level == 0: return oq() else: return Or([nq(level - 1), nq(level - 1), nq(level - 1)])
def make_filter_clause(self, text): return Or([ self.make_basic_clause(fieldname, text) for fieldname in self.fieldboosts.iterkeys() ])
def test_replace(): q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)]) q = q.replace("a", "b", "BB") assert_equal(q, And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0)]))
def getAnnotations(self, commentText): tmpRes = {} if commentText == '': return [] procCommText, origIndx = compressStr(commentText, lower=True) termArr = procCommText.split() # There might be repeating entries orgNames = set() for qs in range(0, len(termArr), self.queryStride): qe = min(qs + self.querySpan, len(termArr)) q = [] for i in range(qs, qe - 1): if not termArr[i] in STOP_WORDS: bigram = And([ Term(self.fieldName, termArr[i]), Term(self.fieldName, termArr[i + 1]) ]) q.append(bigram) #print('@@', ' '.join(termArr[qs:qe])) #print('Query: ', q) res = self.searcher.search(Or(q), limit=self.topK) #print('Found %d results' % len(res)) for k in range(len(res)): if k >= self.topK: break orgName = res[k][self.fieldName] orgNames.add(orgName) for orgName in orgNames: start = 0 while start < len(procCommText): indx = procCommText.find(orgName, start) #print('###', orgName, start, indx) if indx == -1: break assert (indx + len(orgName) <= len(origIndx)) start = indx + len(orgName) # To be a valid match startChar = origIndx[indx] endChar = origIndx[indx + len(orgName) - 1] + 1 # TODO additional condtitions for spaces!! if startChar >= 0 and endChar >= 0: if startChar in tmpRes: tmpRes[startChar] = max(tmpRes[startChar], endChar) else: tmpRes[startChar] = endChar resAnnot = [] for startChar in tmpRes: endChar = tmpRes[startChar] resAnnot.append(Annotation(startChar, endChar, 'OrgDict')) return resAnnot
def search(self, collector, query_str1=None, query_str2=None, itemtypes=(), highlight=False): # rejects '*' and '?' if query_str1: for kw in (s.strip() for s in query_str1.split()): if not kw.replace("*", "").replace("?", "").strip(): return [] wildcard = (query_str1 and any(c in query_str1 for c in "*?")) parser = self._parser_wild if wildcard else self._parser asf_parser = self._asf_parser with self._index.searcher() as searcher: andlist = [] try: if query_str1: andlist.append(parser.parse(query_str1)) if query_str2: andlist.append(asf_parser.parse(query_str2)) except: return [] if itemtypes: if len(itemtypes) > 1: andlist.append(Or([Term('itemtype', t) for t in itemtypes])) else: andlist.append(Term('itemtype', itemtypes[0])) query = And(andlist) searcher.search_with_collector(query, collector) hits = collector.results() if highlight: hits.fragmenter = WholeFragmenter() hits.formatter = HtmlFormatter(tagname='span', classname='s_match', termclass='s_term') if wildcard and query_str1: pat = query_str1.replace("-", "").replace(" ", "") wildmatch = re.compile(fnmatch.translate(pat)) # Construct a result list results = [] for hit in hits: if collector.aborted: return [] (label, path, prio, sortkey) = hit['data'] if wildcard and query_str1: if not wildmatch.match(sortkey): continue if highlight: if query_str1: text = hit.highlights('content') else: text = hit['content'] else: text = None results.append((label, path, sortkey, prio, text)) sortkey_prio_getter = itemgetter(2, 3) results.sort(key=sortkey_prio_getter) # Return return results
def restrict_query(self, request): return Or([ And([Term('public', 't'), Term('searchable', 't')]), Term('users', request.user.username) ] + [Term('groups', group.name) for group in request.user.groups.all()])