Ejemplo n.º 1
0
def test_fieldboost():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india"))
    w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa"))
    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india"))
    w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo"))
    w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india"))
    w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango"))
    w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango"))
    w.commit()

    def field_booster(fieldname, factor=2.0):
        "Returns a function which will boost the given field in a query tree"
        def booster_fn(obj):
            if obj.is_leaf() and obj.field() == fieldname:
                obj = copy.deepcopy(obj)
                obj.boost *= factor
                return obj
            else:
                return obj
        return booster_fn

    with ix.searcher() as s:
        q = Or([Term("a", u("alfa")), Term("b", u("alfa"))])
        q = q.accept(field_booster("a", 100.0))
        assert_equal(text_type(q), text_type("(a:alfa^100.0 OR b:alfa)"))
        r = s.search(q)
        assert_equal([hit["id"] for hit in r], [2, 5, 6, 3, 0, 1, 4])
Ejemplo n.º 2
0
def test_regular_or():
    ix = get_index()
    with ix.searcher() as s:
        oq = Or([Term("text", "bravo"), Term("text", "alfa")])
        m = oq.matcher(s)
        while m.is_active():
            orig = s.stored_fields(m.id())["text"]
            for span in m.spans():
                v = orig[span.start]
                assert v == "bravo" or v == "alfa"
            m.next()
Ejemplo n.º 3
0
    def matcher(self, searcher, context=None):
        fieldname = self.fieldname
        constantscore = self.constantscore

        reader = searcher.reader()
        qs = [Term(fieldname, word) for word in self._btexts(reader)]
        if not qs:
            return matching.NullMatcher()

        if len(qs) == 1:
            # If there's only one term, just use it
            q = qs[0]
        elif constantscore or len(qs) > self.TOO_MANY_CLAUSES:
            # If there's so many clauses that an Or search would take forever,
            # trade memory for time and just find all the matching docs and
            # serve them as one ListMatcher
            fmt = searcher.schema[fieldname].format
            doc_to_values = defaultdict(list)
            doc_to_weights = defaultdict(float)
            for q in qs:
                m = q.matcher(searcher)
                while m.is_active():
                    docnum = m.id()
                    doc_to_values[docnum].append(m.value())
                    if not constantscore:
                        doc_to_weights[docnum] += m.weight()
                    m.next()

            docnums = sorted(doc_to_values.keys())
            # This is a list of lists of value strings -- ListMatcher will
            # actually do the work of combining multiple values if the user
            # asks for them
            values = [doc_to_values[docnum] for docnum in docnums]

            kwargs = {"values": values, "format": fmt}
            if constantscore:
                kwargs["all_weights"] = self.boost
            else:
                kwargs["weights"] = [doc_to_weights[docnum]
                                     for docnum in docnums]

            #return matching.ListMatcher(docnums, term=term, **kwargs)
            return matching.ListMatcher(docnums, **kwargs)
        else:
            # The default case: Or the terms together
            from whoosh.query import Or
            q = Or(qs)

        m = q.matcher(searcher, context)
        #m = matching.SingleTermMatcher(m, term)
        return m
Ejemplo n.º 4
0
 def parse(self, input, normalize=True):
     reqs, opts, nots, phrase = self._sort(self._split(input))
     make_clause = self.make_clause
     make_filter_clause = self.make_filter_clause
     
     reqs = [make_clause(text) for text in reqs]
     opts = [make_clause(text) for text in opts]
     nots = [make_filter_clause(text) for text in nots]
     
     pctmatch = int((len(reqs) + len(opts)) * self.minpercent) - len(reqs)
     minmatch = max(pctmatch, self.minmatch - len(reqs), 0)
     
     q = Or(opts, minmatch=minmatch)
     if reqs: q = AndMaybe(And(reqs), q)
     if nots: q = AndNot(q, Or(nots))
     
     if normalize:
         q = q.normalize()
     return q
Ejemplo n.º 5
0
def search_whoosh_index_headline(query, paths):
  if not paths:
    return []
  ix = get_whoosh_index()
  parser = MultifieldParser(['content', 'title', 'abstract'], ix.schema)
  q = parser.parse(query)

  allow_q = Or([Term('path', path) for path in paths])

  res = []

  with ix.searcher() as searcher:
    results = searcher.search(q, filter=allow_q, limit=len(paths), terms=True)
    for hit in results:
      res.append({
        # 'title': hit['title'],
        'short_url': hit['path'],
        'highlights': u' [...] '.join(filter(None, [hit.highlights("title", top=5), hit.highlights("abstract", top=5), hit.highlights("content", top=5)]))
      })

  return res
Ejemplo n.º 6
0
def validate_name(meta, itemid):
    """
    Check whether the names are valid.
    Will just return, if they are valid, will raise a NameNotValidError if not.
    """
    names = meta.get(NAME)
    current_namespace = meta.get(NAMESPACE)
    if current_namespace is None:
        raise NameNotValidError(L_("No namespace field in the meta."))
    namespaces = [namespace.rstrip('/') for namespace, _ in app.cfg.namespace_mapping]

    if len(names) != len(set(names)):
        msg = L_("The names in the name list must be unique.")
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)
    # Item names must not start with '@' or '+', '@something' denotes a field where as '+something' denotes a view.
    invalid_names = [name for name in names if name.startswith(('@', '+'))]
    if invalid_names:
        msg = L_("Item names (%(invalid_names)s) must not start with '@' or '+'", invalid_names=", ".join(invalid_names))
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)

    namespaces = namespaces + NAMESPACES_IDENTIFIER  # Also dont allow item names to match with identifier namespaces.
    # Item names must not match with existing namespaces.
    invalid_names = [name for name in names if name.split('/', 1)[0] in namespaces]
    if invalid_names:
        msg = L_("Item names (%(invalid_names)s) must not match with existing namespaces.", invalid_names=", ".join(invalid_names))
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)
    query = And([Or([Term(NAME, name) for name in names]), Term(NAMESPACE, current_namespace)])
    # There should be not item existing with the same name.
    if itemid is not None:
        query = And([query, Not(Term(ITEMID, itemid))])  # search for items except the current item.
    with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher:
        results = searcher.search(query)
        duplicate_names = {name for result in results for name in result[NAME] if name in names}
        if duplicate_names:
            msg = L_("Item(s) named %(duplicate_names)s already exist.", duplicate_names=", ".join(duplicate_names))
            flash(msg, "error")  # duplicate message at top of form
            raise NameNotValidError(msg)
Ejemplo n.º 7
0
    def get_subitem_revs(self):
        """
        Create a list of subitems of this item.

        Subitems are in the form of storage Revisions.
        """
        query = And([
            Term(WIKINAME, app.cfg.interwikiname),
            Term(NAMESPACE, self.fqname.namespace)
        ])
        # trick: an item of empty name can be considered as "virtual root item"
        # that has all wiki items as sub items
        if self.names:
            query = And([
                query,
                Or([
                    Prefix(NAME_EXACT, prefix)
                    for prefix in self.subitem_prefixes
                ])
            ])
        revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None)
        return revs
Ejemplo n.º 8
0
    def search(self,
               q_str: str,
               in_chats: Optional[List[int]],
               page_len: int,
               page_num: int = 1) -> SearchResult:
        q = self.query_parser.parse(q_str)
        with self.ix.searcher() as searcher:
            q_filter = in_chats and Or(
                [Term('chat_id', str(chat_id)) for chat_id in in_chats])
            result_page = searcher.search_page(q,
                                               page_num,
                                               page_len,
                                               filter=q_filter,
                                               sortedby='post_time',
                                               reverse=True)

            hits = [
                SearchHit(IndexMsg(**msg),
                          self.highlighter.highlight_hit(msg, 'content'))
                for msg in result_page
            ]
            return SearchResult(hits, result_page.is_last_page(),
                                result_page.total)
Ejemplo n.º 9
0
def get_subscribers(**meta):
    """ Get all users that are subscribed to the item

    :param meta: key/value pairs from item metadata - itemid, name, namespace, tags keys
    :return: a set of Subscriber objects
    """
    itemid = meta.get(ITEMID)
    name = meta.get(NAME)
    namespace = meta.get(NAMESPACE)
    fqname = CompositeName(namespace, ITEMID, itemid)
    tags = meta.get(TAGS)
    terms = []
    if itemid is not None:
        terms.extend([Term(SUBSCRIPTION_IDS, u"{0}:{1}".format(ITEMID, itemid))])
    if namespace is not None:
        if name is not None:
            terms.extend(Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(NAME, namespace, name_))
                         for name_ in name)
        if tags is not None:
            terms.extend(Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(TAGS, namespace, tag))
                         for tag in tags)
    query = Or(terms)
    with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher:
        result_iterators = [searcher.search(query, limit=None), ]
        subscription_patterns = searcher.lexicon(SUBSCRIPTION_PATTERNS)
        patterns = get_matched_subscription_patterns(subscription_patterns, **meta)
        result_iterators.extend(searcher.documents(subscription_patterns=pattern) for pattern in patterns)
        subscribers = set()
        for user in chain.from_iterable(result_iterators):
            email = user.get(EMAIL)
            if email:
                from moin.user import User
                u = User(uid=user.get(ITEMID))
                if u.may.read(fqname):
                    locale = user.get(LOCALE, DEFAULT_LOCALE)
                    subscribers.add(Subscriber(user[ITEMID], user[NAME][0], email, locale))
    return subscribers
Ejemplo n.º 10
0
    def more_like(self, pk, source, top=5):
        """Find similar units."""
        index = self.get_source_index()
        with index.searcher() as searcher:
            # Extract key terms
            kts = searcher.key_terms_from_text('source',
                                               source,
                                               numterms=10,
                                               normalize=False)
            # Create an Or query from the key terms
            query = Or(
                [Term('source', word, boost=weight) for word, weight in kts])

            # Grab fulltext results
            results = [(h['pk'], h.score)
                       for h in searcher.search(query, limit=top)]
            if not results:
                return []
            # Normalize scores to 0-100
            max_score = max([h[1] for h in results])
            scores = {h[0]: h[1] * 100 / max_score for h in results}

            # Filter results with score above 50 and not current unit
            return [h[0] for h in results if scores[h[0]] > 50 and h[0] != pk]
Ejemplo n.º 11
0
    def more_like(self, pk, source, top=5):
        """Find similar units."""
        index = self.get_source_index()
        with index.searcher() as searcher:
            # Extract key terms
            kts = searcher.key_terms_from_text(
                'source', source, numterms=10, normalize=False
            )
            # Create an Or query from the key terms
            query = Or([Term('source', word, boost=weight) for word, weight in kts])
            LOGGER.debug('more like query: %r', query)

            # Grab fulltext results
            results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)]
            LOGGER.debug('found %d matches', len(results))
            if not results:
                return []

            # Filter bad results
            threshold = max((h[1] for h in results)) / 2
            results = [h[0] for h in results if h[1] > threshold]
            LOGGER.debug('filter %d matches over threshold %d', len(results), threshold)

            return results
Ejemplo n.º 12
0
    def parse(self, input, normalize=True):
        reqs, opts, nots, phrase = self._sort(self._split(input))
        make_clause = self.make_clause
        make_filter_clause = self.make_filter_clause

        reqs = [make_clause(text) for text in reqs]
        opts = [make_clause(text) for text in opts]
        nots = [make_filter_clause(text) for text in nots]

        pctmatch = int((len(reqs) + len(opts)) * self.minpercent) - len(reqs)
        minmatch = max(pctmatch, self.minmatch - len(reqs), 0)

        q = Or(opts, minmatch=minmatch)
        if reqs: q = AndMaybe(And(reqs), q)
        if nots: q = AndNot(q, Or(nots))

        if normalize:
            q = q.normalize()
        return q
Ejemplo n.º 13
0
    def __call__(self):

        command = self.request.matchdict['command']
        params = self.request.params

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if command == 'namesearch':

            search_index = self.request.registry.settings['whoosh_index']
            query_parser = self.request.registry.settings['query_parser']

            with search_index.searcher() as searcher:

                query = query_parser.parse(params['term'])

                # allowable = Or([Term(u'item_type', u'species'), Term(u'item_type', u'climate')])
                # allowable = Or([Term(u'item_type', u'species')])
                allowable = Or([
                    Term(u'item_type', u'species'),
                    Term(u'item_type', u'refugia'),
                    Term(u'item_type', u'aoc'),
                    Term(u'item_type', u'richness')
                ])

                results = searcher.search(query, filter=allowable)

                matches = {}

                for result in results:
                    matches[result['nice_name']] = {
                        "type": result['item_type'],
                        "path": result['item_path'],
                        "mapId": result['item_id']
                    }

            json_content = json.dumps(matches)
            return Response(body=json_content, content_type='application/json')
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if command == 'mapsearch':

            es = self.request.registry.settings['search_conn']

            allowable = ['species', 'refugia', 'aoc', 'richness']
            query = {
                "query": {
                    "bool": {
                        "must": {
                            "match": {
                                "nice_name": {
                                    "query": params['term'],
                                    "operator": "and"
                                }
                            }
                        },
                        "filter": {
                            "terms": {
                                "item_type": allowable
                            }
                        }
                    }
                },
                "from": 0,
                "size": 15
            }

            results = es.search(index='wallace', doc_type='map', body=query)

            matches = {}
            for result in results['hits']['hits']:
                doc = result['_source']
                matches[doc['nice_name']] = {
                    "type": doc['item_type'],
                    "path": doc['item_path'],
                    "mapId": doc['item_id']
                }

            json_content = json.dumps(matches)
            return Response(body=json_content, content_type='application/json')

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        elif command == 'preplayer':

            gs_user = self.request.registry.settings['climas.gs_user']
            gs_pass = self.request.registry.settings['climas.gs_pass']

            # ==== what's the map they want?

            map_type = params['info[type]']
            map_path = params['info[path]']
            map_id = params['info[mapId]']
            map_projection = params['proj']

            if map_type == 'species':
                path_to_map_tif = ''.join([
                    "file:///rdsi/wallace2/W2_website/species/", map_path,
                    "/summaries_temperature/", map_projection, ".tif"
                ])

            else:
                # all the summary maps types have the same path
                path_to_map_tif = ''.join([
                    "file:///rdsi/wallace2/W2_website/", map_path, "/",
                    map_projection, ".tif"
                ])

            coverage_name = '@'.join(
                [map_type, map_id.replace(' ', '_'), map_projection])

            # TODO: remove this debug
            print(path_to_map_tif)

            # ==== insert that map into geoserver

            # todo: put this into a timeout somehow
            poke = requests.put(
                "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/"
                + coverage_name + "/external.geotiff",
                data=path_to_map_tif,
                auth=(gs_user, gs_pass))
            poke = requests.post(
                "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/"
                + coverage_name + "/coverages",
                data="<coverage><name>" + coverage_name +
                "</name><nativeName>" + map_projection +
                "</nativeName></coverage>",
                auth=(gs_user, gs_pass),
                headers={'Content-type': 'text/xml'})

            # ==== return the WMS url for that layer

            if (poke.ok or 'already exists' in poke.text):
                result = {
                    "ok": True,
                    "mapUrl":
                    u"http://wallace-maps.hpc.jcu.edu.au/geoserver/wallace/wms",
                    "layerName": u"wallace:" + coverage_name
                }

                json_content = json.dumps(result)
                return Response(body=json_content,
                                content_type='application/json')

            json_content = json.dumps({
                "ok": False,
                "status_code": poke.status_code,
                "status_reason": poke.reason,
                "result": poke.text
            })

            # if we haven't returned yet, our layer poke didn't work
            return Response(status_code=500,
                            body=json_content,
                            content_type='application/json')
Ejemplo n.º 14
0
    def __recs_query(self, positive_rated_document_list: list, rated_document_list: list,
                     scores: list, recs_number: int, items_directory: str, candidate_list: list) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list (list): list of contents that the user liked
            rated_document_list (list): list of all the contents that the user rated
            scores (list): ratings given by the user
            recs_number (int): how many items must be recommended. Only the number can be specified, not
                a specific item for which compute the prediction
            items_directory (str): directory where the items are stored
            candidate_list (list): list of the items that can be recommended, if None
                all unrated items will be used

        Returns:
            score_frame (pd.DataFrame): dataFrame containing the recommendations for the user
        """

        ix = open_dir(items_directory)
        with ix.searcher(weighting=scoring.TF_IDF if self.__classic_similarity else scoring.BM25F) as searcher:

            # Initializes user_docs which is a dictionary that has the document as key and
            # another dictionary as value. The dictionary value has the name of the field as key
            # and its contents as value. By doing so we obtain the data of the fields while
            # also storing information regarding the field and the document where it was
            field_list = None
            user_docs = {}
            for doc in positive_rated_document_list:
                user_docs[doc] = dict()
                field_list = searcher.stored_fields(doc)
                for field_name in field_list:
                    if field_name == 'content_id':
                        continue
                    user_docs[doc][field_name] = field_list[field_name]

            logger.info("Building query")

            # For each field of each document one string (containing the name of the field and the data in it)
            # is created and added to the query.
            # Also each part of the query that refers to a document
            # is boosted by the score given by the user to said document
            string_query = "("
            for doc, score in zip(user_docs.keys(), scores):
                string_query += "("
                for field_name in field_list:
                    if field_name == 'content_id':
                        continue
                    word_list = user_docs[doc][field_name].split()
                    string_query += field_name + ":("
                    for term in word_list:
                        string_query += term + " "
                    string_query += ") "
                string_query += ")^" + str(score) + " "
            string_query += ") "

            # The requirement of retrieved documents to be in a candidate list (if passed) is added
            # by building a query for the content id of said documents.
            # Also the query containing all the content ids for the documents that the user rated
            # is created.
            # Both these queries will be used by the index searcher
            candidate_query_list = None
            rated_query_list = []

            for document in rated_document_list:
                rated_query_list.append(Term("content_id", document))
            rated_query_list = Or(rated_query_list)

            if candidate_list is not None:
                candidate_query_list = []
                for candidate in candidate_list:
                    candidate_query_list.append(Term("content_id", candidate))
                candidate_query_list = Or(candidate_query_list)

            # The filter and mask arguments of the index searcher are used respectively
            # to find only candidate documents and to ignore documents rated by the user
            schema = ix.schema
            query = QueryParser("content_id", schema=schema, group=qparser.OrGroup).parse(string_query)
            score_docs = searcher.search(query, limit=recs_number, filter=candidate_query_list, mask=rated_query_list)

            logger.info("Building score frame to return")

            # Builds the recommendation frame. Items in the candidate list or rated by the user
            # were already filtered previously by the index searcher
            columns = ['to_id', 'rating']
            score_frame = pd.DataFrame(columns=columns)
            for result in score_docs:
                item_id = result["content_id"]

                score_frame = pd.concat([
                    score_frame, pd.DataFrame.from_records([(item_id, result.score)], columns=columns)])

        return score_frame
Ejemplo n.º 15
0
def search_doc(directory,
               word,
               doc_types,
               num_page=1,
               num_by_page=10,
               show_num_results=True):
    """
    * -------------{Function}---------------
    * Returns a list of docs that contains a given set of words that matches a g
    * -------------{returns}----------------
    * {set} query results . . . 
    * -------------{params}-----------------
    * : directory -> path of the index
    * : word -> words to query
    * : doc_types -> type of doc to search
    * : num_page -> number of pages to search
    * : show_num_results -> number of results to return
    """
    index_schema = load_index(directory)
    doctypes_schema = load_doctypes_schema(directory)

    # Retrieves the fields to search from the doctypes schema
    fields_to_search = []
    for doc_type in doc_types:
        doc_type = doc_type.lower()
        try:
            schema = doctypes_schema[doc_type]
            fields_to_search = fields_to_search + schema
        except:
            logger.warning(
                "Schema not found for {doc_type}".format(doc_type=doc_type))

    # By default we search "content" (for BC) and "tags"
    fields = ["content", "tags"] + fields_to_search
    logger.info(
        "search will be performed on fields {fields}".format(fields=fields))

    # Creates the query parser
    # MultifieldParser allows search on multiple fields
    # We use custom FuzzyTerm class to set the Leveshtein distance to 2
    parser = MultifieldParser(fields,
                              schema=doctypes_schema,
                              termclass=CustomFuzzyTerm)
    query = parser.parse(word)

    # Creates a filter on the doctype field
    doctype_filter_matcher = []
    for doc_type in doc_types:
        term = FuzzyTerm("doc_type", doc_type.lower(), 1.0, 2)
        doctype_filter_matcher.append(term)

    doc_type_filter = Or(doctype_filter_matcher)

    # Processes the search(request the index, whoosh magic)
    with index_schema.searcher() as searcher:
        results = searcher.search_page(query,
                                       num_page,
                                       pagelen=num_by_page,
                                       filter=doc_type_filter)
        results_id = [result["doc_id"] for result in results]
        logger.info("Results: {results_id}".format(results_id=results_id))

        # Ensures BC if the number of results is not requested
        if show_num_results:
            return {"ids": results_id, "num_results": len(results)}
        else:
            return {"ids": results_id}
Ejemplo n.º 16
0
 def search(self, query):
     with self.index.searcher() as searcher:
         terms = [FuzzyTerm("content", word, maxdist=2) for word in query]
         search_query = Or(terms)
         results = searcher.search(search_query)
         return [result["filename"] for result in results]
Ejemplo n.º 17
0
def test_wildcard():
    _run_query(Or([Wildcard('value', u('*red*')), Wildcard('name', u('*yellow*'))]),
                    [u("A"), u("C"), u("D"), u("E")])
    # Missing
    _run_query(Wildcard('value', 'glonk*'), [])
Ejemplo n.º 18
0
    def query(self,
              string_query: str,
              results_number: int,
              mask_list: list = None,
              candidate_list: list = None,
              classic_similarity: bool = True) -> dict:
        """
        Uses a search index to query the index in order to retrieve specific contents using a query expressed in string
        form

        Args:
            string_query (str): query expressed as a string
            results_number (int): number of results the searcher will return for the query
            mask_list (list): list of content_ids of items to ignore in the search process
            candidate_list (list): list of content_ids of items to consider in the search process,
                if it is not None only items in the list will be considered
            classic_similarity (bool): if True, classic tf idf is used for scoring, otherwise BM25F is used

        Returns:
            results (dict): the final results dictionary containing the results found from the search index for the
                query. The dictionary will be in the following form:

                    {content_id: {"item": item_dictionary, "score": item_score}, ...}

                content_id is the content_id for the corresponding item
                item_dictionary is the dictionary of the item containing the fields as keys and the contents as values.
                So it will be in the following form: {"Plot": "this is the plot", "Genre": "this is the Genre"}
                The item_dictionary will not contain the content_id since it is already defined and used as key of the
                external dictionary
                items_score is the score given to the item for the query by the index searcher
        """
        ix = open_dir(self.directory)
        with ix.searcher(
                weighting=TF_IDF if classic_similarity else BM25F) as searcher:
            candidate_query_list = None
            mask_query_list = None

            # the mask list contains the content_id for the items to ignore in the searching process
            # from the mask list a mask query is created and it will be used by the searcher
            if mask_list is not None:
                mask_query_list = []
                for document in mask_list:
                    mask_query_list.append(Term("content_id", document))
                mask_query_list = Or(mask_query_list)

            # the candidate list contains the content_id for the items to consider in the searching process
            # from the candidate list a candidate query is created and it will be used by the searcher
            if candidate_list is not None:
                candidate_query_list = []
                for candidate in candidate_list:
                    candidate_query_list.append(Term("content_id", candidate))
                candidate_query_list = Or(candidate_query_list)

            schema = ix.schema
            parser = QueryParser("content_id", schema=schema, group=OrGroup)
            # regular expression to match the possible field styles
            # examples: "content_id" or "Genre#2" or "Genre#2#custom_id"
            parser.add_plugin(
                FieldsPlugin(r'(?P<text>[\w-]+(\#[\w-]+(\#[\w-]+)?)?|[*]):'))
            query = parser.parse(string_query)
            score_docs = \
                searcher.search(query, limit=results_number, filter=candidate_query_list, mask=mask_query_list)

            # creation of the results dictionary, This phase is necessary because the Hit objects returned by the
            # searcher as results need the reader inside the search index in order to return information
            # so it would be impossible to access a field or the score of the item from outside this method
            # because of that this dictionary containing the most important infos is created
            results = {}
            for hit in score_docs:
                hit_dict = dict(hit)
                content_id = hit_dict.pop("content_id")
                results[content_id] = {}
                results[content_id]["item"] = hit_dict
                results[content_id]["score"] = hit.score
            return results
Ejemplo n.º 19
0
def advanced_query(parameters, page = 0, n = 10):
    """
    :param dict text_paramters: a dictionary of field-to-query pairs specifying text-based queries.
        this is good for fields like "rules_text", "name", "flavor_text", etc.
    :param dict range_parameters: a dictionary of field to range pairs.
        this is good for fields like power, toughness, cmc, etc.
    :param dict point_parameters: a dictionary of field to value parameters. Every card in the return
        set must have an exact match to every value in the dict.
        In example, if point_parameters is {'cmc': 5} then for every card in the set card.cmc == 5 must evaluate to true.
        .. warning::
            using this parameter will cause the query system to filter through whoosh results, slowing down computation.
    :param int page: the 'page' of results to return
    :param int n: the number of results per page.
    :return: Exact class TBD, will provide way to iterate over the page's worth of results.
    """
    import whoosh.fields
    from whoosh.query import And, Or
    schema = get_whoosh_index().schema

    # fix `page` and `n` (they may be string versions of ints)
    page = int(page)
    n = int(n)

    # After talking with Ben it sounds like we can do something to the effect
    # of taking multiple sub queries and perform unions and intersections on their
    # results
    # This is going to be the best way to get the desired results.

    # to start: build a list of all the query objects we'll be searching.
    query_objs = []
    for field, target in parameters.items():
        # Coerce potential numeric point queries to whoosh syntax.
        if isinstance(target, float):
            target = int(target+0.5)
        if isinstance(target, int):
            target = str(target)
            #target = f"{{{target-1} TO {target+1}}}"
            #target = target.replace("[ TO", "[TO").replace("TO ]", "TO]")

        # Coerce range queries to whoosh syntax, assume they're inclusive bounds.
        if isinstance(target, (list, tuple)):
            if len(target) != 2:
                raise ValueError(f"Unable to treat parameter as range query! ({target})")
            target = f"[{target[0] if target[0] != -1 else ''} TO {target[1] if target[1] != -1 else ''}]"

            # whoosh has issues if there's an open ended range with a space separating TO from the bracket:
            target = target.replace("[ TO", "[TO").replace("TO ]", "TO]")

        # the comma-separated KEYWORD fields have been giving us some issue
        # it seems that whoosh is a bit bi-polar when it comes to commas in these fields.
        # so we'll add two subqueries, one with a comma and one without.
        if field in schema and isinstance(schema[field], whoosh.fields.KEYWORD):
            # add the extra query object:
            subqueries = [QueryParser(field, schema).parse(target.lower()+','),
                          QueryParser(field, schema).parse(target.lower())]
            query_objs.append(Or(subqueries))

        else:
            query_objs.append(QueryParser(field, schema).parse(target.lower())) # again, lower capitalization on everything

    if not len(query_objs):
        return []

    # now build a nice big compound query:
    query = And(query_objs)
    with get_whoosh_index().searcher() as searcher:
        # run that query and return the appropriate results page.
        try:
            results = searcher.search_page(query, page+1, n)
        except Exception:
            print(repr(query))
            raise

        return [x['data_obj'] for x in results]
Ejemplo n.º 20
0
def test_not():
    _run_query(Or([Term("value", u("red")), Term("name", u("yellow")), Not(Term("name", u("quick")))]),
                    [u("A"), u("E")])
Ejemplo n.º 21
0
 def oq():
     return Or([Term("a", u("a")), Term("a", u("b"))])
Ejemplo n.º 22
0
def test_query_copy_hash():
    def do(q1, q2):
        q1a = copy.deepcopy(q1)
        assert_equal(q1, q1a)
        assert_equal(hash(q1), hash(q1a))
        assert_not_equal(q1, q2)

    do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5))
    do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1),
       And([Term("a", u("b")), Term("c", u("d"))], boost=1.5))
    do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]),
       Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5))
    do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]),
       DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))],
                      boost=1.5))
    do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5)))
    do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5))
    do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"),
                                                     boost=1.5))
    do(FuzzyTerm("a", u("b"), constantscore=True),
       FuzzyTerm("a", u("b"), constantscore=False))
    do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5))
    do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d")))
    do(TermRange("a", None, u("c")), TermRange("a", None, None))
    do(TermRange("a", u("b"), u("c"), boost=1.1),
       TermRange("a", u("b"), u("c"), boost=1.5))
    do(TermRange("a", u("b"), u("c"), constantscore=True),
       TermRange("a", u("b"), u("c"), constantscore=False))
    do(NumericRange("a", 1, 5), NumericRange("a", 1, 6))
    do(NumericRange("a", None, 5), NumericRange("a", None, None))
    do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5))
    do(NumericRange("a", 3, 6, constantscore=True),
       NumericRange("a", 3, 6, constantscore=False))
    # do(DateRange)
    do(Variations("a", u("render")), Variations("a", u("renders")))
    do(Variations("a", u("render"), boost=1.1),
       Variations("a", u("renders"), boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")]),
       Phrase("a", [u("b"), u("c"), u("e")]))
    do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1),
       Phrase("a", [u("b"), u("c"), u("d")], boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")], slop=1),
       Phrase("a", [u("b"), u("c"), u("d")], slop=2))
    # do(Ordered)
    do(Every(), Every("a"))
    do(Every("a"), Every("b"))
    do(Every("a", boost=1.1), Every("a", boost=1.5))
    do(NullQuery, Term("a", u("b")))
    do(ConstantScoreQuery(Term("a", u("b"))),
       ConstantScoreQuery(Term("a", u("c"))))
    do(ConstantScoreQuery(Term("a", u("b")), score=2.0),
       ConstantScoreQuery(Term("a", u("c")), score=2.1))
    do(Require(Term("a", u("b")), Term("c", u("d"))),
       Require(Term("a", u("b"), boost=1.1), Term("c", u("d"))))
    # do(Require)
    # do(AndMaybe)
    # do(AndNot)
    # do(Otherwise)

    do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")),
                                                        limit=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d"))),
       SpanNear(Term("a", u("b")), Term("c", u("e"))))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True),
       SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False))
    do(SpanNot(Term("a", u("b")), Term("a", u("c"))),
       SpanNot(Term("a", u("b")), Term("a", u("d"))))
    do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]),
       SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))]))
    do(SpanContains(Term("a", u("b")), Term("a", u("c"))),
       SpanContains(Term("a", u("b")), Term("a", u("d"))))
Ejemplo n.º 23
0
# Query 3: USA JPN 
parser = QueryParser("strong_hashtags", index.schema)
query = parser.parse("USA JPN")
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

# Query 4: RTs about USA
parser = QueryParser("tweet_text", index.schema)
query = parser.parse("RT USA")
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

# Query 5: score 
query = Or([Term("strong_hastags","FIFAWWC"),Term("tweet_text","score")])
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])


#############################################################################
"""            Upload Whoosh index directory to S3 bucket                 """

# version to upload to Github will have AWS keys removed
AWS_KEY = ''
AWS_SECRET = ''

from boto.s3.connection import S3Connection
conn = S3Connection(AWS_KEY, AWS_SECRET)    
bucket = conn.create_bucket('w205_assignment4')
Ejemplo n.º 24
0
def do_search(event):
    print("Algoritmo di ranking:" + str(app._model.get()))
    og = qparser.OrGroup.factory(0.9)
    if not app.txtEntry.get():
        return False
    query = app.txtEntry.get()
    regex = r'((?:article|incollection|inproceedings|phdthesis|mastersthesis|publication|venue)(?:.(?:(?:author|title|year|publisher)))?:\s?(?:\*{0,1}\w*\*{0,1})?(?:\".+?\")?)'
    queries = re.findall(regex, query)  # Trovo tutte le ricerche con campi specifici e le metto in una lista
    spec_query = len(queries)
    query = re.sub(regex, ' ',
                   query).strip()  # Tolgo le ricerche che ho trovato prima dalla query. Così ora rimangono solo le ricerche per frase e le parole singole
    if len(query) > 0:
        queries = queries + re.findall(r'".+?"',
                                       query)  # Trovo tutte le ricerche per frase, quindi quelle comprese tra " " e le aggiungo alla lista di prima
        query = re.sub(r'".+?"', '',
                       query)  # Tolgo le ricerche per frase dalla query. Ora quindi rimangono solo le singole parole
        query = re.sub(r'\s+', ' ', query).strip()  # Tolgo gli spazi inutili (dovuti alle sostituzioni di prima)
        queries = queries + query.split(
            ' ')  # Splitto la query per spazi così trovo le singole parole e le aggiungo alla lista
    try:
        queries.remove('')
    except Exception:
        pass
    queries_for_publi = list()
    queries_for_venue = list()
    ven_query = " AND (tag:book OR tag:proceedings)"
    tag_query = " AND (tag:article OR tag:incollection OR tag:phdthesis OR tag:mastersthesis OR tag:inproceedings)"
    start_time = datetime.datetime.now()
    for split in queries[:spec_query]:
        split = split.split(':')
        dotted = split[0].split('.')
        tag = dotted[0]
        if len(dotted) > 1:
            field = dotted[1]
        else:
            field = 0
        if tag in publi:
            tag_query = " AND (tag:" + tag + ")"
        else:
            tag_query = " AND (tag:article OR tag:incollection OR tag:phdthesis OR tag:mastersthesis OR tag:inproceedings)"
        q = qparser.QueryParser(field, schema=schema_publi, group=og)
        if tag in (publi + ['publication']) and field in ['author', 'title', 'year']:
            queries_for_publi.append(q.parse(split[1] + tag_query))
        elif tag == 'venue' and field in ['title', 'publisher']:
            queries_for_venue.append(q.parse(split[1] + ven_query))
        elif not field:
            if tag == 'venue':
                qv = MultifieldParser(terms_venue, schema=schema_venue)
                queries_for_venue.append(qv.parse(split[1] + ven_query))
            else:
                qp = MultifieldParser(terms_publi, schema=schema_publi)
                queries_for_publi.append(qp.parse(split[1] + tag_query))
    if queries[spec_query:]:  # Se son rimasti dei termini non specifici
        qv = MultifieldParser(terms_venue, schema=schema_venue)
        qp = MultifieldParser(terms_publi, schema=schema_publi)
        for split in queries[spec_query:]:
            queries_for_venue.append(qv.parse(split + ven_query))
            queries_for_publi.append(qp.parse(split + tag_query))

    app.venue_results = [y for y in app.get_vx_searcher().search(Or([x for x in queries_for_venue]), limit=None)]
    app.publi_results = [y for y in app.get_px_searcher().search(Or([x for x in queries_for_publi]), limit=None)]

    print("SEARCHING: " + str(datetime.datetime.now() - start_time))
    start_time = datetime.datetime.now()
    app.len_publi = len(app.publi_results)
    app.len_venue = len(app.venue_results)
    print("PUBLICATION LEN:" + str(app.len_publi))
    print("VENUE LEN:" + str(app.len_venue))
    if app.venue_results and app.publi_results:
        app.results, app.indice_t = threshold_2(list(), app.publi_results, app.venue_results, app.len_publi,
                                                app.len_venue)
    print("THRESHOLD: " + str(datetime.datetime.now() - start_time))
    app.listNodes.delete(0, 'end')
    app.listSelection.delete(0, 'end')

    if not app.publi_results or not app.venue_results:
        app.counter = 0
        app.results = app.publi_results + app.venue_results
        for hit in app.results[:10]:
            if len(hit.get("title")) > 50:
                app.listNodes.insert(END, hit.get("title")[0:50] + "...")
            else:
                app.listNodes.insert(END, hit.get("title"))

        app.listNodes.bind('<<ListboxSelect>>', onselect)
    else:
        for hit in app.results:
            l = len(hit.pub.get("title"))
            if not l:
                if len(hit.venue.get("title")) > 50:
                    app.listNodes.insert(END, hit.venue.get("title")[:50] + "...")
                else:
                    app.listNodes.insert(END, hit.venue.get("title"))
            elif l > 50:
                app.listNodes.insert(END, hit.pub.get("title")[0:50] + "...")
            else:
                app.listNodes.insert(END, hit.pub.get("title"))
        app.listNodes.bind('<<ListboxSelect>>', onselect_for_dict)

    print("PUTTING ON LIST: " + str(datetime.datetime.now() - start_time))
Ejemplo n.º 25
0
    def perform_search(self, sentence):
        with self._searcher() as s:
            tokens = sentence.split()
            tokens = [token for token in tokens if token != REPLACED]
            print('tokens=', tokens)
            exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens],
                                  boost=.5)
            exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens],
                                boost=.5,
                                scale=0.9)
            # Added variability of maxdist based on word length
            fuzzy_or_match = Or([
                FuzzyTerm(TEXT_FIELD,
                          t,
                          prefixlength=1,
                          maxdist=1 if len(t) < 8 else 2)
                for t in tokens if len(t) >= 4
            ],
                                boost=.2,
                                scale=0.9)
            if len(tokens) > 1:
                # add bigrams if there are any
                bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)]
                bigram_fuzzy_or_match = Or([
                    FuzzyTerm(BIGRAMS_FIELD,
                              b,
                              prefixlength=3,
                              maxdist=2 if len(b) < 8 else 3) for b in bigrams
                ],
                                           scale=0.9)
            else:
                bigram_fuzzy_or_match = None

            non_brand_or_match = Or(
                [Term(NONBRAND_TEXT_FIELD, t) for t in tokens])

            # q = exact_and_match \
            # | exact_or_match \
            # | fuzzy_or_match

            # my_match = Or([Term(f, token) for token in tokens], boost=1)
            # q = my_match

            #
            # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0,
            #                    scale=0.9)

            q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match

            if bigram_fuzzy_or_match:
                q = q | bigram_fuzzy_or_match

            print(q)
            search_results = self.get_search_results(self._index, s, q)

            for x in search_results:
                print(x, x.score)

            if search_results:
                score, text, matched = search_results[0].items()
                return text, list(set(matched))
            else:
                return None, None
Ejemplo n.º 26
0
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None, within=None,
               dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            parsed_query = self.parser.parse(query_string)
            if len(model_choices) > 0:
                narrow_model = [Term(DJANGO_CT, rm) for rm in model_choices]
                parsed_query = And([Or(narrow_model), parsed_query])

            searcher = self.index.searcher()

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            collapse_field = kwargs.get("collapse")
            collapse_limit = kwargs.get("collapse_limit")

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse
            }

            if collapse_field is not None:
                search_kwargs['collapse'] = FieldFacet(collapse_field)
                search_kwargs['collapse_limit'] = 1

                if kwargs.get("collapse_order") is not None:
                    order = kwargs.get("collapse_order")
                    collapse_order = FieldFacet(order.replace('-', ''), reverse=order.find('-') > -1)
                    search_kwargs['collapse_order'] = collapse_order

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            grouped_results = None
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            if collapse_field is not None and collapse_limit > 1:
                search_kwargs = {
                    'sortedby': collapse_order
                }
                grouped_results = []
                for result in raw_page:
                    query = And([Term(collapse_field, result[collapse_field]), parsed_query])
                    results = searcher.search(query, limit=collapse_limit, **search_kwargs)

                    grouped_results.append(results)

            results = self._process_results(raw_page, result_class=result_class, collapse_field=collapse_field, grouped_results=grouped_results)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
Ejemplo n.º 27
0
def search(request):
    if request.method == 'POST':
        form = Search_Form(request.POST)
        if form.is_valid():
            if not aux_check_index():
                aux_reset_all()
            key = form.cleaned_data['key_word'].lower()
            type = form.cleaned_data['type']
            ix = open_dir(dirindex)
            with ix.searcher() as searcher:
                words = key.strip().split()
                terms_classified = []
                for word in words:
                    terms = []
                    for desc in [
                            'descripcionECI', 'descripcionMM', 'descripcionFC'
                    ]:
                        terms.append(Term(desc, word))
                    terms_classified.append(terms)
                subqueries = []
                for t in terms_classified:
                    if type == 'N3':
                        subqueries.append(And(t))
                    else:
                        subqueries.append(Or(t))
                query = subqueries[0]
                if len(subqueries) > 1:
                    if type == 'N1':
                        query = Or(subqueries)
                    else:
                        query = And(subqueries)
                results = searcher.search(query)
                title = "Resultados para: "
                mostrar = True
                if len(results) == 0:
                    title = "No hay resultados para: "
                    mostrar = False
                eci = []
                mm = []
                fc = []
                for r in results:
                    eci.append(
                        Historico_ECI.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    mm.append(
                        Historico_MM.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    fc.append(
                        Historico_FC.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                return render(
                    request, 'search.html', {
                        "eci": eci,
                        "mm": mm,
                        'fc': fc,
                        "title": title + key,
                        "mostrar": mostrar
                    })
    else:
        form = Search_Form()
    return render(request, 'search.html', {'form': form})
Ejemplo n.º 28
0
 def nq(level):
     if level == 0:
         return oq()
     else:
         return Or([nq(level - 1), nq(level - 1), nq(level - 1)])
Ejemplo n.º 29
0
 def make_filter_clause(self, text):
     return Or([
         self.make_basic_clause(fieldname, text)
         for fieldname in self.fieldboosts.iterkeys()
     ])
Ejemplo n.º 30
0
def test_replace():
    q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2),
             Variations("a", "b", boost=2.0)])
    q = q.replace("a", "b", "BB")
    assert_equal(q, And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2),
                         Variations("a", "BB", boost=2.0)]))
Ejemplo n.º 31
0
    def getAnnotations(self, commentText):
        tmpRes = {}

        if commentText == '':
            return []

        procCommText, origIndx = compressStr(commentText, lower=True)

        termArr = procCommText.split()

        # There might be repeating entries
        orgNames = set()

        for qs in range(0, len(termArr), self.queryStride):
            qe = min(qs + self.querySpan, len(termArr))

            q = []
            for i in range(qs, qe - 1):
                if not termArr[i] in STOP_WORDS:
                    bigram = And([
                        Term(self.fieldName, termArr[i]),
                        Term(self.fieldName, termArr[i + 1])
                    ])
                    q.append(bigram)

            #print('@@', ' '.join(termArr[qs:qe]))
            #print('Query: ', q)

            res = self.searcher.search(Or(q), limit=self.topK)

            #print('Found %d results' % len(res))

            for k in range(len(res)):
                if k >= self.topK:
                    break
                orgName = res[k][self.fieldName]
                orgNames.add(orgName)

        for orgName in orgNames:
            start = 0
            while start < len(procCommText):
                indx = procCommText.find(orgName, start)
                #print('###', orgName, start, indx)
                if indx == -1: break
                assert (indx + len(orgName) <= len(origIndx))
                start = indx + len(orgName)
                # To be a valid match
                startChar = origIndx[indx]
                endChar = origIndx[indx + len(orgName) - 1] + 1
                # TODO additional condtitions for spaces!!
                if startChar >= 0 and endChar >= 0:
                    if startChar in tmpRes:
                        tmpRes[startChar] = max(tmpRes[startChar], endChar)
                    else:
                        tmpRes[startChar] = endChar

        resAnnot = []

        for startChar in tmpRes:
            endChar = tmpRes[startChar]
            resAnnot.append(Annotation(startChar, endChar, 'OrgDict'))

        return resAnnot
Ejemplo n.º 32
0
    def search(self,
               collector,
               query_str1=None,
               query_str2=None,
               itemtypes=(),
               highlight=False):

        # rejects '*' and '?'
        if query_str1:
            for kw in (s.strip() for s in query_str1.split()):
                if not kw.replace("*", "").replace("?", "").strip():
                    return []

        wildcard = (query_str1 and any(c in query_str1 for c in "*?"))

        parser = self._parser_wild if wildcard else self._parser
        asf_parser = self._asf_parser

        with self._index.searcher() as searcher:
            andlist = []
            try:
                if query_str1:
                    andlist.append(parser.parse(query_str1))
                if query_str2:
                    andlist.append(asf_parser.parse(query_str2))
            except:
                return []

            if itemtypes:
                if len(itemtypes) > 1:
                    andlist.append(Or([Term('itemtype', t)
                                       for t in itemtypes]))
                else:
                    andlist.append(Term('itemtype', itemtypes[0]))

            query = And(andlist)

            searcher.search_with_collector(query, collector)
            hits = collector.results()

            if highlight:
                hits.fragmenter = WholeFragmenter()
                hits.formatter = HtmlFormatter(tagname='span',
                                               classname='s_match',
                                               termclass='s_term')

            if wildcard and query_str1:
                pat = query_str1.replace("-", "").replace(" ", "")
                wildmatch = re.compile(fnmatch.translate(pat))

            # Construct a result list
            results = []
            for hit in hits:
                if collector.aborted:
                    return []
                (label, path, prio, sortkey) = hit['data']

                if wildcard and query_str1:
                    if not wildmatch.match(sortkey):
                        continue

                if highlight:
                    if query_str1:
                        text = hit.highlights('content')
                    else:
                        text = hit['content']
                else:
                    text = None

                results.append((label, path, sortkey, prio, text))

            sortkey_prio_getter = itemgetter(2, 3)
            results.sort(key=sortkey_prio_getter)

            # Return
            return results
Ejemplo n.º 33
0
 def restrict_query(self, request):
     return Or([
         And([Term('public', 't'), Term('searchable', 't')]),
         Term('users', request.user.username)
     ] + [Term('groups', group.name) for group in request.user.groups.all()])