Ejemplo n.º 1
0
def test_span_near():
    ix = get_index()
    with ix.searcher() as s:

        def test(q):
            m = q.matcher(s)
            while m.is_active():
                yield s.stored_fields(m.id())["text"], m.spans()
                m.next()

        for orig, sps in test(
                spans.SpanNear(Term("text", "alfa"),
                               Term("text", "bravo"),
                               ordered=True)):
            assert_equal(orig[sps[0].start], "alfa")
            assert_equal(orig[sps[0].end], "bravo")

        for orig, sps in test(
                spans.SpanNear(Term("text", "alfa"),
                               Term("text", "bravo"),
                               ordered=False)):
            first = orig[sps[0].start]
            second = orig[sps[0].end]
            assert (first == "alfa" and second == "bravo"
                    or (first == "bravo" and second == "alfa"))

        for orig, sps in test(
                spans.SpanNear(Term("text", "bravo"),
                               Term("text", "bravo"),
                               ordered=True)):
            text = " ".join(orig)
            assert text.find("bravo bravo") > -1

        q = spans.SpanNear(
            spans.SpanNear(Term("text", "alfa"), Term("text", "charlie")),
            Term("text", "echo"))
        for orig, sps in test(q):
            text = " ".join(orig)
            assert text.find("alfa charlie echo") > -1

        q = spans.SpanNear(Or([Term("text", "alfa"),
                               Term("text", "charlie")]),
                           Term("text", "echo"),
                           ordered=True)
        for orig, sps in test(q):
            text = " ".join(orig)
            assert text.find("alfa echo") > -1 or text.find(
                "charlie echo") > -1
Ejemplo n.º 2
0
    def matcher(self, searcher, weighting=None):
        fieldname = self.fieldname
        constantscore = self.constantscore
        reader = searcher.reader()
        qs = [Term(fieldname, word) for word in self._words(reader)]
        if not qs:
            return matching.NullMatcher()

        if len(qs) == 1:
            # If there's only one term, just use it
            q = qs[0]
        elif constantscore or len(qs) > self.TOO_MANY_CLAUSES:
            # If there's so many clauses that an Or search would take forever,
            # trade memory for time and just find all the matching docs serve
            # them up as one or more ListMatchers
            fmt = searcher.schema[fieldname].format
            doc_to_values = defaultdict(list)
            doc_to_weights = defaultdict(float)
            for q in qs:
                m = q.matcher(searcher)
                while m.is_active():
                    docnum = m.id()
                    doc_to_values[docnum].append(m.value())
                    if not constantscore:
                        doc_to_weights[docnum] += m.weight()
                    m.next()

            docnums = sorted(doc_to_values.keys())
            # This is a list of lists of value strings -- ListMatcher will
            # actually do the work of combining multiple values if the user
            # asks for them
            values = [doc_to_values[docnum] for docnum in docnums]

            kwargs = {"values": values, "format": fmt}
            if constantscore:
                kwargs["all_weights"] = self.boost
            else:
                kwargs["weights"] = [
                    doc_to_weights[docnum] for docnum in docnums
                ]

            return matching.ListMatcher(docnums, **kwargs)
        else:
            # The default case: Or the terms together
            from whoosh.query import Or
            q = Or(qs)

        return q.matcher(searcher, weighting=weighting)
Ejemplo n.º 3
0
def test_simplify():
    s = fields.Schema(k=fields.ID, v=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee"))
    w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia"))
    w.commit()

    r = ix.reader()
    q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")])
    q2 = And([Or([Term('v', u('bear'), boost=2.0),
                  Term('v', u('bee'), boost=2.0),
                  Term('v', u('brie'), boost=2.0)]),
              Term('v', 'juliet')])
    assert_equal(q1.simplify(r), q2)
Ejemplo n.º 4
0
def get_subscribers(**meta):
    """ Get all users that are subscribed to the item

    :param meta: key/value pairs from item metadata - itemid, name, namespace, tags keys
    :return: a set of Subscriber objects
    """
    itemid = meta.get(ITEMID)
    name = meta.get(NAME)
    namespace = meta.get(NAMESPACE)
    fqname = CompositeName(namespace, ITEMID, itemid)
    tags = meta.get(TAGS)
    terms = []
    if itemid is not None:
        terms.extend(
            [Term(SUBSCRIPTION_IDS, u"{0}:{1}".format(ITEMID, itemid))])
    if namespace is not None:
        if name is not None:
            terms.extend(
                Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(
                    NAME, namespace, name_)) for name_ in name)
        if tags is not None:
            terms.extend(
                Term(SUBSCRIPTION_IDS, u"{0}:{1}:{2}".format(
                    TAGS, namespace, tag)) for tag in tags)
    query = Or(terms)
    with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher:
        result_iterators = [
            searcher.search(query, limit=None),
        ]
        subscription_patterns = searcher.lexicon(SUBSCRIPTION_PATTERNS)
        patterns = get_matched_subscription_patterns(subscription_patterns,
                                                     **meta)
        result_iterators.extend(
            searcher.documents(subscription_patterns=pattern)
            for pattern in patterns)
        subscribers = set()
        for user in chain.from_iterable(result_iterators):
            email = user.get(EMAIL)
            if email:
                from MoinMoin.user import User
                u = User(uid=user.get(ITEMID))
                if u.may.read(fqname):
                    locale = user.get(LOCALE, DEFAULT_LOCALE)
                    subscribers.add(
                        Subscriber(user[ITEMID], user[NAME][0], email, locale))
    return subscribers
Ejemplo n.º 5
0
 def search_relation(search_terms, item, key):
     """
     Search for a relation and store the result in the given item (at key).
     """
     results = searcher.search(
         q=Or(search_terms),
         limit=None,
         filter=And(default_terms) if default_terms else None,
         **search_args,
     )
     if results:
         item[key] = [
             _get_fields(hit, return_fields) for hit in results
         ]
     else:
         item[key] = [
         ]  # Replace original value of the field with the results.
Ejemplo n.º 6
0
    def sear_paper(self):
        if not self.key_words or len(self.key_words)==0:
            return '0'
        q = [Term("textdata", k) for k in self.key_words]
        index =   open_dir(self.path_index) # open_dir('paper-index')
        searcher = index.searcher()
        results = searcher.search(Or(q))
        results.fragmenter.maxchars = 30000
        results.fragmenter.surround = 150
        print('Number of hits:', len(results))
        hf = HtmlFormatter(tagname="span",classname="match", termclass="term")
        results._set_formatter(hf)

        hl_results = [hit.highlights("textdata") for hit in results]
        #for hit in results:
        #    print(hit.highlights("textdata"))
        return hl_results
Ejemplo n.º 7
0
    def simplify(self, ixreader):
        if self.fieldname not in ixreader.schema:
            return qcore.NullQuery()
        field = ixreader.schema[self.fieldname]

        existing = []
        for btext in sorted(set(self._btexts(ixreader))):
            text = field.from_bytes(btext)
            existing.append(Term(self.fieldname, text, boost=self.boost))

        if len(existing) == 1:
            return existing[0]
        elif existing:
            from whoosh.query import Or
            return Or(existing)
        else:
            return qcore.NullQuery
Ejemplo n.º 8
0
def search(ix, query, project_ids=[], limit=10):
    query_parser = QueryParser("name", schema=ix.schema)
    whoosh_query = query_parser.parse(query)
    is_project_filter = len(project_ids) > 0
    ids = []
    with ix.searcher() as searcher:
        if is_project_filter:
            project_id_terms = Or(
                [Term("project_id", project_id) for project_id in project_ids])
            results = searcher.search(whoosh_query,
                                      filter=project_id_terms,
                                      limit=limit)
        else:
            results = searcher.search(whoosh_query, limit=limit)
        for result in results:
            ids.append(result["id"])
    return ids
Ejemplo n.º 9
0
def filter_by_player(jugador):
    ix = open_dir("Index_news")
    print(jugador)
    with ix.searcher() as buscador_noticias:
        list_aux = []
        for nom_divididio in jugador.split():
            list_aux.append(
                Or([
                    Term("titulo",
                         str(nom_divididio).lower()),
                    Term("desc",
                         str(nom_divididio).lower())
                ]))
        q = And(list_aux)
        results_whoosh = buscador_noticias.search(q, limit=None)
        results = []
        for result_whoosh in results_whoosh:
            results.append(result_whoosh.fields())
        return results
Ejemplo n.º 10
0
def search_whoosh_index_headline(query, paths):
  if not paths:
    return []
  ix = get_whoosh_index()
  parser = MultifieldParser(['content', 'title', 'abstract'], ix.schema)
  q = parser.parse(query)

  allow_q = Or([Term('path', path) for path in paths])

  res = []

  with ix.searcher() as searcher:
    results = searcher.search(q, filter=allow_q, limit=len(paths), terms=True)
    for hit in results:
      res.append({
        # 'title': hit['title'],
        'short_url': hit['path'],
        'highlights': u' [...] '.join(filter(None, [hit.highlights("title", top=5), hit.highlights("abstract", top=5), hit.highlights("content", top=5)]))
      })

  return res
Ejemplo n.º 11
0
def whoosh_search(user_id, query_terms):
    ret = {}
    user_artists_profile = get_user_artists_profile(user_id)
    # q = qp.parse(
    #     "(" + query + " ANDMAYBE ((" + ") OR (".join([
    #         (" artist_id:" + str(artist_id) + "^" + str(1 + artist_score)) for (
    #             artist_id, artist_score) in user_artists_profile]) + ")))")
    # see https://pythonhosted.org/Whoosh/api/query.html#whoosh.query.AndMaybe
    q = AndMaybe(
        And([Term('title', qt) for qt in query_terms.split(' ')]),
        Or([
            Term('artist_id', artist_id, boost=artist_score)
            for (artist_id, artist_score) in user_artists_profile
        ]))
    with ix.searcher() as searcher:
        results = searcher.search(q, limit=10)
        ret = {
            'items': [hit.fields() for hit in results],
            'runtime': results.runtime
        }
    return ret
Ejemplo n.º 12
0
def validate_name(meta, itemid):
    """
    Check whether the names are valid.
    Will just return, if they are valid, will raise a NameNotValidError if not.
    """
    names = meta.get(NAME)
    current_namespace = meta.get(NAMESPACE)
    if current_namespace is None:
        raise NameNotValidError(L_("No namespace field in the meta."))
    namespaces = [namespace.rstrip('/') for namespace, _ in app.cfg.namespace_mapping]

    if len(names) != len(set(names)):
        msg = L_("The names in the name list must be unique.")
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)
    # Item names must not start with '@' or '+', '@something' denotes a field where as '+something' denotes a view.
    invalid_names = [name for name in names if name.startswith(('@', '+'))]
    if invalid_names:
        msg = L_("Item names (%(invalid_names)s) must not start with '@' or '+'", invalid_names=", ".join(invalid_names))
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)

    namespaces = namespaces + NAMESPACES_IDENTIFIER  # Also dont allow item names to match with identifier namespaces.
    # Item names must not match with existing namespaces.
    invalid_names = [name for name in names if name.split('/', 1)[0] in namespaces]
    if invalid_names:
        msg = L_("Item names (%(invalid_names)s) must not match with existing namespaces.", invalid_names=", ".join(invalid_names))
        flash(msg, "error")  # duplicate message at top of form
        raise NameNotValidError(msg)
    query = And([Or([Term(NAME, name) for name in names]), Term(NAMESPACE, current_namespace)])
    # There should be not item existing with the same name.
    if itemid is not None:
        query = And([query, Not(Term(ITEMID, itemid))])  # search for items except the current item.
    with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher:
        results = searcher.search(query)
        duplicate_names = {name for result in results for name in result[NAME] if name in names}
        if duplicate_names:
            msg = L_("Item(s) named %(duplicate_names)s already exist.", duplicate_names=", ".join(duplicate_names))
            flash(msg, "error")  # duplicate message at top of form
            raise NameNotValidError(msg)
Ejemplo n.º 13
0
    def get_subitem_revs(self):
        """
        Create a list of subitems of this item.

        Subitems are in the form of storage Revisions.
        """
        query = And([
            Term(WIKINAME, app.cfg.interwikiname),
            Term(NAMESPACE, self.fqname.namespace)
        ])
        # trick: an item of empty name can be considered as "virtual root item"
        # that has all wiki items as sub items
        if self.names:
            query = And([
                query,
                Or([
                    Prefix(NAME_EXACT, prefix)
                    for prefix in self.subitem_prefixes
                ])
            ])
        revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None)
        return revs
Ejemplo n.º 14
0
    def search(self,
               q_str: str,
               in_chats: Optional[List[int]],
               page_len: int,
               page_num: int = 1) -> SearchResult:
        q = self.query_parser.parse(q_str)
        with self.ix.searcher() as searcher:
            q_filter = in_chats and Or(
                [Term('chat_id', str(chat_id)) for chat_id in in_chats])
            result_page = searcher.search_page(q,
                                               page_num,
                                               page_len,
                                               filter=q_filter,
                                               sortedby='post_time',
                                               reverse=True)

            hits = [
                SearchHit(IndexMsg(**msg),
                          self.highlighter.highlight_hit(msg, 'content'))
                for msg in result_page
            ]
            return SearchResult(hits, result_page.is_last_page(),
                                result_page.total)
Ejemplo n.º 15
0
    def more_like(self, pk, source, top=5):
        """Find similar units."""
        index = self.get_source_index()
        with index.searcher() as searcher:
            # Extract key terms
            kts = searcher.key_terms_from_text(
                'source', source, numterms=10, normalize=False
            )
            # Create an Or query from the key terms
            query = Or([Term('source', word, boost=weight) for word, weight in kts])
            LOGGER.debug('more like query: %r', query)

            # Grab fulltext results
            results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)]
            LOGGER.debug('found %d matches', len(results))
            if not results:
                return []

            # Filter bad results
            threshold = max((h[1] for h in results)) / 2
            results = [h[0] for h in results if h[1] > threshold]
            LOGGER.debug('filter %d matches over threshold %d', len(results), threshold)

            return results
Ejemplo n.º 16
0
    def more_like(self, pk, source, top=5):
        """Find similar units."""
        index = self.get_source_index()
        with index.searcher() as searcher:
            # Extract key terms
            kts = searcher.key_terms_from_text('source',
                                               source,
                                               numterms=10,
                                               normalize=False)
            # Create an Or query from the key terms
            query = Or(
                [Term('source', word, boost=weight) for word, weight in kts])

            # Grab fulltext results
            results = [(h['pk'], h.score)
                       for h in searcher.search(query, limit=top)]
            if not results:
                return []
            # Normalize scores to 0-100
            max_score = max([h[1] for h in results])
            scores = {h[0]: h[1] * 100 / max_score for h in results}

            # Filter results with score above 50 and not current unit
            return [h[0] for h in results if scores[h[0]] > 50 and h[0] != pk]
Ejemplo n.º 17
0
def test_query_copy_hash():
    def do(q1, q2):
        q1a = copy.deepcopy(q1)
        assert_equal(q1, q1a)
        assert_equal(hash(q1), hash(q1a))
        assert_not_equal(q1, q2)

    do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5))
    do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1),
       And([Term("a", u("b")), Term("c", u("d"))], boost=1.5))
    do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]),
       Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5))
    do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]),
       DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))],
                      boost=1.5))
    do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5)))
    do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5))
    do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"),
                                                     boost=1.5))
    do(FuzzyTerm("a", u("b"), constantscore=True),
       FuzzyTerm("a", u("b"), constantscore=False))
    do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5))
    do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d")))
    do(TermRange("a", None, u("c")), TermRange("a", None, None))
    do(TermRange("a", u("b"), u("c"), boost=1.1),
       TermRange("a", u("b"), u("c"), boost=1.5))
    do(TermRange("a", u("b"), u("c"), constantscore=True),
       TermRange("a", u("b"), u("c"), constantscore=False))
    do(NumericRange("a", 1, 5), NumericRange("a", 1, 6))
    do(NumericRange("a", None, 5), NumericRange("a", None, None))
    do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5))
    do(NumericRange("a", 3, 6, constantscore=True),
       NumericRange("a", 3, 6, constantscore=False))
    # do(DateRange)
    do(Variations("a", u("render")), Variations("a", u("renders")))
    do(Variations("a", u("render"), boost=1.1),
       Variations("a", u("renders"), boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")]),
       Phrase("a", [u("b"), u("c"), u("e")]))
    do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1),
       Phrase("a", [u("b"), u("c"), u("d")], boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")], slop=1),
       Phrase("a", [u("b"), u("c"), u("d")], slop=2))
    # do(Ordered)
    do(Every(), Every("a"))
    do(Every("a"), Every("b"))
    do(Every("a", boost=1.1), Every("a", boost=1.5))
    do(NullQuery, Term("a", u("b")))
    do(ConstantScoreQuery(Term("a", u("b"))),
       ConstantScoreQuery(Term("a", u("c"))))
    do(ConstantScoreQuery(Term("a", u("b")), score=2.0),
       ConstantScoreQuery(Term("a", u("c")), score=2.1))
    do(Require(Term("a", u("b")), Term("c", u("d"))),
       Require(Term("a", u("b"), boost=1.1), Term("c", u("d"))))
    # do(Require)
    # do(AndMaybe)
    # do(AndNot)
    # do(Otherwise)

    do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")),
                                                        limit=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d"))),
       SpanNear(Term("a", u("b")), Term("c", u("e"))))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True),
       SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False))
    do(SpanNot(Term("a", u("b")), Term("a", u("c"))),
       SpanNot(Term("a", u("b")), Term("a", u("d"))))
    do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]),
       SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))]))
    do(SpanContains(Term("a", u("b")), Term("a", u("c"))),
       SpanContains(Term("a", u("b")), Term("a", u("d"))))
Ejemplo n.º 18
0
    def perform_search(self, sentence):
        with self._searcher() as s:
            tokens = sentence.split()
            tokens = [token for token in tokens if token != REPLACED]
            print('tokens=', tokens)
            exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens],
                                  boost=.5)
            exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens],
                                boost=.5,
                                scale=0.9)
            # Added variability of maxdist based on word length
            fuzzy_or_match = Or([
                FuzzyTerm(TEXT_FIELD,
                          t,
                          prefixlength=1,
                          maxdist=1 if len(t) < 8 else 2)
                for t in tokens if len(t) >= 4
            ],
                                boost=.2,
                                scale=0.9)
            if len(tokens) > 1:
                # add bigrams if there are any
                bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)]
                bigram_fuzzy_or_match = Or([
                    FuzzyTerm(BIGRAMS_FIELD,
                              b,
                              prefixlength=3,
                              maxdist=2 if len(b) < 8 else 3) for b in bigrams
                ],
                                           scale=0.9)
            else:
                bigram_fuzzy_or_match = None

            non_brand_or_match = Or(
                [Term(NONBRAND_TEXT_FIELD, t) for t in tokens])

            # q = exact_and_match \
            # | exact_or_match \
            # | fuzzy_or_match

            # my_match = Or([Term(f, token) for token in tokens], boost=1)
            # q = my_match

            #
            # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0,
            #                    scale=0.9)

            q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match

            if bigram_fuzzy_or_match:
                q = q | bigram_fuzzy_or_match

            print(q)
            search_results = self.get_search_results(self._index, s, q)

            for x in search_results:
                print(x, x.score)

            if search_results:
                score, text, matched = search_results[0].items()
                return text, list(set(matched))
            else:
                return None, None
Ejemplo n.º 19
0
def search(request):
    if request.method == 'POST':
        form = Search_Form(request.POST)
        if form.is_valid():
            if not aux_check_index():
                aux_reset_all()
            key = form.cleaned_data['key_word'].lower()
            type = form.cleaned_data['type']
            ix = open_dir(dirindex)
            with ix.searcher() as searcher:
                words = key.strip().split()
                terms_classified = []
                for word in words:
                    terms = []
                    for desc in [
                            'descripcionECI', 'descripcionMM', 'descripcionFC'
                    ]:
                        terms.append(Term(desc, word))
                    terms_classified.append(terms)
                subqueries = []
                for t in terms_classified:
                    if type == 'N3':
                        subqueries.append(And(t))
                    else:
                        subqueries.append(Or(t))
                query = subqueries[0]
                if len(subqueries) > 1:
                    if type == 'N1':
                        query = Or(subqueries)
                    else:
                        query = And(subqueries)
                results = searcher.search(query)
                title = "Resultados para: "
                mostrar = True
                if len(results) == 0:
                    title = "No hay resultados para: "
                    mostrar = False
                eci = []
                mm = []
                fc = []
                for r in results:
                    eci.append(
                        Historico_ECI.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    mm.append(
                        Historico_MM.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    fc.append(
                        Historico_FC.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                return render(
                    request, 'search.html', {
                        "eci": eci,
                        "mm": mm,
                        'fc': fc,
                        "title": title + key,
                        "mostrar": mostrar
                    })
    else:
        form = Search_Form()
    return render(request, 'search.html', {'form': form})
Ejemplo n.º 20
0
    def __call__(self):

        command = self.request.matchdict['command']
        params = self.request.params

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if command == 'namesearch':

            search_index = self.request.registry.settings['whoosh_index']
            query_parser = self.request.registry.settings['query_parser']

            with search_index.searcher() as searcher:

                query = query_parser.parse(params['term'])

                # allowable = Or([Term(u'item_type', u'species'), Term(u'item_type', u'climate')])
                # allowable = Or([Term(u'item_type', u'species')])
                allowable = Or([
                    Term(u'item_type', u'species'),
                    Term(u'item_type', u'refugia'),
                    Term(u'item_type', u'aoc'),
                    Term(u'item_type', u'richness')
                ])

                results = searcher.search(query, filter=allowable)

                matches = {}

                for result in results:
                    matches[result['nice_name']] = {
                        "type": result['item_type'],
                        "path": result['item_path'],
                        "mapId": result['item_id']
                    }

            json_content = json.dumps(matches)
            return Response(body=json_content, content_type='application/json')
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        if command == 'mapsearch':

            es = self.request.registry.settings['search_conn']

            allowable = ['species', 'refugia', 'aoc', 'richness']
            query = {
                "query": {
                    "bool": {
                        "must": {
                            "match": {
                                "nice_name": {
                                    "query": params['term'],
                                    "operator": "and"
                                }
                            }
                        },
                        "filter": {
                            "terms": {
                                "item_type": allowable
                            }
                        }
                    }
                },
                "from": 0,
                "size": 15
            }

            results = es.search(index='wallace', doc_type='map', body=query)

            matches = {}
            for result in results['hits']['hits']:
                doc = result['_source']
                matches[doc['nice_name']] = {
                    "type": doc['item_type'],
                    "path": doc['item_path'],
                    "mapId": doc['item_id']
                }

            json_content = json.dumps(matches)
            return Response(body=json_content, content_type='application/json')

        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        elif command == 'preplayer':

            gs_user = self.request.registry.settings['climas.gs_user']
            gs_pass = self.request.registry.settings['climas.gs_pass']

            # ==== what's the map they want?

            map_type = params['info[type]']
            map_path = params['info[path]']
            map_id = params['info[mapId]']
            map_projection = params['proj']

            if map_type == 'species':
                path_to_map_tif = ''.join([
                    "file:///rdsi/wallace2/W2_website/species/", map_path,
                    "/summaries_temperature/", map_projection, ".tif"
                ])

            else:
                # all the summary maps types have the same path
                path_to_map_tif = ''.join([
                    "file:///rdsi/wallace2/W2_website/", map_path, "/",
                    map_projection, ".tif"
                ])

            coverage_name = '@'.join(
                [map_type, map_id.replace(' ', '_'), map_projection])

            # TODO: remove this debug
            print(path_to_map_tif)

            # ==== insert that map into geoserver

            # todo: put this into a timeout somehow
            poke = requests.put(
                "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/"
                + coverage_name + "/external.geotiff",
                data=path_to_map_tif,
                auth=(gs_user, gs_pass))
            poke = requests.post(
                "http://wallace-maps.hpc.jcu.edu.au/geoserver/rest/workspaces/wallace/coveragestores/"
                + coverage_name + "/coverages",
                data="<coverage><name>" + coverage_name +
                "</name><nativeName>" + map_projection +
                "</nativeName></coverage>",
                auth=(gs_user, gs_pass),
                headers={'Content-type': 'text/xml'})

            # ==== return the WMS url for that layer

            if (poke.ok or 'already exists' in poke.text):
                result = {
                    "ok": True,
                    "mapUrl":
                    u"http://wallace-maps.hpc.jcu.edu.au/geoserver/wallace/wms",
                    "layerName": u"wallace:" + coverage_name
                }

                json_content = json.dumps(result)
                return Response(body=json_content,
                                content_type='application/json')

            json_content = json.dumps({
                "ok": False,
                "status_code": poke.status_code,
                "status_reason": poke.reason,
                "result": poke.text
            })

            # if we haven't returned yet, our layer poke didn't work
            return Response(status_code=500,
                            body=json_content,
                            content_type='application/json')
Ejemplo n.º 21
0
def search_doc(directory,
               word,
               doc_types,
               num_page=1,
               num_by_page=10,
               show_num_results=True):
    """
    * -------------{Function}---------------
    * Returns a list of docs that contains a given set of words that matches a g
    * -------------{returns}----------------
    * {set} query results . . . 
    * -------------{params}-----------------
    * : directory -> path of the index
    * : word -> words to query
    * : doc_types -> type of doc to search
    * : num_page -> number of pages to search
    * : show_num_results -> number of results to return
    """
    index_schema = load_index(directory)
    doctypes_schema = load_doctypes_schema(directory)

    # Retrieves the fields to search from the doctypes schema
    fields_to_search = []
    for doc_type in doc_types:
        doc_type = doc_type.lower()
        try:
            schema = doctypes_schema[doc_type]
            fields_to_search = fields_to_search + schema
        except:
            logger.warning(
                "Schema not found for {doc_type}".format(doc_type=doc_type))

    # By default we search "content" (for BC) and "tags"
    fields = ["content", "tags"] + fields_to_search
    logger.info(
        "search will be performed on fields {fields}".format(fields=fields))

    # Creates the query parser
    # MultifieldParser allows search on multiple fields
    # We use custom FuzzyTerm class to set the Leveshtein distance to 2
    parser = MultifieldParser(fields,
                              schema=doctypes_schema,
                              termclass=CustomFuzzyTerm)
    query = parser.parse(word)

    # Creates a filter on the doctype field
    doctype_filter_matcher = []
    for doc_type in doc_types:
        term = FuzzyTerm("doc_type", doc_type.lower(), 1.0, 2)
        doctype_filter_matcher.append(term)

    doc_type_filter = Or(doctype_filter_matcher)

    # Processes the search(request the index, whoosh magic)
    with index_schema.searcher() as searcher:
        results = searcher.search_page(query,
                                       num_page,
                                       pagelen=num_by_page,
                                       filter=doc_type_filter)
        results_id = [result["doc_id"] for result in results]
        logger.info("Results: {results_id}".format(results_id=results_id))

        # Ensures BC if the number of results is not requested
        if show_num_results:
            return {"ids": results_id, "num_results": len(results)}
        else:
            return {"ids": results_id}
Ejemplo n.º 22
0
 def oq():
     return Or([Term("a", u("a")), Term("a", u("b"))])
Ejemplo n.º 23
0
 def restrict_query(self, request):
     return Or([
         And([Term('public', 't'), Term('searchable', 't')]),
         Term('users', request.user.username)
     ] + [Term('groups', group.name) for group in request.user.groups.all()])
Ejemplo n.º 24
0
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None, within=None,
               dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            parsed_query = self.parser.parse(query_string)
            if len(model_choices) > 0:
                narrow_model = [Term(DJANGO_CT, rm) for rm in model_choices]
                parsed_query = And([Or(narrow_model), parsed_query])

            searcher = self.index.searcher()

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            collapse_field = kwargs.get("collapse")
            collapse_limit = kwargs.get("collapse_limit")

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse
            }

            if collapse_field is not None:
                search_kwargs['collapse'] = FieldFacet(collapse_field)
                search_kwargs['collapse_limit'] = 1

                if kwargs.get("collapse_order") is not None:
                    order = kwargs.get("collapse_order")
                    collapse_order = FieldFacet(order.replace('-', ''), reverse=order.find('-') > -1)
                    search_kwargs['collapse_order'] = collapse_order

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            grouped_results = None
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            if collapse_field is not None and collapse_limit > 1:
                search_kwargs = {
                    'sortedby': collapse_order
                }
                grouped_results = []
                for result in raw_page:
                    query = And([Term(collapse_field, result[collapse_field]), parsed_query])
                    results = searcher.search(query, limit=collapse_limit, **search_kwargs)

                    grouped_results.append(results)

            results = self._process_results(raw_page, result_class=result_class, collapse_field=collapse_field, grouped_results=grouped_results)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
Ejemplo n.º 25
0
 def nq(level):
     if level == 0:
         return oq()
     else:
         return Or([nq(level - 1), nq(level - 1), nq(level - 1)])
Ejemplo n.º 26
0
def test_not():
    _run_query(Or([Term("value", u("red")), Term("name", u("yellow")), Not(Term("name", u("quick")))]),
                    [u("A"), u("E")])
Ejemplo n.º 27
0
def test_replace():
    q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2),
             Variations("a", "b", boost=2.0)])
    q = q.replace("a", "b", "BB")
    assert_equal(q, And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2),
                         Variations("a", "BB", boost=2.0)]))
Ejemplo n.º 28
0
def test_wildcard():
    _run_query(Or([Wildcard('value', u('*red*')), Wildcard('name', u('*yellow*'))]),
                    [u("A"), u("C"), u("D"), u("E")])
    # Missing
    _run_query(Wildcard('value', 'glonk*'), [])
Ejemplo n.º 29
0
    def search(self,
               collector,
               query_str1=None,
               query_str2=None,
               itemtypes=(),
               highlight=False):

        # rejects '*' and '?'
        if query_str1:
            for kw in (s.strip() for s in query_str1.split()):
                if not kw.replace("*", "").replace("?", "").strip():
                    return []

        wildcard = (query_str1 and any(c in query_str1 for c in "*?"))

        parser = self._parser_wild if wildcard else self._parser
        asf_parser = self._asf_parser

        with self._index.searcher() as searcher:
            andlist = []
            try:
                if query_str1:
                    andlist.append(parser.parse(query_str1))
                if query_str2:
                    andlist.append(asf_parser.parse(query_str2))
            except:
                return []

            if itemtypes:
                if len(itemtypes) > 1:
                    andlist.append(Or([Term('itemtype', t)
                                       for t in itemtypes]))
                else:
                    andlist.append(Term('itemtype', itemtypes[0]))

            query = And(andlist)

            searcher.search_with_collector(query, collector)
            hits = collector.results()

            if highlight:
                hits.fragmenter = WholeFragmenter()
                hits.formatter = HtmlFormatter(tagname='span',
                                               classname='s_match',
                                               termclass='s_term')

            if wildcard and query_str1:
                pat = query_str1.replace("-", "").replace(" ", "")
                wildmatch = re.compile(fnmatch.translate(pat))

            # Construct a result list
            results = []
            for hit in hits:
                if collector.aborted:
                    return []
                (label, path, prio, sortkey) = hit['data']

                if wildcard and query_str1:
                    if not wildmatch.match(sortkey):
                        continue

                if highlight:
                    if query_str1:
                        text = hit.highlights('content')
                    else:
                        text = hit['content']
                else:
                    text = None

                results.append((label, path, sortkey, prio, text))

            sortkey_prio_getter = itemgetter(2, 3)
            results.sort(key=sortkey_prio_getter)

            # Return
            return results
Ejemplo n.º 30
0
    def query(self,
              string_query: str,
              results_number: int,
              mask_list: list = None,
              candidate_list: list = None,
              classic_similarity: bool = True) -> dict:
        """
        Uses a search index to query the index in order to retrieve specific contents using a query expressed in string
        form

        Args:
            string_query (str): query expressed as a string
            results_number (int): number of results the searcher will return for the query
            mask_list (list): list of content_ids of items to ignore in the search process
            candidate_list (list): list of content_ids of items to consider in the search process,
                if it is not None only items in the list will be considered
            classic_similarity (bool): if True, classic tf idf is used for scoring, otherwise BM25F is used

        Returns:
            results (dict): the final results dictionary containing the results found from the search index for the
                query. The dictionary will be in the following form:

                    {content_id: {"item": item_dictionary, "score": item_score}, ...}

                content_id is the content_id for the corresponding item
                item_dictionary is the dictionary of the item containing the fields as keys and the contents as values.
                So it will be in the following form: {"Plot": "this is the plot", "Genre": "this is the Genre"}
                The item_dictionary will not contain the content_id since it is already defined and used as key of the
                external dictionary
                items_score is the score given to the item for the query by the index searcher
        """
        ix = open_dir(self.directory)
        with ix.searcher(
                weighting=TF_IDF if classic_similarity else BM25F) as searcher:
            candidate_query_list = None
            mask_query_list = None

            # the mask list contains the content_id for the items to ignore in the searching process
            # from the mask list a mask query is created and it will be used by the searcher
            if mask_list is not None:
                mask_query_list = []
                for document in mask_list:
                    mask_query_list.append(Term("content_id", document))
                mask_query_list = Or(mask_query_list)

            # the candidate list contains the content_id for the items to consider in the searching process
            # from the candidate list a candidate query is created and it will be used by the searcher
            if candidate_list is not None:
                candidate_query_list = []
                for candidate in candidate_list:
                    candidate_query_list.append(Term("content_id", candidate))
                candidate_query_list = Or(candidate_query_list)

            schema = ix.schema
            parser = QueryParser("content_id", schema=schema, group=OrGroup)
            # regular expression to match the possible field styles
            # examples: "content_id" or "Genre#2" or "Genre#2#custom_id"
            parser.add_plugin(
                FieldsPlugin(r'(?P<text>[\w-]+(\#[\w-]+(\#[\w-]+)?)?|[*]):'))
            query = parser.parse(string_query)
            score_docs = \
                searcher.search(query, limit=results_number, filter=candidate_query_list, mask=mask_query_list)

            # creation of the results dictionary, This phase is necessary because the Hit objects returned by the
            # searcher as results need the reader inside the search index in order to return information
            # so it would be impossible to access a field or the score of the item from outside this method
            # because of that this dictionary containing the most important infos is created
            results = {}
            for hit in score_docs:
                hit_dict = dict(hit)
                content_id = hit_dict.pop("content_id")
                results[content_id] = {}
                results[content_id]["item"] = hit_dict
                results[content_id]["score"] = hit.score
            return results