Beispiel #1
0
 def __unicode__(self):
     r = u("%s:%s") % (self.fieldname, self.text) + u("~")
     if self.maxdist > 1:
         r += u("%d") % self.maxdist
     if self.boost != 1.0:
         r += u("^%f") % self.boost
     return r
Beispiel #2
0
 def __unicode__(self):
     r = u("DisMax(")
     r += " ".join(sorted(text_type(s) for s in self.subqueries))
     r += u(")")
     if self.tiebreak:
         r += u("~") + text_type(self.tiebreak)
     return r
Beispiel #3
0
 def __unicode__(self):
     r = u("(")
     r += (self.JOINT).join([text_type(s) for s in self.subqueries])
     r += u(")")
     if self.minmatch:
         r += u(">%s") % self.minmatch
     return r
Beispiel #4
0
    def __unicode__(self):
        text = self.text
        if isinstance(text, bytes_type):
            try:
                text = text.decode("ascii")
            except UnicodeDecodeError:
                text = repr(text)

        t = u("%s:%s") % (self.fieldname, text)
        if self.boost != 1:
            t += u("^") + text_type(self.boost)
        return t
Beispiel #5
0
 def __unicode__(self):
     startchar = "{" if self.startexcl else "["
     endchar = "}" if self.endexcl else "]"
     start = "" if self.start is None else self.start
     end = "" if self.end is None else self.end
     return u("%s:%s%s TO %s%s") % (
         self.fieldname,
         startchar,
         start,
         end,
         endchar,
     )
Beispiel #6
0
class WildcardPlugin(whoosh.qparser.plugins.TaggingPlugin):
    # \u055E = Armenian question mark
    # \u061F = Arabic question mark
    # \u1367 = Ethiopic question mark
    qmarks = u("?\u055E\u061F\u1367")
    expr = "(?P<text>[*%s])" % qmarks

    def filters(self, parser):
        # Run early, but definitely before multifield plugin
        return [(self.do_wildcards, 50)]

    def do_wildcards(self, parser, group):
        i = 0
        while i < len(group):
            node = group[i]
            if isinstance(node, self.WildcardNode):
                if i < len(group) - 1 and group[i + 1].is_text():
                    nextnode = group.pop(i + 1)
                    node.text += nextnode.text
                if i > 0 and group[i - 1].is_text():
                    prevnode = group.pop(i - 1)
                    node.text = prevnode.text + node.text
                else:
                    i += 1
            else:
                if isinstance(node, CylleneusGroupNode):
                    self.do_wildcards(parser, node)
                i += 1

        for i in xrange(len(group)):
            node = group[i]
            if isinstance(node, self.WildcardNode):
                text = node.text
                if (len(text) > 1 and all(qm not in text for qm in self.qmarks)
                        and text.find("*") == len(text) - 1):
                    newnode = PrefixPlugin.PrefixNode(text[:-1])
                    newnode.startchar = node.startchar
                    newnode.endchar = node.endchar
                    group[i] = newnode
        return group

    class WildcardNode(whoosh.qparser.syntax.TextNode):
        # Note that this node inherits tokenize = False from TextNode,
        # so the text in this node will not be analyzed... just passed
        # straight to the query

        qclass = cylleneus.engine.query.terms.Wildcard

        def r(self):
            return "Wild %r" % self.text

    nodetype = WildcardNode
Beispiel #7
0
    def normalize(self):
        if self.start in ("", None) and self.end in (u("\uffff"), None):
            from whoosh.query import Every

            return Every(self.fieldname, boost=self.boost)
        elif self.start == self.end:
            if self.startexcl or self.endexcl:
                return qcore.NullQuery
            return terms.Term(self.fieldname, self.start, boost=self.boost)
        else:
            return TermRange(
                self.fieldname,
                self.start,
                self.end,
                self.startexcl,
                self.endexcl,
                boost=self.boost,
            )
Beispiel #8
0
    def iter_all_terms(self, ixreader, phrases=True):
        """Returns an iterator of (fieldname, text) pairs for all terms in
        this query tree.
        >>> qp = qparser.QueryParser("text", myindex.schema)
        >>> q = myparser.parse("alfa bravo title:charlie")
        >>> # List the terms in a query
        >>> list(q.iter_all_terms())
        [("text", "alfa"), ("text", "bravo"), ("title", "charlie")]
        >>> # Get a set of all terms in the query that don't exist in the index
        >>> r = myindex.reader()
        >>> missing = set(t for t in q.iter_all_terms() if t not in r)
        set([("text", "alfa"), ("title", "charlie")])
        >>> # All terms in the query that occur in fewer than 5 documents in
        >>> # the index
        >>> [t for t in q.iter_all_terms() if r.doc_frequency(t[0], t[1]) < 5]
        [("title", "charlie")]
        :param phrases: Whether to add words found in Phrase queries.
        """

        for btext in self._btexts(ixreader):
            yield (self.fieldname, u(btext))
Beispiel #9
0
 def __unicode__(self):
     return u("NOT ") + text_type(self.query)
Beispiel #10
0
 def __unicode__(self):
     return u("%s:<%s>") % (self.fieldname, self.text)
Beispiel #11
0
 def __unicode__(self):
     r = u("(")
     r += self.JOINT.join([text_type(s) for s in self.subqueries])
     r += u(")")
     return r
Beispiel #12
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = CylleneusToken(
            positions, chars, removestops=removestops, mode=mode, **kwargs
        )
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            text = u("")
            charmap = self.charmap
            pos = start_pos
            startchar = currentchar = start_char
            for char in value:
                tchar = charmap[ord(char)]
                if tchar:
                    text += tchar
                else:
                    if currentchar > startchar:
                        t.text = text
                        t.boost = 1.0
                        if keeporiginal:
                            t.original = t.text
                        if positions:
                            t.pos = pos
                            pos += 1
                        if chars:
                            t.startchar = startchar
                            t.endchar = currentchar
                        yield t
                    startchar = currentchar + 1
                    text = u("")

                currentchar += 1

            if currentchar > startchar:
                t.text = value[startchar:currentchar]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = startchar
                    t.endchar = currentchar
                yield t
Beispiel #13
0
 def __unicode__(self):
     return u("%s:*") % self.fieldname
Beispiel #14
0
 def __unicode__(self):
     return u("<_NullQuery>")