Ejemplo n.º 1
0
def test_apply():
    def visit(q):
        if isinstance(q, (Term, Variations, FuzzyTerm)):
            q.text = q.text.upper()
            return q
        return q.apply(visit)

    before = And([Not(Term("a", u("b"))), Variations("a", u("c")),
                  Not(FuzzyTerm("a", u("d")))])
    after = visit(before)
    assert_equal(after, And([Not(Term("a", u("B"))), Variations("a", u("C")),
                             Not(FuzzyTerm("a", u("D")))]))

    def term2var(q):
        if isinstance(q, Term):
            return Variations(q.fieldname, q.text)
        else:
            return q.apply(term2var)

    q = And([Term("f", "alfa"), Or([Term("f", "bravo"),
                                    Not(Term("f", "charlie"))])])
    q = term2var(q)
    assert_equal(q, And([Variations('f', 'alfa'),
                         Or([Variations('f', 'bravo'),
                             Not(Variations('f', 'charlie'))])]))
Ejemplo n.º 2
0
def queryCategoryGenerator(busqueda):
    trozos = busqueda
    query = None
    for p in trozos:
        if (query is None):
            query = FuzzyTerm("categoria", p, maxdist=2)
        else:
            query = query | FuzzyTerm("categoria", p, maxdist=2)

    return query
Ejemplo n.º 3
0
def querySearchGenerator(busqueda):
    trozos = busqueda.split(" ")
    query = None
    for p in trozos:
        if (query is None):
            query = FuzzyTerm("titulo", p, maxdist=int(
                len(p) / 4)) | FuzzyTerm(
                    "descripcion", p, maxdist=int(len(p) / 4))
        else:
            query = query | FuzzyTerm("titulo", p, maxdist=int(
                len(p) / 4)) | FuzzyTerm(
                    "descripcion", p, maxdist=int(len(p) / 4))
    return query
Ejemplo n.º 4
0
def busqueda_noticia(request):
    form = NoticiaBusquedaForm()
    noticias = None
    if request.method == 'POST':
        form = NoticiaBusquedaForm(request.POST)
        if form.is_valid():
            noticias = Noticia.objects.all()
            keywords = form.cleaned_data['keywords']

            ix = open_dir(dirindex)
            with ix.searcher() as searcher:

                temas = keywords.split()

                for x in temas:
                    query = FuzzyTerm('titulo', x)
                    #Si no funciona bien, hacerlo con Term
                    noticias = noticias & searcher.search(query)

    else:
        form = NoticiaBusquedaForm()

    return render(request, 'busqueda_noticias.html', {
        'form': form,
        'noticias': noticias
    })
Ejemplo n.º 5
0
    def search_doc(self,
                   word,
                   docTypes,
                   numPage=1,
                   numByPage=10,
                   showNumResults=False):
        """
        Return a list of docs that contains given word and that matches
        given type.
        """

        indexSchema = IndexSchema()

        # Retrieves the fields to search from the doctypes schema
        fieldsToSearch = []
        for docType in docTypes:
            docType = docType.lower()
            try:
                schema = indexSchema.doctypesSchema[docType]
                fieldsToSearch = fieldsToSearch + schema
            except:
                logger.warning("Schema not found for %s" % docType)

        # By default we search "content" (for BC) and "tags"
        fields = ['content', 'tags'] + fieldsToSearch
        logger.info("Search will be performed on fields %s" % fields)

        # Creates the query parser.
        # MultifieldParser allows search on multiple fields.
        # We use a custom FuzzyTerm class to set the Levenstein distance to 2
        parser = MultifieldParser(fields,
                                  schema=indexSchema.schema,
                                  termclass=CustomFuzzyTerm)
        query = parser.parse(word)

        # Creates a filter on the doctype field
        doctypeFilterMatcher = []
        for docType in docTypes:
            term = FuzzyTerm("docType", unicode(docType.lower()), 1.0, 2)
            doctypeFilterMatcher.append(term)

        docTypeFilter = Or(doctypeFilterMatcher)

        # Processes the search (request the index, Whoosh magic)
        with indexSchema.index.searcher() as searcher:
            results = searcher.search_page(query,
                                           numPage,
                                           pagelen=numByPage,
                                           filter=docTypeFilter)

            resultsID = [result["docId"] for result in results]
            logger.info("Results: %s" % resultsID)

            # Ensures BC if the number of results is not requested
            if showNumResults:
                return {'ids': resultsID, 'numResults': len(results)}
            else:
                return {'ids': resultsID}
Ejemplo n.º 6
0
def test_fuzzyterm():
    schema = fields.Schema(id=fields.STORED, f=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, f=u("alfa bravo charlie delta"))
    w.add_document(id=2, f=u("bravo charlie delta echo"))
    w.add_document(id=3, f=u("charlie delta echo foxtrot"))
    w.add_document(id=4, f=u("delta echo foxtrot golf"))
    w.commit()

    with ix.searcher() as s:
        q = FuzzyTerm("f", "brave")
        assert_equal([d["id"] for d in s.search(q)], [1, 2])
Ejemplo n.º 7
0
def queryCategoryGenerator(busqueda):
    trozos = []

    for b in busqueda:
        t = []

        if " " in b:
            t = b.split(" ")
            trozos = trozos + t
        elif "/" in b:
            t = b.split("/")
            trozos = trozos + t

        trozos.append(b)

    query = None

    for p in trozos:
        if (query is None):
            query = FuzzyTerm("categoria", p, maxdist=2)
        else:
            query = query | FuzzyTerm("categoria", p, maxdist=2)
    return query
Ejemplo n.º 8
0
def test_query_copy_hash():
    def do(q1, q2):
        q1a = copy.deepcopy(q1)
        assert_equal(q1, q1a)
        assert_equal(hash(q1), hash(q1a))
        assert_not_equal(q1, q2)

    do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5))
    do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1),
       And([Term("a", u("b")), Term("c", u("d"))], boost=1.5))
    do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]),
       Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5))
    do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]),
       DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))],
                      boost=1.5))
    do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5)))
    do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5))
    do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"),
                                                     boost=1.5))
    do(FuzzyTerm("a", u("b"), constantscore=True),
       FuzzyTerm("a", u("b"), constantscore=False))
    do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5))
    do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d")))
    do(TermRange("a", None, u("c")), TermRange("a", None, None))
    do(TermRange("a", u("b"), u("c"), boost=1.1),
       TermRange("a", u("b"), u("c"), boost=1.5))
    do(TermRange("a", u("b"), u("c"), constantscore=True),
       TermRange("a", u("b"), u("c"), constantscore=False))
    do(NumericRange("a", 1, 5), NumericRange("a", 1, 6))
    do(NumericRange("a", None, 5), NumericRange("a", None, None))
    do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5))
    do(NumericRange("a", 3, 6, constantscore=True),
       NumericRange("a", 3, 6, constantscore=False))
    # do(DateRange)
    do(Variations("a", u("render")), Variations("a", u("renders")))
    do(Variations("a", u("render"), boost=1.1),
       Variations("a", u("renders"), boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")]),
       Phrase("a", [u("b"), u("c"), u("e")]))
    do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1),
       Phrase("a", [u("b"), u("c"), u("d")], boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")], slop=1),
       Phrase("a", [u("b"), u("c"), u("d")], slop=2))
    # do(Ordered)
    do(Every(), Every("a"))
    do(Every("a"), Every("b"))
    do(Every("a", boost=1.1), Every("a", boost=1.5))
    do(NullQuery, Term("a", u("b")))
    do(ConstantScoreQuery(Term("a", u("b"))),
       ConstantScoreQuery(Term("a", u("c"))))
    do(ConstantScoreQuery(Term("a", u("b")), score=2.0),
       ConstantScoreQuery(Term("a", u("c")), score=2.1))
    do(Require(Term("a", u("b")), Term("c", u("d"))),
       Require(Term("a", u("b"), boost=1.1), Term("c", u("d"))))
    # do(Require)
    # do(AndMaybe)
    # do(AndNot)
    # do(Otherwise)

    do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")),
                                                        limit=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d"))),
       SpanNear(Term("a", u("b")), Term("c", u("e"))))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True),
       SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False))
    do(SpanNot(Term("a", u("b")), Term("a", u("c"))),
       SpanNot(Term("a", u("b")), Term("a", u("d"))))
    do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]),
       SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))]))
    do(SpanContains(Term("a", u("b")), Term("a", u("c"))),
       SpanContains(Term("a", u("b")), Term("a", u("d"))))
Ejemplo n.º 9
0
 def __init__(self, fieldname, text, boost=1.0, maxdist=1):
     FuzzyTerm.__init__(self, fieldname, text, 1.0, 2)
Ejemplo n.º 10
0
def search_doc(directory,
               word,
               doc_types,
               num_page=1,
               num_by_page=10,
               show_num_results=True):
    """
    * -------------{Function}---------------
    * Returns a list of docs that contains a given set of words that matches a g
    * -------------{returns}----------------
    * {set} query results . . . 
    * -------------{params}-----------------
    * : directory -> path of the index
    * : word -> words to query
    * : doc_types -> type of doc to search
    * : num_page -> number of pages to search
    * : show_num_results -> number of results to return
    """
    index_schema = load_index(directory)
    doctypes_schema = load_doctypes_schema(directory)

    # Retrieves the fields to search from the doctypes schema
    fields_to_search = []
    for doc_type in doc_types:
        doc_type = doc_type.lower()
        try:
            schema = doctypes_schema[doc_type]
            fields_to_search = fields_to_search + schema
        except:
            logger.warning(
                "Schema not found for {doc_type}".format(doc_type=doc_type))

    # By default we search "content" (for BC) and "tags"
    fields = ["content", "tags"] + fields_to_search
    logger.info(
        "search will be performed on fields {fields}".format(fields=fields))

    # Creates the query parser
    # MultifieldParser allows search on multiple fields
    # We use custom FuzzyTerm class to set the Leveshtein distance to 2
    parser = MultifieldParser(fields,
                              schema=doctypes_schema,
                              termclass=CustomFuzzyTerm)
    query = parser.parse(word)

    # Creates a filter on the doctype field
    doctype_filter_matcher = []
    for doc_type in doc_types:
        term = FuzzyTerm("doc_type", doc_type.lower(), 1.0, 2)
        doctype_filter_matcher.append(term)

    doc_type_filter = Or(doctype_filter_matcher)

    # Processes the search(request the index, whoosh magic)
    with index_schema.searcher() as searcher:
        results = searcher.search_page(query,
                                       num_page,
                                       pagelen=num_by_page,
                                       filter=doc_type_filter)
        results_id = [result["doc_id"] for result in results]
        logger.info("Results: {results_id}".format(results_id=results_id))

        # Ensures BC if the number of results is not requested
        if show_num_results:
            return {"ids": results_id, "num_results": len(results)}
        else:
            return {"ids": results_id}
Ejemplo n.º 11
0
 def __init__(self, fieldname, text, boost=1.0, maxdist=1):
     FuzzyTerm.__init__(self, fieldname, text, 1.0, 2)
Ejemplo n.º 12
0
'''
Created on Oct 27, 2014

@author: Cassie
'''
from whoosh import index
from whoosh.qparser import QueryParser
from whoosh.query import And, Term, Not, FuzzyTerm, Phrase

ix = index.open_dir("index")

q1 = And([
    Term("city_text", u"greek"),
    Term("city_text", u"roman"),
    Not(Term("city_text", u"persian"))
])
q2 = FuzzyTerm("city_text", u"shakespeare")
q3 = Phrase("city_text", [u"located", u"below", u"sea", u"level"], slop=10)

with ix.searcher() as s:
    results = s.search(q2, limit=None)
    for a in results:
        print a['city_name']
Ejemplo n.º 13
0
def fuzzy_term(q, dist, field):
    if len(q) <= 3:
        return Term(field, q)
    return FuzzyTerm(field, q, maxdist=dist, prefixlength=1)
Ejemplo n.º 14
0
    def perform_search(self, sentence):
        with self._searcher() as s:
            tokens = sentence.split()
            tokens = [token for token in tokens if token != REPLACED]
            print('tokens=', tokens)
            exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens],
                                  boost=.5)
            exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens],
                                boost=.5,
                                scale=0.9)
            # Added variability of maxdist based on word length
            fuzzy_or_match = Or([
                FuzzyTerm(TEXT_FIELD,
                          t,
                          prefixlength=1,
                          maxdist=1 if len(t) < 8 else 2)
                for t in tokens if len(t) >= 4
            ],
                                boost=.2,
                                scale=0.9)
            if len(tokens) > 1:
                # add bigrams if there are any
                bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)]
                bigram_fuzzy_or_match = Or([
                    FuzzyTerm(BIGRAMS_FIELD,
                              b,
                              prefixlength=3,
                              maxdist=2 if len(b) < 8 else 3) for b in bigrams
                ],
                                           scale=0.9)
            else:
                bigram_fuzzy_or_match = None

            non_brand_or_match = Or(
                [Term(NONBRAND_TEXT_FIELD, t) for t in tokens])

            # q = exact_and_match \
            # | exact_or_match \
            # | fuzzy_or_match

            # my_match = Or([Term(f, token) for token in tokens], boost=1)
            # q = my_match

            #
            # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0,
            #                    scale=0.9)

            q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match

            if bigram_fuzzy_or_match:
                q = q | bigram_fuzzy_or_match

            print(q)
            search_results = self.get_search_results(self._index, s, q)

            for x in search_results:
                print(x, x.score)

            if search_results:
                score, text, matched = search_results[0].items()
                return text, list(set(matched))
            else:
                return None, None
Ejemplo n.º 15
0
def fuzzy_term(q: str, dist: int, field: str) -> Term:
    if len(q) <= 3:
        return Term(field, q)
    return FuzzyTerm(field, q, maxdist=dist, prefixlength=1)
Ejemplo n.º 16
0
 def search(self, query):
     with self.index.searcher() as searcher:
         terms = [FuzzyTerm("content", word, maxdist=2) for word in query]
         search_query = Or(terms)
         results = searcher.search(search_query)
         return [result["filename"] for result in results]