Example #1
    def search(self, query: str, frequencies=False):
        Get the indices of the documents matching the query
        :param query: The whoosh query string
        :param frequencies: If true, return pairs of (docnum, frequency) rather than only docnum
        :return: sequence of document numbers (and freqs, if frequencies is True)

        with self.index.searcher(weighting=scoring.Frequency) as searcher:
            if frequencies:
                ## for some reason, using searcher.search counts all individual occurrences of the terms in a phrase ("term1 term2")
                ## after the phrase occurs at least once. So for frequencies, we use this lengthy alternative
                ## (I expect that somewhere a setting is hidden to simply fix this with searcher.search, but no clue yet)
                results = defaultdict(lambda:float(0))
                queries = divide_query(query)

                for i, q in enumerate(queries):
                    q = QueryParser("text", self.index.schema).parse(q)
                    matcher = q.matcher(searcher)

                    while matcher.is_active():
                        docnum = searcher.reader().stored_fields(matcher.id())['doc_i']
                        bd = boostdict(matcher)
                        for s in matcher.spans():
                            results[docnum] += bd[s] if s in bd else 1
                return [(k,v) for k,v in results.items()]
                query = QueryParser("text", self.index.schema).parse(query)
                results = searcher.search(query, limit=None, scored=False, sortedby=None)
                return [results[i]['doc_i'] for i in range(len(results))]
def searchTime(dir,query,lim):
    index = open_dir(dir)
    class TimeScorer(scoring.BaseScorer):
        def __init__(self, idfScorer,bm25Scorer):
            self.idfScorer = idfScorer
            self.bm25Scorer = bm25Scorer
        def score(self, matcher):

            s = self.bm25Scorer.score(matcher)*0.5+self.idfScorer.score(matcher)*0.5
            return s

    class TimeWeight(scoring.WeightingModel):        
        def scorer(self, searcher, fieldname, text, qf=1):
            # BM25
            bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf) 
            tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf)
            return TimeScorer(tfidfScorer,bm25Scorer)
    res = []   
    with index.searcher(weighting=TimeScorer()) as searcher:
        query = QueryParser("content", index.schema, group=OrGroup).parse(unicode(query,"UTF-8"))
        results = searcher.search(query, limit=lim)
        for r in results:
    return res
def searchPageRank(dir,query,lim,rank):
    index = open_dir(dir)
    class PageRankScorer(scoring.BaseScorer):
        def __init__(self, idf):
            self.idf = idf
        def score(self, matcher):
            doc = str(matcher.id()+1)
            r = 0
            if doc in rank.keys():
                r = rank[doc]
            s = matcher.weight() * self.idf* r
          #  print doc," | ", s
            return s

    class pageRankWeight(scoring.WeightingModel):
        def scorer(self, searcher, fieldname, text, qf=1):
            # IDF is a global statistic, so get it from the top-level searcher
            parent = searcher.get_parent()  # Returns self if no parent
            idf = parent.idf(fieldname, text)
            return PageRankScorer(idf)
    res = []   
    with index.searcher(weighting=pageRankWeight()) as searcher:
        query = QueryParser("content", index.schema, group=OrGroup).parse(unicode(query,"UTF-8"))
        results = searcher.search(query, limit=lim)
        for r in results:
    return res
Example #4
    def listar(event):
        lista.delete(0, END)
        ixc = open_dir(dirindex1)
        ixa = open_dir(dirindex2)

        if pattern == "texto":
            with ixc.searcher() as searcher:
                query = MultifieldParser(["asunto", "cuerpo"],
                results = searcher.search(query)
                for r in results:
                    lista.insert(END, r['remitente'])
                    with ixa.searcher() as namesearch:
                        query = QueryParser('email', ixa.schema).parse(
                        agenda = namesearch.search(query)
                        for name in agenda:
                            lista.insert(END, name['nombre'])

                    lista.insert(END, '')

        elif pattern == "spam":
            with ixc.searcher() as searcher:
                asuntos = entrada.get().strip().replace(" ", " OR ")
                query = QueryParser("asunto",
                results = searcher.search(query)
                for r in results:
                    lista.insert(END, r['fichero'])
                    lista.insert(END, '')

        elif pattern == "fecha":
            with ixc.searcher() as searcher:
                #Cogemos la fecha, la pasamos a datetime y como eso da error, la pasamos a solo date (no queremos time)
                biggerthan = datetime.datetime.strptime(
                    entrada.get().strip(), "%Y%m%d").date()
                # {*fecha* to] <-- indica conjunto abierto hasta el final
                query = QueryParser("fecha", ixc.schema).parse(
                    unicode("{" + str(biggerthan) + " to]"))
                results = searcher.search(query)
                for r in results:
                    lista.insert(END, r['remitente'])
                    lista.insert(END, r['destinatario'])
                    lista.insert(END, r['asunto'])
                    lista.insert(END, '')
Example #5
def buscar(pattern, texto):
    ix = open_dir(dirindex)
    res = []
    with ix.searcher() as searcher:
        query = QueryParser(pattern, ix.schema).parse(unicode(texto))
        results = searcher.search(query)
        for r in results:
            a = Anime.objects.get(titulo=r['titulo'])
            #             print r['sinopsis']
    return res
def searchCOS(dir,query,lim):
    index = open_dir(dir)
    res = []
    with index.searcher(weighting=scoring.TF_IDF()) as searcher:
        query = QueryParser("content", index.schema, group=OrGroup).parse(unicode(query,"UTF-8"))
        results = searcher.search(query, limit=lim)
        for r in results:
    return res
Example #7
def main(argv):


    config = load_config()
    index_path = config['index_path']

    ix = open_dir(index_path)
    searcher = ix.searcher()

    print "Doc count=%d" % searcher.doc_count()
    while True:
            querystring = raw_input("find something? >")
        except KeyboardInterrupt:
        with ix.searcher() as searcher:
            querystring = querystring.strip()
            if querystring == "":
                q = query.Every()
                parser = QueryParser("content", ix.schema)
                q = parser.parse(querystring)
            results = searcher.search_page(q, 1, pagelen=20)
            if len(results) == 0:
                print "No result"
                print "Found %d results" % len(results)
                quit_ = False
                for p in range(1, results.pagecount + 1):
                    while not quit_:
                        for i, hit in enumerate(results):
                            print "%d >> %s" % (i + (p - 1) * 20 + 1, hit)
                        inp = raw_input(
                            "Page %d/%d, (Enter: next page|q: quit) ? >" %
                            (p, results.pagecount))
                        if inp.strip() == 'q':
                            quit_ = True
                            if p < results.pagecount:
                                results = searcher.search_page(q,
                                                               p + 1,
                    if quit_:
Example #8
    def get_context(self, query: str, window: int = 30):
        Get the words in the context (n-word window) of all locations of the string

        :param query: search query
        :param window: window size (in words)
        :return: a generator of (id, text) pairs
        def get_window_tokens(tokens, spans):
            position = -1
            for span in spans:
                for position in range(max(position + 1, span.start - window), min(len(tokens), span.end + window + 1)):
                    yield tokens[position]

        query = QueryParser("text", self.index.schema).parse(query)
        with self.index.searcher() as searcher:
            matcher = query.matcher(searcher)
            while matcher.is_active():
                docnum = searcher.reader().stored_fields(matcher.id())['doc_i']
                yield docnum, list(get_window_tokens(self.tokens[docnum], matcher.spans()))
def searchL2R(dir,query,lim,rank,w):
    index = open_dir(dir)
    sss = None;
    class L2RScorer(scoring.BaseScorer):
        def __init__(self, idfScorer,bm25Scorer):
            self.idfScorer = idfScorer
            self.bm25Scorer = bm25Scorer
        def score(self, matcher):
            doc = str(sss.stored_fields(matcher.id())["id"])
            r = 0
            if doc in rank.keys():
                r = rank[doc]
            s = self.bm25Scorer.score(matcher)*w[0]+self.idfScorer.score(matcher)*w[1]+r*w[2]
            return s

    class L2RWeight(scoring.WeightingModel):        
        def scorer(self, searcher, fieldname, text, qf=1):
            # BM25
            bm25Scorer = BM25F().scorer(searcher, fieldname, text, qf) 
            tfidfScorer = TF_IDF().scorer(searcher, fieldname, text, qf)
            return L2RScorer(tfidfScorer,bm25Scorer)
    res = []   
    with index.searcher(weighting=L2RWeight()) as searcher:
        sss = searcher
        query = QueryParser("content", index.schema, group=OrGroup).parse(unicode(query,"UTF-8"))
        results = searcher.search(query, limit=lim)
        for r in results:
    return res
Example #10
def _journal_filter(user_log, search_term):
    Filters sqlalchemy user_log based on search_term with whoosh Query language

    :param user_log:
    :param search_term:
    log.debug('Initial search term: %r', search_term)
    qry = None
    if search_term:
        qp = QueryParser('repository', schema=JOURNAL_SCHEMA)
        qry = qp.parse(unicode(search_term))
        log.debug('Filtering using parsed query %r', qry)

    def wildcard_handler(col, wc_term):
        if wc_term.startswith('*') and not wc_term.endswith('*'):
            #postfix == endswith
            wc_term = remove_prefix(wc_term, prefix='*')
            return func.lower(col).endswith(func.lower(wc_term))
        elif wc_term.startswith('*') and wc_term.endswith('*'):
            #wildcard == ilike
            wc_term = remove_prefix(wc_term, prefix='*')
            wc_term = remove_suffix(wc_term, suffix='*')
            return func.lower(col).contains(func.lower(wc_term))

    def get_filterion(field, val, term):

        if field == 'repository':
            field = getattr(UserLog, 'repository_name')
        elif field == 'ip':
            field = getattr(UserLog, 'user_ip')
        elif field == 'date':
            field = getattr(UserLog, 'action_date')
        elif field == 'username':
            field = getattr(UserLog, 'username')
            field = getattr(UserLog, field)
        log.debug('filter field: %s val=>%s', field, val)

        #sql filtering
        if isinstance(term, query.Wildcard):
            return wildcard_handler(field, val)
        elif isinstance(term, query.Prefix):
            return func.lower(field).startswith(func.lower(val))
        elif isinstance(term, query.DateRange):
            return and_(field >= val[0], field <= val[1])
        return func.lower(field) == func.lower(val)

    if isinstance(qry, (query.And, query.Term, query.Prefix, query.Wildcard,
        if not isinstance(qry, query.And):
            qry = [qry]
        for term in qry:
            field = term.fieldname
            val = (term.text if not isinstance(term, query.DateRange) else
                   [term.startdate, term.enddate])
            user_log = user_log.filter(get_filterion(field, val, term))
    elif isinstance(qry, query.Or):
        filters = []
        for term in qry:
            field = term.fieldname
            val = (term.text if not isinstance(term, query.DateRange) else
                   [term.startdate, term.enddate])
            filters.append(get_filterion(field, val, term))
        user_log = user_log.filter(or_(*filters))

    return user_log
Example #11
        description = post["d"]
        title = post["t"]
        l = post["l"]

        dpt = title + ". " + description
        t = time.strftime(post["p"])

        print title

        tags = {}

        for word in list(set(dpt.split())):
            with entities_index.searcher() as searcher:
                parser = QueryParser("name",
                results = searcher.search(parser, limit=100)
                for e in results:
                    name = e["name"]
                    url = e["url"]
                    opt = e["opt"]

                    if word == name and len(opt) == 0:
                        tags[name] = url
                    elif name not in tags.keys() and name in dpt and (
                            len(opt) == 0 or opt in dpt):
                        tags[name] = url

        print tags