コード例 #1
0
ファイル: __init__.py プロジェクト: infinitylabs/Django
def get_index():
    try:
        storage = FileStorage(settings.WHOOSH_INDEX)
        return storage.open_index(indexname="search")
    except IOError:
        # No index? other error?
        create_index()
        storage = FileStorage(settings.WHOOSH_INDEX)
        return storage.open_index(indexname="search")
コード例 #2
0
ファイル: models.py プロジェクト: dugo/The-Church-of-Horrors
def update_index(sender, instance, created, **kwargs):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index()

    try:
        writer = ix.writer()
    except:
        return
    
    tags = []
    for t in instance.tags.all():
        try:
            tags.append(unicode(t.name))
        except:
            pass
        
    tags = u','.join(tags)

    try:
    
        if created:
            writer.add_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username,
                                        id=unicode(instance.pk))
            writer.commit()
        else:
            writer.update_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username,
                                        id=unicode(instance.pk))
            writer.commit()
    except:
        pass
コード例 #3
0
ファイル: __init__.py プロジェクト: Bouska/memopol2
def update_index(sender, instance, created, **kwargs):
    if int(os.environ.get('SKIP_SEARCH_INDEX', '0')):
        return
    try:
        url = unicode(instance.get_absolute_url())
    except Exception:
        log.critical('Cant resolve url. Content %r not indexed' % instance)
        return

    content = getattr(instance, 'content', None)
    if content is None:
        content = unicode(instance)
    elif callable(content):
        content = content()

    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname='memopol')
    writer = ix.writer()
    if created:
        writer.add_document(title=unicode(instance), content=content,
                            type=unicode(instance.__class__.__name__.lower()),
                            url=url)
        writer.commit()
    else:
        writer.update_document(title=unicode(instance), content=content,
                               type=unicode(instance.__class__.__name__.lower()),
                               url=url)
        writer.commit()
コード例 #4
0
def init_index(index=".index"):
	indexZ=index
	if not os.path.exists(indexZ):
		os.mkdir(indexZ)      # os.rmdir(index)
	storage = FileStorage(indexZ)
	schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID   (stored=True),tags=KEYWORD)
	ix = storage.create_index(schema)
	ix = storage.open_index()
	return ix
コード例 #5
0
def get_index(index=".index"):
        indexZ=index
	if not os.path.exists(indexZ):
		return "there is no index with this name %s!! use indexer to build the index" % index
		       	
		sys.exit()
	storage = FileStorage(indexZ)
	ix = storage.open_index()
	print "the index has %d docs" % ix.doc_count_all()
	return ix
コード例 #6
0
 def handle_noargs(self, **options):
     # from settings import HAYSTACK_CONNECTIONS
     # storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH'])
     storage = FileStorage('/dev/shm/whoosh/')
     
     ix = storage.open_index('SPELL')
     
     with ix.reader() as r:
         for id in r.all_doc_ids():
             print r.stored_fields(id)
コード例 #7
0
ファイル: indexer.py プロジェクト: sfirmery/gasoline
    def _open_indexes(self):
        """open storage and open indexes"""
        if not os.path.exists("index"):
            os.mkdir("index")
        storage = FileStorage("index")

        # open or initialise index
        if not storage.index_exists(indexname='MAIN'):
            self.ix = storage.\
                create_index(IndexerSchema, indexname='MAIN')
        self.ix = storage.open_index(indexname='MAIN')
コード例 #8
0
ファイル: graphanalyze.py プロジェクト: staffanm/ferenda
    def eval_get_ranked_set_baseline(self, basefile):
        # Step 1: Read the saved keyterms for a subset of articles
        # (created by analyze_baseline_queries)
        g = Graph()
        g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3")

        articles = {}
        for (s, p, o) in g:
            if not str(s) in articles:
                articles[str(s)] = []
            articles[str(s)].append(str(o))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, create a query for each article based on
        # the keyterms.
        connector = query.Or
        indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index'])
        storage = FileStorage(indexdir)
        idx = storage.open_index()
        searcher = idx.searcher(weighting=scoring.BM25F())

        res = {}

        # for article in sorted(articles.keys()):
        for article in self._articles(basefile):
            terms = articles[article]
            rankedset = []
            #parser = qparser.QueryParser("content", idx.schema)
            #q = parser.parse(connector.join(terms))
            q = query.And([
                # query.Term("articles", article),
                connector([query.Term("content", x) for x in terms])
            ])
            # print q
            # self.log.debug("Article %s: %s", article, " or ".join(terms))
            results = searcher.search(q, limit=None)
            resultidx = 0
            # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10))
            for result in results:
                reslbl = "%s (%s)" % (
                    result['basefile'], results.score(resultidx))
                rankedset.append([result['basefile'], reslbl])
                # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx)))
                resultidx += 1
            self.log.info("Created baseline ranked set for %s: Top result %s (of %s)" %
                          (article.split("/")[-1], rankedset[0][0], len(rankedset)))

            # return just a list of URIs, no scoring information. But the
            # full URI isnt available in the whoosh db, so we recreate it.
            res[article] = ["http://lagen.nu/ext/celex/%s" % x[
                0] for x in rankedset]

        return res
コード例 #9
0
ファイル: models.py プロジェクト: Heit/rarog
def update_index(sender, instance, created, **kwargs):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname="rarog")
    writer = ix.writer()
    if created:
        writer.add_document(title=unicode(instance), body_html=instance.body_html,
                                    url=unicode(instance.get_absolute_url()))
        writer.commit()
    else:
        writer.update_document(title=unicode(instance), body_html=instance.body_html,
                                    url=unicode(instance.get_absolute_url()))
        writer.commit()
コード例 #10
0
def searchIndex():
    '''
    searchindex()
    Performs the requested search through the index/schema
    INPUTS: idx -- desired index to search
    OUTPUTS: results -- results of the search
    '''
    # Navigate to the LM index directory
    c = ''
    while True:
        print 'The current directory is ' + os.getcwd()
        ques = 'Is the LM index (directory) in the current directory? [y/n]\t'
        c = raw_input(ques).lower()
        if c == 'y' or c == 'yes':
            idxDir = os.getcwd()
            break
        elif c == 'n' or c == 'no':
            while True:
                idxDir = raw_input('Where is it?\t').lower()
                try:
                    os.chdir(idxDir)
                    break
                except WindowsError:
                    print 'Sorry, I couldn\'t navigate to that directory'
            break
        elif c == 'q' or c == 'quit':
            print '\tReturning to the Main Menu'
            return
        else:
            print 'I\'m sorry, I don\'t understand what you mean. Try again.'

    # Open the index
    idxDir = idxDir + '/LM_Storage'
    storage = FileStorage(idxDir)
    idx = storage.open_index(indexname = 'LM')
    
    # Determine what the user wants to search for 
    c = ''
    while True:
        ques = 'What would you like to search? song/artist [s], lyrics [L]\t'
        c = raw_input(ques).lower()
        if c == 's' or c == 'song/artist' or c == 'song':
            searchForSong(idx)
            break
        elif c == 'l' or c == 'lyrics':
            searchForLyrics(idx)
            break
        elif c == 'q' or c == 'quit':
            print '\tReturning to the Main Menu'
            return 
        else:
            print 'I\'m sorry, I don\'t understand what you mean. Try again.'
コード例 #11
0
ファイル: searchservice.py プロジェクト: yonatano/Plink
def search_does_exist(query):
    #query = unicode(query, 'utf-8')
    #query = unidecode(query)

    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname="wiki")

    from whoosh.qparser import QueryParser
    with ix.searcher() as searcher:
      query = QueryParser("title", ix.schema).parse(query)
      whoosh_results = searcher.search(query, limit=1)

      return len(whoosh_results) > 0
コード例 #12
0
ファイル: models.py プロジェクト: dugo/The-Church-of-Horrors
 def search(self,q):
     from whoosh.filedb.filestore import FileStorage
     from whoosh.qparser import MultifieldParser
     storage = FileStorage(settings.WHOOSH_INDEX)
     ix = storage.open_index()
     q = q.replace('+', ' AND ').replace(' -', ' NOT ')
     parser = MultifieldParser(["content","title","tags","author"], schema=ix.schema)
     qry = parser.parse(q)
     searcher = ix.searcher()
     hits = searcher.search(qry)
     
     
     
     return self.objects.filter(id__in=[h.fields()['id'] for h in hits]).filter(published=True)
コード例 #13
0
def run_search(query):
    from settings import HAYSTACK_CONNECTIONS
    storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH'])
    # storage = FileStorage('/dev/shm/whoosh/')
    
    ix = storage.open_index('MAIN')
    
    with ix.searcher() as s:
        from whoosh.qparser import QueryParser
        qp = QueryParser("text", schema=ix.schema)

        q = qp.parse(query)
        results = s.search(q)
        for i, r in enumerate(results):
            result = "%d: (%s) %s" % (i, r['id'], r['title']) # ignored
コード例 #14
0
def build_clean_index():
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index()
    writer = ix.writer()
    try:
        mlogger.debug("building index from scratch.....................")        
        mlogger.debug("adding objects...................")
        
        for si in StudentInstitute.objects.all():
             adddoc(si,writer,True)   
          
        for fi in FacultyInstitute.objects.all():
             adddoc(fi,writer,True)
    finally:            
        writer.commit()
        ix.close()
コード例 #15
0
ファイル: views.py プロジェクト: Heit/rarog
def search(request):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname="rarog")
    hits = []
    query = request.GET.get('q', None)
    if query is not None and query != u"":
        query = query.replace('+', ' AND ').replace(' -', ' NOT ')
        parser = MultifieldParser(['title','body_html'], schema=ix.schema)
        try:
            qry = parser.parse(query)
        except:
            qry = None
        if qry is not None:
            searcher = ix.searcher()
            hits = searcher.search(qry)
    return query, hits
コード例 #16
0
ファイル: searchservice.py プロジェクト: yonatano/Plink
def add_documents_to_index(index_name, documents):
    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname=index_name)

    writer = ix.writer()

    for i, document in enumerate(documents):

        print "{}%".format(i/len(documents) * 100)

        if index_name == "wiki":
            writer.add_document(title=u"{}".format(sanitize_text(document.title)))
        if index_name == "movie":
            writer.add_document(title=u"{}".format(sanitize_text(document.title)))

    writer.commit()
コード例 #17
0
ファイル: searchservice.py プロジェクト: yonatano/Plink
def search(query):
    #query = unicode(query, 'utf-8')

    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname="wiki")

    from whoosh.qparser import QueryParser
    with ix.searcher() as searcher:
        query = QueryParser("title", ix.schema).parse(query)
        whoosh_results = searcher.search(query, limit=1)

        results = []

        for w in whoosh_results:
            results.append("{}".format(w))

        return results
コード例 #18
0
ファイル: search.py プロジェクト: sii/siptrackd
 def _setup(self, storage_directory):
     schema = fields.Schema(
         oid=fields.ID(stored=True, unique=True),
         name=fields.ID())
     schema.add('*', fields.TEXT, glob=True)
     if storage_directory:
         if  os.path.exists(storage_directory):
             self._using_existing_index = True
             storage = FileStorage(storage_directory)
             ix = storage.open_index()
         else:
             os.mkdir(storage_directory)
             storage = FileStorage(storage_directory)
             ix = storage.create_index(schema)
     else:
         storage = RamStorage()
         ix = storage.create_index(schema)
     return (schema, ix)
コード例 #19
0
ファイル: views.py プロジェクト: olethanh/memopol2
def search(request, template_name='search.html'):
    """
    Simple search view, which accepts search queries via url, like google.
    Use something like ?q=this+is+the+serch+term

    """
    hits = []
    data = dict(
            q = request.GET.get('q', ''),
            types = request.GET.getlist('types') or [],
        )
    form = SearchForm(data)
    form.is_valid()
    query = form.cleaned_data['q']
    types = form.cleaned_data['types']
    limit = int(request.GET.get('limit', 0)) or None
    if query not in (None, u"", u"*"):
        storage = FileStorage(settings.WHOOSH_INDEX)
        ix = storage.open_index(indexname='memopol')
        # Whoosh don't understands '+' or '-' but we can replace
        # them with 'AND' and 'NOT'.
        query = query.replace('+', ' AND ').replace(' -', ' NOT ')
        if types and len(types) != len(types_choices):
            query = 'type:(%s) %s' % (' OR '.join(types), query)
        parser = QueryParser("content", schema=ix.schema)
        try:
            qry = parser.parse(query)
        except:
            # don't show the user weird errors only because we don't
            # understand the query.
            # parser.parse("") would return None
            qry = None
        if qry is not None:
            searcher = ix.searcher()
            try:
                hits = searcher.search(qry)
            except Exception, e:
                log.critical('Error while searching %s' % qry)
                log.exception(e)
            else:
                if limit:
                    hits = hits[:limit]
        ix.close()
コード例 #20
0
ファイル: index.py プロジェクト: jerem/Whoosh
def open_dir(dirname, indexname = None, mapped=True):
    """Convenience function for opening an index in a directory. Takes care of creating
    a FileStorage object for you. dirname is the filename of the directory in
    containing the index. indexname is the name of the index to create; you only need to
    specify this if you have multiple indexes within the same storage object.
    
    :param dirname: the path string of the directory in which to create the index.
    :param indexname: the name of the index to create; you only need to specify this if
        you have multiple indexes within the same storage object.
    :param mapped: whether to use memory mapping to speed up disk reading.
    :returns: :class:`Index`
    """
    
    if indexname is None:
        indexname = _DEF_INDEX_NAME
    
    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname, mapped=mapped)
    return storage.open_index(indexname)
コード例 #21
0
ファイル: index.py プロジェクト: BenSchwab/portfolio
def open_dir(dirname, indexname=None, readonly=False):
    """Convenience function for opening an index in a directory. Takes care of
    creating a FileStorage object for you. dirname is the filename of the
    directory in containing the index. indexname is the name of the index to
    create; you only need to specify this if you have multiple indexes within
    the same storage object.
    
    :param dirname: the path string of the directory in which to create the
        index.
    :param indexname: the name of the index to create; you only need to specify
        this if you have multiple indexes within the same storage object.
    """

    if indexname is None:
        indexname = _DEF_INDEX_NAME

    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname, readonly=readonly)
    return storage.open_index(indexname)
コード例 #22
0
def newIndex():
    '''
    newIndex()
    Creates the index/schema for the Whoosh module
    INPUTS: (none)
    OUTPUTS: idx -- index 
    '''
    print '\tCreating a new Index in the current directory'
    # Create an index to store the artist/title and lyrics
    schm = Schema(Name=TEXT(stored=True), Ingr=KEYWORD(stored=True, commas=True))
    # Create a directory called FAR_Storage; will contain the index
    # See Whoosh documentation for more information
    if not os.path.exists('FAR_Storage'):
        os.mkdir('FAR_Storage')
    idxDir ='FAR_Storage'
    storage = FileStorage(idxDir)
    idx = storage.create_index(schm, indexname='FAR')
    idx = storage.open_index(indexname = 'FAR')
    return idx
コード例 #23
0
def generateSchedule():
    while True:
        print 'The current directory is ' + os.getcwd()
        ques = 'Is the FAR index (directory) in the current directory? [y/n]\t'
        c = raw_input(ques).lower()
        if c == 'y' or c == 'yes':
            idxDir = os.getcwd()
            break
        elif c == 'n' or c == 'no':
            while True:
                idxDir = raw_input('Where is it?\t').lower()
                try:
                    os.chdir(idxDir)
                    break
                # Im sure this only works in Windoze                        
                except WindowsError:
                    print 'Sorry, I couldn\'t navigate to that directory'
            break
        elif c == 'q' or c == 'quit':
            print '\tReturning to the Main Menu'
            return
        else:
            print 'I\'m sorry, I don\'t understand what you mean. Try again.'

    # Open the index
    print '\nThe Food \'Roo is gathering your recipes. Please wait paitiently ...'

    idxDir = idxDir + '/FAR_Storage'
    storage = FileStorage(idxDir)
    idx = storage.open_index(indexname = 'FAR')

    result = idx.searcher().documents()
    r = result
    print 'Yay! We\'ve got a whole pouch full of recipies'
    print 'You\'ve got ' + str(len(list(r))) + ' recipies in Food \'Roo\'s pouch\n\n'


    for item in result:
        print 'I can get in here'

        print item
        print item['Name']
コード例 #24
0
    def handle_noargs(self, **options):
        from settings import HAYSTACK_CONNECTIONS
        storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH'])
        # storage = FileStorage('/dev/shm/whoosh/')
        
        ix = storage.open_index('MAIN')
        
        with ix.searcher() as s:
            from whoosh.qparser import QueryParser
            qp = QueryParser("content", schema=ix.schema)

            q = qp.parse(u"((title:whee OR text:whee OR notes:whee OR program:whee OR job_title:whee) AND document_type:5 AND (programs:1 OR programs:2))")
            results = s.search(q)
            for i, r in enumerate(results):
                print "%d: (%s) %s" % (i, r['id'], r['title'])

            q = qp.parse(u"((title:whee OR text:whee OR notes:whee OR program:whee OR job_title:whee) AND document_type:5 AND (programs:1 OR programs:2))")
            results = s.search(q, sortedby='-created')
            for i, r in enumerate(results):
                print "%d: (%s) %s" % (i, r['id'], r['title'])
コード例 #25
0
ファイル: utils.py プロジェクト: braddockcg/internet-in-a-box
def whoosh_open_dir_32_or_64(dirname, indexname=None, readonly=False):
    """Convenience function for opening an index in a directory based on
    open_dir in whoosh.index.
    This functions automatically detects if the machine is 32-bit or 64-bit,
    and turns off mmap if only 32-bit to avoid address space exhaustion
    on large indices.

    :param dirname: the path string of the directory in which to create the
        index.
    :param indexname: the name of the index to create; you only need to specify
        this if you have multiple indexes within the same storage object.
    """

    from whoosh.filedb.filestore import FileStorage
    from whoosh.index import _DEF_INDEX_NAME

    supports_mmap = not is32bit()
    if indexname is None:
        indexname = _DEF_INDEX_NAME

    storage = FileStorage(dirname, readonly=readonly, supports_mmap=supports_mmap)
    return storage.open_index(indexname)
コード例 #26
0
class TinaIndex():
    """
    Open or Create a whoosh index
    Provides searching methods
    """

    def __init__( self, indexdir ):
        self.writer = None
        self.reader = None
        self.searcher = None
        self.indexdir = indexdir
        self.storage = FileStorage(self.indexdir)
        self.index = None
        try:
            self.index = self.storage.open_index()
        except EmptyIndexError, e:
            _logger.warning( "No existing index at %s : "%self.indexdir)
            self.schema = TinaSchema()
            if not os.path.exists(self.indexdir):
                os.mkdir(self.indexdir)
            self.index = self.storage.create_index(self.schema)
        except LockError, le:
            _logger.error("index LockError %s : "%self.indexdir)
            raise LockError(le)
コード例 #27
0
ファイル: makeIndex.py プロジェクト: Ginchung/LuXun-Search
ix_path = 'indexdir/'
ix_name = 'luxun_index_name'

if not os.path.exists(ix_path):
    os.mkdir(ix_path)
ix = create_in(ix_path, schema, indexname=ix_name)
ix.close()

from whoosh.filedb.filestore import FileStorage
import re

# Replace pattern
rpp = re.compile('[\u3000※·\n\r〔〕 ]')

storage = FileStorage(ix_path)  #idx_path 为索引路径
ix = storage.open_index(indexname=ix_name)
# 按照schema定义信息,增加需要建立索引的文档
# 注意:字符串格式需要为unicode格式
with ix.writer() as w:
    for article in articles:
        with open(article, 'r', encoding='utf-8') as f:
            arr = f.read()
        subs = re.split(rpp, arr)
        art = ''.join(subs)
        length = len(art)
        ind = art[::-1].find('〔1〕')
        if ind > 0:
            art = art[:length - ind - 1]
            art = re.split('[\d]', art)
            art = ''.join(art[1:])
        name_arr = article.split('/')
コード例 #28
0
ファイル: views.py プロジェクト: chitholian/Flower-Detection
def search(request):
    if request.method != 'POST':
        return JsonResponse({'error': 'Method not allowed'}, status=405)
    if request.POST['searchType'] == 'keywords':
        if not request.POST['keywords']:
            return JsonResponse({'error': 'Bad request'}, status=400)

        result = {
            'detections': 0,
            'locations': [],
            'scores': [],
            'categories': [],
            'similarImages': [],
            'taxanomy': [],
            'benefits': [],
            'extra': [],
            'queryType': 'keywords',
        }

        schema = Schema(title=TEXT(stored=True),
                        cat_id=ID(stored=True),
                        content=TEXT)
        storage = FileStorage(f'{BASE_DIR}/TheApp/flowers/indexdir')
        ix = storage.open_index(schema=schema)
        with ix.searcher() as searcher:
            query = QueryParser('content',
                                ix.schema).parse(request.POST['keywords'])
            results = searcher.search(query)
            for r in results:
                if 'cat_id' in r:
                    cid = int(r['cat_id'])
                    result['categories'].append(cid)

        result['detections'] = len(set(result['categories']))

        result['taxanomy'], result['benefits'], result['extra'], result[
            'similarImages'] = get_flower_info_and_images(result['categories'])

        return JsonResponse(result, safe=True)

    elif request.POST['searchType'] == 'image':
        if not request.FILES['image']:
            return JsonResponse({'error': 'Bad request'}, status=400)

        # Run inference on the uploaded image.
        image = Image.open(request.FILES.get('image'))
        image = image.resize((300, 300)).convert('RGB')
        image = np.asarray(image, dtype=np.uint8)

        # Load TFLite model and allocate tensors.
        interpreter = tf.lite.Interpreter(
            model_path=f"{BASE_DIR}/TheApp/flowers/ML-Models/model.tflite")
        interpreter.allocate_tensors()

        # Get input and output tensors.
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()

        input_shape = input_details[0]['shape']

        input_data = np.reshape(image, input_shape)
        interpreter.set_tensor(input_details[0]['index'], input_data)

        interpreter.invoke()

        locations = interpreter.get_tensor(output_details[0]['index'])
        categories = interpreter.get_tensor(output_details[1]['index'])
        scores = interpreter.get_tensor(output_details[2]['index'])
        detections = interpreter.get_tensor(output_details[3]['index'])

        result = {
            'detections': 0,
            'locations': [],
            'scores': [],
            'categories': [],
            'similarImages': [],
            'taxanomy': [],
            'benefits': [],
            'extra': [],
            'queryType': 'image',
        }

        for d in range(int(detections[0])):
            score = scores[0][d]
            if score >= 0.25:
                loc = locations[0][d]
                category = categories[0][d]
                result['detections'] = result['detections'] + 1
                result['locations'].append(loc)
                result['scores'].append(score)
                result['categories'].append(category)

        result['taxanomy'], result['benefits'], result['extra'], result[
            'similarImages'] = get_flower_info_and_images(result['categories'])

        return JsonResponse(result, encoder=NumpyJSONEncoder, safe=True)

    else:
        return JsonResponse({'error': 'Bad request'}, status=400)
コード例 #29
0
# Indica onde fica o indice
storage = FileStorage(indexdir)

# Cria schema
schema = Schema(
    id=ID(stored=True, unique=True),
    titulo=TEXT(stored=True),
    conteudo=TEXT(stored=True),
    tags=KEYWORD
    )

# Verifica se existe o indice chamado docs na pasta indexdir
usages_exists = index.exists_in(indexdir, indexname="docs2")
if (usages_exists is True):
    # Abre o indice gerado
    ix = storage.open_index(indexname="docs2")
else:
    # Cria novo indice
    ix = storage.create_index(schema, indexname="docs2")

def carregarTxt():
    # Passar por todos os arquivos txt da pasta docs
    writer = ix.writer()

    for num,arq in enumerate(glob.glob("docs/*.txt")):
        # Abrir arquivo
        try:
            f = codecs.open(arq, "r", "UTF-8")
        except Exception:
            print 'Falha na leitura do arquivo: ', arq
        linhas = f.readlines()
コード例 #30
0
ファイル: backend.py プロジェクト: tjwalch/wagtail-whoosh
class WhooshSearchBackend(BaseSearchBackend):
    query_compiler_class = WhooshSearchQueryCompiler
    results_class = WhooshSearchResults
    rebuilder_class = WhooshSearchRebuilder

    def __init__(self, params):
        super().__init__(params)
        self.params = params

        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = params.get("POST_LIMIT", 128 * 1024 * 1024)
        self.path = params.get("PATH")

        self.setup()
        self.refresh_index(optimize=False)

    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)

        self.schema = self.build_schema()
        self.content_field_name = "text"

        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        self.parser.add_plugins([FuzzyTermPlugin])

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self):
        schema_fields = {
            'id': WHOOSH_ID(stored=True, unique=True),
            'django_ct': WHOOSH_ID(stored=True),
            'django_id': WHOOSH_ID(stored=True),
            'text': TEXT(stored=True),
        }

        return Schema(**schema_fields)

    def get_config(self):
        return self.params.get('SEARCH_CONFIG')

    def get_index_for_model(self, model, db_alias=None):
        return Index(self, model, db_alias)

    def get_index_for_object(self, obj):
        return self.get_index_for_model(obj._meta.model, obj._state.db)

    def reset_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def add_type(self, model):
        pass  # Not needed.

    def refresh_index(self, optimize=True):
        if not self.setup_complete:
            self.setup()
        else:
            self.index = self.index.refresh()
        if optimize:
            # optimize is a locking operation, shouldn't be called unless recreating the index
            self.index.optimize()

    def add(self, obj):
        self.get_index_for_object(obj).add_item(obj)

    def add_bulk(self, model, obj_list):
        if obj_list:
            self.get_index_for_object(obj_list[0]).add_items(model, obj_list)

    def delete(self, obj):
        self.get_index_for_object(obj).delete_item(obj)
コード例 #31
0
class RedisWhooshStore(SAMLStoreBase
                       ):  # TODO: This needs a gc mechanism for keys (uuids)
    def json_dict(self, name):
        return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name),
                                     redis=self._redis,
                                     writeback=True),
                            maxsize=config.cache_size)

    def xml_dict(self, name):
        return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name),
                                    redis=self._redis,
                                    writeback=True),
                            maxsize=config.cache_size)

    def __init__(self, *args, **kwargs):
        self._dir = kwargs.pop('directory', '.whoosh')
        clear = bool(kwargs.pop('clear', config.store_clear))
        self._name = kwargs.pop('name', config.store_name)
        self._redis = kwargs.pop('redis', redis())
        if clear:
            shutil.rmtree(self._dir)
        now = datetime.now()
        self._last_index_time = now
        self._last_modified = now
        self._setup()
        if clear:
            self.reset()

    def _setup(self):
        self._redis = getattr(self, '_redis', None)
        if not self._redis:
            self._redis = redis(
            )  # XXX test cases won't get correctly unpicked because of this
        self.schema = Schema(content=NGRAMWORDS(stored=False))
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        self.schema.add('sha1', ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self.objects = self.xml_dict('objects')
        self.parts = self.json_dict('parts')
        self.storage = FileStorage(os.path.join(self._dir, self._name))
        try:
            self.index = self.storage.open_index(schema=self.schema)
        except BaseException as ex:
            log.warning(ex)
            self.storage.create()
            self.index = self.storage.create_index(self.schema)
            self._reindex()

    def __getstate__(self):
        state = dict()
        for p in ('_dir', '_name', '_last_index_time', '_last_modified'):
            state[p] = getattr(self, p)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._setup()

    def __call__(self, *args, **kwargs):
        watched = kwargs.pop('watched', None)
        scheduler = kwargs.pop('scheduler', None)
        if watched is not None and scheduler is not None:
            super(RedisWhooshStore, self).__call__(watched=watched,
                                                   scheduler=scheduler)
            log.debug("indexing using {}".format(scheduler))
            if scheduler is not None:  # and self._last_modified > self._last_index_time and :
                scheduler.add_job(
                    RedisWhooshStore._reindex,
                    args=[self],
                    max_instances=1,
                    coalesce=True,
                    misfire_grace_time=2 * config.update_frequency,
                )

    def _reindex(self):
        log.debug("indexing the store...")
        self._last_index_time = datetime.now()
        seen = set()
        refs = set([b2u(s) for s in self.objects.keys()])
        parts = self.parts.values()
        for ref in refs:
            for part in parts:
                if ref in part['items']:
                    seen.add(ref)

        ix = self.storage.open_index()
        lock = ix.lock("reindex")
        try:
            log.debug("waiting for index lock")
            lock.acquire(True)
            log.debug("got index lock")
            with ix.writer() as writer:
                for ref in refs:
                    if ref not in seen:
                        log.debug("removing unseen ref {}".format(ref))
                        del self.objects[ref]
                        del self.parts[ref]

                log.debug("updating index")
                for e in self.objects.values():
                    info = self._index_prep(entity_simple_info(e))
                    ref = object_id(e)
                    writer.add_document(object_id=ref, **info)

                writer.mergetype = CLEAR
        finally:
            try:
                log.debug("releasing index lock")
                lock.release()
            except ThreadError as ex:
                pass

    def dump(self):
        ix = self.storage.open_index()
        from whoosh.query import Every

        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        res = dict()
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v

        content = " ".join(
            filter(
                lambda x: x is not None,
                [
                    info.get(x, '') for x in ('service_name', 'title',
                                              'domain', 'keywords', 'scopes')
                ],
            ))
        res['content'] = content.strip()
        for a, v in info.items():
            k = a
            if a in ATTRS_INV:
                k = ATTRS_INV[a]

            if k in self.schema.names():
                if type(v) in (list, tuple):
                    res[k] = " ".join([vv.lower() for vv in v])
                elif type(v) in six.string_types:
                    res[k] = info[a].lower()
        res['sha1'] = hash_id(info['entity_id'], prefix=False)
        return res

    def update(self, t, tid=None, etag=None, lazy=True):
        relt = root(t)
        assert relt is not None

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            ref = object_id(relt)
            parts = None
            if ref in self.parts:
                parts = self.parts[ref]
            if etag is not None and (parts is None
                                     or parts.get('etag', None) != etag):
                self.parts[ref] = {
                    'id': relt.get('entityID'),
                    'etag': etag,
                    'count': 1,
                    'items': [ref]
                }
                self.objects[ref] = relt
                self._last_modified = datetime.now()
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            if etag is None:
                etag = hex_digest(dumptree(t, pretty_print=False), 'sha256')
            parts = None
            if tid in self.parts:
                parts = self.parts[tid]
            if parts is None or parts.get('etag', None) != etag:
                items = set()
                for e in iter_entities(t):
                    ref = object_id(e)
                    items.add(ref)
                    self.objects[ref] = e
                self.parts[tid] = {
                    'id': tid,
                    'count': len(items),
                    'etag': etag,
                    'items': list(items)
                }
                self._last_modified = datetime.now()

        if not lazy:
            self._reindex()

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def collections(self):
        return [b2u(ref) for ref in self.parts.keys()]

    def reset(self):
        for k in ('{}_{}'.format(self._name, 'parts'),
                  '{}_{}'.format(self._name, 'objects')):
            self._redis.delete('{}_{}'.format(self._name, 'parts'))
            self._redis.delete('{}_{}'.format(self._name, 'objects'))

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def _prep_key(self, key):
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        key = key.replace('-', ' AND NOT ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        return key

    def _entities(self):
        lst = set()
        for ref_data in self.parts.values():
            for ref in ref_data['items']:
                e = self.objects.get(ref, None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def lookup(self, key):
        if key == 'entities' or key is None:
            return self._entities()

        bkey = six.b(key)
        if bkey in self.objects:
            return [self.objects.get(bkey)]

        if bkey in self.parts:
            res = []
            part = self.parts.get(bkey)
            for item in part['items']:
                res.extend(self.lookup(item))
            return res

        key = self._prep_key(key)
        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                e = self.objects.get(result['object_id'], None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def search(self, query=None, path=None, entity_filter=None, related=None):
        if entity_filter:
            query = "{!s} AND {!s}".format(query, entity_filter)
        query = self._prep_key(query)
        qp = MultifieldParser(['content', 'domain'], schema=self.schema)
        q = qp.parse(query)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            log.debug(results)
            for result in results:
                lst.add(result['object_id'])

        res = list()
        for ref in lst:
            e = self.objects.get(ref, None)
            if e is not None:
                res.append(discojson(e))
        return res
コード例 #32
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024)
        
        if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.')
    
    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True
        
        if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH)
        
        if self.use_file_storage:
            self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            'id': ID(stored=True, unique=True),
            'django_ct': ID(stored=True),
            'django_id': ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer())
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            writer.update_document(**doc)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
    
    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)
        self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))
    
    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        
        if not models:
            self.delete_index()
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
    
    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH)
        elif not self.use_file_storage:
            self.storage.clean()
        
        # Recreate everything.
        self.setup()
        
    def optimize(self):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        self.index.optimize()
    
    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None,
               limit_to_registered_models=None, **kwargs):
        if not self.setup_complete:
            self.setup()
        
        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        
        query_string = force_unicode(query_string)
        
        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }
        
        reverse = False
        
        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1
            
            if len(sort_by) > 1 and reverse_counter > 1:
                raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.")
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])
                    
                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)
                    
                    if len(sort_by_list) == 1:
                        reverse = False
                
            sort_by = sort_by_list[0]
        
        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
        
        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
        
        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
        
        narrowed_results = None
        self.index = self.index.refresh()
        
        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
        
        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models registered
            # with the current site.
            if narrow_queries is None:
                narrow_queries = set()
            
            registered_models = self.build_registered_models_list()
            
            if len(registered_models) > 0:
                narrow_queries.add('django_ct:(%s)' % ' OR '.join(registered_models))
        
        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()
            
            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq)))
                
                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results
        
        self.index = self.index.refresh()
        
        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)
            
            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }
            
            # Prevent against Whoosh throwing an error. Requires an end_offset
            # greater than 0.
            if not end_offset is None and end_offset <= 0:
                end_offset = 1
            
            raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse)
            
            # Handle the case where the results have been narrowed.
            if narrowed_results:
                raw_results.filter(narrowed_results)
            
            # Determine the page.
            page_num = 0
            
            if end_offset is None:
                end_offset = 1000000
            
            if start_offset is None:
                start_offset = 0
            
            page_length = end_offset - start_offset
            
            if page_length and page_length > 0:
                page_num = start_offset / page_length
            
            # Increment because Whoosh uses 1-based page numbers.
            page_num += 1
            
            try:
                raw_page = ResultsPage(raw_results, page_num, page_length)
            except ValueError:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            
            return self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query)
        else:
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None
            
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
    
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None,
                       limit_to_registered_models=None, **kwargs):
        warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2)
        return {
            'results': [],
            'hits': 0,
        }
    
    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
        from haystack import site
        results = []
        
        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)
        
        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()
        
        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if isinstance(index.fields[string_key], MultiValueField):
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)
                
                del(additional_fields['django_ct'])
                del(additional_fields['django_id'])
                
                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace('*', '') for term in query_string.split()]
                    
                    additional_fields['highlighted'] = {
                        self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                    }
                
                result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
    
    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        sp = SpellChecker(self.storage)
        cleaned_query = force_unicode(query_string)
        
        if not query_string:
            return spelling_suggestion
        
        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')
        
        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')
        
        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []
        
        for word in query_words:
            suggestions = sp.suggest(word, number=1)
            
            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])
        
        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion
    
    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.
        
        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = True
            else:
                value = False
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_unicode(v) for v in value])
        elif isinstance(value, (int, long, float)):
            # Leave it alone.
            pass
        else:
            value = force_unicode(value)
        return value
    
    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.
        
        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False
        
        if value and isinstance(value, basestring):
            possible_datetime = DATETIME_REGEX.search(value)
            
            if possible_datetime:
                date_values = possible_datetime.groupdict()
            
                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)
            
                return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])
        
        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)
            
            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass
        
        return value
コード例 #33
0
class SearchIndex:
    def __init__(self, index_folder):
        self.storage = FileStorage(index_folder)

        if not self.storage.index_exists():
            schema = Schema(fileid=ID(stored=True), content=TEXT(stored=True))
            self.storage.create_index(schema)
        self.index = self.storage.open_index()

    def size(self):
        return self.index.doc_count()

    def list_files(self):
        with self.index.reader() as reader:
            for docnum, doc in reader.iter_docs():
                yield doc['fileid']

    def index_content(self, fileid, path_to_file, mimetype):
        """Index one file.
        """
        with self.index.writer() as writer:
            self._index_content(fileid, path_to_file, mimetype, writer)

    def delete(self, fileid):
        """Delete one file.
        """
        with self.index.searcher() as searcher:
            docnum = searcher.document_number(fileid=fileid)
        with self.index.writer() as writer:
            writer.delete_document(docnum)

    def _index_content(self, fileid, path_to_file, mimetype, writer):
        """Index one file.
        """
        if not mimetype in EXTRACTORS:
            content = "Missing extractor for {}".format(mimetype)
        else:
            with open(path_to_file, 'rb') as f:
                document_bytes = f.read()
            magic = detect_from_content(document_bytes)

            content = EXTRACTORS[mimetype](path_to_file, document_bytes, magic)
        writer.add_document(fileid=fileid, content=content)

    def refresh(self, all_files, file_repo):
        """Extract the text from all the files in the repository, purging existing repo."""
        with self.index.reader() as reader:
            docids = list(reader.all_doc_ids())
        with self.index.writer() as writer:
            for docid in docids:
                writer.delete_document(docid)

            for node in all_files:
                self._index_content(
                    node['id'],
                    file_repo.get_absolute_path_to_file(node['number']),
                    node['mimetype'], writer)

    def similarto(self, fileid, top=20, numterms=50):
        with self.index.searcher() as searcher:
            docnum = searcher.document_number(fileid=fileid)
            results = searcher.more_like(docnum,
                                         'content',
                                         top=top,
                                         numterms=numterms,
                                         normalize=False)
            return list(map(lambda hit: hit['fileid'], results))

    def search(self, query_str, limit=20):
        qp = QueryParser('content', schema=self.index.schema)
        query = qp.parse(query_str)

        with self.index.searcher() as searcher:
            results = searcher.search(query, limit=limit)
            return list(map(lambda hit: hit['fileid'], results))

    def text(self, fileid):
        with self.index.searcher() as searcher:
            doc = searcher.document(fileid=fileid)
            if doc:
                return doc['content']
            else:
                return "File {} not in index!".format(fileid)
コード例 #34
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        ".",
    )

    def __init__(self, connection_alias, **connection_options):
        super().__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, "POST_LIMIT",
                                  128 * 1024 * 1024)
        self.path = connection_options.get("PATH")

        if connection_options.get("STORAGE", "file") != "file":
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger("haystack")

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections

        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, "RAM_STORE", None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        self.parser.add_plugins([FuzzyTermPlugin])

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for _, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost,
                    )
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=field_class.analyzer or StemmingAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True,
                )

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if "boost" in doc:
                    del doc["boost"]

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(
                        "%s while preparing object for update" %
                        e.__class__.__name__,
                        exc_info=True,
                        extra={
                            "data": {
                                "index": index,
                                "object": get_identifier(obj)
                            }
                        },
                    )

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            if writer.ident is not None:
                writer.join()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse('%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Whoosh: %s",
                whoosh_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(
                    q=self.parser.parse(" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Whoosh index: %s",
                               e,
                               exc_info=True)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if end_offset is not None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self,
               query_string,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields="",
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               within=None,
               dwithin=None,
               distance_point=None,
               models=None,
               limit_to_registered_models=None,
               result_class=None,
               **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        query_string = force_str(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != "*":
            return {"results": [], "hits": 0}

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith("-"):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith("-"):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list

        group_by = []
        facet_types = {}
        if facets is not None:
            group_by += [
                FieldFacet(facet, allow_overlap=True, maptype=Count)
                for facet in facets
            ]
            facet_types.update({facet: "fields" for facet in facets})

        if date_facets is not None:

            def _fixup_datetime(dt):
                if isinstance(dt, datetime):
                    return dt
                if isinstance(dt, date):
                    return datetime(dt.year, dt.month, dt.day)
                raise ValueError

            for key, value in date_facets.items():
                start = _fixup_datetime(value["start_date"])
                end = _fixup_datetime(value["end_date"])
                gap_by = value["gap_by"]
                gap_amount = value.get("gap_amount", 1)
                gap = RelativeDelta(**{"%ss" % gap_by: gap_amount})
                group_by.append(
                    DateRangeFacet(key, start, end, gap, maptype=Count))
                facet_types[key] = "dates"

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.",
                          Warning,
                          stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(" OR ".join(
                ["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {"results": [], "hits": 0}

                if narrowed_results is not None:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {"results": [], "hits": 0}

            page_num, page_length = self.calculate_page(
                start_offset, end_offset)

            search_kwargs = {
                "pagelen": page_length,
                "sortedby": sort_by,
                "reverse": reverse,
                "groupedby": group_by,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs["filter"] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num,
                                                **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {"results": [], "hits": 0, "spelling_suggestion": None}

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {"results": [], "hits": 0, "spelling_suggestion": None}

            results = self._process_results(
                raw_page,
                highlight=highlight,
                query_string=query_string,
                spelling_query=spelling_query,
                result_class=result_class,
                facet_types=facet_types,
            )
            searcher.close()

            if hasattr(narrow_searcher, "close"):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(
                        query_string)
            else:
                spelling_suggestion = None

            return {
                "results": [],
                "hits": 0,
                "spelling_suggestion": spelling_suggestion,
            }

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(" OR ".join(
                ["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != "*":
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_str(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {"results": [], "hits": 0}

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, "filter"):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {"results": [], "hits": 0, "spelling_suggestion": None}

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {"results": [], "hits": 0, "spelling_suggestion": None}

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, "close"):
            narrow_searcher.close()

        return results

    def _process_results(
        self,
        raw_page,
        highlight=False,
        query_string="",
        spelling_query=None,
        result_class=None,
        facet_types=None,
    ):
        from haystack import connections

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        facets = {}

        if facet_types:
            facets = {
                "fields": {},
                "dates": {},
                "queries": {},
            }
            for facet_fieldname in raw_page.results.facet_names():
                group = raw_page.results.groups(facet_fieldname)
                facet_type = facet_types[facet_fieldname]

                # Extract None item for later processing, if present.
                none_item = group.pop(None, None)

                lst = facets[facet_type][facet_fieldname] = sorted(
                    group.items(), key=(lambda itm: (-itm[1], itm[0])))

                if none_item is not None:
                    # Inject None item back into the results.
                    none_entry = (None, none_item)
                    if not lst or lst[-1][1] >= none_item:
                        lst.append(none_entry)
                    else:
                        for i, value in enumerate(lst):
                            if value[1] < none_item:
                                lst.insert(i, none_entry)
                                break

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], "convert"):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) == 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ",")
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del additional_fields[DJANGO_CT]
                del additional_fields[DJANGO_ID]

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter("em")
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter,
                    )
                    additional_fields["highlighted"] = {
                        self.content_field_name: [whoosh_result]
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            "results": results,
            "hits": hits,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_str(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, "")

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, "")

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = " ".join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, "strftime"):
            if not hasattr(value, "hour"):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = "true"
            else:
                value = "false"
        elif isinstance(value, (list, tuple)):
            value = ",".join([force_str(v) for v in value])
        elif isinstance(value, (int, float)):
            # Leave it alone.
            pass
        else:
            value = force_str(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == "true":
            return True
        elif value == "false":
            return False

        if value and isinstance(value, str):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(
                    date_values["year"],
                    date_values["month"],
                    date_values["day"],
                    date_values["hour"],
                    date_values["minute"],
                    date_values["second"],
                )

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                    converted_value,
                (list, tuple, set, dict, int, float, complex),
            ):
                return converted_value
        except Exception:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
コード例 #35
0
ファイル: indexer.py プロジェクト: celi3t/HangryGoat
def index_dict(dictionary):
    storage = FileStorage("index")
    ix = storage.open_index()
    add_to_index(ix, [dictionary])
コード例 #36
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')
        
        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
    
    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True
        
        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
        
        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise
                
                self.log.error("Failed to add documents to Whoosh: %s", e)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
コード例 #37
0
    app.logger.setLevel(logging.INFO)
    app.logger.info('microblog startup')

enable_search = WHOOSH_ENABLED
if enable_search:
    search_is_new = False
    if not os.path.exists(WHOOSH_BASE):
        os.mkdir(WHOOSH_BASE)
        search_is_new = True
    search_storage = FileStorage(WHOOSH_BASE)
    search_ix = None
    if search_is_new:
        schema = Schema(id=ID(stored=True), body=TEXT())
        search_ix = search_storage.create_index(schema)
    else:
        search_ix = search_storage.open_index()


class CustomJSONEncoder(JSONEncoder):
    """This class adds support for lazy translation texts to Flask's
    JSON encoder. This is necessary when flashing translated texts."""
    def default(self, obj):
        from speaklater import is_lazy_string
        if is_lazy_string(obj):
            try:
                return unicode(obj)  # python 2
            except NameError:
                return str(obj)  # python 3
        return super(CustomJSONEncoder, self).default(obj)

コード例 #38
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')
        
        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
    
    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True
        
        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
        
        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise
                
                self.log.error("Failed to add documents to Whoosh: %s", e)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
コード例 #39
0
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
import whoosh.qparser as qparser
import chinese
import os, glob, codecs, sys

analyzer = chinese.ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), 
	sub_title=TEXT(stored=True, analyzer=analyzer),
	author=TEXT(stored=True, analyzer=analyzer), 
	content=TEXT(stored=True, analyzer=analyzer))

storage = FileStorage("indexdir")
ix = storage.open_index()
writer = ix.writer()

_string = sys.argv[1]
_mode = sys.argv[2]
normal = (_mode == "normal")

_distance = 0
if(normal is False):
	_distance = int(sys.argv[3])

with ix.searcher() as searcher:
	# og = qparser.OrGroup.factory(0.9)
	parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema)
	# parser = qparser.QueryParser("content", ix.schema)
	parser.remove_plugin_class(qparser.PhrasePlugin)
コード例 #40
0
ix = index.create_in("indexdir", schema)
带开一个已经存在某个目录的索引,使用index.open_dir()
[python] view plain copy
import whoosh.index as index

ix = index.open_dir("indexdir")
这些是便利方法:
[python] view plain copy
from whoosh.filedb.filestore import FileStorage
storage = FileStorage("indexdir")

# Create an index
ix = storage.create_index(schema)

# Open an existing index
storage.open_index()
你和index对象一起创建的schema对象是可序列化的并且和index一起存储
你可以在同一个目录下面使用多个索引,用关键字参数分开
[python] view plain copy
# Using the convenience functions
ix = index.create_in("indexdir", schema=schema, indexname="usages")
ix = index.open_dir("indexdir", indexname="usages")

# Using the Storage object
ix = storage.create_index(schema, indexname="usages")
ix = storage.open_index(indexname="usages")

Clearing the index

在一个目录上调用index.craete_in()函数可以清除已经存在的索引的内容
可以用函数index.exist_in()来检测制定目录上面是否有一个有效的索引
コード例 #41
0
def src(indexdir,
        quer,
        dsc=True,
        flds="2",
        lim=100,
        w="bm",
        opt=[],
        lo="o",
        resdir="",
        run=""):
    """
    Questa funzione permette di effettuare il reperimento di risultati per query contenute nel file indicato in quer dall'indice indicato in indexdir, 
    utilizzando oggetti e funzioni del modulo whoosh.
    
    Parameters
    ----------
    indexdir : string
    Una stringa che indica il percorso della cartella contenente l'indice su cui effettuare la ricerca.
    
    quer : string
    Una stringa che indica il percorso del file contenente le query da utilizzare.
    
    dsc : bool
    Indica quale campo della query utilizzare, titolo o descrizione, di default usa la descrizione.
    
    flds : string
    Indica il numero di campi, puo' assumere solo valori "1", "2" o "3".
    
    lim : int
    Numero massimo di documenti reperiti per query deve essere un numero positivo.
    
    w : string
    Schema di pesatura utilizzato, puo' assumere solo valori "bm" o "tf".
    "bm" sta per BM25 mentre "tf" sta per TF_IDF (per ulteriori informazioni vedi whoosh.scorig)
    
    lo : string
    Operatore logico utilizzato per le parole delle query puo' assumere valori "o" per OR o "a" per AND.
    
    opt : list
    Dovrebbe contenere due valori di tipo numerico da assegnare ai parametri del BM25.
    
    resdir : string
    Una stringa che indica il percorso della cartella dove salvare il file dei risultati.
    Se striga vuota come da default i risultati vengono stampati con print.
    
    run : string
    Una stringa opzionale, permette di aggiungere  parte del tag e del nome dei file dei risultati
    
    Returns
    -------
    None
    
    Notes
    -----
    Questa funzione e' stata fatta per effettuare ricerche in un indice della gia' citata collezione ohsumed, non e' garantito che funzioni per altri.
    In particolare il file delle query deve essere organizzato allo stesso modo del file contenente le query sperimentali per la collezione ohsumed (il quale si dovrebbe 
    trovare nella stessa cartella di questo programma) e i documenti dell'indice dovrebbero avere i campi 'identifier', 'title', 'abstract' e 'terms'.
    """
    fst = FileStorage(indexdir)
    ix = fst.open_index()
    # Creo il runtag utilizzando tutti i parametri che si possono cambiare
    tag = run + "_BATCH_DESC" + str(dsc)[0] + "_" + flds + "C_GRP" + lo.upper(
    ) + "_" + w.upper() + "_" + str(lim) + "RES"

    # ------------------------------------------------------------------------------------------------ #
    # Interpreta la scelta di quale operatore logico si usa per raggruppare le parole delle query
    if lo == "o":
        lgroup = qparser.OrGroup
    elif lo == "a":
        lgroup = qparser.AndGroup

    # ------------------------------------------------------------------------------------------------ #
    # Interpreta la scelta dello schema di peastura
    if w == "tf":
        score = scoring.TF_IDF()
    elif w == "bm":
        if opt:  # opt dovrebbe contenere il punto che ottimizza un valore(come MAP) per i due parametri
            score = scoring.BM25F(opt[0], opt[1])
        else:
            score = scoring.BM25F()

    # ------------------------------------------------------------------------------------------------ #
    # Interpeta il numero di campi dei documenti da utilizzare
    if flds == "1":
        campi = "title"
        parser = qp
    elif flds == "2":
        campi = ["title", "abstract"]
        parser = mp
    elif flds == "3":
        campi = ["title", "abstract", "terms"]
        parser = mp

    # ----------------------------------------------------------------------------------------------- #
    #--- apertura del file delle query ---#
    infile = open(quer, 'r')
    #--- lettura del file
    text = infile.read()
    #--- dom delle query
    dom = parseString(text)
    #--- estrazione dei dati della query
    title = gettagdata(dom, 'title')  # Utilizzare il campo title delle query
    if dsc == True:
        title = gettagdata(dom, 'desc')  # Utilizzare il campo desc delle query
        # Togliere i commenti dalle righe successive e commentare la riga prcedente per usare entrambi
        #desc = gettagdata(dom,'desc')
        #for x in range(len(title)-1):
        #    title[x]=title[x]+" "+desc[x]
    num = gettagdata(dom, 'num')
    infile.close()

    # ------------------------------------------------------------------------------------------------- #
    # Apre il file dove inserire i risultati se esiste
    if resdir and os.path.exists(resdir):
        resfile = open(
            resdir + "/" + run + "_" + flds + "C" + ".treceval", 'w'
        )  # Se si cambiano piu' parametri e' consigliato usare la variabile tag al posto di run+"_"+flds+"C" per non rischiare di sovrascrivere risultati
        print "File dei risultati " + run + "_" + flds + "C" + ".treceval"
    else:
        print resdir, "does not exist"
        resdir = None

    # Effettua la ricerca per ogni query
    for qid in num[:]:
        title[int(qid) - 1].encode('utf-8')
        query = parser(campi, ix.schema,
                       group=lgroup).parse(title[int(qid) - 1])
        new_query = parser(campi, ix.schema, group=lgroup).parse(
            expq_cor(ix, query)
        )  # Corregge la query se le parole hanno una lettera sbagliata
        #print new_query
        results = ix.searcher(weighting=score).search(
            new_query, limit=lim)  # Effettua la ricerca effettiva

        if results:
            if not resdir:  # Stampa i risultati in console
                res(results, qid, lim, tag)
            else:  # Stampa i risultati su file
                print "sta stampando i risultati della query " + qid + " su file"
                res(results, qid, lim, tag, resfile)
        else:
            print "non ha trovato risultati"
    resfile.close()
    ix.searcher().close()

    return None
コード例 #42
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__class__.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

                # reset the writer so there is no 'start_doc' error from the
                # previous failed update attempt
                writer = AsyncWriter(self.index)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s",
                           whoosh_id, e)

    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        try:
            if not models:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s.%s" %
                                            (DJANGO_CT, model._meta.app_label,
                                             model._meta.module_name))

                self.index.delete_by_query(
                    q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to clear documents from Whoosh: %s", e)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if not end_offset is None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self,
               query_string,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields='',
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               within=None,
               dwithin=None,
               distance_point=None,
               models=None,
               limit_to_registered_models=None,
               result_class=None,
               **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.",
                          Warning,
                          stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.",
                          Warning,
                          stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.",
                          Warning,
                          stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted([
                '%s.%s' % (model._meta.app_label, model._meta.module_name)
                for model in models
            ])
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(
                start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num,
                                                **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page,
                                            highlight=highlight,
                                            query_string=query_string,
                                            spelling_query=spelling_query,
                                            result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(
                        query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted([
                '%s.%s' % (model._meta.app_label, model._meta.module_name)
                for model in models
            ])
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)
        searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [
                        term.replace('*', '') for term in query_string.split()
                    ]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms, sa, ContextFragmenter(terms),
                                UppercaseFormatter())
                        ],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'],
                                date_values['day'], date_values['hour'],
                                date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                    converted_value,
                (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
コード例 #43
0
    #
    # full text search
    #

    try:
        if not os.path.exists(FULLTEXT_INDEX_PATH):
            os.makedirs(FULLTEXT_INDEX_PATH, exist_ok=True)

        idx_storage = FileStorage(FULLTEXT_INDEX_PATH)

        if not index.exists_in(FULLTEXT_INDEX_PATH):
            print('creating new whoosh index')
            schema = getFTschema()
            idx = index.create_in(FULLTEXT_INDEX_PATH, schema)
        else:
            idx = idx_storage.open_index()

        idx_writer = idx.writer()

        for post in posts:
            keywords = " ".join(post.get_categories()) + ' ' + " ".join(
                post.get_keywords()) + ' ' + " ".join(post.get_tags())
            # print(keywords)
            idx_writer.update_document(
                path=post.get_url(),
                content=post.get_raw_md().lower(),
                title=post.get_title().lower(),
                keywords=keywords,
            )

        # TODO: pàgines a indexar? per exemple CKA
コード例 #44
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        ".",
    )

    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        self.setup_complete = False

        if not hasattr(settings, "HAYSTACK_WHOOSH_PATH"):
            raise ImproperlyConfigured("You must specify a HAYSTACK_WHOOSH_PATH in your settings.")

    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False

        # Make sure the index is there.
        if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True

        if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % settings.HAYSTACK_WHOOSH_PATH
            )

        self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {"id": ID(stored=True, unique=True), "django_ct": ID(stored=True), "django_id": ID(stored=True)}
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if isinstance(field_class, MultiValueField):
                schema_fields[field_name] = KEYWORD(stored=True, commas=True)
            elif isinstance(field_class, (DateField, DateTimeField, IntegerField, FloatField, BooleanField)):
                if field_class.indexed is False:
                    schema_fields[field_name] = STORED
                else:
                    schema_fields[field_name] = ID(stored=True)
            else:
                schema_fields[field_name] = TEXT(stored=True, analyzer=StemmingAnalyzer())

            if field_class.document is True:
                content_field_name = field_name

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = self.index.writer()

        for obj in iterable:
            doc = index.prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            writer.update_document(**doc)

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)
        self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))

        # For now, commit no matter what, as we run into locking issues otherwise.
        self.index.commit()

    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if not models:
            self.delete_index()
        else:
            models_to_delete = []

            for model in models:
                models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))

            self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))

        # For now, commit no matter what, as we run into locking issues otherwise.
        self.index.commit()

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH)

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    @log_query
    def search(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
        spelling_query=None,
        limit_to_registered_models=True,
        **kwargs
    ):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        query_string = force_unicode(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u"*":
            return {"results": [], "hits": 0}

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith("-"):
                    reverse_counter += 1

            if len(sort_by) > 1 and reverse_counter > 1:
                raise SearchBackendError(
                    "Whoosh does not handle more than one field and any field being ordered in reverse."
                )

            for order_by in sort_by:
                if order_by.startswith("-"):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models registered
            # with the current site.
            if narrow_queries is None:
                narrow_queries = set()

            registered_models = self.build_registered_models_list()

            if len(registered_models) > 0:
                narrow_queries.add("django_ct:(%s)" % " OR ".join(registered_models))

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq)))

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {"results": [], "hits": 0}

            raw_results = searcher.search(parsed_query, sortedby=sort_by, reverse=reverse)

            # Handle the case where the results have been narrowed.
            if narrowed_results:
                raw_results.filter(narrowed_results)

            return self._process_results(
                raw_results,
                start_offset,
                end_offset,
                highlight=highlight,
                query_string=query_string,
                spelling_query=spelling_query,
            )
        else:
            if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False):
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {"results": [], "hits": 0, "spelling_suggestion": spelling_suggestion}

    def more_like_this(self, model_instance, additional_query_string=None):
        warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2)
        return {"results": [], "hits": 0}

    def _process_results(
        self, raw_results, start_offset, end_offset, highlight=False, query_string="", spelling_query=None
    ):
        from haystack import site

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_results)
        raw_results = raw_results[start_offset:end_offset]

        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_results):
            raw_result = dict(raw_result)
            app_label, model_name = raw_result["django_ct"].split(".")
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], "convert"):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields["django_ct"])
                del (additional_fields["django_id"])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter

                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace("*", "") for term in query_string.split()]

                    # DRL_FIXME: Highlighting doesn't seem to work properly in testing.
                    additional_fields["highlighted"] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms,
                                sa,
                                ContextFragmenter(terms),
                                UppercaseFormatter(),
                            )
                        ]
                    }

                # Requires Whoosh 0.1.20+.
                if hasattr(raw_results, "score"):
                    score = raw_results.score(doc_offset)
                else:
                    score = None

                if score is None:
                    score = 0

                result = SearchResult(app_label, model_name, raw_result["django_id"], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion}

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        sp = SpellChecker(self.storage)
        cleaned_query = force_unicode(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, "")

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, "")

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = sp.suggest(word, number=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = " ".join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.
        
        Code courtesy of pysolr.
        """
        if hasattr(value, "strftime"):
            if hasattr(value, "hour"):
                value = force_unicode(value.strftime("%Y-%m-%dT%H:%M:%S"))
            else:
                value = force_unicode(value.strftime("%Y-%m-%dT00:00:00"))
        elif isinstance(value, bool):
            if value:
                value = u"true"
            else:
                value = u"false"
        else:
            value = force_unicode(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.
        
        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == "true":
            return True
        elif value == "false":
            return False

        if value:
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(
                    date_values["year"],
                    date_values["month"],
                    date_values["day"],
                    date_values["hour"],
                    date_values["minute"],
                    date_values["second"],
                )

        try:
            # This is slightly gross but it's hard to tell otherwise what the
            # string's original type might have been. Be careful who you trust.
            converted_value = eval(value)

            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
コード例 #45
0
ファイル: indexer.py プロジェクト: celi3t/HangryGoat
def get_index():
    storage = FileStorage("index")
    return storage.open_index()
コード例 #46
0
def whoosh_open_idx(idx_path, schema, indexname="content"):
    storage = FileStorage(idx_path)
    ix = storage.open_index(schema=schema, indexname=indexname)
    return ix
コード例 #47
0
def openIndex():
    storage = FileStorage("indexdir")
    ix = storage.open_index()
    return ix
コード例 #48
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e)

    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        try:
            if not models:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to clear documents from Whoosh: %s", e)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if not end_offset is None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None, within=None,
               dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(
                    parsed_query,
                    page_num,
                    **search_kwargs
                )
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name, top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)
        searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter
                    )
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
コード例 #49
0
# 方法一 使用FileStorage对象
from whoosh.filedb.filestore import FileStorage
storage = FileStorage('index')  # idx_path 为索引路径
idx1 = storage.open_index(indexname='idx1')

from whoosh import index
# 方法二 使用open_dir函数
from whoosh.index import open_dir
idx2 = open_dir('index', indexname='idx2')  # indexname 为索引名
print(index.exists_in('index', indexname='idx2'))
pass

from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin

og = OrGroup.factory(0.9)

qp = QueryParser("content", schema=idx1.schema)  # group=OrGroup
qp.remove_plugin_class(FieldsPlugin)
q = qp.parse("reset")
print(q)
# mqp = MultifieldParser(["title", "content"], schema=idx1.schema)
# mq = mqp.parse(u"many only")
#
# from whoosh.query import *
# myquery = And([Term("title", u"third"), q])
# # myquery = Term("title", u"ird")
# print(myquery)
searcher = idx1.searcher()
r = (searcher.search(q=q, limit=None))
print(len(r))
for hit in r: