Example #1
0
def build_index(sa_session, whoosh_index_dir, path_to_repositories):
    """
    Build the search indexes. One for repositories and another for tools within.
    """
    #  Rare race condition exists here and below
    if not os.path.exists(whoosh_index_dir):
        os.makedirs(whoosh_index_dir)
    tool_index_dir = os.path.join(whoosh_index_dir, 'tools')
    if not os.path.exists(tool_index_dir):
        os.makedirs(tool_index_dir)

    repo_index_storage = FileStorage(whoosh_index_dir)
    tool_index_storage = FileStorage(tool_index_dir)

    repo_index = repo_index_storage.create_index(repo_schema)
    tool_index = tool_index_storage.create_index(tool_schema)

    repo_index_writer = repo_index.writer()
    tool_index_writer = tool_index.writer()

    repos_indexed = 0
    tools_indexed = 0

    for repo in get_repos(sa_session, path_to_repositories):

        repo_index_writer.add_document(
            id=repo.get('id'),
            name=unicodify(repo.get('name')),
            description=unicodify(repo.get('description')),
            long_description=unicodify(repo.get('long_description')),
            homepage_url=unicodify(repo.get('homepage_url')),
            remote_repository_url=unicodify(repo.get('remote_repository_url')),
            repo_owner_username=unicodify(repo.get('repo_owner_username')),
            times_downloaded=repo.get('times_downloaded'),
            approved=repo.get('approved'),
            last_updated=repo.get('last_updated'),
            full_last_updated=repo.get('full_last_updated'))
        #  Tools get their own index
        for tool in repo.get('tools_list'):
            tool_index_writer.add_document(
                id=unicodify(tool.get('id')),
                name=unicodify(tool.get('name')),
                version=unicodify(tool.get('version')),
                description=unicodify(tool.get('description')),
                help=unicodify(tool.get('help')),
                repo_owner_username=unicodify(repo.get('repo_owner_username')),
                repo_name=unicodify(repo.get('name')),
                repo_id=repo.get('id'))
            tools_indexed += 1
            print(tools_indexed, 'tools (', tool.get('id'), ')')

        repos_indexed += 1
        print(repos_indexed, 'repos (', repo.get('id'), ')')

    tool_index_writer.commit()
    repo_index_writer.commit()

    print("TOTAL repos indexed: ", repos_indexed)
    print("TOTAL tools indexed: ", tools_indexed)
def build_index(sa_session, whoosh_index_dir, path_to_repositories):
    """
    Build the search indexes. One for repositories and another for tools within.
    """
    #  Rare race condition exists here and below
    if not os.path.exists(whoosh_index_dir):
        os.makedirs(whoosh_index_dir)
    tool_index_dir = os.path.join(whoosh_index_dir, 'tools')
    if not os.path.exists(tool_index_dir):
        os.makedirs(tool_index_dir)

    repo_index_storage = FileStorage(whoosh_index_dir)
    tool_index_storage = FileStorage(tool_index_dir)

    repo_index = repo_index_storage.create_index(repo_schema)
    tool_index = tool_index_storage.create_index(tool_schema)

    repo_index_writer = repo_index.writer()
    tool_index_writer = tool_index.writer()

    repos_indexed = 0
    tools_indexed = 0

    for repo in get_repos(sa_session, path_to_repositories):

        repo_index_writer.add_document(id=repo.get('id'),
                             name=unicodify(repo.get('name')),
                             description=unicodify(repo.get('description')),
                             long_description=unicodify(repo.get('long_description')),
                             homepage_url=unicodify(repo.get('homepage_url')),
                             remote_repository_url=unicodify(repo.get('remote_repository_url')),
                             repo_owner_username=unicodify(repo.get('repo_owner_username')),
                             times_downloaded=repo.get('times_downloaded'),
                             approved=repo.get('approved'),
                             last_updated=repo.get('last_updated'),
                             full_last_updated=repo.get('full_last_updated'))
        #  Tools get their own index
        for tool in repo.get('tools_list'):
            tool_index_writer.add_document(id=unicodify(tool.get('id')),
                                           name=unicodify(tool.get('name')),
                                           version=unicodify(tool.get('version')),
                                           description=unicodify(tool.get('description')),
                                           help=unicodify(tool.get('help')),
                                           repo_owner_username=unicodify(repo.get('repo_owner_username')),
                                           repo_name=unicodify(repo.get('name')),
                                           repo_id=repo.get('id'))
            tools_indexed += 1
            print(tools_indexed, 'tools (', tool.get('id'), ')')

        repos_indexed += 1
        print(repos_indexed, 'repos (', repo.get('id'), ')')

    tool_index_writer.commit()
    repo_index_writer.commit()

    print("TOTAL repos indexed: ", repos_indexed)
    print("TOTAL tools indexed: ", tools_indexed)
Example #3
0
 def _get_index(self, language=None):
     storage = FileStorage(self._index_dir).create()
     if storage.index_exists():
         ix = storage.open_index()
     else:
         ix = storage.create_index(self._get_schema(language))
     return ix
Example #4
0
def build_index():
    """building the index from scratch"""
    print "building index.."

    index_dir = PYTHON_SEARCH_DIR
    if TEST_COLLECTION:
        index_dir = PYTHON_SEARCH_DIR_TEST
        #CR_DOCS_DB.drop()
        #CR_DOCS_DB.ensure_index("code_id", unique=True)
    if os.path.exists(index_dir):
        shutil.rmtree(index_dir)
    os.mkdir(index_dir)
    schema = get_schema()
    storage = FileStorage(index_dir)
    ix = storage.create_index(schema)
    w = ix.writer()
    print "finding posts.."
    posts_with_code = POSTS_DB.find({"answers.Body": {"$regex": "/.*<code>.*/"}}, timeout=False)
    print "adding files.."
    q = 0
    for i, question in enumerate(posts_with_code):
        q += add_doc(w, question)
        if i % 1000 == 0 and not i == 0:
            print "commit number:", str(i / 1000), "with", q, "codes"
            w.commit()
            w = ix.writer()

    w.commit()
    posts_with_code.close()
    print "the index was built!"
    return ix
def build_index(sa_session, toolshed_whoosh_index_dir):
    storage = FileStorage(toolshed_whoosh_index_dir)
    index = storage.create_index(schema)
    writer = index.writer()

    def to_unicode(a_basestr):
        if type(a_basestr) is str:
            return unicode(a_basestr, 'utf-8')
        else:
            return a_basestr

    repos_indexed = 0
    for (id, name, description, long_description, homepage_url,
         remote_repository_url, repo_owner_username, times_downloaded,
         approved, last_updated, full_last_updated) in get_repos(sa_session):

        writer.add_document(
            id=id,
            name=to_unicode(name),
            description=to_unicode(description),
            long_description=to_unicode(long_description),
            homepage_url=to_unicode(homepage_url),
            remote_repository_url=to_unicode(remote_repository_url),
            repo_owner_username=to_unicode(repo_owner_username),
            times_downloaded=times_downloaded,
            approved=approved,
            last_updated=last_updated,
            full_last_updated=full_last_updated)
        repos_indexed += 1
    writer.commit()
    print "Number of repos indexed: ", repos_indexed
Example #6
0
    def __init__(self, path: Path):
        storage = FileStorage(fspath(path))

        if path.exists():
            self.ix = storage.open_index()
        else:
            path.mkdir(exist_ok=True, parents=True)
            self.ix = storage.create_index(self.schema)
Example #7
0
def create_index(sender=None, **kwargs):
    """Creates a File based whoosh index, location used is
    settings.WHOOSH_INDEX so make sure that is set"""
    if not os.path.exists(settings.WHOOSH_INDEX):
        os.mkdir(settings.WHOOSH_INDEX)
        storage = FileStorage(settings.WHOOSH_INDEX)
        ix = storage.create_index(schema=WHOOSH_SCHEMA,
                                  indexname="search")
def init_index(index=".index"):
	indexZ=index
	if not os.path.exists(indexZ):
		os.mkdir(indexZ)      # os.rmdir(index)
	storage = FileStorage(indexZ)
	schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID   (stored=True),tags=KEYWORD)
	ix = storage.create_index(schema)
	ix = storage.open_index()
	return ix
Example #9
0
 def _setup(self, storage_directory):
     schema = fields.Schema(
         oid=fields.ID(stored=True, unique=True),
         name=fields.ID())
     schema.add('*', fields.TEXT, glob=True)
     if storage_directory:
         if  os.path.exists(storage_directory):
             self._using_existing_index = True
             storage = FileStorage(storage_directory)
             ix = storage.open_index()
         else:
             os.mkdir(storage_directory)
             storage = FileStorage(storage_directory)
             ix = storage.create_index(schema)
     else:
         storage = RamStorage()
         ix = storage.create_index(schema)
     return (schema, ix)
Example #10
0
def get_index(index, schema, refresh=False):
    index_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), index)
    storage = FileStorage(index_dir)
    if exists_in(index_dir) and not refresh:
        ix = storage.open_index()
    else:
        # os.mkdir(index_dir)
        st = FileStorage(index_dir).create()
        ix = st.create_index(schema)
    return ix
Example #11
0
def get_myindex(indexdir='indexdir', filestore=False):
    schema = get_schema()
    if not filestore:
        if not os.path.exists(indexdir):
            os.mkdir(indexdir)
            ix = index.create_in(indexdir, schema)
        ix = index.open_dir(indexdir)
    else:
        storage = FileStorage(indexdir)
        # TODO: When the indexdir has already exist
        #       the index object also use create_index,
        #       it should use open_dir as above method.
        ix = storage.create_index(schema)
    return ix
Example #12
0
def get_index(name, schema, path, clean=False):
    # create dir
    if not os.path.exists(path):
        os.makedirs(path)

    storage = FileStorage(path)

    # Create an index object
    try:
        if clean:
            raise EmptyIndexError()
        return storage.open_index(indexname=name)
    except EmptyIndexError:
        return storage.create_index(schema, indexname=name)
def init_index(index=".index"):
    indexZ = index
    if not os.path.exists(indexZ):
        os.mkdir(indexZ)  # os.rmdir(index)
    storage = FileStorage(indexZ)
    schema = Schema(name=TEXT(stored=True),
                    ext=KEYWORD,
                    title=TEXT(stored=True),
                    content=TEXT,
                    path=ID(stored=True),
                    tags=KEYWORD)
    ix = storage.create_index(schema)
    ix = storage.open_index()
    return ix
Example #14
0
def indexLibraries(db, index_field="name", index_folder="index_fullname"):
    print("whoosh Indexing")
    schema = Schema(fullname=TEXT(analyzer=StemmingAnalyzer(), spelling=True),
                    id=TEXT(stored=True))
    dirname = os.path.dirname(os.path.abspath(__file__))
    st = FileStorage(os.path.join(dirname, index_folder)).create()
    index = st.create_index(schema)
    posts = db.AQLQuery("FOR library in libraries RETURN library",
                        rawResults=True,
                        batchSize=10000)

    writer = index.writer()
    for post in posts:
        writer.update_document(fullname=post[index_field], id=post["_id"])
    writer.commit()
Example #15
0
def create_index(request):
    analyzer = ChineseAnalyzer()
    schema = Schema(ids=TEXT(stored=True, analyzer=analyzer),
                    name=TEXT(stored=True, analyzer=analyzer))
    file_storage = FileStorage('D:/Chihuo/shopping_mall/index')
    if not os.path.exists('D:/Chihuo/shopping_mall/index'):
        os.mkdir('D:/Chihuo/shopping_mall/index')
        ix = file_storage.create_index(schema)
    else:
        ix = file_storage.open_index()
    writer = ix.writer()
    for goods in Goods.objects.all():
        writer.add_document(ids=str(goods.id), name=goods.name)
        print(goods.name)
    writer.commit()
    return HttpResponse('创建完成')
Example #16
0
    def BuiltIndex(self):
        analyzer = ChineseAnalyzer()
        # define schema
        schema = Schema(title=TEXT(sortable=True),
                        zb_url=TEXT(sortable=True),
                        ctime=TEXT(sortable=True),
                        deadline=TEXT(sortable=True),
                        bsdeadline=TEXT(sortable=True),
                        dbtb=TEXT(sortable=True),
                        content=TEXT(sortable=True, analyzer=analyzer),
                        lettercard=TEXT(sortable=True, analyzer=analyzer))
        dirname = './whoosh_index'
        storage = FileStorage(dirname)
        if not os.path.exists(dirname):
            os.mkdir(dirname)
            # create index file
            ix = storage.create_index(schema, indexname='Hello')
        else:
            ix = storage.open_index(indexname='Hello')

        writer = ix.writer()

        # fetch rows from DB
        num = 0
        try:
            with connection.cursor() as cursor:
                for tbname in self.Gettabs():
                    sql = '''SELECT `title`, `zb_url`, `ctime`, `deadline`, `bsdeadline`,`dbtb`, `content`, `lettercard` FROM {}'''.format(
                        tbname)
                    cursor.execute(sql)
                    rows = cursor.fetchall()
                    # write the rows into indexes
                    for row in rows:
                        writer.add_document(title=str(row["title"]),
                                            zb_url=str(row["zb_url"]),
                                            ctime=str(row["ctime"]),
                                            deadline=str(row['deadline']),
                                            bsdeadline=str(row['bsdeadline']),
                                            dbtb=str(row["dbtb"]),
                                            content=str(row["content"]),
                                            lettercard=str(row["lettercard"]))

                        num += 1
                writer.commit()
        finally:
            connection.close()
        print("%d docs indexed!" % num)
Example #17
0
def create_in(dirname, schema, indexname=None):
    """Convenience function to create an index in a directory. Takes care of creating
    a FileStorage object for you. indexname is t
    
    :param dirname: the path string of the directory in which to create the index.
    :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields.
    :param indexname: the name of the index to create; you only need to specify this if
        you are creating multiple indexes within the same storage object.
    :returns: :class:`Index`
    """
    
    if not indexname:
        indexname = _DEF_INDEX_NAME
    
    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname)
    return storage.create_index(schema, indexname)
Example #18
0
def create_in(dirname, schema, indexname=None):
    """Convenience function to create an index in a directory. Takes care of creating
    a FileStorage object for you. indexname is t
    
    :param dirname: the path string of the directory in which to create the index.
    :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields.
    :param indexname: the name of the index to create; you only need to specify this if
        you are creating multiple indexes within the same storage object.
    :returns: :class:`Index`
    """
    
    if not indexname:
        indexname = _DEF_INDEX_NAME
    
    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname)
    return storage.create_index(schema, indexname)
Example #19
0
def create_index(request):
    analyzer = ChineseAnalyzer()
    schema = Schema(name=TEXT(stored=True, analyzer=analyzer),
                    datail=TEXT(stored=True, analyzer=analyzer),
                    ids=TEXT(stored=True))
    file_storage = FileStorage('./index')
    if not os.path.exists('./index'):
        os.mkdir('./index')
        ix = file_storage.create_index(schema)
    else:
        ix = file_storage.open_index()
    writer = ix.writer()
    for goods in Good_type.objects.all():
        writer.add_document(name=goods.name,
                            datail=goods.note,
                            ids=str(goods.id))
    writer.commit()
    return HttpResponse('索引创建完成')
Example #20
0
def create_in(dirname, schema, indexname=None, byteorder=None):
    """Convenience function to create an index in a directory. Takes care of creating
    a FileStorage object for you. indexname is t
    
    :param dirname: the path string of the directory in which to create the index.
    :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields.
    :param indexname: the name of the index to create; you only need to specify this if
        you are creating multiple indexes within the same storage object.
    :param byteorder: the byte order to use when writing numeric values to disk: 'big',
        'little', or None. If None (the default), Whoosh uses the native platform order.
    :returns: :class:`Index`
    """

    if not indexname:
        indexname = _DEF_INDEX_NAME

    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname, byteorder=byteorder)
    return storage.create_index(schema, indexname)
Example #21
0
    def __init__(self,
                 index_dir="whoosh_index",
                 schema_type="",
                 schema_name="default_schema"):
        self.index_dir = index_dir
        self.schema_type = schema_type
        self.schema_name = schema_name
        self.schema_dir = self.index_dir + "/" + self.schema_name
        self.search_limit = 100

        analyzer = analysis.StandardAnalyzer(stoplist=frozenset([]))
        # create schema
        if self.schema_type == "dialogs":
            self.schema = Schema(dialog=TEXT(analyzer=analyzer, stored=True),
                                 lang=ID(stored=True),
                                 turn=NUMERIC(stored=True),
                                 vector=STORED)
        elif self.schema_type == "embedding":
            self.schema = Schema(key=ID(stored=True), vector=STORED)

        # create index
        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)

        if not os.path.exists(self.schema_dir):
            os.mkdir(self.schema_dir)

        # create / load index
        storage = FileStorage(self.schema_dir)
        # check index exists
        if storage.index_exists():
            print('index exists, loading.')
            # open
            self.ix = storage.open_index()
        else:
            print('index doesn\'t exists, creating.')
            # create
            self.ix = storage.create_index(self.schema)

        # open index directory
        # self.ix = open_dir(self.schema_dir)

        self.writer = None
Example #22
0
def build_index( sa_session, whoosh_index_dir ):
    storage = FileStorage( whoosh_index_dir )
    index = storage.create_index( schema )
    writer = index.writer()
    def to_unicode( a_basestr ):
        if type( a_basestr ) is str:
            return unicode( a_basestr, 'utf-8' )
        else:
            return a_basestr
    lddas_indexed = 0
    for id, name, info, dbkey, message in get_lddas( sa_session ):
        writer.add_document( id=id,
                             name=to_unicode( name ),
                             info=to_unicode( info ),
                             dbkey=to_unicode( dbkey ),
                             message=to_unicode( message ) )
        lddas_indexed += 1
    writer.commit()
    print "Number of active library datasets indexed: ", lddas_indexed
Example #23
0
def newIndex():
    '''
    newIndex()
    Creates the index/schema for the Whoosh module
    INPUTS: (none)
    OUTPUTS: idx -- index 
    '''
    print '\tCreating a new Index in the current directory'
    # Create an index to store the artist/title and lyrics
    schm = Schema(Name=TEXT(stored=True), Ingr=KEYWORD(stored=True, commas=True))
    # Create a directory called FAR_Storage; will contain the index
    # See Whoosh documentation for more information
    if not os.path.exists('FAR_Storage'):
        os.mkdir('FAR_Storage')
    idxDir ='FAR_Storage'
    storage = FileStorage(idxDir)
    idx = storage.create_index(schm, indexname='FAR')
    idx = storage.open_index(indexname = 'FAR')
    return idx
Example #24
0
def create_index(index_dir, data_dir):
    schema = Schema(path=ID(stored=True, unique=True),
                    content=TEXT(stored=True))

    storage_obj = FileStorage(index_dir)

    if whoosh.index.exists_in(index_dir):
        try:
            shutil.rmtree(index_dir)
            os.makedirs(index_dir)
        except:
            raise PermissionError(
                _("Das Index-Verzeichnis konnte nicht erstellt werden"))

    idx = storage_obj.create_index(schema)

    writer = idx.writer()

    # Iteriere über alle Dateien die auf .md enden
    for (path, dirs, files) in os.walk(data_dir):
        # Remove the git-Folder
        if '.git' in dirs:
            dirs.remove('.git')

        for article in files:
            if article.endswith('.md'):
                article_path = os.path.join(
                    os.path.relpath(path, data_dir).strip('./'), article)

                try:
                    # Get file content
                    with codecs.open(os.path.join(path, article), "r",
                                     "utf-8") as f:
                        content = f.read()
                        writer.add_document(path=article_path, content=content)
                except:
                    continue

    writer.commit()

    return True
Example #25
0
def test_storage_creation():
    import tempfile, uuid
    from whoosh import fields
    from whoosh.filedb.filestore import FileStorage

    schema = fields.Schema(text=fields.TEXT)
    uid = uuid.uuid4()
    dirpath = os.path.join(tempfile.gettempdir(), str(uid))
    assert not os.path.exists(dirpath)

    st = FileStorage(dirpath)
    st.create()
    assert os.path.exists(dirpath)

    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("alfa bravo"))
        w.add_document(text=u("bracho charlie"))

    st.destroy()
    assert not os.path.exists(dirpath)
def build_index(sa_session, whoosh_index_dir):
    storage = FileStorage(whoosh_index_dir)
    index = storage.create_index(schema)
    writer = index.writer()

    def to_unicode(a_basestr):
        if not isinstance(a_basestr, text_type):
            return text_type(a_basestr, 'utf-8')
        else:
            return a_basestr

    lddas_indexed = 0
    for id, name, info, dbkey, message in get_lddas(sa_session):
        writer.add_document(id=id,
                            name=to_unicode(name),
                            info=to_unicode(info),
                            dbkey=to_unicode(dbkey),
                            message=to_unicode(message))
        lddas_indexed += 1
    writer.commit()
    print("Number of active library datasets indexed: ", lddas_indexed)
Example #27
0
def test_storage_creation():
    import tempfile, uuid
    from whoosh import fields
    from whoosh.filedb.filestore import FileStorage

    schema = fields.Schema(text=fields.TEXT)
    uid = uuid.uuid4()
    dirpath = os.path.join(tempfile.gettempdir(), str(uid))
    assert not os.path.exists(dirpath)

    st = FileStorage(dirpath)
    st.create()
    assert os.path.exists(dirpath)

    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("alfa bravo"))
        w.add_document(text=u("bracho charlie"))

    st.destroy()
    assert not os.path.exists(dirpath)
Example #28
0
def build_index( sa_session, toolshed_whoosh_index_dir ):
    storage = FileStorage( toolshed_whoosh_index_dir )
    index = storage.create_index( schema )
    writer = index.writer()
    def to_unicode( a_basestr ):
        if type( a_basestr ) is str:
            return unicode( a_basestr, 'utf-8' )
        else:
            return a_basestr

    repos_indexed = 0
    for ( id,
            name, 
            description, 
            long_description,
            homepage_url,
            remote_repository_url,
            repo_owner_username,
            times_downloaded,
            approved,
            last_updated,
            full_last_updated ) in get_repos( sa_session ):

        writer.add_document( id = id,
                             name = to_unicode( name ),
                             description = to_unicode( description ), 
                             long_description = to_unicode( long_description ), 
                             homepage_url = to_unicode( homepage_url ), 
                             remote_repository_url = to_unicode( remote_repository_url ), 
                             repo_owner_username = to_unicode( repo_owner_username ),
                             times_downloaded = times_downloaded,
                             approved = approved,
                             last_updated = last_updated,
                             full_last_updated = full_last_updated )
        repos_indexed += 1
    writer.commit()
    print "Number of repos indexed: ", repos_indexed
Example #29
0
	def index(self):
		if self.empty:
			if not os.path.exists(self.folder):
				os.makedirs(self.folder)
			st = FileStorage(self.folder)
			ix = st.create_index(self.schema)
			w = ix.writer()
			w.add_document(name = u"beuha")
			pipe = file.ID3Filter()
			#[TODO] using itunes info for artwork?
			cpt = 0
			for track in pipe(ItunesParser(self.path)):
				if track['album'] != None : 
					album = track['album'].encode('ascii', 'ignore')
				else:
					album = ""
				#print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore')
				if cpt % 20 == 0:
					print "\n%i " %cpt,
				print '#',
				#print track['album'], track['name']
				w.add_document(
					trackId = track['trackId'], name=track['name']
					,artist=track['artist'], album=track['album'],
					genre=track['genre'], location=track['location'],
					artwork=boolean(track['artwork']),
					trackNumber=track['trackNumber'], bitRate=track['bitRate']
				)
				#if cpt % 100 == 1:
				#	w.commit()
				cpt += 1
			print "\n\n%i tracks indexed" % cpt
			w.commit()
			ix.optimize()
			ix.close()
		else :
			print "already indexed"
Example #30
0
class TinaIndex():
    """
    Open or Create a whoosh index
    Provides searching methods
    """

    def __init__( self, indexdir ):
        self.writer = None
        self.reader = None
        self.searcher = None
        self.indexdir = indexdir
        self.storage = FileStorage(self.indexdir)
        self.index = None
        try:
            self.index = self.storage.open_index()
        except EmptyIndexError, e:
            _logger.warning( "No existing index at %s : "%self.indexdir)
            self.schema = TinaSchema()
            if not os.path.exists(self.indexdir):
                os.mkdir(self.indexdir)
            self.index = self.storage.create_index(self.schema)
        except LockError, le:
            _logger.error("index LockError %s : "%self.indexdir)
            raise LockError(le)
Example #31
0
 def make_index(self, dirname, schema, ixname):
     if not exists(dirname):
         mkdir(dirname)
     st = FileStorage(dirname)
     ix = st.create_index(schema, indexname = ixname)
     return ix
Example #32
0
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)
带开一个已经存在某个目录的索引,使用index.open_dir()
[python] view plain copy
import whoosh.index as index

ix = index.open_dir("indexdir")
这些是便利方法:
[python] view plain copy
from whoosh.filedb.filestore import FileStorage
storage = FileStorage("indexdir")

# Create an index
ix = storage.create_index(schema)

# Open an existing index
storage.open_index()
你和index对象一起创建的schema对象是可序列化的并且和index一起存储
你可以在同一个目录下面使用多个索引,用关键字参数分开
[python] view plain copy
# Using the convenience functions
ix = index.create_in("indexdir", schema=schema, indexname="usages")
ix = index.open_dir("indexdir", indexname="usages")

# Using the Storage object
ix = storage.create_index(schema, indexname="usages")
ix = storage.open_index(indexname="usages")

Clearing the index
Example #33
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
Example #34
0
def create_index(sender=None, **kwargs):
    if not os.path.exists(settings.WHOOSH_INDEX):
        os.mkdir(settings.WHOOSH_INDEX)
        storage = FileStorage(settings.WHOOSH_INDEX)
        ix = storage.create_index(schema=WHOOSH_SCHEMA)
Example #35
0
class Library(object):

    RESULTS_LIMIT = 700
    FUZZY_LIMIT = 5
    SUGGESTIONS_LIMIT = 5
    
    def __init__(self, dbsession, **settings):
        """Initializes Whoosh by setting up and loading indexes for lookup."""
        self._dbsession = dbsession
        self.schema = ChipSchema()
        self.directory = settings.get(
            'whoosh.store',
            os.path.join(settings['config_path'], 'whoosh-data')
        )
        self.indexname = settings.get(
            'whoosh.indexname',
            'chips'
        )
        self.rebuild = asbool(settings.get('whoosh.rebuild', 'false'))
        self.storage = FileStorage(self.directory)

        self.setindex()
        
        if self.rebuild:
            self.setindex()
            self.buildindex()
        else:
            self.setindex()

    def setindex(self):
        if self.rebuild and os.path.exists(self.directory):
            shutil.rmtree(self.directory)

        if not os.path.exists(self.directory):
            os.mkdir(self.directory)

        if whoosh.index.exists_in(
            self.directory,
            indexname=self.indexname
        ):
            if self.rebuild:
                shutil.rmtree(self.directory)
                self.setindex()
            else:
                self.index = self.storage.open_index(indexname=self.indexname)
        else:
            self.index = self.storage.create_index(
                self.schema,
                indexname=self.indexname
            )
            
    def buildindex(self):
        q = self._dbsession.query(Chip).all()
        writer = self.index.writer()
        for chip in q:
            try:
                version = chip.version.name
            except AttributeError:
                version = ''

            writer.add_document(
                id=str(chip.id),
                indice=str(chip.indice),
                indice_game=str(chip.indice_game),
                name=chip.name.lower(),
                name_jp=chip.name_jp,
                name_display=chip.name,
                game=chip.game.name.lower(),
                game_enum=chip.game,
                version=version,
                version_enum=chip.version,
                classification=chip.classification.name,
                classification_enum=chip.classification,
                element=chip.element.name,
                element_enum=chip.element,
                description=chip.description,
                code=','.join(chip.codes_iter()).lower(),
                size=str(chip.size),
                damage_min=str(chip.damage_min),
                damage_max=str(chip.damage_max),
                recovery=str(chip.recovery),
                rarity=str(chip.rarity)
            )
        writer.commit(writing.CLEAR)
        
    def lookup(self, term, fuzzy=False, limit=None):
        term = term.strip()
        term = term.lower()

        if limit:
            limit = limit
        else:
            limit = self.RESULTS_LIMIT

        fields = (
            'indice',
            'indice_game',
            'name',
            'name_jp',
            'game',
            'version',
            'classification',
            'element',
            'code',
            'size',
            'damage_min',
            'damage_max',
            'recovery',
            'rarity'
        )
        if fuzzy:
            parser = MultifieldParser(
                fields,
                schema=self.index.schema,
                termclass=FuzzyTerm
            )
        else:
            parser = MultifieldParser(fields, schema=self.index.schema)

        operators = OperatorsPlugin(
            And="&",
            Or="\\|",
            AndNot="&!",
            AndMaybe="&~",
            Not="\\-"
        )
        parser.replace_plugin(operators)
        query = parser.parse(term)
        results = []
        try:
            searcher = self.index.searcher()
            results = searcher.search(query, limit=limit)

            if not results and not fuzzy:
                # Try a Fuzzy Search.
                return self.lookup(term, fuzzy=True, limit=self.FUZZY_LIMIT)
        except IndexError:
            pass
            
        return results
Example #36
0
def build_whoosh_index_cron():
    """
    建立全文搜索索引
    """
    from web.utils import whoosh_site_schema, whoosh_article_schema
    from whoosh.filedb.filestore import FileStorage
    from whoosh.qparser import QueryParser

    idx_dir = settings.WHOOSH_IDX_DIR
    first_boot = False

    if not os.path.exists(idx_dir):
        os.makedirs(idx_dir)
        first_boot = True

    storage = FileStorage(idx_dir)

    # 索引站点
    if first_boot:
        idx = storage.create_index(whoosh_site_schema, indexname="site")
    else:
        idx = storage.open_index(indexname="site", schema=whoosh_site_schema)

    idx_writer = idx.writer()

    for site_id in get_active_sites():
        # 判断是否已经索引
        if is_indexed('site', site_id) and not first_boot:
            continue

        try:
            site = Site.objects.get(pk=site_id, status='active')
        except:
            continue

        cname = split_cn_words(site.cname, join=True)
        author = site.author or ''
        brief = split_cn_words(site.brief, join=True)

        logger.info(f"源分词结果:`{site_id}`{cname}`{brief}")

        try:
            idx_writer.add_document(id=site_id,
                                    cname=cname,
                                    author=author,
                                    brief=brief)
            set_indexed('site', site_id)
        except:
            logger.warning(f"源索引失败:`{site_id}")
    idx_writer.commit()

    # 索引文章
    if first_boot:
        idx = storage.create_index(whoosh_article_schema, indexname="article")
    else:
        idx = storage.open_index(indexname="article",
                                 schema=whoosh_article_schema)

    idx_writer = idx.writer()

    for uindex in get_recent_articles():
        # 判断是否已经索引
        if is_indexed('article', uindex) and not first_boot:
            continue

        try:
            article = Article.objects.get(uindex=uindex, status='active')
        except:
            continue

        content = get_content(uindex, article.site_id)

        if content:
            title = split_cn_words(article.title, join=True)
            author = article.author or ''

            content_soup = BeautifulSoup(content, 'html.parser')
            content = split_cn_words(content_soup.get_text(),
                                     join=True,
                                     limit=20)

            logger.info(f"文章分词结果:`{uindex}`{title}")

            try:
                idx_writer.add_document(uindex=uindex,
                                        title=title,
                                        author=author,
                                        content=content)
                set_indexed('article', uindex)
            except:
                logger.warning(f"文章索引失败:`{uindex}")
    idx_writer.commit()

    # 清理过期文章
    idx = storage.open_index(indexname="article", schema=whoosh_article_schema)
    idx_writer = idx.writer()

    lastweek_ts = str(current_ts() - 7 * 86400 * 1000)
    query = QueryParser("uindex",
                        idx.schema).parse('uindex:{to %s]' % lastweek_ts)

    with idx.searcher() as searcher:
        idx_writer.delete_by_query(query, searcher)
        idx_writer.commit()

    return True
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            # Document boosts aren't supported in Whoosh 2.5.0+.
            if 'boost' in doc:
                del doc['boost']

            try:
                writer.update_document(**doc)
            except Exception as e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e)

    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        try:
            if not models:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to clear documents from Whoosh: %s", e)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if not end_offset is None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None, within=None,
               dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(
                    parsed_query,
                    page_num,
                    **search_kwargs
                )
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name, top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)
        searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name),
                        terms,
                        sa,
                        ContextFragmenter(),
                        formatter
                    )
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
Example #38
0
def build_index(sa_session, whoosh_index_dir, path_to_repositories,
                hgweb_config_dir):
    """
    Build the search indexes. One for repositories and another for tools within.
    """
    #  Rare race condition exists here and below
    tool_index_dir = os.path.join(whoosh_index_dir, 'tools')
    if not os.path.exists(whoosh_index_dir):
        os.makedirs(whoosh_index_dir)
        os.makedirs(tool_index_dir)
        work_repo_dir = whoosh_index_dir
        work_tool_dir = tool_index_dir
    else:
        # Index exists, prevent in-place index regeneration
        work_repo_dir = tempfile.mkdtemp(prefix="tmp-whoosh-repo")
        work_tool_dir = tempfile.mkdtemp(prefix="tmp-whoosh-tool")

    repo_index_storage = FileStorage(work_repo_dir)
    tool_index_storage = FileStorage(work_tool_dir)

    repo_index = repo_index_storage.create_index(repo_schema)
    tool_index = tool_index_storage.create_index(tool_schema)

    repo_index_writer = repo_index.writer()
    tool_index_writer = tool_index.writer()

    repos_indexed = 0
    tools_indexed = 0

    for repo in get_repos(sa_session, path_to_repositories, hgweb_config_dir):

        repo_index_writer.add_document(
            id=repo.get('id'),
            name=unicodify(repo.get('name')),
            description=unicodify(repo.get('description')),
            long_description=unicodify(repo.get('long_description')),
            homepage_url=unicodify(repo.get('homepage_url')),
            remote_repository_url=unicodify(repo.get('remote_repository_url')),
            repo_owner_username=unicodify(repo.get('repo_owner_username')),
            categories=unicodify(repo.get('categories')),
            times_downloaded=repo.get('times_downloaded'),
            approved=repo.get('approved'),
            last_updated=repo.get('last_updated'),
            full_last_updated=repo.get('full_last_updated'),
            repo_lineage=unicodify(repo.get('repo_lineage')))
        #  Tools get their own index
        for tool in repo.get('tools_list'):
            tool_index_writer.add_document(
                id=unicodify(tool.get('id')),
                name=unicodify(tool.get('name')),
                version=unicodify(tool.get('version')),
                description=unicodify(tool.get('description')),
                help=unicodify(tool.get('help')),
                repo_owner_username=unicodify(repo.get('repo_owner_username')),
                repo_name=unicodify(repo.get('name')),
                repo_id=repo.get('id'))
            tools_indexed += 1
            print(tools_indexed, 'tools (', tool.get('id'), ')')

        repos_indexed += 1
        print(repos_indexed, 'repos (', repo.get('id'), ')')

    tool_index_writer.commit()
    repo_index_writer.commit()

    print("TOTAL repos indexed: ", repos_indexed)
    print("TOTAL tools indexed: ", tools_indexed)

    # Copy the built indexes if we were working in a tmp folder
    if work_repo_dir is not whoosh_index_dir:
        shutil.rmtree(whoosh_index_dir)
        os.makedirs(whoosh_index_dir)
        os.makedirs(tool_index_dir)
        copy_tree(work_repo_dir, whoosh_index_dir)
        copy_tree(work_tool_dir, tool_index_dir)
        shutil.rmtree(work_repo_dir)
Example #39
0
 def create_indexes(cls):
     storage = FileStorage(index_dir)
     storage.create_index(SongIndexSchema, indexname="songs")
Example #40
0
    stream_handler = logging.StreamHandler()
    app.logger.addHandler(stream_handler)
    app.logger.setLevel(logging.INFO)
    app.logger.info('microblog startup')

enable_search = WHOOSH_ENABLED
if enable_search:
	search_is_new = False
	if not os.path.exists(WHOOSH_BASE):
	    os.mkdir(WHOOSH_BASE)
	    search_is_new = True
	search_storage = FileStorage(WHOOSH_BASE)
	search_ix = None
	if search_is_new:
	    schema = Schema(id=ID(stored=True), body=TEXT())
	    search_ix = search_storage.create_index(schema)
	else:
	    search_ix = search_storage.open_index()

class CustomJSONEncoder(JSONEncoder):
    """This class adds support for lazy translation texts to Flask's
    JSON encoder. This is necessary when flashing translated texts."""
    def default(self, obj):
        from speaklater import is_lazy_string
        if is_lazy_string(obj):
            try:
                return unicode(obj)  # python 2
            except NameError:
                return str(obj)  # python 3
        return super(CustomJSONEncoder, self).default(obj)
Example #41
0
def create_index(sender=None, **kwargs):
    if not os.path.exists(settings.WHOOSH_INDEX):
        os.mkdir(settings.WHOOSH_INDEX)
        storage = FileStorage(settings.WHOOSH_INDEX)
        storage.create_index(WHOOSH_SCHEMA, indexname='memopol')
Example #42
0
class RedisWhooshStore(SAMLStoreBase
                       ):  # TODO: This needs a gc mechanism for keys (uuids)
    def json_dict(self, name):
        return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name),
                                     redis=self._redis,
                                     writeback=True),
                            maxsize=config.cache_size)

    def xml_dict(self, name):
        return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name),
                                    redis=self._redis,
                                    writeback=True),
                            maxsize=config.cache_size)

    def __init__(self, *args, **kwargs):
        self._dir = kwargs.pop('directory', '.whoosh')
        clear = bool(kwargs.pop('clear', config.store_clear))
        self._name = kwargs.pop('name', config.store_name)
        self._redis = kwargs.pop('redis', redis())
        if clear:
            shutil.rmtree(self._dir)
        now = datetime.now()
        self._last_index_time = now
        self._last_modified = now
        self._setup()
        if clear:
            self.reset()

    def _setup(self):
        self._redis = getattr(self, '_redis', None)
        if not self._redis:
            self._redis = redis(
            )  # XXX test cases won't get correctly unpicked because of this
        self.schema = Schema(content=NGRAMWORDS(stored=False))
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        self.schema.add('sha1', ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self.objects = self.xml_dict('objects')
        self.parts = self.json_dict('parts')
        self.storage = FileStorage(os.path.join(self._dir, self._name))
        try:
            self.index = self.storage.open_index(schema=self.schema)
        except BaseException as ex:
            log.warn(ex)
            self.storage.create()
            self.index = self.storage.create_index(self.schema)
            self._reindex()

    def __getstate__(self):
        state = dict()
        for p in ('_dir', '_name', '_last_index_time', '_last_modified'):
            state[p] = getattr(self, p)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._setup()

    def __call__(self, *args, **kwargs):
        watched = kwargs.pop('watched', None)
        scheduler = kwargs.pop('scheduler', None)
        if watched is not None and scheduler is not None:
            super(RedisWhooshStore, self).__call__(watched=watched,
                                                   scheduler=scheduler)
            log.debug("indexing using {}".format(scheduler))
            if scheduler is not None:  # and self._last_modified > self._last_index_time and :
                scheduler.add_job(RedisWhooshStore._reindex,
                                  args=[self],
                                  max_instances=1,
                                  coalesce=True,
                                  misfire_grace_time=2 *
                                  config.update_frequency)

    def _reindex(self):
        log.debug("indexing the store...")
        self._last_index_time = datetime.now()
        seen = set()
        refs = set([b2u(s) for s in self.objects.keys()])
        parts = self.parts.values()
        for ref in refs:
            for part in parts:
                if ref in part['items']:
                    seen.add(ref)

        ix = self.storage.open_index()
        lock = ix.lock("reindex")
        try:
            log.debug("waiting for index lock")
            lock.acquire(True)
            log.debug("got index lock")
            with ix.writer() as writer:
                for ref in refs:
                    if ref not in seen:
                        log.debug("removing unseen ref {}".format(ref))
                        del self.objects[ref]
                        del self.parts[ref]

                log.debug("updating index")
                for e in self.objects.values():
                    info = self._index_prep(entity_simple_info(e))
                    ref = object_id(e)
                    writer.add_document(object_id=ref, **info)

                writer.mergetype = CLEAR
        finally:
            try:
                log.debug("releasing index lock")
                lock.release()
            except ThreadError as ex:
                pass

    def dump(self):
        ix = self.storage.open_index()
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        res = dict()
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v

        content = " ".join(
            filter(lambda x: x is not None, [
                info.get(x, '') for x in ('service_name', 'title', 'domain',
                                          'keywords', 'scopes')
            ]))
        res['content'] = content.strip()
        for a, v in info.items():
            k = a
            if a in ATTRS_INV:
                k = ATTRS_INV[a]

            if k in self.schema.names():
                if type(v) in (list, tuple):
                    res[k] = " ".join([vv.lower() for vv in v])
                elif type(v) in six.string_types:
                    res[k] = info[a].lower()
        res['sha1'] = hash_id(info['entity_id'], prefix=False)
        return res

    def update(self, t, tid=None, etag=None, lazy=True):
        relt = root(t)
        assert (relt is not None)

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            ref = object_id(relt)
            parts = None
            if ref in self.parts:
                parts = self.parts[ref]
            if etag is not None and (parts is None
                                     or parts.get('etag', None) != etag):
                self.parts[ref] = {
                    'id': relt.get('entityID'),
                    'etag': etag,
                    'count': 1,
                    'items': [ref]
                }
                self.objects[ref] = relt
                self._last_modified = datetime.now()
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            if etag is None:
                etag = hex_digest(dumptree(t, pretty_print=False), 'sha256')
            parts = None
            if tid in self.parts:
                parts = self.parts[tid]
            if parts is None or parts.get('etag', None) != etag:
                items = set()
                for e in iter_entities(t):
                    ref = object_id(e)
                    items.add(ref)
                    self.objects[ref] = e
                self.parts[tid] = {
                    'id': tid,
                    'count': len(items),
                    'etag': etag,
                    'items': list(items)
                }
                self._last_modified = datetime.now()

        if not lazy:
            self._reindex()

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def collections(self):
        return [b2u(ref) for ref in self.parts.keys()]

    def reset(self):
        for k in ('{}_{}'.format(self._name, 'parts'),
                  '{}_{}'.format(self._name, 'objects')):
            self._redis.delete('{}_{}'.format(self._name, 'parts'))
            self._redis.delete('{}_{}'.format(self._name, 'objects'))

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def _prep_key(self, key):
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        key = key.replace('-', ' AND NOT ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        return key

    def _entities(self):
        lst = set()
        for ref_data in self.parts.values():
            for ref in ref_data['items']:
                e = self.objects.get(ref, None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def lookup(self, key):
        if key == 'entities' or key is None:
            return self._entities()

        bkey = six.b(key)
        if bkey in self.objects:
            return [self.objects.get(bkey)]

        if bkey in self.parts:
            res = []
            part = self.parts.get(bkey)
            for item in part['items']:
                res.extend(self.lookup(item))
            return res

        key = self._prep_key(key)
        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                e = self.objects.get(result['object_id'], None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def search(self, query=None, path=None, entity_filter=None, related=None):
        if entity_filter:
            query = "{!s} AND {!s}".format(query, entity_filter)
        query = self._prep_key(query)
        qp = MultifieldParser(['content', 'domain'], schema=self.schema)
        q = qp.parse(query)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            log.debug(results)
            for result in results:
                lst.add(result['object_id'])

        res = list()
        for ref in lst:
            e = self.objects.get(ref, None)
            if e is not None:
                res.append(discojson(e))
        return res
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        ".",
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024)
        self.path = connection_options.get("PATH")

        if connection_options.get("STORAGE", "file") != "file":
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias
            )

        self.log = logging.getLogger("haystack")

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections

        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(
            connections[self.connection_alias].get_unified_index().all_searchfields()
        )
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True, commas=True, scorable=True, field_boost=field_class.boost
                    )
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=int, field_boost=field_class.boost
                )
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored, type=float, field_boost=field_class.boost
                )
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost
                )
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost
                )

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"%s while preparing object for update" % e.__name__,
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}},
                )

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
Example #44
0
class SearchMigrationTest(TestCase):
    """Search index migration testing"""
    def setUp(self):
        self.path = tempfile.mkdtemp()
        self.backup = weblate.trans.search.STORAGE
        self.storage = FileStorage(self.path)
        weblate.trans.search.STORAGE = self.storage
        self.storage.create()

    def tearDown(self):
        if os.path.exists(self.path):
            shutil.rmtree(self.path)
        weblate.trans.search.STORAGE = self.backup

    def do_test(self, source, target):
        if source is not None:
            self.storage.create_index(source, 'source')
        if target is not None:
            self.storage.create_index(target, 'target-cs')

        self.assertIsNotNone(
            weblate.trans.search.get_source_index()
        )
        self.assertIsNotNone(
            weblate.trans.search.get_target_index('cs')
        )

    def test_nonexisting(self):
        self.do_test(None, None)

    def test_nonexisting_dir(self):
        shutil.rmtree(self.path)
        self.do_test(None, None)

    def test_current(self):
        source = weblate.trans.search.SourceSchema
        target = weblate.trans.search.TargetSchema
        self.do_test(source, target)

    def test_2_4(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
            location=TEXT()
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
            comment=TEXT(),
        )
        self.do_test(source, target)

    def test_2_1(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
        )
        self.do_test(source, target)
Example #45
0
class SearchMigrationTest(TestCase, TempDirMixin):
    """Search index migration testing"""
    def setUp(self):
        self.create_temp()
        self.backup = weblate.trans.search.STORAGE
        self.storage = FileStorage(self.tempdir)
        weblate.trans.search.STORAGE = self.storage
        self.storage.create()

    def tearDown(self):
        self.remove_temp()
        weblate.trans.search.STORAGE = self.backup

    def do_test(self, source, target):
        if source is not None:
            self.storage.create_index(source, 'source')
        if target is not None:
            self.storage.create_index(target, 'target-cs')

        sindex = weblate.trans.search.get_source_index()
        self.assertIsNotNone(sindex)
        tindex = weblate.trans.search.get_target_index('cs')
        self.assertIsNotNone(tindex)
        writer = sindex.writer()
        writer.update_document(
            pk=1,
            source="source",
            context="context",
            location="location",
        )
        writer.commit()
        writer = tindex.writer()
        writer.update_document(
            pk=1,
            target="target",
            comment="comment"
        )
        writer.commit()
        for item in ('source', 'context', 'location', 'target'):
            self.assertEqual(
                fulltext_search(item, ['cs'], {item: True}),
                set([1])
            )

    def test_nonexisting(self):
        self.do_test(None, None)

    def test_nonexisting_dir(self):
        shutil.rmtree(self.tempdir)
        self.tempdir = None
        self.do_test(None, None)

    def test_current(self):
        source = weblate.trans.search.SourceSchema
        target = weblate.trans.search.TargetSchema
        self.do_test(source, target)

    def test_2_4(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
            location=TEXT()
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
            comment=TEXT(),
        )
        self.do_test(source, target)

    def test_2_1(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
        )
        self.do_test(source, target)
Example #46
0
 def create_indexes(cls):
     storage = FileStorage(index_dir)
     storage.create_index(SongIndexSchema, indexname="songs")
Example #47
0
def create_index():
    if not os.path.exists(settings.WHOOSH_INDEX):
        os.makedirs(settings.WHOOSH_INDEX)
    storage = FileStorage(settings.WHOOSH_INDEX)
    storage.create_index(schema=WHOOSH_SCHEMA, indexname="rarog")
Example #48
0
class SearchMigrationTest(TestCase, TempDirMixin):
    """Search index migration testing"""
    def setUp(self):
        self.create_temp()
        self.backup = weblate.trans.search.STORAGE
        self.storage = FileStorage(self.tempdir)
        weblate.trans.search.STORAGE = self.storage
        self.storage.create()

    def tearDown(self):
        self.remove_temp()
        weblate.trans.search.STORAGE = self.backup

    def do_test(self, source, target):
        if source is not None:
            self.storage.create_index(source, 'source')
        if target is not None:
            self.storage.create_index(target, 'target-cs')

        sindex = weblate.trans.search.get_source_index()
        self.assertIsNotNone(sindex)
        tindex = weblate.trans.search.get_target_index('cs')
        self.assertIsNotNone(tindex)
        writer = sindex.writer()
        writer.update_document(
            pk=1,
            source="source",
            context="context",
            location="location",
        )
        writer.commit()
        writer = tindex.writer()
        writer.update_document(pk=1, target="target", comment="comment")
        writer.commit()
        for item in ('source', 'context', 'location', 'target'):
            self.assertEqual(fulltext_search(item, ['cs'], {item: True}),
                             set([1]))

    def test_nonexisting(self):
        self.do_test(None, None)

    def test_nonexisting_dir(self):
        shutil.rmtree(self.tempdir)
        self.tempdir = None
        self.do_test(None, None)

    def test_current(self):
        source = weblate.trans.search.SourceSchema
        target = weblate.trans.search.TargetSchema
        self.do_test(source, target)

    def test_2_4(self):
        source = Schema(checksum=ID(stored=True, unique=True),
                        source=TEXT(),
                        context=TEXT(),
                        location=TEXT())
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
            comment=TEXT(),
        )
        self.do_test(source, target)

    def test_2_1(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
        )
        self.do_test(source, target)
Example #49
0
import whoosh, os
from whoosh.filedb.filestore import FileStorage

schema_commit = whoosh.fields.Schema(repository_id=whoosh.fields.ID(stored=True), 
    commit_id=whoosh.fields.ID(stored=True),
    author=whoosh.fields.TEXT(stored=True),
    date=whoosh.fields.DATETIME,
    message=whoosh.fields.ID(stored=True))

indexdir = "indexdir"

storage = FileStorage(indexdir)

exists = whoosh.index.exists_in(indexdir)

if exists:
    ix = storage.open_index(indexname="usages")
else:
    if not os.path.exists(indexdir):
        os.mkdir(indexdir)
    ix = storage.create_index(schema_commit, indexname="usages")
Example #50
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')
        
        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
    
    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True
        
        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
        
        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise
                
                self.log.error("Failed to add documents to Whoosh: %s", e)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)
带开一个已经存在某个目录的索引,使用index.open_dir()
[python] view plain copy
import whoosh.index as index

ix = index.open_dir("indexdir")
这些是便利方法:
[python] view plain copy
from whoosh.filedb.filestore import FileStorage
storage = FileStorage("indexdir")

# Create an index
ix = storage.create_index(schema)

# Open an existing index
storage.open_index()
你和index对象一起创建的schema对象是可序列化的并且和index一起存储
你可以在同一个目录下面使用多个索引,用关键字参数分开
[python] view plain copy
# Using the convenience functions
ix = index.create_in("indexdir", schema=schema, indexname="usages")
ix = index.open_dir("indexdir", indexname="usages")

# Using the Storage object
ix = storage.create_index(schema, indexname="usages")
ix = storage.open_index(indexname="usages")

Clearing the index
Example #52
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024)
        
        if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.')
    
    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True
        
        if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH)
        
        if self.use_file_storage:
            self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            'id': ID(stored=True, unique=True),
            'django_ct': ID(stored=True),
            'django_id': ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer())
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            writer.update_document(**doc)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
    
    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)
        self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id))
    
    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        
        if not models:
            self.delete_index()
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
    
    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH)
        elif not self.use_file_storage:
            self.storage.clean()
        
        # Recreate everything.
        self.setup()
        
    def optimize(self):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        self.index.optimize()
    
    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None,
               limit_to_registered_models=None, **kwargs):
        if not self.setup_complete:
            self.setup()
        
        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        
        query_string = force_unicode(query_string)
        
        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }
        
        reverse = False
        
        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1
            
            if len(sort_by) > 1 and reverse_counter > 1:
                raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.")
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])
                    
                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)
                    
                    if len(sort_by_list) == 1:
                        reverse = False
                
            sort_by = sort_by_list[0]
        
        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
        
        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
        
        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
        
        narrowed_results = None
        self.index = self.index.refresh()
        
        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
        
        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models registered
            # with the current site.
            if narrow_queries is None:
                narrow_queries = set()
            
            registered_models = self.build_registered_models_list()
            
            if len(registered_models) > 0:
                narrow_queries.add('django_ct:(%s)' % ' OR '.join(registered_models))
        
        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()
            
            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq)))
                
                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results
        
        self.index = self.index.refresh()
        
        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)
            
            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }
            
            # Prevent against Whoosh throwing an error. Requires an end_offset
            # greater than 0.
            if not end_offset is None and end_offset <= 0:
                end_offset = 1
            
            raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse)
            
            # Handle the case where the results have been narrowed.
            if narrowed_results:
                raw_results.filter(narrowed_results)
            
            # Determine the page.
            page_num = 0
            
            if end_offset is None:
                end_offset = 1000000
            
            if start_offset is None:
                start_offset = 0
            
            page_length = end_offset - start_offset
            
            if page_length and page_length > 0:
                page_num = start_offset / page_length
            
            # Increment because Whoosh uses 1-based page numbers.
            page_num += 1
            
            try:
                raw_page = ResultsPage(raw_results, page_num, page_length)
            except ValueError:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            
            return self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query)
        else:
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None
            
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
    
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None,
                       limit_to_registered_models=None, **kwargs):
        warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2)
        return {
            'results': [],
            'hits': 0,
        }
    
    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
        from haystack import site
        results = []
        
        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)
        
        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()
        
        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if isinstance(index.fields[string_key], MultiValueField):
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)
                
                del(additional_fields['django_ct'])
                del(additional_fields['django_id'])
                
                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace('*', '') for term in query_string.split()]
                    
                    additional_fields['highlighted'] = {
                        self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                    }
                
                result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
    
    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        sp = SpellChecker(self.storage)
        cleaned_query = force_unicode(query_string)
        
        if not query_string:
            return spelling_suggestion
        
        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')
        
        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')
        
        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []
        
        for word in query_words:
            suggestions = sp.suggest(word, number=1)
            
            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])
        
        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion
    
    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.
        
        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = True
            else:
                value = False
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_unicode(v) for v in value])
        elif isinstance(value, (int, long, float)):
            # Leave it alone.
            pass
        else:
            value = force_unicode(value)
        return value
    
    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.
        
        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False
        
        if value and isinstance(value, basestring):
            possible_datetime = DATETIME_REGEX.search(value)
            
            if possible_datetime:
                date_values = possible_datetime.groupdict()
            
                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)
            
                return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])
        
        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)
            
            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass
        
        return value
Example #53
0
def setup_index():
    storage = FileStorage(data_dir('memory'))
    storage.create()
    return storage.create_index(TMSchema())
Example #54
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, 'RAM_STORE', None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       "data": {
                                           "index": index,
                                           "object": get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s",
                           whoosh_id,
                           e,
                           exc_info=True)

    def clear(self, models=None, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(
                    q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ','.join(models_to_delete),
                    e,
                    exc_info=True)
            else:
                self.log.error("Failed to clear Whoosh index: %s",
                               e,
                               exc_info=True)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if end_offset is not None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self,
               query_string,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields='',
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               within=None,
               dwithin=None,
               distance_point=None,
               models=None,
               limit_to_registered_models=None,
               result_class=None,
               **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.",
                          Warning,
                          stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.",
                          Warning,
                          stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.",
                          Warning,
                          stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(
                start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num,
                                                **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page,
                                            highlight=highlight,
                                            query_string=query_string,
                                            spelling_query=spelling_query,
                                            result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(
                        query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'],
                                date_values['day'], date_values['hour'],
                                date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                    converted_value,
                (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
Example #55
0
def create_index(**kwargs):
    if not os.path.exists(SEARCH_INDEX):
        os.mkdir(SEARCH_INDEX)
        storage = FileStorage(SEARCH_INDEX)
        storage.create_index(SEARCH_SCHEMA)