Exemple #1
0
def update_index(sender, instance, created, **kwargs):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index()

    try:
        writer = ix.writer()
    except:
        return
    
    tags = []
    for t in instance.tags.all():
        try:
            tags.append(unicode(t.name))
        except:
            pass
        
    tags = u','.join(tags)

    try:
    
        if created:
            writer.add_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username,
                                        id=unicode(instance.pk))
            writer.commit()
        else:
            writer.update_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username,
                                        id=unicode(instance.pk))
            writer.commit()
    except:
        pass
Exemple #2
0
def update_index(sender, instance, created, **kwargs):
    if int(os.environ.get('SKIP_SEARCH_INDEX', '0')):
        return
    try:
        url = unicode(instance.get_absolute_url())
    except Exception:
        log.critical('Cant resolve url. Content %r not indexed' % instance)
        return

    content = getattr(instance, 'content', None)
    if content is None:
        content = unicode(instance)
    elif callable(content):
        content = content()

    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname='memopol')
    writer = ix.writer()
    if created:
        writer.add_document(title=unicode(instance), content=content,
                            type=unicode(instance.__class__.__name__.lower()),
                            url=url)
        writer.commit()
    else:
        writer.update_document(title=unicode(instance), content=content,
                               type=unicode(instance.__class__.__name__.lower()),
                               url=url)
        writer.commit()
 def __init__(self, path, masterkey=None):
     FileStorage.__init__(self, path, supports_mmap=False)
     self.masterkey = masterkey[:32]
     self.signkey = masterkey[32:]
     self._tmp_storage = self.temp_storage
     self.length_cache = {}
     self._open_files = {}
Exemple #4
0
    def test_threaded_filelock(self):
        self.make_dir("testindex")
        st = FileStorage("testindex")
        lock1 = st.lock("testlock")
        result = []

        # The thread function tries to acquire the lock and
        # then quits
        def fn():
            lock2 = st.lock("testlock")
            gotit = try_for(lock2.acquire, 1.0, 0.1)
            if gotit:
                result.append(True)
                lock2.release()

        t = threading.Thread(target=fn)

        # Acquire the lock in this thread
        lock1.acquire()
        # Start the other thread trying to acquire the lock
        t.start()
        # Wait for a bit
        time.sleep(0.15)
        # Release the lock
        lock1.release()
        # Wait for the other thread to finish
        t.join()
        # If the other thread got the lock, it should have
        # appended something to the "results" list.
        self.assertEqual(len(result), 1)

        self.clean_file("testindex/testlock")
        self.destroy_dir("testindex")
Exemple #5
0
def create_index(sender=None, **kwargs):
    """Creates a File based whoosh index, location used is
    settings.WHOOSH_INDEX so make sure that is set"""
    if not os.path.exists(settings.WHOOSH_INDEX):
        os.mkdir(settings.WHOOSH_INDEX)
        storage = FileStorage(settings.WHOOSH_INDEX)
        ix = storage.create_index(schema=WHOOSH_SCHEMA,
                                  indexname="search")
def init_index(index=".index"):
	indexZ=index
	if not os.path.exists(indexZ):
		os.mkdir(indexZ)      # os.rmdir(index)
	storage = FileStorage(indexZ)
	schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID   (stored=True),tags=KEYWORD)
	ix = storage.create_index(schema)
	ix = storage.open_index()
	return ix
Exemple #7
0
def get_index():
    try:
        storage = FileStorage(settings.WHOOSH_INDEX)
        return storage.open_index(indexname="search")
    except IOError:
        # No index? other error?
        create_index()
        storage = FileStorage(settings.WHOOSH_INDEX)
        return storage.open_index(indexname="search")
def build_index(sa_session, whoosh_index_dir, path_to_repositories):
    """
    Build the search indexes. One for repositories and another for tools within.
    """
    #  Rare race condition exists here and below
    if not os.path.exists(whoosh_index_dir):
        os.makedirs(whoosh_index_dir)
    tool_index_dir = os.path.join(whoosh_index_dir, 'tools')
    if not os.path.exists(tool_index_dir):
        os.makedirs(tool_index_dir)

    repo_index_storage = FileStorage(whoosh_index_dir)
    tool_index_storage = FileStorage(tool_index_dir)

    repo_index = repo_index_storage.create_index(repo_schema)
    tool_index = tool_index_storage.create_index(tool_schema)

    repo_index_writer = repo_index.writer()
    tool_index_writer = tool_index.writer()

    repos_indexed = 0
    tools_indexed = 0

    for repo in get_repos(sa_session, path_to_repositories):

        repo_index_writer.add_document(id=repo.get('id'),
                             name=unicodify(repo.get('name')),
                             description=unicodify(repo.get('description')),
                             long_description=unicodify(repo.get('long_description')),
                             homepage_url=unicodify(repo.get('homepage_url')),
                             remote_repository_url=unicodify(repo.get('remote_repository_url')),
                             repo_owner_username=unicodify(repo.get('repo_owner_username')),
                             times_downloaded=repo.get('times_downloaded'),
                             approved=repo.get('approved'),
                             last_updated=repo.get('last_updated'),
                             full_last_updated=repo.get('full_last_updated'))
        #  Tools get their own index
        for tool in repo.get('tools_list'):
            tool_index_writer.add_document(id=unicodify(tool.get('id')),
                                           name=unicodify(tool.get('name')),
                                           version=unicodify(tool.get('version')),
                                           description=unicodify(tool.get('description')),
                                           help=unicodify(tool.get('help')),
                                           repo_owner_username=unicodify(repo.get('repo_owner_username')),
                                           repo_name=unicodify(repo.get('name')),
                                           repo_id=repo.get('id'))
            tools_indexed += 1
            print(tools_indexed, 'tools (', tool.get('id'), ')')

        repos_indexed += 1
        print(repos_indexed, 'repos (', repo.get('id'), ')')

    tool_index_writer.commit()
    repo_index_writer.commit()

    print("TOTAL repos indexed: ", repos_indexed)
    print("TOTAL tools indexed: ", tools_indexed)
def get_index(index=".index"):
        indexZ=index
	if not os.path.exists(indexZ):
		return "there is no index with this name %s!! use indexer to build the index" % index
		       	
		sys.exit()
	storage = FileStorage(indexZ)
	ix = storage.open_index()
	print "the index has %d docs" % ix.doc_count_all()
	return ix
 def handle_noargs(self, **options):
     # from settings import HAYSTACK_CONNECTIONS
     # storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH'])
     storage = FileStorage('/dev/shm/whoosh/')
     
     ix = storage.open_index('SPELL')
     
     with ix.reader() as r:
         for id in r.all_doc_ids():
             print r.stored_fields(id)
Exemple #11
0
    def _open_indexes(self):
        """open storage and open indexes"""
        if not os.path.exists("index"):
            os.mkdir("index")
        storage = FileStorage("index")

        # open or initialise index
        if not storage.index_exists(indexname='MAIN'):
            self.ix = storage.\
                create_index(IndexerSchema, indexname='MAIN')
        self.ix = storage.open_index(indexname='MAIN')
Exemple #12
0
    def eval_get_ranked_set_baseline(self, basefile):
        # Step 1: Read the saved keyterms for a subset of articles
        # (created by analyze_baseline_queries)
        g = Graph()
        g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3")

        articles = {}
        for (s, p, o) in g:
            if not str(s) in articles:
                articles[str(s)] = []
            articles[str(s)].append(str(o))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, create a query for each article based on
        # the keyterms.
        connector = query.Or
        indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index'])
        storage = FileStorage(indexdir)
        idx = storage.open_index()
        searcher = idx.searcher(weighting=scoring.BM25F())

        res = {}

        # for article in sorted(articles.keys()):
        for article in self._articles(basefile):
            terms = articles[article]
            rankedset = []
            #parser = qparser.QueryParser("content", idx.schema)
            #q = parser.parse(connector.join(terms))
            q = query.And([
                # query.Term("articles", article),
                connector([query.Term("content", x) for x in terms])
            ])
            # print q
            # self.log.debug("Article %s: %s", article, " or ".join(terms))
            results = searcher.search(q, limit=None)
            resultidx = 0
            # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10))
            for result in results:
                reslbl = "%s (%s)" % (
                    result['basefile'], results.score(resultidx))
                rankedset.append([result['basefile'], reslbl])
                # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx)))
                resultidx += 1
            self.log.info("Created baseline ranked set for %s: Top result %s (of %s)" %
                          (article.split("/")[-1], rankedset[0][0], len(rankedset)))

            # return just a list of URIs, no scoring information. But the
            # full URI isnt available in the whoosh db, so we recreate it.
            res[article] = ["http://lagen.nu/ext/celex/%s" % x[
                0] for x in rankedset]

        return res
Exemple #13
0
def update_index(sender, instance, created, **kwargs):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname="rarog")
    writer = ix.writer()
    if created:
        writer.add_document(title=unicode(instance), body_html=instance.body_html,
                                    url=unicode(instance.get_absolute_url()))
        writer.commit()
    else:
        writer.update_document(title=unicode(instance), body_html=instance.body_html,
                                    url=unicode(instance.get_absolute_url()))
        writer.commit()
def searchIndex():
    '''
    searchindex()
    Performs the requested search through the index/schema
    INPUTS: idx -- desired index to search
    OUTPUTS: results -- results of the search
    '''
    # Navigate to the LM index directory
    c = ''
    while True:
        print 'The current directory is ' + os.getcwd()
        ques = 'Is the LM index (directory) in the current directory? [y/n]\t'
        c = raw_input(ques).lower()
        if c == 'y' or c == 'yes':
            idxDir = os.getcwd()
            break
        elif c == 'n' or c == 'no':
            while True:
                idxDir = raw_input('Where is it?\t').lower()
                try:
                    os.chdir(idxDir)
                    break
                except WindowsError:
                    print 'Sorry, I couldn\'t navigate to that directory'
            break
        elif c == 'q' or c == 'quit':
            print '\tReturning to the Main Menu'
            return
        else:
            print 'I\'m sorry, I don\'t understand what you mean. Try again.'

    # Open the index
    idxDir = idxDir + '/LM_Storage'
    storage = FileStorage(idxDir)
    idx = storage.open_index(indexname = 'LM')
    
    # Determine what the user wants to search for 
    c = ''
    while True:
        ques = 'What would you like to search? song/artist [s], lyrics [L]\t'
        c = raw_input(ques).lower()
        if c == 's' or c == 'song/artist' or c == 'song':
            searchForSong(idx)
            break
        elif c == 'l' or c == 'lyrics':
            searchForLyrics(idx)
            break
        elif c == 'q' or c == 'quit':
            print '\tReturning to the Main Menu'
            return 
        else:
            print 'I\'m sorry, I don\'t understand what you mean. Try again.'
Exemple #15
0
    def __init__(self, word_file, graph_file):
        dirname = os.path.dirname(graph_file)
        st = FileStorage(dirname)
        f = st.open_file(graph_file)
        gr = fst.GraphReader(f)
        self.graph = gr

        self.dict = {}
        with codecs.open(word_file,'r','utf-8') as file:
            for line in file:
                tokens = line.split(" ")
                if len(tokens) >= 2:
                    self.dict[tokens[0].strip()] = int(tokens[1].strip())
Exemple #16
0
def search_does_exist(query):
    #query = unicode(query, 'utf-8')
    #query = unidecode(query)

    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname="wiki")

    from whoosh.qparser import QueryParser
    with ix.searcher() as searcher:
      query = QueryParser("title", ix.schema).parse(query)
      whoosh_results = searcher.search(query, limit=1)

      return len(whoosh_results) > 0
Exemple #17
0
def get_myindex(indexdir='indexdir', filestore=False):
    schema = get_schema()
    if not filestore:
        if not os.path.exists(indexdir):
            os.mkdir(indexdir)
            ix = index.create_in(indexdir, schema)
        ix = index.open_dir(indexdir)
    else:
        storage = FileStorage(indexdir)
        # TODO: When the indexdir has already exist
        #       the index object also use create_index,
        #       it should use open_dir as above method.
        ix = storage.create_index(schema)
    return ix
 def test_hash(self):
     self.make_dir("testindex")
     st = FileStorage("testindex")
     hwf = st.create_file("test.hsh")
     hw = FileHashWriter(hwf)
     hw.add("foo", "bar")
     hw.add("glonk", "baz")
     hw.close()
     
     hrf = st.open_file("test.hsh")
     hr = FileHashReader(hrf)
     self.assertEqual(hr.get("foo"), "bar")
     self.assertEqual(hr.get("baz"), None)
     hr.close()
Exemple #19
0
 def search(self,q):
     from whoosh.filedb.filestore import FileStorage
     from whoosh.qparser import MultifieldParser
     storage = FileStorage(settings.WHOOSH_INDEX)
     ix = storage.open_index()
     q = q.replace('+', ' AND ').replace(' -', ' NOT ')
     parser = MultifieldParser(["content","title","tags","author"], schema=ix.schema)
     qry = parser.parse(q)
     searcher = ix.searcher()
     hits = searcher.search(qry)
     
     
     
     return self.objects.filter(id__in=[h.fields()['id'] for h in hits]).filter(published=True)
Exemple #20
0
    def test_filelock_simple(self):
        self.make_dir("testindex")
        st = FileStorage("testindex")
        lock1 = st.lock("testlock")
        lock2 = st.lock("testlock")

        self.assertTrue(lock1.acquire())
        self.assertFalse(lock2.acquire())
        lock1.release()
        self.assertTrue(lock2.acquire())
        self.assertFalse(lock1.acquire())
        lock2.release()

        self.clean_file("testindex/testlock")
        self.destroy_dir("testindex")
def run_search(query):
    from settings import HAYSTACK_CONNECTIONS
    storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH'])
    # storage = FileStorage('/dev/shm/whoosh/')
    
    ix = storage.open_index('MAIN')
    
    with ix.searcher() as s:
        from whoosh.qparser import QueryParser
        qp = QueryParser("text", schema=ix.schema)

        q = qp.parse(query)
        results = s.search(q)
        for i, r in enumerate(results):
            result = "%d: (%s) %s" % (i, r['id'], r['title']) # ignored
Exemple #22
0
def build_clean_index():
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index()
    writer = ix.writer()
    try:
        mlogger.debug("building index from scratch.....................")        
        mlogger.debug("adding objects...................")
        
        for si in StudentInstitute.objects.all():
             adddoc(si,writer,True)   
          
        for fi in FacultyInstitute.objects.all():
             adddoc(fi,writer,True)
    finally:            
        writer.commit()
        ix.close()
Exemple #23
0
def add_documents_to_index(index_name, documents):
    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname=index_name)

    writer = ix.writer()

    for i, document in enumerate(documents):

        print "{}%".format(i/len(documents) * 100)

        if index_name == "wiki":
            writer.add_document(title=u"{}".format(sanitize_text(document.title)))
        if index_name == "movie":
            writer.add_document(title=u"{}".format(sanitize_text(document.title)))

    writer.commit()
    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False

        # Make sure the index is there.
        if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True

        if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % settings.HAYSTACK_WHOOSH_PATH
            )

        self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)

        self.setup_complete = True
Exemple #25
0
def search(request):
    storage = FileStorage(settings.WHOOSH_INDEX)
    ix = storage.open_index(indexname="rarog")
    hits = []
    query = request.GET.get('q', None)
    if query is not None and query != u"":
        query = query.replace('+', ' AND ').replace(' -', ' NOT ')
        parser = MultifieldParser(['title','body_html'], schema=ix.schema)
        try:
            qry = parser.parse(query)
        except:
            qry = None
        if qry is not None:
            searcher = ix.searcher()
            hits = searcher.search(qry)
    return query, hits
Exemple #26
0
class Index:
	def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'):
		self.path = os.path.expanduser(path)
		self.schema = Schema(
			trackId = ID(stored=True),
			name=TEXT(stored=True),
			artist=TEXT(stored=True),
			album=TEXT(stored=True),
			genre=KEYWORD(stored=True),
			location=STORED,
			trackNumber=STORED,
			bitRate=ID(stored=True),
			artwork=KEYWORD(stored=True)
			)
		self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema)
		self.folder = "%s/index" % os.path.expanduser(folder)
		self.empty = not whoosh.index.exists_in(self.folder)
		self.ix = None
	def index(self):
		if self.empty:
			if not os.path.exists(self.folder):
				os.makedirs(self.folder)
			st = FileStorage(self.folder)
			ix = st.create_index(self.schema)
			w = ix.writer()
			w.add_document(name = u"beuha")
			pipe = file.ID3Filter()
			#[TODO] using itunes info for artwork?
			cpt = 0
			for track in pipe(ItunesParser(self.path)):
				if track['album'] != None : 
					album = track['album'].encode('ascii', 'ignore')
				else:
					album = ""
				#print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore')
				if cpt % 20 == 0:
					print "\n%i " %cpt,
				print '#',
				#print track['album'], track['name']
				w.add_document(
					trackId = track['trackId'], name=track['name']
					,artist=track['artist'], album=track['album'],
					genre=track['genre'], location=track['location'],
					artwork=boolean(track['artwork']),
					trackNumber=track['trackNumber'], bitRate=track['bitRate']
				)
				#if cpt % 100 == 1:
				#	w.commit()
				cpt += 1
			print "\n\n%i tracks indexed" % cpt
			w.commit()
			ix.optimize()
			ix.close()
		else :
			print "already indexed"
	def query(self, query):
		if self.ix == None:
			self.ix = FileStorage(self.folder).open_index()
		q = self.parser.parse(query)
		return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
Exemple #27
0
def create_in(dirname, schema, indexname=None):
    """Convenience function to create an index in a directory. Takes care of creating
    a FileStorage object for you. indexname is t
    
    :param dirname: the path string of the directory in which to create the index.
    :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields.
    :param indexname: the name of the index to create; you only need to specify this if
        you are creating multiple indexes within the same storage object.
    :returns: :class:`Index`
    """
    
    if not indexname:
        indexname = _DEF_INDEX_NAME
    
    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname)
    return storage.create_index(schema, indexname)
Exemple #28
0
class SearchMigrationTest(TestCase, TempDirMixin):
    """Search index migration testing"""
    def setUp(self):
        self.create_temp()
        self.storage = FileStorage(self.tempdir)
        self.storage.create()

    def tearDown(self):
        self.remove_temp()

    def do_test(self):
        fulltext = Fulltext()
        fulltext.storage = self.storage

        sindex = fulltext.get_source_index()
        self.assertIsNotNone(sindex)
        tindex = fulltext.get_target_index('cs')
        self.assertIsNotNone(tindex)
        writer = sindex.writer()
        writer.update_document(
            pk=1,
            source="source",
            context="context",
            location="location",
        )
        writer.commit()
        writer = tindex.writer()
        writer.update_document(
            pk=1,
            target="target",
            comment="comment"
        )
        writer.commit()
        for item in ('source', 'context', 'location', 'target'):
            self.assertEqual(
                fulltext.search(item, ['cs'], {item: True}),
                set([1])
            )

    def test_nonexisting(self):
        self.do_test()

    def test_nonexisting_dir(self):
        shutil.rmtree(self.tempdir)
        self.tempdir = None
        self.do_test()
Exemple #29
0
def search(query):
    #query = unicode(query, 'utf-8')

    storage = FileStorage("indexdir")
    ix = storage.open_index(indexname="wiki")

    from whoosh.qparser import QueryParser
    with ix.searcher() as searcher:
        query = QueryParser("title", ix.schema).parse(query)
        whoosh_results = searcher.search(query, limit=1)

        results = []

        for w in whoosh_results:
            results.append("{}".format(w))

        return results
Exemple #30
0
def open_dir(dirname, indexname = None, mapped=True):
    """Convenience function for opening an index in a directory. Takes care of creating
    a FileStorage object for you. dirname is the filename of the directory in
    containing the index. indexname is the name of the index to create; you only need to
    specify this if you have multiple indexes within the same storage object.
    
    :param dirname: the path string of the directory in which to create the index.
    :param indexname: the name of the index to create; you only need to specify this if
        you have multiple indexes within the same storage object.
    :param mapped: whether to use memory mapping to speed up disk reading.
    :returns: :class:`Index`
    """
    
    if indexname is None:
        indexname = _DEF_INDEX_NAME
    
    from whoosh.filedb.filestore import FileStorage
    storage = FileStorage(dirname, mapped=mapped)
    return storage.open_index(indexname)
Exemple #31
0
 def __init__(self, path, masterkey=None):
     self.masterkey = masterkey[:32]
     self.signkey = masterkey[32:]
     self._tmp_storage = self.temp_storage
     self.length_cache = {}
     FileStorage.__init__(self, path, supports_mmap=False)
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
import whoosh.qparser as qparser
import chinese
import os, glob, codecs, sys

analyzer = chinese.ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer), 
	sub_title=TEXT(stored=True, analyzer=analyzer),
	author=TEXT(stored=True, analyzer=analyzer), 
	content=TEXT(stored=True, analyzer=analyzer))

storage = FileStorage("indexdir")
ix = storage.open_index()
writer = ix.writer()

_string = sys.argv[1]
_mode = sys.argv[2]
normal = (_mode == "normal")

_distance = 0
if(normal is False):
	_distance = int(sys.argv[3])

with ix.searcher() as searcher:
	# og = qparser.OrGroup.factory(0.9)
	parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema)
	# parser = qparser.QueryParser("content", ix.schema)
Exemple #33
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024)
        
        if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.')
    
    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True
        
        if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH)
        
        if self.use_file_storage:
            self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            writer.update_document(**doc)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
    
    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)
        self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)))
    
    def clear(self, models=[], commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        
        if not models:
            self.delete_index()
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append(u"%s:%s.%s" % (DJANGO_CT, model._meta.app_label, model._meta.module_name))
            
            self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
    
    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH)
        elif not self.use_file_storage:
            self.storage.clean()
        
        # Recreate everything.
        self.setup()
        
    def optimize(self):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        self.index.optimize()
    
    @log_query
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()
        
        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        
        query_string = force_unicode(query_string)
        
        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }
        
        reverse = False
        
        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1
            
            if len(sort_by) > 1 and reverse_counter > 1:
                raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.")
            
            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])
                    
                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)
                    
                    if len(sort_by_list) == 1:
                        reverse = False
                
            sort_by = sort_by_list[0]
        
        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
        
        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
        
        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
        
        narrowed_results = None
        self.index = self.index.refresh()
        
        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
        
        if limit_to_registered_models:
            # Using narrow queries, limit the results to only models registered
            # with the current site.
            if narrow_queries is None:
                narrow_queries = set()
            
            registered_models = self.build_registered_models_list()
            
            if len(registered_models) > 0:
                narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in registered_models]))
        
        narrow_searcher = None
        
        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()
            
            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq)))
                
                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                   narrowed_results = recent_narrowed_results
        
        self.index = self.index.refresh()
        
        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)
            
            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }
            
            # Prevent against Whoosh throwing an error. Requires an end_offset
            # greater than 0.
            if not end_offset is None and end_offset <= 0:
                end_offset = 1
            
            raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse)
            
            # Handle the case where the results have been narrowed.
            if narrowed_results:
                raw_results.filter(narrowed_results)
            
            # Determine the page.
            page_num = 0
            
            if end_offset is None:
                end_offset = 1000000
            
            if start_offset is None:
                start_offset = 0
            
            page_length = end_offset - start_offset
            
            if page_length and page_length > 0:
                page_num = start_offset / page_length
            
            # Increment because Whoosh uses 1-based page numbers.
            page_num += 1
            
            try:
                raw_page = ResultsPage(raw_results, page_num, page_length)
            except ValueError:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            
            results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class)
            searcher.close()
            
            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()
            
            return results
        else:
            if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None
            
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
    
    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2)
        return {
            'results': [],
            'hits': 0,
        }
    
    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
        if not self.site:
            from haystack import site
        else:
            site = self.site
        
        results = []
        
        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)
        
        if result_class is None:
            result_class = SearchResult
        
        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()
        
        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)
                
                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])
                
                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace('*', '') for term in query_string.split()]
                    
                    additional_fields['highlighted'] = {
                        self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                    }
                
                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, searchsite=self.site, **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
    
    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        sp = SpellChecker(self.storage)
        cleaned_query = force_unicode(query_string)
        
        if not query_string:
            return spelling_suggestion
        
        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')
        
        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')
        
        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []
        
        for word in query_words:
            suggestions = sp.suggest(word, number=1)
            
            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])
        
        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion
    
    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.
        
        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_unicode(v) for v in value])
        elif isinstance(value, (int, long, float)):
            # Leave it alone.
            pass
        else:
            value = force_unicode(value)
        return value
    
    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.
        
        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False
        
        if value and isinstance(value, basestring):
            possible_datetime = DATETIME_REGEX.search(value)
            
            if possible_datetime:
                date_values = possible_datetime.groupdict()
            
                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)
            
                return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second'])
        
        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)
            
            # Try to handle most built-in types.
            if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass
        
        return value
Exemple #34
0
    app.logger.info('microblog startup')

if os.environ.get('HEROKU') is not None:
    import logging
    stream_handler = logging.StreamHandler()
    app.logger.addHandler(stream_handler)
    app.logger.setLevel(logging.INFO)
    app.logger.info('microblog startup')

enable_search = WHOOSH_ENABLED
if enable_search:
    search_is_new = False
    if not os.path.exists(WHOOSH_BASE):
        os.mkdir(WHOOSH_BASE)
        search_is_new = True
    search_storage = FileStorage(WHOOSH_BASE)
    search_ix = None
    if search_is_new:
        schema = Schema(id=ID(stored=True), body=TEXT())
        search_ix = search_storage.create_index(schema)
    else:
        search_ix = search_storage.open_index()


class CustomJSONEncoder(JSONEncoder):
    """This class adds support for lazy translation texts to Flask's
    JSON encoder. This is necessary when flashing translated texts."""
    def default(self, obj):
        from speaklater import is_lazy_string
        if is_lazy_string(obj):
            try:
Exemple #35
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '.',
    )
    
    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
        self.path = connection_options.get('PATH')
        
        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False
        
        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
    
    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False
        
        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True
        
        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
        
        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS
            
            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()
            
            self.storage = LOCALS.RAM_STORE
        
        self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)
        
        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)
        
        self.setup_complete = True
    
    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))
    
    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()
        
        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)
        
        for obj in iterable:
            doc = index.full_prepare(obj)
            
            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])
            
            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise
                
                self.log.error("Failed to add documents to Whoosh: %s", e)
        
        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()
            
            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)
Exemple #36
0
# rimuove gia' le stopword delle prove precedenti
json_stop_words = open("../Indicizzazione/stopWords_clinico.json", "r")
json_string = ""
for line in json_stop_words:
    json_string = json_string + line

datastore = json.loads(json_string)
#print datastore

campo = "identifier"
fields = ["title", "abstract", "terms"]

if not os.path.exists(sys.argv[1]):  # controlla se l'indice non c'e'
    print sys.argv[1], "does not exist"  # esci se non esiste
else:  # altrimenti procedi
    fst = FileStorage(sys.argv[1])  # afferra la maniglia e
    ix = fst.open_index()  # apri il file corrispondente
    #--- apertura del file delle query ---#
    infile = open(sys.argv[2], 'r')
    #--- lettura del file
    text = infile.read()
    #--- dom delle query
    dom = parseString(text)
    #--- estrazione dei dati della query
    #title = gettagdata(dom,'title')
    num = gettagdata(dom, 'num')
    #desc  = gettagdata(dom,'desc')
    #for x in range(len(title)-1):
    #    title[x]+=" "+desc[x]
    title = gettagdata(dom, 'desc')
    title = [
Exemple #37
0
def in_site_search(request):
    """
    站内搜索
    """
    user = get_login_user(request)
    keyword = request.POST.get('keyword', '').strip()
    scope = request.POST.get('scope', 'all')

    logger.warning(f"搜索关键字:`{keyword}")
    keyword = split_cn_words(keyword, join=True)
    logger.info(f"转换后的关键字:`{keyword}")

    if scope not in ('all', 'feed', 'article'):
        return HttpResponseForbidden('Param Error')

    if not keyword:
        return HttpResponseNotFound("Empty Keyword")

    storage = FileStorage(settings.WHOOSH_IDX_DIR)
    rel_sites, rel_articles = None, None

    # 查找相关源
    if scope in ('feed', 'all'):
        idx = storage.open_index(indexname="site", schema=whoosh_site_schema)
        qp = MultifieldParser(['cname', 'author', 'brief'],
                              schema=whoosh_site_schema)
        query = qp.parse(keyword)
        sites = []

        with idx.searcher() as s:
            results = s.search(query, limit=50)

            for ret in results:
                sites.append(ret['id'])

        rel_sites = Site.objects.filter(status='active',
                                        pk__in=sites).order_by('-star')
    elif scope == 'article':
        # 查找相关文章
        idx = storage.open_index(indexname="article",
                                 schema=whoosh_article_schema)
        qp = MultifieldParser(['title', 'author', 'content'],
                              schema=whoosh_article_schema)
        query = qp.parse(keyword)
        articles = []

        with idx.searcher() as s:
            old_mask = TermRange("uindex", None,
                                 str(current_ts() - 7 * 86400 * 1000))
            results = s.search(query, mask=old_mask, limit=50)

            for ret in results:
                articles.append(ret['uindex'])
        rel_articles = Article.objects.filter(is_recent=True,
                                              status='active',
                                              uindex__in=articles).iterator()

    # 用户订阅
    user_sub_feeds = []
    if user:
        user_sub_feeds = get_user_subscribe_feeds(user.oauth_id,
                                                  user_level=user.level)

    context = dict()
    context['user'] = user
    context['user_sub_feeds'] = user_sub_feeds
    context['rel_sites'] = rel_sites
    context['rel_articles'] = rel_articles
    context['keyword'] = keyword

    if scope == 'all':
        return render(request, 'search/search.html', context=context)
    elif scope == 'feed':
        return render(request, 'search/search_feeds.html', context=context)
    elif scope == 'article':
        return render(request, 'search/search_articles.html', context=context)
Exemple #38
0
def setup_index():
    storage = FileStorage(data_dir('memory'))
    storage.create()
    return storage.create_index(TMSchema())
Exemple #39
0
class TranslationMemory(object):
    def __init__(self):
        self.index = FileStorage(data_dir('memory')).open_index()
        self.parser = qparser.QueryParser(
            'source',
            schema=self.index.schema,
            group=qparser.OrGroup.factory(0.9),
            termclass=query.FuzzyTerm,
        )
        self.searcher = None
        self.comparer = Comparer()

    def __del__(self):
        self.close()

    def open_searcher(self):
        if self.searcher is None:
            self.searcher = self.index.searcher()

    def doc_count(self):
        self.open_searcher()
        return self.searcher.doc_count()

    def close(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None

    @contextlib.contextmanager
    def writer(self):
        writer = self.index.writer()
        try:
            yield writer
        finally:
            writer.commit()

    def get_language_code(self, code, langmap):
        language = Language.objects.auto_get_or_create(code)
        if langmap and language.code in langmap:
            language = Language.objects.auto_get_or_create(
                langmap[language.code])
        return language.code

    def import_tmx(self, fileobj, langmap=None):
        origin = force_text(os.path.basename(fileobj.name))
        storage = tmxfile.parsefile(fileobj)
        header = next(storage.document.getroot().iterchildren(
            storage.namespaced("header")))
        source_language_code = header.get('srclang')
        source_language = self.get_language_code(source_language_code, langmap)

        languages = {}
        with self.writer() as writer:
            for unit in storage.units:
                # Parse translations (translate-toolkit does not care about
                # languages here, it just picks first and second XML elements)
                translations = {}
                for node in unit.getlanguageNodes():
                    lang, text = get_node_data(unit, node)
                    translations[lang] = text
                    if lang not in languages:
                        languages[lang] = self.get_language_code(lang, langmap)

                try:
                    source = translations.pop(source_language_code)
                except KeyError:
                    # Skip if source language is not present
                    continue

                for lang, text in translations.items():
                    writer.add_document(
                        source_language=source_language,
                        target_language=languages[lang],
                        source=source,
                        target=text,
                        origin=origin,
                        category=CATEGORY_FILE,
                    )

    def lookup(self, source_language, target_language, text):
        langfilter = query.And([
            query.Term('source_language', source_language),
            query.Term('target_language', target_language),
        ])
        self.open_searcher()
        text_query = self.parser.parse(text)
        matches = self.searcher.search(text_query,
                                       filter=langfilter,
                                       limit=20000)

        for match in matches:
            similarity = self.comparer.similarity(text, match['source'])
            if similarity < 30:
                continue
            yield (match['source'], match['target'], similarity,
                   match['origin'])

    def delete(self, origin):
        """Delete entries by origin."""
        with self.writer() as writer:
            return writer.delete_by_term('origin', origin)

    def empty(self):
        """Recreates translation memory."""
        self.index = setup_index()
        self.searcher = None

    def get_origins(self):
        self.open_searcher()
        return [force_text(x) for x in self.searcher.lexicon('origin')]
Exemple #40
0
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
import whoosh.qparser as qparser
import chinese
import os, glob, codecs

analyzer = chinese.ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True),
                sub_title=TEXT(stored=True),
                author=TEXT(stored=True),
                content=TEXT(stored=True, analyzer=analyzer))

storage = FileStorage("indexdir")
ix = storage.create_index(schema)
writer = ix.writer()

# add index

allFile = []
os.chdir('source')
allDir = glob.glob('*')

for path in allDir:
    os.chdir(path)
    allFile = glob.glob('*.txt')
    for everyFile in allFile:
        if (everyFile[0] == 's'):
            print(everyFile)
Exemple #41
0
def index_document(indice: str, data: dict):
    store = FileStorage(indice)
    ix = store.open_index()
    current_app.logger.debug('Writing {} to {}'.format(data, indice))
    with ix.writer() as writer:
        writer.update_document(**data)
Exemple #42
0
 def __enter__(self):
     dirpath = TempDir.__enter__(self)
     store = FileStorage(dirpath)
     self.onexit = lambda: store.close()
     return store
Exemple #43
0
 def __enter__(self):
     dirpath = TempDir.__enter__(self)
     self.store = FileStorage(dirpath, debug=self._debug)
     return self.store
def check_db_matches():
    """created in order to build the importance of data graph.. probably should be changed in order to be reused"""
    FIRST_RUN = False
    #ALL_FILE = "all_queries_big"
    #DB_FILE = "all_dbs_big"
    ALL_FILE = "all_queries"
    DB_FILE = "all_dbs"
    START_FROM = "number"
    ALL_NUM = "all_num_from_new"
    ALL_NUM = "all_num_from_4_5_full_17"

    ALL_FIXED_q = "all_fixed_queries" + str(17)
    ALL_FIXED_dbs = "all_fixed_dbs" + str(17)
    biggest = 20
    max_db_size = 20
    all_queries = {}
    db = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {},
          {}, {}, {}]
    found = [0] * biggest
    ret_val = []
    if FIRST_RUN:
        #raw_input("are you sure you want to rewrite the db?!")
        storage_main = FileStorage(INDEX_DIR_CODE)
        ix_main = storage_main.open_index()
        try:
            """
            with open(START_FROM, "rb") as file_h:
                (curr_db, count, db_sizes) = pickle.load(file_h)
            with open(ALL_FIXED_q, "rb") as file_h:
                all_queries = pickle.load(file_h)
            with open(ALL_FIXED_dbs, "rb") as file_h:
                db = pickle.load(file_h)
            print len(all_queries.keys())
            print "Real size", [len(e.keys()) for e in db]
            print "left", db_sizes
            print curr_db, count
            """
            with open(START_FROM, "rb") as file_h:
                (curr_db, count, db_sizes) = pickle.load(file_h)
            print "read", curr_db, count
            with open(ALL_FILE + str(curr_db - 1), "rb") as file_h:
                all_queries = pickle.load(file_h)
            with open(DB_FILE + str(curr_db - 1), "rb") as file_h:
                db = pickle.load(file_h)
            print "Real size", [len(e.keys()) for e in db]
        except:
            curr_db = 0
            count = 0
            db_sizes = [2**i for i in range(1, biggest + 1)]
        new_count = 0
        print "start reading posts"
        q_db = POSTS_DB.find({}, timeout=False)
        print "done reading posts"
        print "start with", curr_db
        for question in q_db:
            if curr_db == max_db_size:
                print "break"
                break
            new_count += 1
            if new_count < count:
                continue
            if db_sizes[curr_db] % 1000 == 0:
                print "BUILD:", curr_db, "I'm Alive, more", db_sizes[
                    curr_db], "togo!"
            snips = get_possible_snippets(question['Id'])
            if snips is None or len(snips) == 0:
                continue
            (db[curr_db])[question['Id']] = snips[0]
            db_sizes = db_sizes[:curr_db] + [e - 1 for e in db_sizes[curr_db:]]
            if db_sizes[curr_db] == 0:
                t = time.time()
                print "find matches for", curr_db, "size is", len(
                    db[curr_db].keys())
                for place, key in enumerate(db[curr_db].keys()):
                    if place % 1000 == 0:
                        print "FIND: I'm Alive", place
                    code = db[curr_db][key][0]
                    res_dict, tokens, q_scores = fast_from_code_to_question(
                        code, ix_main)
                    if all_queries.get(key, None) is None:
                        all_queries[key] = (tokens, res_dict)
                curr_db += 1
                try:
                    print "saved", time.time() - t
                    with open(ALL_FILE + str(curr_db), "wb") as file_h:
                        pickle.dump(all_queries, file_h)
                    with open(DB_FILE + str(curr_db), "wb") as file_h:
                        pickle.dump(db, file_h)
                    with open(START_FROM, "wb") as file_h:
                        pickle.dump((curr_db, new_count, db_sizes), file_h)
                except:
                    print "to much to write"
                print "start", 2**(curr_db + 1)
        q_db.close()
        num = 0
    else:
        print "reading files.."
        t = time.time()
        """with open(ALL_FILE+str(max_db_size), "rb") as file_h:
            all_queries = pickle.load(file_h)
        with open(DB_FILE+str(max_db_size), "rb") as file_h:
            db = pickle.load(file_h)"""
        with open(ALL_FIXED_q, "rb") as file_h:
            all_queries = pickle.load(file_h)
        with open(ALL_FIXED_dbs, "rb") as file_h:
            db = pickle.load(file_h)
        print "done reading", time.time() - t
        print[len(e.keys()) for e in db]

        try:
            with open(ALL_NUM, "rb") as file_h:
                num, found = pickle.load(file_h)
            print "read", num, found
        except:
            num = 0

    curr_num = 0
    print num, len(all_queries.keys())
    for query in all_queries.keys():
        curr_num += 1
        if curr_num < num:
            continue
        if curr_num % 1000 == 0:
            print "MATCHES: I'M Alive!", curr_num, query

        matches = get_matches(query, all_queries[query])
        flag_f = False
        for match in matches:
            if flag_f:
                break
            for i in range(len(db)):
                if match in db[i].keys() and query in db[i].keys():
                    found[i] += 1
                    flag_f = True
                    break

    if curr_num - 1 > num:
        with open(ALL_NUM, "wb") as file_h:
            pickle.dump((curr_num, found), file_h)
    print found
    """
    #saved in _n
    small_db = [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 8] # 3/5
    small_db = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4] # 4/5
    for i, val in enumerate(small_db):
        try:
            found[i] += val
        except:
            print "shorter db"

    print found"""
    for i in range(len(found) - 1):
        found[i + 1] += found[i]
    print(found)
    for place, i in enumerate([2**i for i in range(1, max_db_size + 1)]):
        ret_val.append(float(found[place]) / i * 100)
    print ret_val
Exemple #45
0
 def setUp(self):
     self.create_temp()
     self.storage = FileStorage(self.tempdir)
     self.storage.create()
Exemple #46
0
class SearchMigrationTest(TestCase, TempDirMixin):
    """Search index migration testing"""
    def setUp(self):
        self.create_temp()
        self.backup = weblate.trans.search.STORAGE
        self.storage = FileStorage(self.tempdir)
        weblate.trans.search.STORAGE = self.storage
        self.storage.create()

    def tearDown(self):
        self.remove_temp()
        weblate.trans.search.STORAGE = self.backup

    def do_test(self, source, target):
        if source is not None:
            self.storage.create_index(source, 'source')
        if target is not None:
            self.storage.create_index(target, 'target-cs')

        sindex = weblate.trans.search.get_source_index()
        self.assertIsNotNone(sindex)
        tindex = weblate.trans.search.get_target_index('cs')
        self.assertIsNotNone(tindex)
        writer = sindex.writer()
        writer.update_document(
            pk=1,
            source="source",
            context="context",
            location="location",
        )
        writer.commit()
        writer = tindex.writer()
        writer.update_document(pk=1, target="target", comment="comment")
        writer.commit()
        for item in ('source', 'context', 'location', 'target'):
            self.assertEqual(fulltext_search(item, ['cs'], {item: True}),
                             set([1]))

    def test_nonexisting(self):
        self.do_test(None, None)

    def test_nonexisting_dir(self):
        shutil.rmtree(self.tempdir)
        self.tempdir = None
        self.do_test(None, None)

    def test_current(self):
        source = weblate.trans.search.SourceSchema
        target = weblate.trans.search.TargetSchema
        self.do_test(source, target)

    def test_2_4(self):
        source = Schema(checksum=ID(stored=True, unique=True),
                        source=TEXT(),
                        context=TEXT(),
                        location=TEXT())
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
            comment=TEXT(),
        )
        self.do_test(source, target)

    def test_2_1(self):
        source = Schema(
            checksum=ID(stored=True, unique=True),
            source=TEXT(),
            context=TEXT(),
        )
        target = Schema(
            checksum=ID(stored=True, unique=True),
            target=TEXT(),
        )
        self.do_test(source, target)
 def __init__(self):
     self.schema = Schema(note_id=NUMERIC(stored=True, unique=True), notebook_id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=ChineseAnalyzer()), snippet=TEXT(analyzer=ChineseAnalyzer()))
     try:
         self.index = FileStorage(config.get("PATH", "notes_index_dir")).open_index()
     except:
         self.index = FileStorage(config.get("PATH", "notes_index_dir")).create_index(self.schema)
Exemple #48
0
 def setUp(self):
     self.create_temp()
     self.backup = weblate.trans.search.STORAGE
     self.storage = FileStorage(self.tempdir)
     weblate.trans.search.STORAGE = self.storage
     self.storage.create()
Exemple #49
0
def media_rebuild():
    print datetime.datetime.now()
    print 'media_rebuild'
    media_db = mysql_new.BaseDB(config.MYSQL_DEFINE_MEDIA)
    schema = Schema(movieid=ID(stored=True, unique=True),
                    title=TEXT(stored=True,
                               analyzer=analyzer_zhongwen,
                               field_boost=2.0),
                    pinyin_title=TEXT(stored=True,
                                      analyzer=analyzer_pinyin,
                                      field_boost=2.0),
                    director=KEYWORD(stored=True),
                    year=NUMERIC(stored=True, sortable=True),
                    score=NUMERIC(stored=True, sortable=True),
                    area=KEYWORD(stored=True),
                    description=TEXT(stored=True, field_boost=1.5),
                    pinyin_description=TEXT(stored=True, field_boost=1.0),
                    actor=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_actor=TEXT(stored=True, field_boost=1.0),
                    genres=KEYWORD(stored=True, field_boost=1.0),
                    pinyin_genres=TEXT(stored=True, field_boost=1.0),
                    type=NUMERIC(stored=True),
                    source=NUMERIC(stored=True))
    SQL = '''SELECT `movieid`, `title`, `type`, `actor`, `genres`, `director`, `douban_score`, `introduction` as description, `year` FROM `media_info` WHERE `status`=1 AND type in ('movie', 'tv', 'teleplay', 'anime')
          '''
    res = media_db.query(SQL, ())
    if not res:
        return
    for info in res:
        if info.get('type') == 'movie':
            info['type'] = 1
        elif info.get('type') == 'teleplay':
            info['type'] = 2
        elif info.get('type') == 'tv':
            info['type'] = 3
        elif info.get('type') == 'anime':
            info['type'] = 4
        else:
            continue
    index_path = os.path.join(config.index_root_dir, 'media')
    if not os.path.exists(index_path):
        os.mkdir(index_path)
    #ix = create_in(index_path, schema=schema)
    storage = FileStorage(index_path)
    ix = storage.open_index()
    writer = ix.writer()
    for info in res:
        pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8')))
        pinyin_description = ' '.join(
            lazy_pinyin(info.get('description').decode('utf8')))
        pinyin_actor = ''.join(info.get('actor', '').strip().split('/'))
        pinyin_actor = ' '.join(lazy_pinyin(pinyin_actor.decode('utf8')))
        pinyin_genres = ''.join(info.get('genres', '').strip().split('/'))
        pinyin_genres = ' '.join(lazy_pinyin(pinyin_genres.decode('utf8')))
        actor = ';'.join(info.get('actor', '').strip().split('/'))
        area = ';'.join(info.get('area', '').strip().split('/'))
        director = ';'.join(info.get('area', '').strip().split('/'))
        genres = ';'.join(info.get('genres', '').strip().split('/'))

        writer.add_document(movieid=info.get('movieid').decode('utf8'),
                            title=info.get('title').decode('utf8'),
                            pinyin_title=pinyin_title,
                            type=info.get('type'),
                            actor=actor.decode('utf8'),
                            pinyin_actor=pinyin_actor,
                            genres=genres.decode('utf8'),
                            pinyin_genres=pinyin_genres,
                            director=director.decode('utf8'),
                            score=info.get('douban_score'),
                            description=info.get('description').decode('utf8'),
                            pinyin_description=pinyin_description,
                            area=area.decode('utf8'),
                            year=info.get('year'))
    writer.commit(mergetype=writing.CLEAR)
Exemple #50
0
class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if getattr(LOCALS, 'RAM_STORE', None) is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:

                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            try:
                doc = index.full_prepare(obj)
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            else:
                # Really make sure it's unicode, because Whoosh won't have it any
                # other way.
                for key in doc:
                    doc[key] = self._from_python(doc[key])

                # Document boosts aren't supported in Whoosh 2.5.0+.
                if 'boost' in doc:
                    del doc['boost']

                try:
                    writer.update_document(**doc)
                except Exception as e:
                    if not self.silently_fail:
                        raise

                    # We'll log the object identifier but won't include the actual object
                    # to avoid the possibility of that generating encoding errors while
                    # processing the log message:
                    self.log.error(u"%s while preparing object for update" %
                                   e.__class__.__name__,
                                   exc_info=True,
                                   extra={
                                       "data": {
                                           "index": index,
                                           "object": get_identifier(obj)
                                       }
                                   })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

    def remove(self, obj_or_string, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        whoosh_id = get_identifier(obj_or_string)

        try:
            self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' %
                                                           (ID, whoosh_id)))
        except Exception as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Whoosh: %s",
                           whoosh_id,
                           e,
                           exc_info=True)

    def clear(self, models=None, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()

        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                self.delete_index()
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append(u"%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.index.delete_by_query(
                    q=self.parser.parse(u" OR ".join(models_to_delete)))
        except Exception as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Whoosh index of models '%s': %s",
                    ','.join(models_to_delete),
                    e,
                    exc_info=True)
            else:
                self.log.error("Failed to clear Whoosh index: %s",
                               e,
                               exc_info=True)

    def delete_index(self):
        # Per the Whoosh mailing list, if wiping out everything from the index,
        # it's much more efficient to simply delete the index files.
        if self.use_file_storage and os.path.exists(self.path):
            shutil.rmtree(self.path)
        elif not self.use_file_storage:
            self.storage.clean()

        # Recreate everything.
        self.setup()

    def optimize(self):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        self.index.optimize()

    def calculate_page(self, start_offset=0, end_offset=None):
        # Prevent against Whoosh throwing an error. Requires an end_offset
        # greater than 0.
        if end_offset is not None and end_offset <= 0:
            end_offset = 1

        # Determine the page.
        page_num = 0

        if end_offset is None:
            end_offset = 1000000

        if start_offset is None:
            start_offset = 0

        page_length = end_offset - start_offset

        if page_length and page_length > 0:
            page_num = int(start_offset / page_length)

        # Increment because Whoosh uses 1-based page numbers.
        page_num += 1
        return page_num, page_length

    @log_query
    def search(self,
               query_string,
               sort_by=None,
               start_offset=0,
               end_offset=None,
               fields='',
               highlight=False,
               facets=None,
               date_facets=None,
               query_facets=None,
               narrow_queries=None,
               spelling_query=None,
               within=None,
               dwithin=None,
               distance_point=None,
               models=None,
               limit_to_registered_models=None,
               result_class=None,
               **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.",
                          Warning,
                          stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.",
                          Warning,
                          stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.",
                          Warning,
                          stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query_string)

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(
                start_offset, end_offset)

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse,
            }

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num,
                                                **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            results = self._process_results(raw_page,
                                            highlight=highlight,
                                            query_string=query_string,
                                            spelling_query=spelling_query,
                                            result_class=result_class)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(
                        spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(
                        query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        if not self.setup_complete:
            self.setup()

        field_name = self.content_field_name
        narrow_queries = set()
        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add(' OR '.join(
                ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))

        if additional_query_string and additional_query_string != '*':
            narrow_queries.add(additional_query_string)

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(
                    self.parser.parse(force_text(nq)), limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        page_num, page_length = self.calculate_page(start_offset, end_offset)

        self.index = self.index.refresh()
        raw_results = EmptyResults()

        searcher = None
        if self.index.doc_count():
            query = "%s:%s" % (ID, get_identifier(model_instance))
            searcher = self.index.searcher()
            parsed_query = self.parser.parse(query)
            results = searcher.search(parsed_query)

            if len(results):
                raw_results = results[0].more_like_this(field_name,
                                                        top=end_offset)

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None and hasattr(raw_results, 'filter'):
                raw_results.filter(narrowed_results)

        try:
            raw_page = ResultsPage(raw_results, page_num, page_length)
        except ValueError:
            if not self.silently_fail:
                raise

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        # Because as of Whoosh 2.5.1, it will return the wrong page of
        # results if you request something too high. :(
        if raw_page.pagenum < page_num:
            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': None,
            }

        results = self._process_results(raw_page, result_class=result_class)

        if searcher:
            searcher.close()

        if hasattr(narrow_searcher, 'close'):
            narrow_searcher.close()

        return results

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    sa = StemmingAnalyzer()
                    formatter = WhooshHtmlFormatter('em')
                    terms = [token.text for token in sa(query_string)]

                    whoosh_result = whoosh_highlight(
                        additional_fields.get(self.content_field_name), terms,
                        sa, ContextFragmenter(), formatter)
                    additional_fields['highlighted'] = {
                        self.content_field_name: [whoosh_result],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def create_spelling_suggestion(self, query_string):
        spelling_suggestion = None
        reader = self.index.reader()
        corrector = reader.corrector(self.content_field_name)
        cleaned_query = force_text(query_string)

        if not query_string:
            return spelling_suggestion

        # Clean the string.
        for rev_word in self.RESERVED_WORDS:
            cleaned_query = cleaned_query.replace(rev_word, '')

        for rev_char in self.RESERVED_CHARACTERS:
            cleaned_query = cleaned_query.replace(rev_char, '')

        # Break it down.
        query_words = cleaned_query.split()
        suggested_words = []

        for word in query_words:
            suggestions = corrector.suggest(word, limit=1)

            if len(suggestions) > 0:
                suggested_words.append(suggestions[0])

        spelling_suggestion = ' '.join(suggested_words)
        return spelling_suggestion

    def _from_python(self, value):
        """
        Converts Python values to a string for Whoosh.

        Code courtesy of pysolr.
        """
        if hasattr(value, 'strftime'):
            if not hasattr(value, 'hour'):
                value = datetime(value.year, value.month, value.day, 0, 0, 0)
        elif isinstance(value, bool):
            if value:
                value = 'true'
            else:
                value = 'false'
        elif isinstance(value, (list, tuple)):
            value = u','.join([force_text(v) for v in value])
        elif isinstance(value, (six.integer_types, float)):
            # Leave it alone.
            pass
        else:
            value = force_text(value)
        return value

    def _to_python(self, value):
        """
        Converts values from Whoosh to native Python values.

        A port of the same method in pysolr, as they deal with data the same way.
        """
        if value == 'true':
            return True
        elif value == 'false':
            return False

        if value and isinstance(value, six.string_types):
            possible_datetime = DATETIME_REGEX.search(value)

            if possible_datetime:
                date_values = possible_datetime.groupdict()

                for dk, dv in date_values.items():
                    date_values[dk] = int(dv)

                return datetime(date_values['year'], date_values['month'],
                                date_values['day'], date_values['hour'],
                                date_values['minute'], date_values['second'])

        try:
            # Attempt to use json to load the values.
            converted_value = json.loads(value)

            # Try to handle most built-in types.
            if isinstance(
                    converted_value,
                (list, tuple, set, dict, six.integer_types, float, complex)):
                return converted_value
        except:
            # If it fails (SyntaxError or its ilk) or we don't trust it,
            # continue on.
            pass

        return value
Exemple #51
0
 def setUp(self):
     self.path = tempfile.mkdtemp()
     self.backup = weblate.trans.search.STORAGE
     self.storage = FileStorage(self.path)
     weblate.trans.search.STORAGE = self.storage
     self.storage.create()
Exemple #52
0
from whoosh.fields import SchemaClass, TEXT, NUMERIC
from whoosh.filedb.filestore import FileStorage
from whoosh.writing import AsyncWriter, BufferedWriter
from whoosh import qparser

from django.conf import settings
from django.dispatch import receiver
from django.db.models.signals import post_migrate
from django.db.utils import IntegrityError
from django.utils.encoding import force_text
from django.db import transaction

from weblate.lang.models import Language
from weblate.trans.data import data_dir

STORAGE = FileStorage(data_dir('whoosh'))


class TargetSchema(SchemaClass):
    '''
    Fultext index schema for target strings.
    '''
    pk = NUMERIC(stored=True, unique=True)
    target = TEXT()
    comment = TEXT()


class SourceSchema(SchemaClass):
    '''
    Fultext index schema for source and context strings.
    '''
Exemple #53
0
from bson.objectid import ObjectId

# Set index, we index title and content as texts and tags as keywords.
# We store inside index only titles and ids.
ana = StemmingAnalyzer()
schema = Schema(fullname=TEXT(analyzer=ana, spelling=True, stored=True))

# # Create index dir if it does not exists.
# if not os.path.exists("index"):
#     os.mkdir("index")
#
# # Initialize index
# index = create_in("index", schema)

st = FileStorage("index_fullname").create()
index = st.create_index(schema)

# Initiate db connection
# connection = Connection('localhost', 27017)
# db = connection["cozy-home"]
# posts = db.posts
conn = Connection(username="******", password="******")
db = conn["example"]
aql_getLibraries = "FOR library in libraries RETURN library"

posts = db.AQLQuery(aql_getLibraries, rawResults=True, batchSize=10000)
# print(len(posts))
# Fill index with posts from DB
writer = index.writer()
for post in posts:
Exemple #54
0
                category=TEXT(stored=True, analyzer=analyzer),
                owner=TEXT(stored=True))

# 按照schema定义信息,增加需要建立索引的文档
# 注意:字符串格式需要为unicode格式

# 存储schema信息至'indexdir'目录下
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
indexdir = BASE_DIR + '/indexdir/'

if arg.output[0] != '':
    indexdir = arg.output[0]

if not os.path.exists(indexdir):
    os.mkdir(indexdir)
    storage = FileStorage(indexdir)
    ix = create_in(indexdir, schema)
else:
    ix = index.open_dir(indexdir)
    # ix = create_in(indexdir, schema)
    print("open")

# write index
writer = ix.writer()

for k, v in to_update.items():

    writer.update_document(
        guid="github_%s" % v['repo_url'],
        source_type='g0v-repos',
        title=v['repo_name'],
Exemple #55
0
 def storage(self):
     return FileStorage(data_dir(self.LOCATION))
    def ranked_set_baseline(self,basefile):
        # Helper from http://effbot.org/zone/element-lib.htm
        def flatten(elem, include_tail=0):
            text = elem.text or ""
            for e in elem:
                text += flatten(e, 1)
                if include_tail and elem.tail: text += elem.tail
            return text
        # step 1: Create a temporary whoosh index in order to find out
        # the most significant words for each article

        ana = analysis.StandardAnalyzer()
        # ana = analysis.StemmingAnalyzer()
        vectorformat = formats.Frequency(ana)
        schema = fields.Schema(article=fields.ID(unique=True),
                               title=fields.TEXT(stored=True),
                               content=fields.TEXT(analyzer=ana,
                                                   vector=vectorformat))

        st = RamStorage()
        tmpidx = st.create_index(schema)
        w = tmpidx.writer()

        XHT_NS = "{http://www.w3.org/1999/xhtml}"
        tree = ET.parse(self.parsed_path(basefile))
        els = tree.findall("//"+XHT_NS+"div")
        articles = []
        for el in els:
            if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article":
                text = Util.normalizeSpace(flatten(el))
                article = unicode(el.attrib['id'][1:])
                articles.append(article)
                w.update_document(article=article,title="Article "+ article,content=text)

        w.commit()
        self.log.info("Indexed %d articles" % len(articles))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, for each article, use the 20 most distinctive terms
        # (filtering away numbers) to create a query against that index

        # things to vary:
        # * numterms
        # * connector (AND or OR)
        # * scoring (weighting=scoring.Cosine())
        numterms = 5
        connector = " AND "
        indexdir = os.path.sep.join([self.config['datadir'],'ecj','index'])
        storage = FileStorage(indexdir)
        idx = storage.open_index()
        searcher = idx.searcher(weighting=scoring.BM25F())

        tempsearch = tmpidx.searcher()

        rankedset = {}
        
        for article in articles:
            rankedset[article] = []
            r = tempsearch.search(query.Term("article",article))
            terms = [t[0] for t in r.key_terms("content", numterms=numterms+1) if not t[0].isdigit()][:numterms]
            print "Article %s:%r" % (article, terms)
            parser = qparser.QueryParser("content")
            q = parser.parse(connector.join(terms))
            results = searcher.search(q, limit=10)
            resultidx = 0
            for result in results:
                reslbl = "%s (%s)"%(result['title'],results.score(resultidx))
                rankedset[article].append([result['basefile'],reslbl])
                print u"\t%s (%s)" % (result['title'], results.score(resultidx))
                resultidx += 1

        return rankedset
Exemple #57
0
class Indexer(RaftNode):
    def __init__(self,
                 host='localhost',
                 port=7070,
                 seed_addr=None,
                 conf=SyncObjConf(),
                 data_dir='/tmp/cockatrice/index',
                 grpc_port=5050,
                 grpc_max_workers=10,
                 http_port=8080,
                 logger=getLogger(),
                 http_logger=getLogger(),
                 metrics_registry=CollectorRegistry()):

        self.__host = host
        self.__port = port
        self.__seed_addr = seed_addr
        self.__conf = conf
        self.__data_dir = data_dir
        self.__grpc_port = grpc_port
        self.__grpc_max_workers = grpc_max_workers
        self.__http_port = http_port
        self.__logger = logger
        self.__http_logger = http_logger
        self.__metrics_registry = metrics_registry

        # metrics
        self.__metrics_core_documents = Gauge(
            '{0}_indexer_index_documents'.format(NAME),
            'The number of documents.', [
                'index_name',
            ],
            registry=self.__metrics_registry)
        self.__metrics_requests_total = Counter(
            '{0}_indexer_requests_total'.format(NAME),
            'The number of requests.', ['func'],
            registry=self.__metrics_registry)
        self.__metrics_requests_duration_seconds = Histogram(
            '{0}_indexer_requests_duration_seconds'.format(NAME),
            'The invocation duration in seconds.', ['func'],
            registry=self.__metrics_registry)

        self.__self_addr = '{0}:{1}'.format(self.__host, self.__port)
        self.__peer_addrs = [] if self.__seed_addr is None else get_peers(
            bind_addr=self.__seed_addr, timeout=10)
        self.__other_addrs = [
            peer_addr for peer_addr in self.__peer_addrs
            if peer_addr != self.__self_addr
        ]
        self.__conf.serializer = self.__serialize
        self.__conf.deserializer = self.__deserialize
        self.__conf.validate()

        self.__indices = {}
        self.__index_configs = {}
        self.__writers = {}
        self.__auto_commit_timers = {}

        self.__lock = RLock()

        # create data dir
        os.makedirs(self.__data_dir, exist_ok=True)
        self.__file_storage = FileStorage(self.__data_dir,
                                          supports_mmap=True,
                                          readonly=False,
                                          debug=False)
        self.__ram_storage = RamStorage()

        # if seed addr specified and self node does not exist in the cluster, add self node to the cluster
        if self.__seed_addr is not None and self.__self_addr not in self.__peer_addrs:
            Thread(target=add_node,
                   kwargs={
                       'node_name': self.__self_addr,
                       'bind_addr': self.__seed_addr,
                       'timeout': 10
                   }).start()

        # copy snapshot from the leader node
        if self.__seed_addr is not None:
            try:
                metadata = get_metadata(bind_addr=get_leader(
                    bind_addr=self.__seed_addr, timeout=10),
                                        timeout=10)
                response = requests.get('http://{0}/snapshot'.format(
                    metadata['http_addr']))
                if response.status_code == HTTPStatus.OK:
                    with open(self.__conf.fullDumpFile, 'wb') as f:
                        f.write(response.content)
            except Exception as ex:
                self.__logger.error('failed to copy snapshot: {0}'.format(ex))

        # start node
        metadata = {
            'grpc_addr': '{0}:{1}'.format(self.__host, self.__grpc_port),
            'http_addr': '{0}:{1}'.format(self.__host, self.__http_port)
        }
        self.__logger.info('starting raft state machine')
        super(Indexer, self).__init__(self.__self_addr,
                                      self.__peer_addrs,
                                      conf=self.__conf,
                                      metadata=metadata)
        self.__logger.info('raft state machine has started')

        if os.path.exists(self.__conf.fullDumpFile):
            self.__logger.debug('snapshot exists: {0}'.format(
                self.__conf.fullDumpFile))
        else:
            pass

        while not self.isReady():
            # recovering data
            self.__logger.debug('waiting for cluster ready')
            self.__logger.debug(self.getStatus())
            time.sleep(1)
        self.__logger.info('cluster ready')
        self.__logger.debug(self.getStatus())

        # open existing indices on startup
        for index_name in self.get_index_names():
            self.__open_index(index_name, index_config=None)

        # record index metrics timer
        self.metrics_timer = Timer(10, self.__record_index_metrics)
        self.metrics_timer.start()

        # start gRPC
        self.__grpc_server = grpc.server(
            futures.ThreadPoolExecutor(max_workers=self.__grpc_max_workers))
        add_IndexServicer_to_server(
            IndexGRPCServicer(self,
                              logger=self.__logger,
                              metrics_registry=self.__metrics_registry),
            self.__grpc_server)
        self.__grpc_server.add_insecure_port('{0}:{1}'.format(
            self.__host, self.__grpc_port))
        self.__grpc_server.start()
        self.__logger.info('gRPC server has started')

        # start HTTP server
        self.__http_servicer = IndexHTTPServicer(self, self.__logger,
                                                 self.__http_logger,
                                                 self.__metrics_registry)
        self.__http_server = HTTPServer(self.__host, self.__http_port,
                                        self.__http_servicer)
        self.__http_server.start()
        self.__logger.info('HTTP server has started')

        self.__logger.info('indexer has started')

    def stop(self):
        # stop HTTP server
        self.__http_server.stop()
        self.__logger.info('HTTP server has stopped')

        # stop gRPC server
        self.__grpc_server.stop(grace=0.0)
        self.__logger.info('gRPC server has stopped')

        self.metrics_timer.cancel()

        # close indices
        for index_name in list(self.__indices.keys()):
            self.__close_index(index_name)

        self.destroy()

        self.__logger.info('index core has stopped')

    def __record_index_metrics(self):
        for index_name in list(self.__indices.keys()):
            try:
                self.__metrics_core_documents.labels(
                    index_name=index_name).set(self.get_doc_count(index_name))
            except Exception as ex:
                self.__logger.error(ex)

    def __record_metrics(self, start_time, func_name):
        self.__metrics_requests_total.labels(func=func_name).inc()

        self.__metrics_requests_duration_seconds.labels(
            func=func_name).observe(time.time() - start_time)

    # def __serialize_indices(self, filename):
    #     with self.__lock:
    #         try:
    #             self.__logger.info('starting serialize indices')
    #
    #         except Exception as ex:
    #             self.__logger.error('failed to create snapshot: {0}'.format(ex))
    #         finally:
    #             self.__logger.info('serialize indices has finished')

    # def __serialize_raft_data(self, filename, raft_data):
    #     with self.__lock:
    #         pass

    # index serializer
    def __serialize(self, filename, raft_data):
        with self.__lock:
            try:
                self.__logger.debug('serializer has started')

                # store the index files and raft logs to the snapshot file
                with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as f:
                    for index_name in self.get_index_names():
                        self.__commit_index(index_name)

                        # with self.__get_writer(index_name).writelock:
                        # with self.__indices[index_name].lock('WRITELOCK'):
                        # index files
                        for index_filename in self.get_index_files(index_name):
                            if self.__index_configs.get(
                                    index_name).get_storage_type() == "ram":
                                with self.__ram_storage.open_file(
                                        index_filename) as r:
                                    f.writestr(index_filename, r.read())
                            else:
                                f.write(
                                    os.path.join(self.__file_storage.folder,
                                                 index_filename),
                                    index_filename)
                            self.__logger.debug('{0} has stored in {1}'.format(
                                index_filename, filename))

                        # index config file
                        f.write(
                            os.path.join(
                                self.__file_storage.folder,
                                self.get_index_config_file(index_name)),
                            self.get_index_config_file(index_name))
                        self.__logger.debug('{0} has stored in {1}'.format(
                            self.get_index_config_file(index_name), filename))

                    # store the raft data
                    f.writestr(RAFT_DATA_FILE, pickle.dumps(raft_data))
                    self.__logger.debug(
                        '{0} has restored'.format(RAFT_DATA_FILE))
                self.__logger.debug('snapshot has created')
            except Exception as ex:
                self.__logger.error(
                    'failed to create snapshot: {0}'.format(ex))
            finally:
                self.__logger.debug('serializer has stopped')

    # index deserializer
    def __deserialize(self, filename):
        with self.__lock:
            try:
                self.__logger.debug('deserializer has started')

                with zipfile.ZipFile(filename, 'r') as zf:
                    # get file names in snapshot file
                    filenames = list(zf.namelist())

                    # get index names in snapshot file
                    index_names = []
                    pattern_toc = re.compile(r'^_(.+)_\d+\.toc$')
                    for f in filenames:
                        match = pattern_toc.search(f)
                        if match and match.group(1) not in index_names:
                            index_names.append(match.group(1))

                    for index_name in index_names:
                        # extract the index config first
                        zf.extract(self.get_index_config_file(index_name),
                                   path=self.__file_storage.folder)
                        index_config = pickle.loads(
                            zf.read(self.get_index_config_file(index_name)))

                        # get index files
                        pattern_toc = re.compile(r'^_{0}_(\d+)\..+$'.format(
                            index_name))  # ex) _myindex_0.toc
                        pattern_seg = re.compile(
                            r'^{0}_([a-z0-9]+)\..+$'.format(index_name)
                        )  # ex) myindex_zseabukc2nbpvh0u.seg
                        pattern_lock = re.compile(r'^{0}_WRITELOCK$'.format(
                            index_name))  # ex) myindex_WRITELOCK
                        index_files = []
                        for file_name in filenames:
                            if re.match(pattern_toc, file_name):
                                index_files.append(file_name)
                            elif re.match(pattern_seg, file_name):
                                index_files.append(file_name)
                            elif re.match(pattern_lock, file_name):
                                index_files.append(file_name)

                        # extract the index files
                        for index_file in index_files:
                            if index_config.get_storage_type() == 'ram':
                                with self.__ram_storage.create_file(
                                        index_file) as r:
                                    r.write(zf.read(index_file))
                            else:
                                zf.extract(index_file,
                                           path=self.__file_storage.folder)

                            self.__logger.debug(
                                '{0} has restored from {1}'.format(
                                    index_file, filename))

                        self.__logger.debug(
                            '{0} has restored'.format(index_name))

                    # extract the raft data
                    raft_data = pickle.loads(zf.read(RAFT_DATA_FILE))
                    self.__logger.debug(
                        '{0} has restored'.format(RAFT_DATA_FILE))
                    return raft_data
            except Exception as ex:
                self.__logger.error(
                    'failed to restore indices: {0}'.format(ex))
            finally:
                self.__logger.debug('deserializer has stopped')

    def is_healthy(self):
        return self.isHealthy()

    def is_alive(self):
        return self.isAlive()

    def is_ready(self):
        return self.isReady()

    def get_addr(self):
        return self.__self_addr

    def get_index_files(self, index_name):
        index_files = []

        pattern_toc = re.compile(
            r'^_{0}_(\d+)\..+$'.format(index_name))  # ex) _myindex_0.toc
        pattern_seg = re.compile(r'^{0}_([a-z0-9]+)\..+$'.format(
            index_name))  # ex) myindex_zseabukc2nbpvh0u.seg
        pattern_lock = re.compile(
            r'^{0}_WRITELOCK$'.format(index_name))  # ex) myindex_WRITELOCK

        if self.__index_configs.get(index_name).get_storage_type() == "ram":
            storage = self.__ram_storage
        else:
            storage = self.__file_storage

        for file_name in list(storage.list()):
            if re.match(pattern_toc, file_name):
                index_files.append(file_name)
            elif re.match(pattern_seg, file_name):
                index_files.append(file_name)
            elif re.match(pattern_lock, file_name):
                index_files.append(file_name)

        return index_files

    @staticmethod
    def get_index_config_file(index_name):
        return '{0}_CONFIG'.format(index_name)

    def get_index_names(self):
        index_names = []

        pattern_toc = re.compile(r'^_(.+)_\d+\.toc$')

        for filename in list(self.__file_storage.list()):
            match = pattern_toc.search(filename)
            if match and match.group(1) not in index_names:
                index_names.append(match.group(1))
        for filename in list(self.__ram_storage.list()):
            match = pattern_toc.search(filename)
            if match and match.group(1) not in index_names:
                index_names.append(match.group(1))

        return index_names

    def is_index_exist(self, index_name):
        return self.__file_storage.index_exists(
            indexname=index_name) or self.__ram_storage.index_exists(
                indexname=index_name)

    def is_index_open(self, index_name):
        return index_name in self.__indices

    @replicated
    def open_index(self, index_name, index_config=None):
        return self.__open_index(index_name, index_config=index_config)

    def __open_index(self, index_name, index_config=None):
        start_time = time.time()

        index = None

        try:
            # open the index
            index = self.__indices.get(index_name)
            if index is None:
                self.__logger.debug('opening {0}'.format(index_name))

                if index_config is None:
                    # set saved index config
                    with open(
                            os.path.join(
                                self.__file_storage.folder,
                                self.get_index_config_file(index_name)),
                            'rb') as f:
                        self.__index_configs[index_name] = pickle.loads(
                            f.read())
                else:
                    # set given index config
                    self.__index_configs[index_name] = index_config

                if self.__index_configs[index_name].get_storage_type(
                ) == 'ram':
                    index = self.__ram_storage.open_index(
                        indexname=index_name,
                        schema=self.__index_configs[index_name].get_schema())
                else:
                    index = self.__file_storage.open_index(
                        indexname=index_name,
                        schema=self.__index_configs[index_name].get_schema())
                self.__indices[index_name] = index

                self.__logger.info('{0} has opened'.format(index_name))

                # open the index writer
                self.__open_writer(index_name)
        except Exception as ex:
            self.__logger.error('failed to open {0}: {1}'.format(
                index_name, ex))
        finally:
            self.__record_metrics(start_time, 'open_index')

        return index

    @replicated
    def close_index(self, index_name):
        return self.__close_index(index_name)

    def __close_index(self, index_name):
        start_time = time.time()

        index = None

        try:
            # close the index writer
            self.__close_writer(index_name)

            # close the index
            index = self.__indices.pop(index_name)
            if index is not None:
                self.__logger.debug('closing {0}'.format(index_name))
                index.close()
                self.__logger.info('{0} has closed'.format(index_name))
        except Exception as ex:
            self.__logger.error('failed to close {0}: {1}'.format(
                index_name, ex))
        finally:
            self.__record_metrics(start_time, 'close_index')

        return index

    @replicated
    def create_index(self, index_name, index_config):
        return self.__create_index(index_name, index_config)

    def __create_index(self, index_name, index_config):
        if self.is_index_exist(index_name):
            # open the index
            return self.__open_index(index_name, index_config=index_config)

        start_time = time.time()

        index = None

        with self.__lock:
            try:
                self.__logger.debug('creating {0}'.format(index_name))

                # set index config
                self.__index_configs[index_name] = index_config

                self.__logger.debug(
                    self.__index_configs[index_name].get_storage_type())

                # create the index
                if self.__index_configs[index_name].get_storage_type(
                ) == 'ram':
                    index = self.__ram_storage.create_index(
                        self.__index_configs[index_name].get_schema(),
                        indexname=index_name)
                else:
                    index = self.__file_storage.create_index(
                        self.__index_configs[index_name].get_schema(),
                        indexname=index_name)
                self.__indices[index_name] = index
                self.__logger.info('{0} has created'.format(index_name))

                # save the index config
                with open(
                        os.path.join(self.__file_storage.folder,
                                     self.get_index_config_file(index_name)),
                        'wb') as f:
                    f.write(pickle.dumps(index_config))

                # open the index writer
                self.__open_writer(index_name)
            except Exception as ex:
                self.__logger.error('failed to create {0}: {1}'.format(
                    index_name, ex))
            finally:
                self.__record_metrics(start_time, 'create_index')

        return index

    @replicated
    def delete_index(self, index_name):
        return self.__delete_index(index_name)

    def __delete_index(self, index_name):
        # close index
        index = self.__close_index(index_name)

        start_time = time.time()

        with self.__lock:
            try:
                self.__logger.debug('deleting {0}'.format(index_name))

                # delete index files
                for filename in self.get_index_files(index_name):
                    self.__file_storage.delete_file(filename)
                    self.__logger.debug('{0} was deleted'.format(filename))

                self.__logger.info('{0} has deleted'.format(index_name))

                # delete the index config
                self.__index_configs.pop(index_name, None)
                os.remove(
                    os.path.join(self.__file_storage.folder,
                                 self.get_index_config_file(index_name)))
            except Exception as ex:
                self.__logger.error('failed to delete {0}: {1}'.format(
                    index_name, ex))
            finally:
                self.__record_metrics(start_time, 'delete_index')

        return index

    def get_index(self, index_name):
        return self.__get_index(index_name)

    def __get_index(self, index_name):
        start_time = time.time()

        try:
            index = self.__indices.get(index_name)
        except Exception as ex:
            raise ex
        finally:
            self.__record_metrics(start_time, 'get_index')

        return index

    def __start_auto_commit_timer(self, index_name, period):
        timer = self.__auto_commit_timers.get(index_name, None)
        if timer is None:
            self.__auto_commit_timers[index_name] = threading.Timer(
                period,
                self.__auto_commit_index,
                kwargs={
                    'index_name': index_name,
                    'period': period
                })
            self.__auto_commit_timers[index_name].start()
            self.__logger.debug(
                'auto commit timer for {0} were started'.format(index_name))

    def __stop_auto_commit_timer(self, index_name):
        timer = self.__auto_commit_timers.pop(index_name, None)
        if timer is not None:
            timer.cancel()
            self.__logger.debug(
                'auto commit timer for {0} were stopped'.format(index_name))

    def __auto_commit_index(self, index_name, period):
        self.__stop_auto_commit_timer(index_name)
        self.__commit_index(index_name)
        self.__start_auto_commit_timer(index_name, period=period)

    def __open_writer(self, index_name):
        writer = None

        try:
            writer = self.__writers.get(index_name, None)
            if writer is None or writer.is_closed:
                self.__logger.debug(
                    'opening writer for {0}'.format(index_name))
                writer = self.__indices.get(index_name).writer()
                self.__writers[index_name] = writer
                self.__logger.debug(
                    'writer for {0} has opened'.format(index_name))

                self.__start_auto_commit_timer(
                    index_name,
                    period=self.__index_configs.get(
                        index_name).get_writer_auto_commit_period())
        except Exception as ex:
            self.__logger.error('failed to open writer for {0}: {1}'.format(
                index_name, ex))

        return writer

    def __close_writer(self, index_name):
        writer = None

        try:
            self.__stop_auto_commit_timer(index_name)

            # close the index
            writer = self.__writers.pop(index_name, None)
            if writer is not None:
                self.__logger.debug(
                    'closing writer for {0}'.format(index_name))
                writer.commit()
                self.__logger.debug(
                    'writer for {0} has closed'.format(index_name))
        except Exception as ex:
            self.__logger.error('failed to close writer for {0}: {1}'.format(
                index_name, ex))

        return writer

    def __get_writer(self, index_name):
        return self.__writers.get(index_name, None)

    def __get_searcher(self, index_name, weighting=None):
        try:
            if weighting is None:
                searcher = self.__indices.get(index_name).searcher()
            else:
                searcher = self.__indices.get(index_name).searcher(
                    weighting=weighting)
        except Exception as ex:
            raise ex

        return searcher

    @replicated
    def commit_index(self, index_name):
        return self.__commit_index(index_name)

    def __commit_index(self, index_name):
        start_time = time.time()

        success = False

        with self.__lock:
            try:
                self.__logger.debug('committing {0}'.format(index_name))

                self.__get_writer(index_name).commit()
                self.__open_writer(index_name)  # reopen writer

                self.__logger.info('{0} has committed'.format(index_name))

                success = True
            except Exception as ex:
                self.__logger.error('failed to commit index {0}: {1}'.format(
                    index_name, ex))
            finally:
                self.__record_metrics(start_time, 'commit_index')

        return success

    @replicated
    def rollback_index(self, index_name):
        return self.__rollback_index(index_name)

    def __rollback_index(self, index_name):
        start_time = time.time()

        success = False

        with self.__lock:
            try:
                self.__logger.debug('rolling back {0}'.format(index_name))

                self.__get_writer(index_name).cancel()
                self.__open_writer(index_name)  # reopen writer

                self.__logger.info('{0} has rolled back'.format(index_name))

                success = True
            except Exception as ex:
                self.__logger.error('failed to rollback index {0}: {1}'.format(
                    index_name, ex))
            finally:
                self.__record_metrics(start_time, 'rollback_index')

        return success

    @replicated
    def optimize_index(self, index_name):
        return self.__optimize_index(index_name)

    def __optimize_index(self, index_name):
        start_time = time.time()

        success = False

        with self.__lock:
            try:
                self.__logger.debug('optimizing {0}'.format(index_name))

                self.__get_writer(index_name).commit(optimize=True,
                                                     merge=False)
                self.__open_writer(index_name)  # reopen writer

                self.__logger.info('{0} has optimized'.format(index_name))

                success = True
            except Exception as ex:
                self.__logger.error('failed to optimize {0}: {1}'.format(
                    index_name, ex))
            finally:
                self.__record_metrics(start_time, 'optimize_index')

        return success

    def get_doc_count(self, index_name):
        try:
            cnt = self.__indices.get(index_name).doc_count()
        except Exception as ex:
            raise ex

        return cnt

    def get_schema(self, index_name):
        try:
            schema = self.__indices.get(index_name).schema
        except Exception as ex:
            raise ex

        return schema

    @replicated
    def put_document(self, index_name, doc_id, fields):
        return self.__put_document(index_name, doc_id, fields)

    def __put_document(self, index_name, doc_id, fields):
        doc = copy.deepcopy(fields)
        doc[self.__index_configs.get(index_name).get_doc_id_field()] = doc_id

        return self.__put_documents(index_name, [doc])

    @replicated
    def put_documents(self, index_name, docs):
        return self.__put_documents(index_name, docs)

    def __put_documents(self, index_name, docs):
        start_time = time.time()

        with self.__lock:
            try:
                self.__logger.debug(
                    'putting documents to {0}'.format(index_name))

                # count = self.__get_writer(index_name).update_documents(docs)

                count = 0
                for doc in docs:
                    self.__get_writer(index_name).update_document(**doc)
                    count += 1

                self.__logger.info('{0} documents has put to {1}'.format(
                    count, index_name))
            except Exception as ex:
                self.__logger.error(
                    'failed to put documents to {0}: {1}'.format(
                        index_name, ex))
                count = -1
            finally:
                self.__record_metrics(start_time, 'put_documents')

        return count

    def get_document(self, index_name, doc_id):
        try:
            results_page = self.search_documents(
                index_name,
                doc_id,
                self.__index_configs.get(index_name).get_doc_id_field(),
                1,
                page_len=1)
            if results_page.total > 0:
                self.__logger.debug('{0} was got from {1}'.format(
                    doc_id, index_name))
            else:
                self.__logger.debug('{0} did not exist in {1}'.format(
                    doc_id, index_name))
        except Exception as ex:
            raise ex

        return results_page

    @replicated
    def delete_document(self, index_name, doc_id):
        return self.__delete_document(index_name, doc_id)

    def __delete_document(self, index_name, doc_id):
        return self.__delete_documents(index_name, [doc_id])

    @replicated
    def delete_documents(self, index_name, doc_ids):
        return self.__delete_documents(index_name, doc_ids)

    def __delete_documents(self, index_name, doc_ids):
        start_time = time.time()

        with self.__lock:
            try:
                self.__logger.debug(
                    'deleting documents from {0}'.format(index_name))

                # count = self.__get_writer(index_name).delete_documents(doc_ids, doc_id_field=self.__index_configs.get(
                #     index_name).get_doc_id_field())

                count = 0
                for doc_id in doc_ids:
                    count += self.__get_writer(index_name).delete_by_term(
                        self.__index_configs.get(
                            index_name).get_doc_id_field(), doc_id)

                self.__logger.info('{0} documents has deleted from {1}'.format(
                    count, index_name))
            except Exception as ex:
                self.__logger.error(
                    'failed to delete documents in bulk to {0}: {1}'.format(
                        index_name, ex))
                count = -1
            finally:
                self.__record_metrics(start_time, 'delete_documents')

        return count

    def search_documents(self,
                         index_name,
                         query,
                         search_field,
                         page_num,
                         page_len=10,
                         weighting=None,
                         **kwargs):
        start_time = time.time()

        try:
            searcher = self.__get_searcher(index_name, weighting=weighting)
            query_parser = QueryParser(search_field,
                                       self.get_schema(index_name))
            query_obj = query_parser.parse(query)
            results_page = searcher.search_page(query_obj,
                                                page_num,
                                                pagelen=page_len,
                                                **kwargs)
            self.__logger.info('{0} documents ware searched from {1}'.format(
                results_page.total, index_name))
        except Exception as ex:
            raise ex
        finally:
            self.__record_metrics(start_time, 'search_documents')

        return results_page

    @replicated
    def create_snapshot(self):
        self.__create_snapshot()

    def __create_snapshot(self):
        self.forceLogCompaction()

    def get_snapshot_file_name(self):
        return self.__conf.fullDumpFile

    def is_snapshot_exist(self):
        return os.path.exists(self.get_snapshot_file_name())

    def open_snapshot_file(self):
        with self.__lock:
            try:
                file = open(self.get_snapshot_file_name(), mode='rb')
            except Exception as ex:
                raise ex

        return file
parser.add_argument("-limit", type=int, default=1000)
parser.add_argument("-binary", action="store_true")
parser.add_argument("-add_bm25", action="store_true")
args = parser.parse_args()

from singletons import PREPROCESS, SEARCHER

run_id = args.model
if args.model == "clusvm":
    clusvm = util.load(f"clusvm.pkl")
elif "sv" in args.model:
    svm = util.load(f"{args.svm_file}.pkl")
elif args.model == "adarank":
    alpha = np.load("ada.npy")
else:
    ix = FileStorage("data/msmarcoidx").open_index()
    if args.model == "bm25":
        SEARCHER = ix.searcher()
    qp = QueryParser("body", schema=ix.schema)


def predict(inp):
    qid, query = inp
    ret = []

    if args.model == "okapi" or args.model == "bm25":
        results = SEARCHER.search(qp.parse(query), limit=args.limit)
        for rank, hit in enumerate(results):
            ret.append([qid, hit["docid"], rank + 1, results.score(rank), run_id])

    elif args.model == "clusvm":
Exemple #59
0
        
        >>> thesaurus.synonyms("hail")
        ['acclaim', 'come', 'herald']
        """

        word = word.lower()
        if self.searcher:
            return self.searcher.document(word=word)["syns"]
        else:
            return synonyms(self.w2n, self.n2w, word)


if __name__ == "__main__":
    from time import clock
    from whoosh.filedb.filestore import FileStorage
    st = FileStorage("c:/testindex")

    #    t = clock()
    #    th = Thesaurus.from_filename("c:/wordnet/wn_s.pl")
    #    print clock() - t
    #
    #    t = clock()
    #    th.to_storage(st)
    #    print clock() - t
    #
    #    t = clock()
    #    print th.synonyms("light")
    #    print clock() - t

    t = clock()
    th = Thesaurus.from_storage(st)
Exemple #60
0
    def __init__(self,
                 host='localhost',
                 port=7070,
                 seed_addr=None,
                 conf=SyncObjConf(),
                 data_dir='/tmp/cockatrice/index',
                 grpc_port=5050,
                 grpc_max_workers=10,
                 http_port=8080,
                 logger=getLogger(),
                 http_logger=getLogger(),
                 metrics_registry=CollectorRegistry()):

        self.__host = host
        self.__port = port
        self.__seed_addr = seed_addr
        self.__conf = conf
        self.__data_dir = data_dir
        self.__grpc_port = grpc_port
        self.__grpc_max_workers = grpc_max_workers
        self.__http_port = http_port
        self.__logger = logger
        self.__http_logger = http_logger
        self.__metrics_registry = metrics_registry

        # metrics
        self.__metrics_core_documents = Gauge(
            '{0}_indexer_index_documents'.format(NAME),
            'The number of documents.', [
                'index_name',
            ],
            registry=self.__metrics_registry)
        self.__metrics_requests_total = Counter(
            '{0}_indexer_requests_total'.format(NAME),
            'The number of requests.', ['func'],
            registry=self.__metrics_registry)
        self.__metrics_requests_duration_seconds = Histogram(
            '{0}_indexer_requests_duration_seconds'.format(NAME),
            'The invocation duration in seconds.', ['func'],
            registry=self.__metrics_registry)

        self.__self_addr = '{0}:{1}'.format(self.__host, self.__port)
        self.__peer_addrs = [] if self.__seed_addr is None else get_peers(
            bind_addr=self.__seed_addr, timeout=10)
        self.__other_addrs = [
            peer_addr for peer_addr in self.__peer_addrs
            if peer_addr != self.__self_addr
        ]
        self.__conf.serializer = self.__serialize
        self.__conf.deserializer = self.__deserialize
        self.__conf.validate()

        self.__indices = {}
        self.__index_configs = {}
        self.__writers = {}
        self.__auto_commit_timers = {}

        self.__lock = RLock()

        # create data dir
        os.makedirs(self.__data_dir, exist_ok=True)
        self.__file_storage = FileStorage(self.__data_dir,
                                          supports_mmap=True,
                                          readonly=False,
                                          debug=False)
        self.__ram_storage = RamStorage()

        # if seed addr specified and self node does not exist in the cluster, add self node to the cluster
        if self.__seed_addr is not None and self.__self_addr not in self.__peer_addrs:
            Thread(target=add_node,
                   kwargs={
                       'node_name': self.__self_addr,
                       'bind_addr': self.__seed_addr,
                       'timeout': 10
                   }).start()

        # copy snapshot from the leader node
        if self.__seed_addr is not None:
            try:
                metadata = get_metadata(bind_addr=get_leader(
                    bind_addr=self.__seed_addr, timeout=10),
                                        timeout=10)
                response = requests.get('http://{0}/snapshot'.format(
                    metadata['http_addr']))
                if response.status_code == HTTPStatus.OK:
                    with open(self.__conf.fullDumpFile, 'wb') as f:
                        f.write(response.content)
            except Exception as ex:
                self.__logger.error('failed to copy snapshot: {0}'.format(ex))

        # start node
        metadata = {
            'grpc_addr': '{0}:{1}'.format(self.__host, self.__grpc_port),
            'http_addr': '{0}:{1}'.format(self.__host, self.__http_port)
        }
        self.__logger.info('starting raft state machine')
        super(Indexer, self).__init__(self.__self_addr,
                                      self.__peer_addrs,
                                      conf=self.__conf,
                                      metadata=metadata)
        self.__logger.info('raft state machine has started')

        if os.path.exists(self.__conf.fullDumpFile):
            self.__logger.debug('snapshot exists: {0}'.format(
                self.__conf.fullDumpFile))
        else:
            pass

        while not self.isReady():
            # recovering data
            self.__logger.debug('waiting for cluster ready')
            self.__logger.debug(self.getStatus())
            time.sleep(1)
        self.__logger.info('cluster ready')
        self.__logger.debug(self.getStatus())

        # open existing indices on startup
        for index_name in self.get_index_names():
            self.__open_index(index_name, index_config=None)

        # record index metrics timer
        self.metrics_timer = Timer(10, self.__record_index_metrics)
        self.metrics_timer.start()

        # start gRPC
        self.__grpc_server = grpc.server(
            futures.ThreadPoolExecutor(max_workers=self.__grpc_max_workers))
        add_IndexServicer_to_server(
            IndexGRPCServicer(self,
                              logger=self.__logger,
                              metrics_registry=self.__metrics_registry),
            self.__grpc_server)
        self.__grpc_server.add_insecure_port('{0}:{1}'.format(
            self.__host, self.__grpc_port))
        self.__grpc_server.start()
        self.__logger.info('gRPC server has started')

        # start HTTP server
        self.__http_servicer = IndexHTTPServicer(self, self.__logger,
                                                 self.__http_logger,
                                                 self.__metrics_registry)
        self.__http_server = HTTPServer(self.__host, self.__http_port,
                                        self.__http_servicer)
        self.__http_server.start()
        self.__logger.info('HTTP server has started')

        self.__logger.info('indexer has started')