def update_index(sender, instance, created, **kwargs): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() try: writer = ix.writer() except: return tags = [] for t in instance.tags.all(): try: tags.append(unicode(t.name)) except: pass tags = u','.join(tags) try: if created: writer.add_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username, id=unicode(instance.pk)) writer.commit() else: writer.update_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username, id=unicode(instance.pk)) writer.commit() except: pass
def update_index(sender, instance, created, **kwargs): if int(os.environ.get('SKIP_SEARCH_INDEX', '0')): return try: url = unicode(instance.get_absolute_url()) except Exception: log.critical('Cant resolve url. Content %r not indexed' % instance) return content = getattr(instance, 'content', None) if content is None: content = unicode(instance) elif callable(content): content = content() storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname='memopol') writer = ix.writer() if created: writer.add_document(title=unicode(instance), content=content, type=unicode(instance.__class__.__name__.lower()), url=url) writer.commit() else: writer.update_document(title=unicode(instance), content=content, type=unicode(instance.__class__.__name__.lower()), url=url) writer.commit()
def __init__(self, path, masterkey=None): FileStorage.__init__(self, path, supports_mmap=False) self.masterkey = masterkey[:32] self.signkey = masterkey[32:] self._tmp_storage = self.temp_storage self.length_cache = {} self._open_files = {}
def test_threaded_filelock(self): self.make_dir("testindex") st = FileStorage("testindex") lock1 = st.lock("testlock") result = [] # The thread function tries to acquire the lock and # then quits def fn(): lock2 = st.lock("testlock") gotit = try_for(lock2.acquire, 1.0, 0.1) if gotit: result.append(True) lock2.release() t = threading.Thread(target=fn) # Acquire the lock in this thread lock1.acquire() # Start the other thread trying to acquire the lock t.start() # Wait for a bit time.sleep(0.15) # Release the lock lock1.release() # Wait for the other thread to finish t.join() # If the other thread got the lock, it should have # appended something to the "results" list. self.assertEqual(len(result), 1) self.clean_file("testindex/testlock") self.destroy_dir("testindex")
def create_index(sender=None, **kwargs): """Creates a File based whoosh index, location used is settings.WHOOSH_INDEX so make sure that is set""" if not os.path.exists(settings.WHOOSH_INDEX): os.mkdir(settings.WHOOSH_INDEX) storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.create_index(schema=WHOOSH_SCHEMA, indexname="search")
def init_index(index=".index"): indexZ=index if not os.path.exists(indexZ): os.mkdir(indexZ) # os.rmdir(index) storage = FileStorage(indexZ) schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID (stored=True),tags=KEYWORD) ix = storage.create_index(schema) ix = storage.open_index() return ix
def get_index(): try: storage = FileStorage(settings.WHOOSH_INDEX) return storage.open_index(indexname="search") except IOError: # No index? other error? create_index() storage = FileStorage(settings.WHOOSH_INDEX) return storage.open_index(indexname="search")
def build_index(sa_session, whoosh_index_dir, path_to_repositories): """ Build the search indexes. One for repositories and another for tools within. """ # Rare race condition exists here and below if not os.path.exists(whoosh_index_dir): os.makedirs(whoosh_index_dir) tool_index_dir = os.path.join(whoosh_index_dir, 'tools') if not os.path.exists(tool_index_dir): os.makedirs(tool_index_dir) repo_index_storage = FileStorage(whoosh_index_dir) tool_index_storage = FileStorage(tool_index_dir) repo_index = repo_index_storage.create_index(repo_schema) tool_index = tool_index_storage.create_index(tool_schema) repo_index_writer = repo_index.writer() tool_index_writer = tool_index.writer() repos_indexed = 0 tools_indexed = 0 for repo in get_repos(sa_session, path_to_repositories): repo_index_writer.add_document(id=repo.get('id'), name=unicodify(repo.get('name')), description=unicodify(repo.get('description')), long_description=unicodify(repo.get('long_description')), homepage_url=unicodify(repo.get('homepage_url')), remote_repository_url=unicodify(repo.get('remote_repository_url')), repo_owner_username=unicodify(repo.get('repo_owner_username')), times_downloaded=repo.get('times_downloaded'), approved=repo.get('approved'), last_updated=repo.get('last_updated'), full_last_updated=repo.get('full_last_updated')) # Tools get their own index for tool in repo.get('tools_list'): tool_index_writer.add_document(id=unicodify(tool.get('id')), name=unicodify(tool.get('name')), version=unicodify(tool.get('version')), description=unicodify(tool.get('description')), help=unicodify(tool.get('help')), repo_owner_username=unicodify(repo.get('repo_owner_username')), repo_name=unicodify(repo.get('name')), repo_id=repo.get('id')) tools_indexed += 1 print(tools_indexed, 'tools (', tool.get('id'), ')') repos_indexed += 1 print(repos_indexed, 'repos (', repo.get('id'), ')') tool_index_writer.commit() repo_index_writer.commit() print("TOTAL repos indexed: ", repos_indexed) print("TOTAL tools indexed: ", tools_indexed)
def get_index(index=".index"): indexZ=index if not os.path.exists(indexZ): return "there is no index with this name %s!! use indexer to build the index" % index sys.exit() storage = FileStorage(indexZ) ix = storage.open_index() print "the index has %d docs" % ix.doc_count_all() return ix
def handle_noargs(self, **options): # from settings import HAYSTACK_CONNECTIONS # storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH']) storage = FileStorage('/dev/shm/whoosh/') ix = storage.open_index('SPELL') with ix.reader() as r: for id in r.all_doc_ids(): print r.stored_fields(id)
def _open_indexes(self): """open storage and open indexes""" if not os.path.exists("index"): os.mkdir("index") storage = FileStorage("index") # open or initialise index if not storage.index_exists(indexname='MAIN'): self.ix = storage.\ create_index(IndexerSchema, indexname='MAIN') self.ix = storage.open_index(indexname='MAIN')
def eval_get_ranked_set_baseline(self, basefile): # Step 1: Read the saved keyterms for a subset of articles # (created by analyze_baseline_queries) g = Graph() g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3") articles = {} for (s, p, o) in g: if not str(s) in articles: articles[str(s)] = [] articles[str(s)].append(str(o)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, create a query for each article based on # the keyterms. connector = query.Or indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index']) storage = FileStorage(indexdir) idx = storage.open_index() searcher = idx.searcher(weighting=scoring.BM25F()) res = {} # for article in sorted(articles.keys()): for article in self._articles(basefile): terms = articles[article] rankedset = [] #parser = qparser.QueryParser("content", idx.schema) #q = parser.parse(connector.join(terms)) q = query.And([ # query.Term("articles", article), connector([query.Term("content", x) for x in terms]) ]) # print q # self.log.debug("Article %s: %s", article, " or ".join(terms)) results = searcher.search(q, limit=None) resultidx = 0 # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10)) for result in results: reslbl = "%s (%s)" % ( result['basefile'], results.score(resultidx)) rankedset.append([result['basefile'], reslbl]) # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx))) resultidx += 1 self.log.info("Created baseline ranked set for %s: Top result %s (of %s)" % (article.split("/")[-1], rankedset[0][0], len(rankedset))) # return just a list of URIs, no scoring information. But the # full URI isnt available in the whoosh db, so we recreate it. res[article] = ["http://lagen.nu/ext/celex/%s" % x[ 0] for x in rankedset] return res
def update_index(sender, instance, created, **kwargs): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname="rarog") writer = ix.writer() if created: writer.add_document(title=unicode(instance), body_html=instance.body_html, url=unicode(instance.get_absolute_url())) writer.commit() else: writer.update_document(title=unicode(instance), body_html=instance.body_html, url=unicode(instance.get_absolute_url())) writer.commit()
def searchIndex(): ''' searchindex() Performs the requested search through the index/schema INPUTS: idx -- desired index to search OUTPUTS: results -- results of the search ''' # Navigate to the LM index directory c = '' while True: print 'The current directory is ' + os.getcwd() ques = 'Is the LM index (directory) in the current directory? [y/n]\t' c = raw_input(ques).lower() if c == 'y' or c == 'yes': idxDir = os.getcwd() break elif c == 'n' or c == 'no': while True: idxDir = raw_input('Where is it?\t').lower() try: os.chdir(idxDir) break except WindowsError: print 'Sorry, I couldn\'t navigate to that directory' break elif c == 'q' or c == 'quit': print '\tReturning to the Main Menu' return else: print 'I\'m sorry, I don\'t understand what you mean. Try again.' # Open the index idxDir = idxDir + '/LM_Storage' storage = FileStorage(idxDir) idx = storage.open_index(indexname = 'LM') # Determine what the user wants to search for c = '' while True: ques = 'What would you like to search? song/artist [s], lyrics [L]\t' c = raw_input(ques).lower() if c == 's' or c == 'song/artist' or c == 'song': searchForSong(idx) break elif c == 'l' or c == 'lyrics': searchForLyrics(idx) break elif c == 'q' or c == 'quit': print '\tReturning to the Main Menu' return else: print 'I\'m sorry, I don\'t understand what you mean. Try again.'
def __init__(self, word_file, graph_file): dirname = os.path.dirname(graph_file) st = FileStorage(dirname) f = st.open_file(graph_file) gr = fst.GraphReader(f) self.graph = gr self.dict = {} with codecs.open(word_file,'r','utf-8') as file: for line in file: tokens = line.split(" ") if len(tokens) >= 2: self.dict[tokens[0].strip()] = int(tokens[1].strip())
def search_does_exist(query): #query = unicode(query, 'utf-8') #query = unidecode(query) storage = FileStorage("indexdir") ix = storage.open_index(indexname="wiki") from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("title", ix.schema).parse(query) whoosh_results = searcher.search(query, limit=1) return len(whoosh_results) > 0
def get_myindex(indexdir='indexdir', filestore=False): schema = get_schema() if not filestore: if not os.path.exists(indexdir): os.mkdir(indexdir) ix = index.create_in(indexdir, schema) ix = index.open_dir(indexdir) else: storage = FileStorage(indexdir) # TODO: When the indexdir has already exist # the index object also use create_index, # it should use open_dir as above method. ix = storage.create_index(schema) return ix
def test_hash(self): self.make_dir("testindex") st = FileStorage("testindex") hwf = st.create_file("test.hsh") hw = FileHashWriter(hwf) hw.add("foo", "bar") hw.add("glonk", "baz") hw.close() hrf = st.open_file("test.hsh") hr = FileHashReader(hrf) self.assertEqual(hr.get("foo"), "bar") self.assertEqual(hr.get("baz"), None) hr.close()
def search(self,q): from whoosh.filedb.filestore import FileStorage from whoosh.qparser import MultifieldParser storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() q = q.replace('+', ' AND ').replace(' -', ' NOT ') parser = MultifieldParser(["content","title","tags","author"], schema=ix.schema) qry = parser.parse(q) searcher = ix.searcher() hits = searcher.search(qry) return self.objects.filter(id__in=[h.fields()['id'] for h in hits]).filter(published=True)
def test_filelock_simple(self): self.make_dir("testindex") st = FileStorage("testindex") lock1 = st.lock("testlock") lock2 = st.lock("testlock") self.assertTrue(lock1.acquire()) self.assertFalse(lock2.acquire()) lock1.release() self.assertTrue(lock2.acquire()) self.assertFalse(lock1.acquire()) lock2.release() self.clean_file("testindex/testlock") self.destroy_dir("testindex")
def run_search(query): from settings import HAYSTACK_CONNECTIONS storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH']) # storage = FileStorage('/dev/shm/whoosh/') ix = storage.open_index('MAIN') with ix.searcher() as s: from whoosh.qparser import QueryParser qp = QueryParser("text", schema=ix.schema) q = qp.parse(query) results = s.search(q) for i, r in enumerate(results): result = "%d: (%s) %s" % (i, r['id'], r['title']) # ignored
def build_clean_index(): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() writer = ix.writer() try: mlogger.debug("building index from scratch.....................") mlogger.debug("adding objects...................") for si in StudentInstitute.objects.all(): adddoc(si,writer,True) for fi in FacultyInstitute.objects.all(): adddoc(fi,writer,True) finally: writer.commit() ix.close()
def add_documents_to_index(index_name, documents): storage = FileStorage("indexdir") ix = storage.open_index(indexname=index_name) writer = ix.writer() for i, document in enumerate(documents): print "{}%".format(i/len(documents) * 100) if index_name == "wiki": writer.add_document(title=u"{}".format(sanitize_text(document.title))) if index_name == "movie": writer.add_document(title=u"{}".format(sanitize_text(document.title))) writer.commit()
def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH ) self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) self.setup_complete = True
def search(request): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname="rarog") hits = [] query = request.GET.get('q', None) if query is not None and query != u"": query = query.replace('+', ' AND ').replace(' -', ' NOT ') parser = MultifieldParser(['title','body_html'], schema=ix.schema) try: qry = parser.parse(query) except: qry = None if qry is not None: searcher = ix.searcher() hits = searcher.search(qry) return query, hits
class Index: def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'): self.path = os.path.expanduser(path) self.schema = Schema( trackId = ID(stored=True), name=TEXT(stored=True), artist=TEXT(stored=True), album=TEXT(stored=True), genre=KEYWORD(stored=True), location=STORED, trackNumber=STORED, bitRate=ID(stored=True), artwork=KEYWORD(stored=True) ) self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema) self.folder = "%s/index" % os.path.expanduser(folder) self.empty = not whoosh.index.exists_in(self.folder) self.ix = None def index(self): if self.empty: if not os.path.exists(self.folder): os.makedirs(self.folder) st = FileStorage(self.folder) ix = st.create_index(self.schema) w = ix.writer() w.add_document(name = u"beuha") pipe = file.ID3Filter() #[TODO] using itunes info for artwork? cpt = 0 for track in pipe(ItunesParser(self.path)): if track['album'] != None : album = track['album'].encode('ascii', 'ignore') else: album = "" #print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore') if cpt % 20 == 0: print "\n%i " %cpt, print '#', #print track['album'], track['name'] w.add_document( trackId = track['trackId'], name=track['name'] ,artist=track['artist'], album=track['album'], genre=track['genre'], location=track['location'], artwork=boolean(track['artwork']), trackNumber=track['trackNumber'], bitRate=track['bitRate'] ) #if cpt % 100 == 1: # w.commit() cpt += 1 print "\n\n%i tracks indexed" % cpt w.commit() ix.optimize() ix.close() else : print "already indexed" def query(self, query): if self.ix == None: self.ix = FileStorage(self.folder).open_index() q = self.parser.parse(query) return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
def create_in(dirname, schema, indexname=None): """Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. indexname is t :param dirname: the path string of the directory in which to create the index. :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage object. :returns: :class:`Index` """ if not indexname: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname) return storage.create_index(schema, indexname)
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.storage = FileStorage(self.tempdir) self.storage.create() def tearDown(self): self.remove_temp() def do_test(self): fulltext = Fulltext() fulltext.storage = self.storage sindex = fulltext.get_source_index() self.assertIsNotNone(sindex) tindex = fulltext.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document( pk=1, target="target", comment="comment" ) writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual( fulltext.search(item, ['cs'], {item: True}), set([1]) ) def test_nonexisting(self): self.do_test() def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test()
def search(query): #query = unicode(query, 'utf-8') storage = FileStorage("indexdir") ix = storage.open_index(indexname="wiki") from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("title", ix.schema).parse(query) whoosh_results = searcher.search(query, limit=1) results = [] for w in whoosh_results: results.append("{}".format(w)) return results
def open_dir(dirname, indexname = None, mapped=True): """Convenience function for opening an index in a directory. Takes care of creating a FileStorage object for you. dirname is the filename of the directory in containing the index. indexname is the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param dirname: the path string of the directory in which to create the index. :param indexname: the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param mapped: whether to use memory mapping to speed up disk reading. :returns: :class:`Index` """ if indexname is None: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname, mapped=mapped) return storage.open_index(indexname)
def __init__(self, path, masterkey=None): self.masterkey = masterkey[:32] self.signkey = masterkey[32:] self._tmp_storage = self.temp_storage self.length_cache = {} FileStorage.__init__(self, path, supports_mmap=False)
from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser import whoosh.qparser as qparser import chinese import os, glob, codecs, sys analyzer = chinese.ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), sub_title=TEXT(stored=True, analyzer=analyzer), author=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer)) storage = FileStorage("indexdir") ix = storage.open_index() writer = ix.writer() _string = sys.argv[1] _mode = sys.argv[2] normal = (_mode == "normal") _distance = 0 if(normal is False): _distance = int(sys.argv[3]) with ix.searcher() as searcher: # og = qparser.OrGroup.factory(0.9) parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema) # parser = qparser.QueryParser("content", ix.schema)
class SearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, site=None): super(SearchBackend, self).__init__(site) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024) if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'): raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.') def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH) if self.use_file_storage: self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name) def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s.%s" % (DJANGO_CT, model._meta.app_label, model._meta.module_name)) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH): shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_unicode(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if len(sort_by) > 1 and reverse_counter > 1: raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if limit_to_registered_models: # Using narrow queries, limit the results to only models registered # with the current site. if narrow_queries is None: narrow_queries = set() registered_models = self.build_registered_models_list() if len(registered_models) > 0: narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in registered_models])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq))) if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse) # Handle the case where the results have been narrowed. if narrowed_results: raw_results.filter(narrowed_results) # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = start_offset / page_length # Increment because Whoosh uses 1-based page numbers. page_num += 1 try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, limit_to_registered_models=None, result_class=None, **kwargs): warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2) return { 'results': [], 'hits': 0, } def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): if not self.site: from haystack import site else: site = self.site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, searchsite=self.site, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None sp = SpellChecker(self.storage) cleaned_query = force_unicode(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = sp.suggest(word, number=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_unicode(v) for v in value]) elif isinstance(value, (int, long, float)): # Leave it alone. pass else: value = force_unicode(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, basestring): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
app.logger.info('microblog startup') if os.environ.get('HEROKU') is not None: import logging stream_handler = logging.StreamHandler() app.logger.addHandler(stream_handler) app.logger.setLevel(logging.INFO) app.logger.info('microblog startup') enable_search = WHOOSH_ENABLED if enable_search: search_is_new = False if not os.path.exists(WHOOSH_BASE): os.mkdir(WHOOSH_BASE) search_is_new = True search_storage = FileStorage(WHOOSH_BASE) search_ix = None if search_is_new: schema = Schema(id=ID(stored=True), body=TEXT()) search_ix = search_storage.create_index(schema) else: search_ix = search_storage.open_index() class CustomJSONEncoder(JSONEncoder): """This class adds support for lazy translation texts to Flask's JSON encoder. This is necessary when flashing translated texts.""" def default(self, obj): from speaklater import is_lazy_string if is_lazy_string(obj): try:
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
# rimuove gia' le stopword delle prove precedenti json_stop_words = open("../Indicizzazione/stopWords_clinico.json", "r") json_string = "" for line in json_stop_words: json_string = json_string + line datastore = json.loads(json_string) #print datastore campo = "identifier" fields = ["title", "abstract", "terms"] if not os.path.exists(sys.argv[1]): # controlla se l'indice non c'e' print sys.argv[1], "does not exist" # esci se non esiste else: # altrimenti procedi fst = FileStorage(sys.argv[1]) # afferra la maniglia e ix = fst.open_index() # apri il file corrispondente #--- apertura del file delle query ---# infile = open(sys.argv[2], 'r') #--- lettura del file text = infile.read() #--- dom delle query dom = parseString(text) #--- estrazione dei dati della query #title = gettagdata(dom,'title') num = gettagdata(dom, 'num') #desc = gettagdata(dom,'desc') #for x in range(len(title)-1): # title[x]+=" "+desc[x] title = gettagdata(dom, 'desc') title = [
def in_site_search(request): """ 站内搜索 """ user = get_login_user(request) keyword = request.POST.get('keyword', '').strip() scope = request.POST.get('scope', 'all') logger.warning(f"搜索关键字:`{keyword}") keyword = split_cn_words(keyword, join=True) logger.info(f"转换后的关键字:`{keyword}") if scope not in ('all', 'feed', 'article'): return HttpResponseForbidden('Param Error') if not keyword: return HttpResponseNotFound("Empty Keyword") storage = FileStorage(settings.WHOOSH_IDX_DIR) rel_sites, rel_articles = None, None # 查找相关源 if scope in ('feed', 'all'): idx = storage.open_index(indexname="site", schema=whoosh_site_schema) qp = MultifieldParser(['cname', 'author', 'brief'], schema=whoosh_site_schema) query = qp.parse(keyword) sites = [] with idx.searcher() as s: results = s.search(query, limit=50) for ret in results: sites.append(ret['id']) rel_sites = Site.objects.filter(status='active', pk__in=sites).order_by('-star') elif scope == 'article': # 查找相关文章 idx = storage.open_index(indexname="article", schema=whoosh_article_schema) qp = MultifieldParser(['title', 'author', 'content'], schema=whoosh_article_schema) query = qp.parse(keyword) articles = [] with idx.searcher() as s: old_mask = TermRange("uindex", None, str(current_ts() - 7 * 86400 * 1000)) results = s.search(query, mask=old_mask, limit=50) for ret in results: articles.append(ret['uindex']) rel_articles = Article.objects.filter(is_recent=True, status='active', uindex__in=articles).iterator() # 用户订阅 user_sub_feeds = [] if user: user_sub_feeds = get_user_subscribe_feeds(user.oauth_id, user_level=user.level) context = dict() context['user'] = user context['user_sub_feeds'] = user_sub_feeds context['rel_sites'] = rel_sites context['rel_articles'] = rel_articles context['keyword'] = keyword if scope == 'all': return render(request, 'search/search.html', context=context) elif scope == 'feed': return render(request, 'search/search_feeds.html', context=context) elif scope == 'article': return render(request, 'search/search_articles.html', context=context)
def setup_index(): storage = FileStorage(data_dir('memory')) storage.create() return storage.create_index(TMSchema())
class TranslationMemory(object): def __init__(self): self.index = FileStorage(data_dir('memory')).open_index() self.parser = qparser.QueryParser( 'source', schema=self.index.schema, group=qparser.OrGroup.factory(0.9), termclass=query.FuzzyTerm, ) self.searcher = None self.comparer = Comparer() def __del__(self): self.close() def open_searcher(self): if self.searcher is None: self.searcher = self.index.searcher() def doc_count(self): self.open_searcher() return self.searcher.doc_count() def close(self): if self.searcher is not None: self.searcher.close() self.searcher = None @contextlib.contextmanager def writer(self): writer = self.index.writer() try: yield writer finally: writer.commit() def get_language_code(self, code, langmap): language = Language.objects.auto_get_or_create(code) if langmap and language.code in langmap: language = Language.objects.auto_get_or_create( langmap[language.code]) return language.code def import_tmx(self, fileobj, langmap=None): origin = force_text(os.path.basename(fileobj.name)) storage = tmxfile.parsefile(fileobj) header = next(storage.document.getroot().iterchildren( storage.namespaced("header"))) source_language_code = header.get('srclang') source_language = self.get_language_code(source_language_code, langmap) languages = {} with self.writer() as writer: for unit in storage.units: # Parse translations (translate-toolkit does not care about # languages here, it just picks first and second XML elements) translations = {} for node in unit.getlanguageNodes(): lang, text = get_node_data(unit, node) translations[lang] = text if lang not in languages: languages[lang] = self.get_language_code(lang, langmap) try: source = translations.pop(source_language_code) except KeyError: # Skip if source language is not present continue for lang, text in translations.items(): writer.add_document( source_language=source_language, target_language=languages[lang], source=source, target=text, origin=origin, category=CATEGORY_FILE, ) def lookup(self, source_language, target_language, text): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search(text_query, filter=langfilter, limit=20000) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield (match['source'], match['target'], similarity, match['origin']) def delete(self, origin): """Delete entries by origin.""" with self.writer() as writer: return writer.delete_by_term('origin', origin) def empty(self): """Recreates translation memory.""" self.index = setup_index() self.searcher = None def get_origins(self): self.open_searcher() return [force_text(x) for x in self.searcher.lexicon('origin')]
from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser import whoosh.qparser as qparser import chinese import os, glob, codecs analyzer = chinese.ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), sub_title=TEXT(stored=True), author=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer)) storage = FileStorage("indexdir") ix = storage.create_index(schema) writer = ix.writer() # add index allFile = [] os.chdir('source') allDir = glob.glob('*') for path in allDir: os.chdir(path) allFile = glob.glob('*.txt') for everyFile in allFile: if (everyFile[0] == 's'): print(everyFile)
def index_document(indice: str, data: dict): store = FileStorage(indice) ix = store.open_index() current_app.logger.debug('Writing {} to {}'.format(data, indice)) with ix.writer() as writer: writer.update_document(**data)
def __enter__(self): dirpath = TempDir.__enter__(self) store = FileStorage(dirpath) self.onexit = lambda: store.close() return store
def __enter__(self): dirpath = TempDir.__enter__(self) self.store = FileStorage(dirpath, debug=self._debug) return self.store
def check_db_matches(): """created in order to build the importance of data graph.. probably should be changed in order to be reused""" FIRST_RUN = False #ALL_FILE = "all_queries_big" #DB_FILE = "all_dbs_big" ALL_FILE = "all_queries" DB_FILE = "all_dbs" START_FROM = "number" ALL_NUM = "all_num_from_new" ALL_NUM = "all_num_from_4_5_full_17" ALL_FIXED_q = "all_fixed_queries" + str(17) ALL_FIXED_dbs = "all_fixed_dbs" + str(17) biggest = 20 max_db_size = 20 all_queries = {} db = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}] found = [0] * biggest ret_val = [] if FIRST_RUN: #raw_input("are you sure you want to rewrite the db?!") storage_main = FileStorage(INDEX_DIR_CODE) ix_main = storage_main.open_index() try: """ with open(START_FROM, "rb") as file_h: (curr_db, count, db_sizes) = pickle.load(file_h) with open(ALL_FIXED_q, "rb") as file_h: all_queries = pickle.load(file_h) with open(ALL_FIXED_dbs, "rb") as file_h: db = pickle.load(file_h) print len(all_queries.keys()) print "Real size", [len(e.keys()) for e in db] print "left", db_sizes print curr_db, count """ with open(START_FROM, "rb") as file_h: (curr_db, count, db_sizes) = pickle.load(file_h) print "read", curr_db, count with open(ALL_FILE + str(curr_db - 1), "rb") as file_h: all_queries = pickle.load(file_h) with open(DB_FILE + str(curr_db - 1), "rb") as file_h: db = pickle.load(file_h) print "Real size", [len(e.keys()) for e in db] except: curr_db = 0 count = 0 db_sizes = [2**i for i in range(1, biggest + 1)] new_count = 0 print "start reading posts" q_db = POSTS_DB.find({}, timeout=False) print "done reading posts" print "start with", curr_db for question in q_db: if curr_db == max_db_size: print "break" break new_count += 1 if new_count < count: continue if db_sizes[curr_db] % 1000 == 0: print "BUILD:", curr_db, "I'm Alive, more", db_sizes[ curr_db], "togo!" snips = get_possible_snippets(question['Id']) if snips is None or len(snips) == 0: continue (db[curr_db])[question['Id']] = snips[0] db_sizes = db_sizes[:curr_db] + [e - 1 for e in db_sizes[curr_db:]] if db_sizes[curr_db] == 0: t = time.time() print "find matches for", curr_db, "size is", len( db[curr_db].keys()) for place, key in enumerate(db[curr_db].keys()): if place % 1000 == 0: print "FIND: I'm Alive", place code = db[curr_db][key][0] res_dict, tokens, q_scores = fast_from_code_to_question( code, ix_main) if all_queries.get(key, None) is None: all_queries[key] = (tokens, res_dict) curr_db += 1 try: print "saved", time.time() - t with open(ALL_FILE + str(curr_db), "wb") as file_h: pickle.dump(all_queries, file_h) with open(DB_FILE + str(curr_db), "wb") as file_h: pickle.dump(db, file_h) with open(START_FROM, "wb") as file_h: pickle.dump((curr_db, new_count, db_sizes), file_h) except: print "to much to write" print "start", 2**(curr_db + 1) q_db.close() num = 0 else: print "reading files.." t = time.time() """with open(ALL_FILE+str(max_db_size), "rb") as file_h: all_queries = pickle.load(file_h) with open(DB_FILE+str(max_db_size), "rb") as file_h: db = pickle.load(file_h)""" with open(ALL_FIXED_q, "rb") as file_h: all_queries = pickle.load(file_h) with open(ALL_FIXED_dbs, "rb") as file_h: db = pickle.load(file_h) print "done reading", time.time() - t print[len(e.keys()) for e in db] try: with open(ALL_NUM, "rb") as file_h: num, found = pickle.load(file_h) print "read", num, found except: num = 0 curr_num = 0 print num, len(all_queries.keys()) for query in all_queries.keys(): curr_num += 1 if curr_num < num: continue if curr_num % 1000 == 0: print "MATCHES: I'M Alive!", curr_num, query matches = get_matches(query, all_queries[query]) flag_f = False for match in matches: if flag_f: break for i in range(len(db)): if match in db[i].keys() and query in db[i].keys(): found[i] += 1 flag_f = True break if curr_num - 1 > num: with open(ALL_NUM, "wb") as file_h: pickle.dump((curr_num, found), file_h) print found """ #saved in _n small_db = [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 8] # 3/5 small_db = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4] # 4/5 for i, val in enumerate(small_db): try: found[i] += val except: print "shorter db" print found""" for i in range(len(found) - 1): found[i + 1] += found[i] print(found) for place, i in enumerate([2**i for i in range(1, max_db_size + 1)]): ret_val.append(float(found[place]) / i * 100) print ret_val
def setUp(self): self.create_temp() self.storage = FileStorage(self.tempdir) self.storage.create()
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): self.remove_temp() weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') sindex = weblate.trans.search.get_source_index() self.assertIsNotNone(sindex) tindex = weblate.trans.search.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document(pk=1, target="target", comment="comment") writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual(fulltext_search(item, ['cs'], {item: True}), set([1])) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema(checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT()) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
def __init__(self): self.schema = Schema(note_id=NUMERIC(stored=True, unique=True), notebook_id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=ChineseAnalyzer()), snippet=TEXT(analyzer=ChineseAnalyzer())) try: self.index = FileStorage(config.get("PATH", "notes_index_dir")).open_index() except: self.index = FileStorage(config.get("PATH", "notes_index_dir")).create_index(self.schema)
def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create()
def media_rebuild(): print datetime.datetime.now() print 'media_rebuild' media_db = mysql_new.BaseDB(config.MYSQL_DEFINE_MEDIA) schema = Schema(movieid=ID(stored=True, unique=True), title=TEXT(stored=True, analyzer=analyzer_zhongwen, field_boost=2.0), pinyin_title=TEXT(stored=True, analyzer=analyzer_pinyin, field_boost=2.0), director=KEYWORD(stored=True), year=NUMERIC(stored=True, sortable=True), score=NUMERIC(stored=True, sortable=True), area=KEYWORD(stored=True), description=TEXT(stored=True, field_boost=1.5), pinyin_description=TEXT(stored=True, field_boost=1.0), actor=KEYWORD(stored=True, field_boost=1.0), pinyin_actor=TEXT(stored=True, field_boost=1.0), genres=KEYWORD(stored=True, field_boost=1.0), pinyin_genres=TEXT(stored=True, field_boost=1.0), type=NUMERIC(stored=True), source=NUMERIC(stored=True)) SQL = '''SELECT `movieid`, `title`, `type`, `actor`, `genres`, `director`, `douban_score`, `introduction` as description, `year` FROM `media_info` WHERE `status`=1 AND type in ('movie', 'tv', 'teleplay', 'anime') ''' res = media_db.query(SQL, ()) if not res: return for info in res: if info.get('type') == 'movie': info['type'] = 1 elif info.get('type') == 'teleplay': info['type'] = 2 elif info.get('type') == 'tv': info['type'] = 3 elif info.get('type') == 'anime': info['type'] = 4 else: continue index_path = os.path.join(config.index_root_dir, 'media') if not os.path.exists(index_path): os.mkdir(index_path) #ix = create_in(index_path, schema=schema) storage = FileStorage(index_path) ix = storage.open_index() writer = ix.writer() for info in res: pinyin_title = ' '.join(lazy_pinyin(info.get('title').decode('utf8'))) pinyin_description = ' '.join( lazy_pinyin(info.get('description').decode('utf8'))) pinyin_actor = ''.join(info.get('actor', '').strip().split('/')) pinyin_actor = ' '.join(lazy_pinyin(pinyin_actor.decode('utf8'))) pinyin_genres = ''.join(info.get('genres', '').strip().split('/')) pinyin_genres = ' '.join(lazy_pinyin(pinyin_genres.decode('utf8'))) actor = ';'.join(info.get('actor', '').strip().split('/')) area = ';'.join(info.get('area', '').strip().split('/')) director = ';'.join(info.get('area', '').strip().split('/')) genres = ';'.join(info.get('genres', '').strip().split('/')) writer.add_document(movieid=info.get('movieid').decode('utf8'), title=info.get('title').decode('utf8'), pinyin_title=pinyin_title, type=info.get('type'), actor=actor.decode('utf8'), pinyin_actor=pinyin_actor, genres=genres.decode('utf8'), pinyin_genres=pinyin_genres, director=director.decode('utf8'), score=info.get('douban_score'), description=info.get('description').decode('utf8'), pinyin_description=pinyin_description, area=area.decode('utf8'), year=info.get('year')) writer.commit(mergetype=writing.CLEAR)
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, 'RAM_STORE', None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True) def clear(self, models=None, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query( q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Whoosh index of models '%s': %s", ','.join(models_to_delete), e, exc_info=True) else: self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if end_offset is not None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page( start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def setUp(self): self.path = tempfile.mkdtemp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.path) weblate.trans.search.STORAGE = self.storage self.storage.create()
from whoosh.fields import SchemaClass, TEXT, NUMERIC from whoosh.filedb.filestore import FileStorage from whoosh.writing import AsyncWriter, BufferedWriter from whoosh import qparser from django.conf import settings from django.dispatch import receiver from django.db.models.signals import post_migrate from django.db.utils import IntegrityError from django.utils.encoding import force_text from django.db import transaction from weblate.lang.models import Language from weblate.trans.data import data_dir STORAGE = FileStorage(data_dir('whoosh')) class TargetSchema(SchemaClass): ''' Fultext index schema for target strings. ''' pk = NUMERIC(stored=True, unique=True) target = TEXT() comment = TEXT() class SourceSchema(SchemaClass): ''' Fultext index schema for source and context strings. '''
from bson.objectid import ObjectId # Set index, we index title and content as texts and tags as keywords. # We store inside index only titles and ids. ana = StemmingAnalyzer() schema = Schema(fullname=TEXT(analyzer=ana, spelling=True, stored=True)) # # Create index dir if it does not exists. # if not os.path.exists("index"): # os.mkdir("index") # # # Initialize index # index = create_in("index", schema) st = FileStorage("index_fullname").create() index = st.create_index(schema) # Initiate db connection # connection = Connection('localhost', 27017) # db = connection["cozy-home"] # posts = db.posts conn = Connection(username="******", password="******") db = conn["example"] aql_getLibraries = "FOR library in libraries RETURN library" posts = db.AQLQuery(aql_getLibraries, rawResults=True, batchSize=10000) # print(len(posts)) # Fill index with posts from DB writer = index.writer() for post in posts:
category=TEXT(stored=True, analyzer=analyzer), owner=TEXT(stored=True)) # 按照schema定义信息,增加需要建立索引的文档 # 注意:字符串格式需要为unicode格式 # 存储schema信息至'indexdir'目录下 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) indexdir = BASE_DIR + '/indexdir/' if arg.output[0] != '': indexdir = arg.output[0] if not os.path.exists(indexdir): os.mkdir(indexdir) storage = FileStorage(indexdir) ix = create_in(indexdir, schema) else: ix = index.open_dir(indexdir) # ix = create_in(indexdir, schema) print("open") # write index writer = ix.writer() for k, v in to_update.items(): writer.update_document( guid="github_%s" % v['repo_url'], source_type='g0v-repos', title=v['repo_name'],
def storage(self): return FileStorage(data_dir(self.LOCATION))
def ranked_set_baseline(self,basefile): # Helper from http://effbot.org/zone/element-lib.htm def flatten(elem, include_tail=0): text = elem.text or "" for e in elem: text += flatten(e, 1) if include_tail and elem.tail: text += elem.tail return text # step 1: Create a temporary whoosh index in order to find out # the most significant words for each article ana = analysis.StandardAnalyzer() # ana = analysis.StemmingAnalyzer() vectorformat = formats.Frequency(ana) schema = fields.Schema(article=fields.ID(unique=True), title=fields.TEXT(stored=True), content=fields.TEXT(analyzer=ana, vector=vectorformat)) st = RamStorage() tmpidx = st.create_index(schema) w = tmpidx.writer() XHT_NS = "{http://www.w3.org/1999/xhtml}" tree = ET.parse(self.parsed_path(basefile)) els = tree.findall("//"+XHT_NS+"div") articles = [] for el in els: if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article": text = Util.normalizeSpace(flatten(el)) article = unicode(el.attrib['id'][1:]) articles.append(article) w.update_document(article=article,title="Article "+ article,content=text) w.commit() self.log.info("Indexed %d articles" % len(articles)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, for each article, use the 20 most distinctive terms # (filtering away numbers) to create a query against that index # things to vary: # * numterms # * connector (AND or OR) # * scoring (weighting=scoring.Cosine()) numterms = 5 connector = " AND " indexdir = os.path.sep.join([self.config['datadir'],'ecj','index']) storage = FileStorage(indexdir) idx = storage.open_index() searcher = idx.searcher(weighting=scoring.BM25F()) tempsearch = tmpidx.searcher() rankedset = {} for article in articles: rankedset[article] = [] r = tempsearch.search(query.Term("article",article)) terms = [t[0] for t in r.key_terms("content", numterms=numterms+1) if not t[0].isdigit()][:numterms] print "Article %s:%r" % (article, terms) parser = qparser.QueryParser("content") q = parser.parse(connector.join(terms)) results = searcher.search(q, limit=10) resultidx = 0 for result in results: reslbl = "%s (%s)"%(result['title'],results.score(resultidx)) rankedset[article].append([result['basefile'],reslbl]) print u"\t%s (%s)" % (result['title'], results.score(resultidx)) resultidx += 1 return rankedset
class Indexer(RaftNode): def __init__(self, host='localhost', port=7070, seed_addr=None, conf=SyncObjConf(), data_dir='/tmp/cockatrice/index', grpc_port=5050, grpc_max_workers=10, http_port=8080, logger=getLogger(), http_logger=getLogger(), metrics_registry=CollectorRegistry()): self.__host = host self.__port = port self.__seed_addr = seed_addr self.__conf = conf self.__data_dir = data_dir self.__grpc_port = grpc_port self.__grpc_max_workers = grpc_max_workers self.__http_port = http_port self.__logger = logger self.__http_logger = http_logger self.__metrics_registry = metrics_registry # metrics self.__metrics_core_documents = Gauge( '{0}_indexer_index_documents'.format(NAME), 'The number of documents.', [ 'index_name', ], registry=self.__metrics_registry) self.__metrics_requests_total = Counter( '{0}_indexer_requests_total'.format(NAME), 'The number of requests.', ['func'], registry=self.__metrics_registry) self.__metrics_requests_duration_seconds = Histogram( '{0}_indexer_requests_duration_seconds'.format(NAME), 'The invocation duration in seconds.', ['func'], registry=self.__metrics_registry) self.__self_addr = '{0}:{1}'.format(self.__host, self.__port) self.__peer_addrs = [] if self.__seed_addr is None else get_peers( bind_addr=self.__seed_addr, timeout=10) self.__other_addrs = [ peer_addr for peer_addr in self.__peer_addrs if peer_addr != self.__self_addr ] self.__conf.serializer = self.__serialize self.__conf.deserializer = self.__deserialize self.__conf.validate() self.__indices = {} self.__index_configs = {} self.__writers = {} self.__auto_commit_timers = {} self.__lock = RLock() # create data dir os.makedirs(self.__data_dir, exist_ok=True) self.__file_storage = FileStorage(self.__data_dir, supports_mmap=True, readonly=False, debug=False) self.__ram_storage = RamStorage() # if seed addr specified and self node does not exist in the cluster, add self node to the cluster if self.__seed_addr is not None and self.__self_addr not in self.__peer_addrs: Thread(target=add_node, kwargs={ 'node_name': self.__self_addr, 'bind_addr': self.__seed_addr, 'timeout': 10 }).start() # copy snapshot from the leader node if self.__seed_addr is not None: try: metadata = get_metadata(bind_addr=get_leader( bind_addr=self.__seed_addr, timeout=10), timeout=10) response = requests.get('http://{0}/snapshot'.format( metadata['http_addr'])) if response.status_code == HTTPStatus.OK: with open(self.__conf.fullDumpFile, 'wb') as f: f.write(response.content) except Exception as ex: self.__logger.error('failed to copy snapshot: {0}'.format(ex)) # start node metadata = { 'grpc_addr': '{0}:{1}'.format(self.__host, self.__grpc_port), 'http_addr': '{0}:{1}'.format(self.__host, self.__http_port) } self.__logger.info('starting raft state machine') super(Indexer, self).__init__(self.__self_addr, self.__peer_addrs, conf=self.__conf, metadata=metadata) self.__logger.info('raft state machine has started') if os.path.exists(self.__conf.fullDumpFile): self.__logger.debug('snapshot exists: {0}'.format( self.__conf.fullDumpFile)) else: pass while not self.isReady(): # recovering data self.__logger.debug('waiting for cluster ready') self.__logger.debug(self.getStatus()) time.sleep(1) self.__logger.info('cluster ready') self.__logger.debug(self.getStatus()) # open existing indices on startup for index_name in self.get_index_names(): self.__open_index(index_name, index_config=None) # record index metrics timer self.metrics_timer = Timer(10, self.__record_index_metrics) self.metrics_timer.start() # start gRPC self.__grpc_server = grpc.server( futures.ThreadPoolExecutor(max_workers=self.__grpc_max_workers)) add_IndexServicer_to_server( IndexGRPCServicer(self, logger=self.__logger, metrics_registry=self.__metrics_registry), self.__grpc_server) self.__grpc_server.add_insecure_port('{0}:{1}'.format( self.__host, self.__grpc_port)) self.__grpc_server.start() self.__logger.info('gRPC server has started') # start HTTP server self.__http_servicer = IndexHTTPServicer(self, self.__logger, self.__http_logger, self.__metrics_registry) self.__http_server = HTTPServer(self.__host, self.__http_port, self.__http_servicer) self.__http_server.start() self.__logger.info('HTTP server has started') self.__logger.info('indexer has started') def stop(self): # stop HTTP server self.__http_server.stop() self.__logger.info('HTTP server has stopped') # stop gRPC server self.__grpc_server.stop(grace=0.0) self.__logger.info('gRPC server has stopped') self.metrics_timer.cancel() # close indices for index_name in list(self.__indices.keys()): self.__close_index(index_name) self.destroy() self.__logger.info('index core has stopped') def __record_index_metrics(self): for index_name in list(self.__indices.keys()): try: self.__metrics_core_documents.labels( index_name=index_name).set(self.get_doc_count(index_name)) except Exception as ex: self.__logger.error(ex) def __record_metrics(self, start_time, func_name): self.__metrics_requests_total.labels(func=func_name).inc() self.__metrics_requests_duration_seconds.labels( func=func_name).observe(time.time() - start_time) # def __serialize_indices(self, filename): # with self.__lock: # try: # self.__logger.info('starting serialize indices') # # except Exception as ex: # self.__logger.error('failed to create snapshot: {0}'.format(ex)) # finally: # self.__logger.info('serialize indices has finished') # def __serialize_raft_data(self, filename, raft_data): # with self.__lock: # pass # index serializer def __serialize(self, filename, raft_data): with self.__lock: try: self.__logger.debug('serializer has started') # store the index files and raft logs to the snapshot file with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as f: for index_name in self.get_index_names(): self.__commit_index(index_name) # with self.__get_writer(index_name).writelock: # with self.__indices[index_name].lock('WRITELOCK'): # index files for index_filename in self.get_index_files(index_name): if self.__index_configs.get( index_name).get_storage_type() == "ram": with self.__ram_storage.open_file( index_filename) as r: f.writestr(index_filename, r.read()) else: f.write( os.path.join(self.__file_storage.folder, index_filename), index_filename) self.__logger.debug('{0} has stored in {1}'.format( index_filename, filename)) # index config file f.write( os.path.join( self.__file_storage.folder, self.get_index_config_file(index_name)), self.get_index_config_file(index_name)) self.__logger.debug('{0} has stored in {1}'.format( self.get_index_config_file(index_name), filename)) # store the raft data f.writestr(RAFT_DATA_FILE, pickle.dumps(raft_data)) self.__logger.debug( '{0} has restored'.format(RAFT_DATA_FILE)) self.__logger.debug('snapshot has created') except Exception as ex: self.__logger.error( 'failed to create snapshot: {0}'.format(ex)) finally: self.__logger.debug('serializer has stopped') # index deserializer def __deserialize(self, filename): with self.__lock: try: self.__logger.debug('deserializer has started') with zipfile.ZipFile(filename, 'r') as zf: # get file names in snapshot file filenames = list(zf.namelist()) # get index names in snapshot file index_names = [] pattern_toc = re.compile(r'^_(.+)_\d+\.toc$') for f in filenames: match = pattern_toc.search(f) if match and match.group(1) not in index_names: index_names.append(match.group(1)) for index_name in index_names: # extract the index config first zf.extract(self.get_index_config_file(index_name), path=self.__file_storage.folder) index_config = pickle.loads( zf.read(self.get_index_config_file(index_name))) # get index files pattern_toc = re.compile(r'^_{0}_(\d+)\..+$'.format( index_name)) # ex) _myindex_0.toc pattern_seg = re.compile( r'^{0}_([a-z0-9]+)\..+$'.format(index_name) ) # ex) myindex_zseabukc2nbpvh0u.seg pattern_lock = re.compile(r'^{0}_WRITELOCK$'.format( index_name)) # ex) myindex_WRITELOCK index_files = [] for file_name in filenames: if re.match(pattern_toc, file_name): index_files.append(file_name) elif re.match(pattern_seg, file_name): index_files.append(file_name) elif re.match(pattern_lock, file_name): index_files.append(file_name) # extract the index files for index_file in index_files: if index_config.get_storage_type() == 'ram': with self.__ram_storage.create_file( index_file) as r: r.write(zf.read(index_file)) else: zf.extract(index_file, path=self.__file_storage.folder) self.__logger.debug( '{0} has restored from {1}'.format( index_file, filename)) self.__logger.debug( '{0} has restored'.format(index_name)) # extract the raft data raft_data = pickle.loads(zf.read(RAFT_DATA_FILE)) self.__logger.debug( '{0} has restored'.format(RAFT_DATA_FILE)) return raft_data except Exception as ex: self.__logger.error( 'failed to restore indices: {0}'.format(ex)) finally: self.__logger.debug('deserializer has stopped') def is_healthy(self): return self.isHealthy() def is_alive(self): return self.isAlive() def is_ready(self): return self.isReady() def get_addr(self): return self.__self_addr def get_index_files(self, index_name): index_files = [] pattern_toc = re.compile( r'^_{0}_(\d+)\..+$'.format(index_name)) # ex) _myindex_0.toc pattern_seg = re.compile(r'^{0}_([a-z0-9]+)\..+$'.format( index_name)) # ex) myindex_zseabukc2nbpvh0u.seg pattern_lock = re.compile( r'^{0}_WRITELOCK$'.format(index_name)) # ex) myindex_WRITELOCK if self.__index_configs.get(index_name).get_storage_type() == "ram": storage = self.__ram_storage else: storage = self.__file_storage for file_name in list(storage.list()): if re.match(pattern_toc, file_name): index_files.append(file_name) elif re.match(pattern_seg, file_name): index_files.append(file_name) elif re.match(pattern_lock, file_name): index_files.append(file_name) return index_files @staticmethod def get_index_config_file(index_name): return '{0}_CONFIG'.format(index_name) def get_index_names(self): index_names = [] pattern_toc = re.compile(r'^_(.+)_\d+\.toc$') for filename in list(self.__file_storage.list()): match = pattern_toc.search(filename) if match and match.group(1) not in index_names: index_names.append(match.group(1)) for filename in list(self.__ram_storage.list()): match = pattern_toc.search(filename) if match and match.group(1) not in index_names: index_names.append(match.group(1)) return index_names def is_index_exist(self, index_name): return self.__file_storage.index_exists( indexname=index_name) or self.__ram_storage.index_exists( indexname=index_name) def is_index_open(self, index_name): return index_name in self.__indices @replicated def open_index(self, index_name, index_config=None): return self.__open_index(index_name, index_config=index_config) def __open_index(self, index_name, index_config=None): start_time = time.time() index = None try: # open the index index = self.__indices.get(index_name) if index is None: self.__logger.debug('opening {0}'.format(index_name)) if index_config is None: # set saved index config with open( os.path.join( self.__file_storage.folder, self.get_index_config_file(index_name)), 'rb') as f: self.__index_configs[index_name] = pickle.loads( f.read()) else: # set given index config self.__index_configs[index_name] = index_config if self.__index_configs[index_name].get_storage_type( ) == 'ram': index = self.__ram_storage.open_index( indexname=index_name, schema=self.__index_configs[index_name].get_schema()) else: index = self.__file_storage.open_index( indexname=index_name, schema=self.__index_configs[index_name].get_schema()) self.__indices[index_name] = index self.__logger.info('{0} has opened'.format(index_name)) # open the index writer self.__open_writer(index_name) except Exception as ex: self.__logger.error('failed to open {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'open_index') return index @replicated def close_index(self, index_name): return self.__close_index(index_name) def __close_index(self, index_name): start_time = time.time() index = None try: # close the index writer self.__close_writer(index_name) # close the index index = self.__indices.pop(index_name) if index is not None: self.__logger.debug('closing {0}'.format(index_name)) index.close() self.__logger.info('{0} has closed'.format(index_name)) except Exception as ex: self.__logger.error('failed to close {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'close_index') return index @replicated def create_index(self, index_name, index_config): return self.__create_index(index_name, index_config) def __create_index(self, index_name, index_config): if self.is_index_exist(index_name): # open the index return self.__open_index(index_name, index_config=index_config) start_time = time.time() index = None with self.__lock: try: self.__logger.debug('creating {0}'.format(index_name)) # set index config self.__index_configs[index_name] = index_config self.__logger.debug( self.__index_configs[index_name].get_storage_type()) # create the index if self.__index_configs[index_name].get_storage_type( ) == 'ram': index = self.__ram_storage.create_index( self.__index_configs[index_name].get_schema(), indexname=index_name) else: index = self.__file_storage.create_index( self.__index_configs[index_name].get_schema(), indexname=index_name) self.__indices[index_name] = index self.__logger.info('{0} has created'.format(index_name)) # save the index config with open( os.path.join(self.__file_storage.folder, self.get_index_config_file(index_name)), 'wb') as f: f.write(pickle.dumps(index_config)) # open the index writer self.__open_writer(index_name) except Exception as ex: self.__logger.error('failed to create {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'create_index') return index @replicated def delete_index(self, index_name): return self.__delete_index(index_name) def __delete_index(self, index_name): # close index index = self.__close_index(index_name) start_time = time.time() with self.__lock: try: self.__logger.debug('deleting {0}'.format(index_name)) # delete index files for filename in self.get_index_files(index_name): self.__file_storage.delete_file(filename) self.__logger.debug('{0} was deleted'.format(filename)) self.__logger.info('{0} has deleted'.format(index_name)) # delete the index config self.__index_configs.pop(index_name, None) os.remove( os.path.join(self.__file_storage.folder, self.get_index_config_file(index_name))) except Exception as ex: self.__logger.error('failed to delete {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'delete_index') return index def get_index(self, index_name): return self.__get_index(index_name) def __get_index(self, index_name): start_time = time.time() try: index = self.__indices.get(index_name) except Exception as ex: raise ex finally: self.__record_metrics(start_time, 'get_index') return index def __start_auto_commit_timer(self, index_name, period): timer = self.__auto_commit_timers.get(index_name, None) if timer is None: self.__auto_commit_timers[index_name] = threading.Timer( period, self.__auto_commit_index, kwargs={ 'index_name': index_name, 'period': period }) self.__auto_commit_timers[index_name].start() self.__logger.debug( 'auto commit timer for {0} were started'.format(index_name)) def __stop_auto_commit_timer(self, index_name): timer = self.__auto_commit_timers.pop(index_name, None) if timer is not None: timer.cancel() self.__logger.debug( 'auto commit timer for {0} were stopped'.format(index_name)) def __auto_commit_index(self, index_name, period): self.__stop_auto_commit_timer(index_name) self.__commit_index(index_name) self.__start_auto_commit_timer(index_name, period=period) def __open_writer(self, index_name): writer = None try: writer = self.__writers.get(index_name, None) if writer is None or writer.is_closed: self.__logger.debug( 'opening writer for {0}'.format(index_name)) writer = self.__indices.get(index_name).writer() self.__writers[index_name] = writer self.__logger.debug( 'writer for {0} has opened'.format(index_name)) self.__start_auto_commit_timer( index_name, period=self.__index_configs.get( index_name).get_writer_auto_commit_period()) except Exception as ex: self.__logger.error('failed to open writer for {0}: {1}'.format( index_name, ex)) return writer def __close_writer(self, index_name): writer = None try: self.__stop_auto_commit_timer(index_name) # close the index writer = self.__writers.pop(index_name, None) if writer is not None: self.__logger.debug( 'closing writer for {0}'.format(index_name)) writer.commit() self.__logger.debug( 'writer for {0} has closed'.format(index_name)) except Exception as ex: self.__logger.error('failed to close writer for {0}: {1}'.format( index_name, ex)) return writer def __get_writer(self, index_name): return self.__writers.get(index_name, None) def __get_searcher(self, index_name, weighting=None): try: if weighting is None: searcher = self.__indices.get(index_name).searcher() else: searcher = self.__indices.get(index_name).searcher( weighting=weighting) except Exception as ex: raise ex return searcher @replicated def commit_index(self, index_name): return self.__commit_index(index_name) def __commit_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('committing {0}'.format(index_name)) self.__get_writer(index_name).commit() self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has committed'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to commit index {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'commit_index') return success @replicated def rollback_index(self, index_name): return self.__rollback_index(index_name) def __rollback_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('rolling back {0}'.format(index_name)) self.__get_writer(index_name).cancel() self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has rolled back'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to rollback index {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'rollback_index') return success @replicated def optimize_index(self, index_name): return self.__optimize_index(index_name) def __optimize_index(self, index_name): start_time = time.time() success = False with self.__lock: try: self.__logger.debug('optimizing {0}'.format(index_name)) self.__get_writer(index_name).commit(optimize=True, merge=False) self.__open_writer(index_name) # reopen writer self.__logger.info('{0} has optimized'.format(index_name)) success = True except Exception as ex: self.__logger.error('failed to optimize {0}: {1}'.format( index_name, ex)) finally: self.__record_metrics(start_time, 'optimize_index') return success def get_doc_count(self, index_name): try: cnt = self.__indices.get(index_name).doc_count() except Exception as ex: raise ex return cnt def get_schema(self, index_name): try: schema = self.__indices.get(index_name).schema except Exception as ex: raise ex return schema @replicated def put_document(self, index_name, doc_id, fields): return self.__put_document(index_name, doc_id, fields) def __put_document(self, index_name, doc_id, fields): doc = copy.deepcopy(fields) doc[self.__index_configs.get(index_name).get_doc_id_field()] = doc_id return self.__put_documents(index_name, [doc]) @replicated def put_documents(self, index_name, docs): return self.__put_documents(index_name, docs) def __put_documents(self, index_name, docs): start_time = time.time() with self.__lock: try: self.__logger.debug( 'putting documents to {0}'.format(index_name)) # count = self.__get_writer(index_name).update_documents(docs) count = 0 for doc in docs: self.__get_writer(index_name).update_document(**doc) count += 1 self.__logger.info('{0} documents has put to {1}'.format( count, index_name)) except Exception as ex: self.__logger.error( 'failed to put documents to {0}: {1}'.format( index_name, ex)) count = -1 finally: self.__record_metrics(start_time, 'put_documents') return count def get_document(self, index_name, doc_id): try: results_page = self.search_documents( index_name, doc_id, self.__index_configs.get(index_name).get_doc_id_field(), 1, page_len=1) if results_page.total > 0: self.__logger.debug('{0} was got from {1}'.format( doc_id, index_name)) else: self.__logger.debug('{0} did not exist in {1}'.format( doc_id, index_name)) except Exception as ex: raise ex return results_page @replicated def delete_document(self, index_name, doc_id): return self.__delete_document(index_name, doc_id) def __delete_document(self, index_name, doc_id): return self.__delete_documents(index_name, [doc_id]) @replicated def delete_documents(self, index_name, doc_ids): return self.__delete_documents(index_name, doc_ids) def __delete_documents(self, index_name, doc_ids): start_time = time.time() with self.__lock: try: self.__logger.debug( 'deleting documents from {0}'.format(index_name)) # count = self.__get_writer(index_name).delete_documents(doc_ids, doc_id_field=self.__index_configs.get( # index_name).get_doc_id_field()) count = 0 for doc_id in doc_ids: count += self.__get_writer(index_name).delete_by_term( self.__index_configs.get( index_name).get_doc_id_field(), doc_id) self.__logger.info('{0} documents has deleted from {1}'.format( count, index_name)) except Exception as ex: self.__logger.error( 'failed to delete documents in bulk to {0}: {1}'.format( index_name, ex)) count = -1 finally: self.__record_metrics(start_time, 'delete_documents') return count def search_documents(self, index_name, query, search_field, page_num, page_len=10, weighting=None, **kwargs): start_time = time.time() try: searcher = self.__get_searcher(index_name, weighting=weighting) query_parser = QueryParser(search_field, self.get_schema(index_name)) query_obj = query_parser.parse(query) results_page = searcher.search_page(query_obj, page_num, pagelen=page_len, **kwargs) self.__logger.info('{0} documents ware searched from {1}'.format( results_page.total, index_name)) except Exception as ex: raise ex finally: self.__record_metrics(start_time, 'search_documents') return results_page @replicated def create_snapshot(self): self.__create_snapshot() def __create_snapshot(self): self.forceLogCompaction() def get_snapshot_file_name(self): return self.__conf.fullDumpFile def is_snapshot_exist(self): return os.path.exists(self.get_snapshot_file_name()) def open_snapshot_file(self): with self.__lock: try: file = open(self.get_snapshot_file_name(), mode='rb') except Exception as ex: raise ex return file
parser.add_argument("-limit", type=int, default=1000) parser.add_argument("-binary", action="store_true") parser.add_argument("-add_bm25", action="store_true") args = parser.parse_args() from singletons import PREPROCESS, SEARCHER run_id = args.model if args.model == "clusvm": clusvm = util.load(f"clusvm.pkl") elif "sv" in args.model: svm = util.load(f"{args.svm_file}.pkl") elif args.model == "adarank": alpha = np.load("ada.npy") else: ix = FileStorage("data/msmarcoidx").open_index() if args.model == "bm25": SEARCHER = ix.searcher() qp = QueryParser("body", schema=ix.schema) def predict(inp): qid, query = inp ret = [] if args.model == "okapi" or args.model == "bm25": results = SEARCHER.search(qp.parse(query), limit=args.limit) for rank, hit in enumerate(results): ret.append([qid, hit["docid"], rank + 1, results.score(rank), run_id]) elif args.model == "clusvm":
>>> thesaurus.synonyms("hail") ['acclaim', 'come', 'herald'] """ word = word.lower() if self.searcher: return self.searcher.document(word=word)["syns"] else: return synonyms(self.w2n, self.n2w, word) if __name__ == "__main__": from time import clock from whoosh.filedb.filestore import FileStorage st = FileStorage("c:/testindex") # t = clock() # th = Thesaurus.from_filename("c:/wordnet/wn_s.pl") # print clock() - t # # t = clock() # th.to_storage(st) # print clock() - t # # t = clock() # print th.synonyms("light") # print clock() - t t = clock() th = Thesaurus.from_storage(st)
def __init__(self, host='localhost', port=7070, seed_addr=None, conf=SyncObjConf(), data_dir='/tmp/cockatrice/index', grpc_port=5050, grpc_max_workers=10, http_port=8080, logger=getLogger(), http_logger=getLogger(), metrics_registry=CollectorRegistry()): self.__host = host self.__port = port self.__seed_addr = seed_addr self.__conf = conf self.__data_dir = data_dir self.__grpc_port = grpc_port self.__grpc_max_workers = grpc_max_workers self.__http_port = http_port self.__logger = logger self.__http_logger = http_logger self.__metrics_registry = metrics_registry # metrics self.__metrics_core_documents = Gauge( '{0}_indexer_index_documents'.format(NAME), 'The number of documents.', [ 'index_name', ], registry=self.__metrics_registry) self.__metrics_requests_total = Counter( '{0}_indexer_requests_total'.format(NAME), 'The number of requests.', ['func'], registry=self.__metrics_registry) self.__metrics_requests_duration_seconds = Histogram( '{0}_indexer_requests_duration_seconds'.format(NAME), 'The invocation duration in seconds.', ['func'], registry=self.__metrics_registry) self.__self_addr = '{0}:{1}'.format(self.__host, self.__port) self.__peer_addrs = [] if self.__seed_addr is None else get_peers( bind_addr=self.__seed_addr, timeout=10) self.__other_addrs = [ peer_addr for peer_addr in self.__peer_addrs if peer_addr != self.__self_addr ] self.__conf.serializer = self.__serialize self.__conf.deserializer = self.__deserialize self.__conf.validate() self.__indices = {} self.__index_configs = {} self.__writers = {} self.__auto_commit_timers = {} self.__lock = RLock() # create data dir os.makedirs(self.__data_dir, exist_ok=True) self.__file_storage = FileStorage(self.__data_dir, supports_mmap=True, readonly=False, debug=False) self.__ram_storage = RamStorage() # if seed addr specified and self node does not exist in the cluster, add self node to the cluster if self.__seed_addr is not None and self.__self_addr not in self.__peer_addrs: Thread(target=add_node, kwargs={ 'node_name': self.__self_addr, 'bind_addr': self.__seed_addr, 'timeout': 10 }).start() # copy snapshot from the leader node if self.__seed_addr is not None: try: metadata = get_metadata(bind_addr=get_leader( bind_addr=self.__seed_addr, timeout=10), timeout=10) response = requests.get('http://{0}/snapshot'.format( metadata['http_addr'])) if response.status_code == HTTPStatus.OK: with open(self.__conf.fullDumpFile, 'wb') as f: f.write(response.content) except Exception as ex: self.__logger.error('failed to copy snapshot: {0}'.format(ex)) # start node metadata = { 'grpc_addr': '{0}:{1}'.format(self.__host, self.__grpc_port), 'http_addr': '{0}:{1}'.format(self.__host, self.__http_port) } self.__logger.info('starting raft state machine') super(Indexer, self).__init__(self.__self_addr, self.__peer_addrs, conf=self.__conf, metadata=metadata) self.__logger.info('raft state machine has started') if os.path.exists(self.__conf.fullDumpFile): self.__logger.debug('snapshot exists: {0}'.format( self.__conf.fullDumpFile)) else: pass while not self.isReady(): # recovering data self.__logger.debug('waiting for cluster ready') self.__logger.debug(self.getStatus()) time.sleep(1) self.__logger.info('cluster ready') self.__logger.debug(self.getStatus()) # open existing indices on startup for index_name in self.get_index_names(): self.__open_index(index_name, index_config=None) # record index metrics timer self.metrics_timer = Timer(10, self.__record_index_metrics) self.metrics_timer.start() # start gRPC self.__grpc_server = grpc.server( futures.ThreadPoolExecutor(max_workers=self.__grpc_max_workers)) add_IndexServicer_to_server( IndexGRPCServicer(self, logger=self.__logger, metrics_registry=self.__metrics_registry), self.__grpc_server) self.__grpc_server.add_insecure_port('{0}:{1}'.format( self.__host, self.__grpc_port)) self.__grpc_server.start() self.__logger.info('gRPC server has started') # start HTTP server self.__http_servicer = IndexHTTPServicer(self, self.__logger, self.__http_logger, self.__metrics_registry) self.__http_server = HTTPServer(self.__host, self.__http_port, self.__http_servicer) self.__http_server.start() self.__logger.info('HTTP server has started') self.__logger.info('indexer has started')