def handle(self, *args, **options): # Optionally rebuild indices from scratch if options['clean']: create_source_index() for lang in Language.objects.have_translation(): create_target_index(lang=lang.code) # Open writer source_writer = BufferedWriter(get_source_index()) target_writers = {} try: # Process all units for unit in self.iterate_units(*args, **options): lang = unit.translation.language.code # Lazy open writer if lang not in target_writers: target_writers[lang] = BufferedWriter( get_target_index(lang) ) # Update target index if unit.translation: update_target_unit_index(target_writers[lang], unit) # Update source index update_source_unit_index(source_writer, unit) finally: # Close all writers source_writer.close() for lang in target_writers: target_writers[lang].close()
def update_index(units, source_units=None): ''' Updates fulltext index for given set of units. ''' languages = Language.objects.all() # Default to same set for both updates if source_units is None: source_units = units # Update source index index = get_source_index() writer = BufferedWriter(index) try: for unit in source_units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: index = get_target_index(lang.code) writer = BufferedWriter(index) try: language_units = units.filter(translation__language=lang).exclude( target='') for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def update_index(units): """Update fulltext index for given set of units.""" languages = Language.objects.have_translation() # Update source index if units.exists(): index = get_source_index() writer = BufferedWriter(index) try: for unit in units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: language_units = units.filter(translation__language=lang).exclude( target='') if language_units.exists(): index = get_target_index(lang.code) writer = BufferedWriter(index) try: for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def main(): queue = HotQueue(main_config.INDEX_QUEUE, host=main_config.REDIS_HOST, port=main_config.REDIS_PORT) index = get_index(main_config.WHOOSH_INDEX_DIR) writer = BufferedWriter(index, limit=10) try: for doc_id in queue.consume(): print "looking at {}".format(doc_id) doc = Document.query.get(doc_id) if doc: write_doc(doc, writer) else: print "no doc with doc_id {}".format(doc_id) finally: writer.close()
def source_writer(self, buffered=True): ''' Returns source index writer (by default buffered). ''' if not buffered: return self.source().writer() if self._source_writer is None: self._source_writer = BufferedWriter(self.source()) return self._source_writer
def target_writer(self, lang, buffered=True): ''' Returns target index writer (by default buffered) for given language. ''' if not buffered: return self.target(lang).writer() if not lang in self._target_writer: self._target_writer[lang] = BufferedWriter(self.target(lang)) return self._target_writer[lang]
def update_index(self, units): """Update fulltext index for given set of units.""" # Update source index index = self.get_source_index() with BufferedWriter(index) as writer: for unit in units: self.update_source_unit_index(writer, unit) languages = set([unit['language'] for unit in units]) # Update per language indices for language in languages: index = self.get_target_index(language) with BufferedWriter(index) as writer: for unit in units: if unit['language'] != language: continue self.update_target_unit_index(writer, unit)
def update_index(self, units): """Update fulltext index for given set of units.""" languages = Language.objects.have_translation() # Update source index if units.exists(): index = self.get_source_index() with BufferedWriter(index) as writer: for unit in units.iterator(): self.update_source_unit_index(writer, unit) # Update per language indices for lang in languages: language_units = units.filter(translation__language=lang).exclude( target='') if language_units.exists(): index = self.get_target_index(lang.code) with BufferedWriter(index) as writer: for unit in language_units.iterator(): self.update_target_unit_index(writer, unit)
def update_index(units, source_units=None): ''' Updates fulltext index for given set of units. ''' languages = Language.objects.have_translation() # Default to same set for both updates if source_units is None: source_units = units # Update source index index = get_source_index() writer = BufferedWriter(index) try: for unit in source_units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: index = get_target_index(lang.code) writer = BufferedWriter(index) try: language_units = units.filter( translation__language=lang ).exclude( target='' ) for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def update_index(units): """Update fulltext index for given set of units.""" languages = Language.objects.have_translation() # Update source index if units.exists(): index = get_source_index() writer = BufferedWriter(index) try: for unit in units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: language_units = units.filter( translation__language=lang ).exclude( target='' ) if language_units.exists(): index = get_target_index(lang.code) writer = BufferedWriter(index) try: for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def __call__(self, pipeline): self._writer = BufferedWriter(self._indexer, period=10, limit=1000) try: self.flush_thread.start() for event in pipeline: self.count += 1 self._writer.add_document(source=unicode(event["source"]), name=unicode(event["index"]), raw=unicode(event["_raw"]), time=int(event.time), hash=unicode(event.hash)) finally: self.flush()
def delete_search_units(source_units, languages): ''' Delete fulltext index for given set of units. ''' # Update source index if source_units: index = get_source_index() writer = BufferedWriter(index) try: for pk in source_units: writer.delete_by_term('pk', pk) finally: writer.close() for lang, units in languages.items(): index = get_target_index(lang) writer = BufferedWriter(index) try: for pk in units: writer.delete_by_term('pk', pk) finally: writer.close()
def daemon(self): """ Daemon function Continuosly watches self.directory for changes and adds them to the index. """ ix = self.get_ix() writer = BufferedWriter(ix, limit=100) event_handler = IndexWriterEventHandler(writer, self, self.all, self.exclude, self.include, ) observer = Observer() observer.schedule(event_handler, path=self.directory, recursive=True) observer.start() #observer.should_keep_running() try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() writer.commit() ix.close() observer.join()
def bootstrap_index(dirname=None, indexname=None): """ Create spam index and add one post from the """ if dirname and indexname: ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema()) else: ix = init_spam_index() writer = BufferedWriter(ix) # Write text to index. index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) writer.commit() writer.close() return ix
def test_20000_buffered(): from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima"] t = now() w = BufferedWriter(ix, limit=100, period=None) for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) t = now() ix.optimize() print("Optimize buffered:", now() - t)
class IndexWhoosh(Index): """Implements the whoosh engine as indexer.""" query_hash = QueryParser("hash", schema=SCHEMA) def create(self): self._indexer = create_in(self.path, SCHEMA) self._index = self._indexer def open(self, ro=False): if os.path.isdir(self.path): self._index = open_dir(self.path) self._indexer = self._index else: os.mkdir(self.path) self.create() self._searcher = self._index.searcher() self._opened = True def set_metadata(self, name, value): with open(os.path.join(self.path,"metadata-%s" % name),"w") as f: f.write(pickle.dumps(value)) def get_metadata(self, name, default=None): try: with open(os.path.join(self.path,"metadata-%s" % name)) as f: return pickle.loads(f.read()) except: return default def _timer(self): while True: self.flush() sleep(self.flush_time) def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs): self._opened = False self._index = None self._writer = None self.path = os.path.join(DEFAULT_INDEX_PATH, path) self.flush_time = flush_time self.flush_thread = threading.Thread(target=self._timer) self.open() self.count = 0 def flush(self): if getattr(self, "callback_flush", None): self.callback_flush(self) if self._writer is not None: self._writer.commit() self.count = 0 def is_indexed(self, hash): return self._searcher.search(self.query_hash.parse(unicode(hash))).estimated_length() > 0 def __call__(self, pipeline): self._writer = BufferedWriter(self._indexer, period=10, limit=1000) try: self.flush_thread.start() for event in pipeline: self.count += 1 self._writer.add_document(source=unicode(event["source"]), name=unicode(event["index"]), raw=unicode(event["_raw"]), time=int(event.time), hash=unicode(event.hash)) finally: self.flush() def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x def __iter__(self): for x in self.search(u"*", None): yield x["raw"]
class Index(object): ''' Class to manage index readers and writers. ''' _source = None _target = {} _source_writer = None _target_writer = {} def source(self): ''' Returns source index. ''' if self._source is None: try: self._source = open_dir( appsettings.WHOOSH_INDEX, indexname='source' ) except whoosh.index.EmptyIndexError: self._source = create_source_index() except IOError: # eg. path does not exist self._source = create_source_index() return self._source def target(self, lang): ''' Returns target index for given language. ''' if not lang in self._target: try: self._target[lang] = open_dir( appsettings.WHOOSH_INDEX, indexname='target-%s' % lang ) except whoosh.index.EmptyIndexError: self._target[lang] = create_target_index(lang) return self._target[lang] def source_writer(self, buffered=True): ''' Returns source index writer (by default buffered). ''' if not buffered: return self.source().writer() if self._source_writer is None: self._source_writer = BufferedWriter(self.source()) return self._source_writer def target_writer(self, lang, buffered=True): ''' Returns target index writer (by default buffered) for given language. ''' if not buffered: return self.target(lang).writer() if not lang in self._target_writer: self._target_writer[lang] = BufferedWriter(self.target(lang)) return self._target_writer[lang] def source_searcher(self, buffered=True): ''' Returns source index searcher (on buffered writer). ''' if not buffered: return self.source().searcher() return self.source_writer(buffered).searcher() def target_searcher(self, lang, buffered=True): ''' Returns target index searcher (on buffered writer) for given language. ''' if not buffered: return self.target(lang).searcher() return self.target_writer(lang, buffered).searcher() def commit(self): ''' Commits pending changes. ''' self._source_writer.commit() for lang in self._target_writer: self._target_writer[lang].commit()
class IndexWhoosh(Index): """Implements the whoosh engine as indexer.""" query_hash = QueryParser("hash", schema=SCHEMA) def create(self): self._indexer = create_in(self.path, SCHEMA) self._index = self._indexer def open(self, ro=False): if os.path.isdir(self.path): self._index = open_dir(self.path) self._indexer = self._index else: os.mkdir(self.path) self.create() self._searcher = self._index.searcher() self._opened = True def set_metadata(self, name, value): with open(os.path.join(self.path, "metadata-%s" % name), "w") as f: f.write(pickle.dumps(value)) def get_metadata(self, name, default=None): try: with open(os.path.join(self.path, "metadata-%s" % name)) as f: return pickle.loads(f.read()) except: return default def _timer(self): while True: self.flush() sleep(self.flush_time) def __init__(self, path, size=None, rows=None, flush_time=10, *args, **kwargs): self._opened = False self._index = None self._writer = None self.path = os.path.join(DEFAULT_INDEX_PATH, path) self.flush_time = flush_time self.flush_thread = threading.Thread(target=self._timer) self.open() self.count = 0 def flush(self): if getattr(self, "callback_flush", None): self.callback_flush(self) if self._writer is not None: self._writer.commit() self.count = 0 def is_indexed(self, hash): return self._searcher.search(self.query_hash.parse( unicode(hash))).estimated_length() > 0 def __call__(self, pipeline): self._writer = BufferedWriter(self._indexer, period=10, limit=1000) try: self.flush_thread.start() for event in pipeline: self.count += 1 self._writer.add_document(source=unicode(event["source"]), name=unicode(event["index"]), raw=unicode(event["_raw"]), time=int(event.time), hash=unicode(event.hash)) finally: self.flush() def search(self, expr, limit=10000): with self._index.searcher() as searcher: query = QueryParser("raw", self._index.schema) query.add_plugin(FieldsPlugin()) query.add_plugin(RangePlugin()) query.add_plugin(GtLtPlugin()) query.add_plugin(WildcardPlugin()) query = query.parse(expr) for x in searcher.search(query, limit=limit): yield x def __iter__(self): for x in self.search(u"*", None): yield x["raw"]
def test_classify(threshold=None, niter=100, limitmb=1024, size=100, verbosity=0): if threshold is None: threshold = settings.SPAM_THRESHOLD # Add posts to test spam index, then spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED)) # Get the valid posts and shuffle. ham = Post.objects.valid_posts(author__profile__score__lte=0, type__in=[Post.ANSWER, Post.COMMENT]) # Get list of id's for both spam = list(spam.values_list("id", flat=True)) ham = list(ham.values_list("id", flat=True)) # tp = Identify spam correctly. # tn = Identify valid post correctly. # fn = Missed to identify a spam. # fp = Mis-identified valid post as spam. tp, tn, fn, fp = 0, 0, 0, 0 seen_ham, seen_spam = 0, 0 elapsed, progress = util.timer_func() for i in range(niter): # Remove previous index if os.path.exists(TRAIN_DIR): shutil.rmtree(TRAIN_DIR) ix = search.init_index( dirname=TRAIN_DIR, indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}", schema=spam_schema()) writer = BufferedWriter(ix, limit=int((niter / 2) + 1), writerargs=dict(limitmb=limitmb, multisegment=True)) index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) # Take one spam post out of training set. one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham) writer.commit() writer.close() post_score = compute_score(post=one_out, ix=ix) predicted_spam = post_score >= threshold is_spam = one_out.is_spam or one_out.is_deleted is_ham = not is_spam seen_spam += 1 if is_spam else 0 seen_ham += 1 if is_ham else 0 detail(is_spam=is_spam, predict=predicted_spam, post=one_out, verb=verbosity, post_score=post_score) if predicted_spam: tp += 1 if is_spam else 0 fp += 1 if is_ham else 0 else: fn += 1 if is_spam else 0 tn += 1 if is_ham else 0 progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}") train_spam = sizer(spam, size) train_ham = sizer(ham, size) print(f"... {train_ham + train_spam}\tSize of index ( per iteration )") print(f"... \t{train_spam}\tSPAM") print(f"... \t{train_ham}\tHAM") print(f"\n... {niter}\tNumber of iterations") report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn) return
key = f'{eid}:{locale}:tags' for tag in tags['values']: storage.lpush(key, tag) if __name__ == '__main__': print('-' * 30) print('Muzeeglot data ingestion') print('-' * 30) if exists(configuration.INGESTION_LOCK): print('WARN: ingestion lock detected, pass') else: print('INFO: evaluate tags corpus') tags_corpus = get_tags_corpus() print('INFO: create search index') if not exists(configuration.INDEX): makedirs(configuration.INDEX) schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED()) index = create_in(configuration.INDEX, schema) writer = BufferedWriter(index, period=60, limit=200) ingest_languages(writer) ingest_tags(tags_corpus) ingest_entities(tags_corpus, writer) print('INFO: optimize and close index') writer.close() index.optimize() index.close() print('INFO: write ingestion lock') with open(configuration.INGESTION_LOCK, 'w') as stream: stream.write('ingested')
def update(self, note): writer = BufferedWriter(self.index, period=10, limit=10) writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.close()
md = pd['metadata'] directories = [x['path'] for x in md['contents'] if x['is_dir'] ] files = [x for x in md['contents'] if not x['is_dir'] ] for f in files: file_path = f['path'] dir_part,ext = os.path.splitext(file_path.lower()) if ext in ignore_extensions: print 'Ignoring file',file_path continue index_file(f,file_path,os.path.basename(file_path)) for d in directories: index_path(d) writer = BufferedWriter(ix) def index_file(file_md,f,title): if not fnmatch.fnmatch(f,'*.*'): return indexed = False last_modified = None modified = None indexed_data = ds.get_document(id,f) if indexed_data: last_modified = indexed_data.get('modified') try: modified = parse_date(file_md['modified']) stale = (not last_modified) or ( (modified - last_modified).total_seconds() > 0 )
def delete(self, note_id): writer = BufferedWriter(self.index) writer.delete_by_term('note_id', note_id) writer.close()
def delete_search_units(source_units, languages): ''' Delete fulltext index for given set of units. ''' # Update source index if source_units: index = get_source_index() writer = BufferedWriter(index) try: for pk in source_units: writer.delete_by_term('pk', pk) finally: writer.close() for lang, units in languages.items(): if units: index = get_target_index(lang) writer = BufferedWriter(index) try: for pk in units: writer.delete_by_term('pk', pk) finally: writer.close()