def update_index(units, source_units=None): ''' Updates fulltext index for given set of units. ''' languages = Language.objects.have_translation() # Default to same set for both updates if source_units is None: source_units = units # Update source index index = get_source_index() writer = BufferedWriter(index) try: for unit in source_units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: index = get_target_index(lang.code) writer = BufferedWriter(index) try: language_units = units.filter( translation__language=lang ).exclude( target='' ) for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def update_index(units, source_units=None): ''' Updates fulltext index for given set of units. ''' languages = Language.objects.have_translation() # Default to same set for both updates if source_units is None: source_units = units # Update source index if source_units.exists(): index = get_source_index() writer = BufferedWriter(index) try: for unit in source_units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: language_units = units.filter(translation__language=lang).exclude( target='') if language_units.exists(): index = get_target_index(lang.code) writer = BufferedWriter(index) try: for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def update_index(units): """Update fulltext index for given set of units.""" languages = Language.objects.have_translation() # Update source index if units.exists(): index = get_source_index() writer = BufferedWriter(index) try: for unit in units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: language_units = units.filter( translation__language=lang ).exclude( target='' ) if language_units.exists(): index = get_target_index(lang.code) writer = BufferedWriter(index) try: for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def update_index(units): """Update fulltext index for given set of units.""" languages = Language.objects.have_translation() # Update source index if units.exists(): index = get_source_index() writer = BufferedWriter(index) try: for unit in units.iterator(): update_source_unit_index(writer, unit) finally: writer.close() # Update per language indices for lang in languages: language_units = units.filter(translation__language=lang).exclude( target='') if language_units.exists(): index = get_target_index(lang.code) writer = BufferedWriter(index) try: for unit in language_units.iterator(): update_target_unit_index(writer, unit) finally: writer.close()
def handle(self, *args, **options): # Optionally rebuild indices from scratch if options['clean']: create_source_index() for lang in Language.objects.have_translation(): create_target_index(lang=lang.code) # Open writer source_writer = BufferedWriter(get_source_index()) target_writers = {} try: # Process all units for unit in self.iterate_units(*args, **options): lang = unit.translation.language.code # Lazy open writer if lang not in target_writers: target_writers[lang] = BufferedWriter( get_target_index(lang) ) # Update target index if unit.translation: update_target_unit_index(target_writers[lang], unit) # Update source index update_source_unit_index(source_writer, unit) finally: # Close all writers source_writer.close() for lang in target_writers: target_writers[lang].close()
def handle(self, *args, **options): # Optionally rebuild indices from scratch if options['clean']: create_source_index() for lang in Language.objects.have_translation(): create_target_index(lang=lang.code) # Open writer source_writer = BufferedWriter(get_source_index()) target_writers = {} try: # Process all units for unit in self.iterate_units(*args, **options): lang = unit.translation.language.code # Lazy open writer if lang not in target_writers: target_writers[lang] = BufferedWriter( get_target_index(lang)) # Update target index if unit.translation: update_target_unit_index(target_writers[lang], unit) # Update source index update_source_unit_index(source_writer, unit) finally: # Close all writers source_writer.close() for lang in target_writers: target_writers[lang].close()
def main(): queue = HotQueue(main_config.INDEX_QUEUE, host=main_config.REDIS_HOST, port=main_config.REDIS_PORT) index = get_index(main_config.WHOOSH_INDEX_DIR) writer = BufferedWriter(index, limit=10) try: for doc_id in queue.consume(): print "looking at {}".format(doc_id) doc = Document.query.get(doc_id) if doc: write_doc(doc, writer) else: print "no doc with doc_id {}".format(doc_id) finally: writer.close()
def bootstrap_index(dirname=None, indexname=None): """ Create spam index and add one post from the """ if dirname and indexname: ix = search.init_index(dirname=dirname, indexname=indexname, schema=spam_schema()) else: ix = init_spam_index() writer = BufferedWriter(ix) # Write text to index. index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) writer.commit() writer.close() return ix
def test_20000_buffered(): from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima"] t = now() w = BufferedWriter(ix, limit=100, period=None) for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) t = now() ix.optimize() print("Optimize buffered:", now() - t)
def delete_search_units(source_units, languages): ''' Delete fulltext index for given set of units. ''' # Update source index if source_units: index = get_source_index() writer = BufferedWriter(index) try: for pk in source_units: writer.delete_by_term('pk', pk) finally: writer.close() for lang, units in languages.items(): index = get_target_index(lang) writer = BufferedWriter(index) try: for pk in units: writer.delete_by_term('pk', pk) finally: writer.close()
def test_classify(threshold=None, niter=100, limitmb=1024, size=100, verbosity=0): if threshold is None: threshold = settings.SPAM_THRESHOLD # Add posts to test spam index, then spam = Post.objects.filter(Q(spam=Post.SPAM) | Q(status=Post.DELETED)) # Get the valid posts and shuffle. ham = Post.objects.valid_posts(author__profile__score__lte=0, type__in=[Post.ANSWER, Post.COMMENT]) # Get list of id's for both spam = list(spam.values_list("id", flat=True)) ham = list(ham.values_list("id", flat=True)) # tp = Identify spam correctly. # tn = Identify valid post correctly. # fn = Missed to identify a spam. # fp = Mis-identified valid post as spam. tp, tn, fn, fp = 0, 0, 0, 0 seen_ham, seen_spam = 0, 0 elapsed, progress = util.timer_func() for i in range(niter): # Remove previous index if os.path.exists(TRAIN_DIR): shutil.rmtree(TRAIN_DIR) ix = search.init_index( dirname=TRAIN_DIR, indexname=f"train_{util.get_uuid(8)}_{settings.SPAM_INDEX_NAME}", schema=spam_schema()) writer = BufferedWriter(ix, limit=int((niter / 2) + 1), writerargs=dict(limitmb=limitmb, multisegment=True)) index_writer(writer=writer, title="Placeholder", content_length=0, is_spam=True, content='CONTENT', uid=STARTER_UID) # Take one spam post out of training set. one_out = one_out_train(spam=spam, writer=writer, size=size, ham=ham) writer.commit() writer.close() post_score = compute_score(post=one_out, ix=ix) predicted_spam = post_score >= threshold is_spam = one_out.is_spam or one_out.is_deleted is_ham = not is_spam seen_spam += 1 if is_spam else 0 seen_ham += 1 if is_ham else 0 detail(is_spam=is_spam, predict=predicted_spam, post=one_out, verb=verbosity, post_score=post_score) if predicted_spam: tp += 1 if is_spam else 0 fp += 1 if is_ham else 0 else: fn += 1 if is_spam else 0 tn += 1 if is_ham else 0 progress(i, step=5, msg=f"iterations. tp={tp} fp={fp} tn={tn} fn={fn}") train_spam = sizer(spam, size) train_ham = sizer(ham, size) print(f"... {train_ham + train_spam}\tSize of index ( per iteration )") print(f"... \t{train_spam}\tSPAM") print(f"... \t{train_ham}\tHAM") print(f"\n... {niter}\tNumber of iterations") report(nham=seen_ham, nspam=seen_spam, tn=tn, tp=tp, fp=fp, fn=fn) return
if not type(cd) is unicode: cd = reader.unicodify(cd) writer.update_document(title=unicode(title),path=unicode(f),filename=unicode(title),content=cd) print 'commit' print 'closed' indexed = True ds.save_document(id,f,indexed=indexed,modified=modified,metadata=file_md) except Exception,e: print 'skipping',e return ds.save_document(id,f,indexed=indexed,modified=modified,error=e.message) index_path('/') writer.close() print 'committing' ds.update_indexed_time(id) def _index_users(): processes = {} while True: linked_users = database.DataStore().linked_users() for user in linked_users: indexed_time = user.get('indexed_time') if indexed_time and ( (datetime.datetime.utcnow() - indexed_time).total_seconds() < INDEX_INTERVAL ): continue if user['_id'] in processes: continue
def delete(self, note_id): writer = BufferedWriter(self.index) writer.delete_by_term('note_id', note_id) writer.close()
def update(self, note): writer = BufferedWriter(self.index, period=10, limit=10) writer.update_document(note_id=note.id, notebook_id=note.notebook_id, title=note.title, snippet=note.snippet) writer.close()
key = f'{eid}:{locale}:tags' for tag in tags['values']: storage.lpush(key, tag) if __name__ == '__main__': print('-' * 30) print('Muzeeglot data ingestion') print('-' * 30) if exists(configuration.INGESTION_LOCK): print('WARN: ingestion lock detected, pass') else: print('INFO: evaluate tags corpus') tags_corpus = get_tags_corpus() print('INFO: create search index') if not exists(configuration.INDEX): makedirs(configuration.INDEX) schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED()) index = create_in(configuration.INDEX, schema) writer = BufferedWriter(index, period=60, limit=200) ingest_languages(writer) ingest_tags(tags_corpus) ingest_entities(tags_corpus, writer) print('INFO: optimize and close index') writer.close() index.optimize() index.close() print('INFO: write ingestion lock') with open(configuration.INGESTION_LOCK, 'w') as stream: stream.write('ingested')