def handle(self, *args, **options): LIMIT = None SLICE_SIZE = 500 solr_sound_ids = [] solr = Solr(url=settings.SOLR_URL) query = SolrQuery() query.set_dismax_query("") # Query to get ALL sounds print "Retrieving ids from %i to %i" % (0, SLICE_SIZE) query.set_query_options(field_list=["id"], rows=SLICE_SIZE, start=0) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs) total_num_documents = results.num_found # Start iterating over other pages (slices) if LIMIT: number_of_documents = min(LIMIT, total_num_documents) else: number_of_documents = total_num_documents for i in range(SLICE_SIZE, number_of_documents, SLICE_SIZE): print "Retrieving ids from %i to %i" % (i, i + SLICE_SIZE) query.set_query_options(field_list=["id"], rows=SLICE_SIZE, start=i) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs) solr_sound_ids = sorted(list(set(solr_sound_ids))) if LIMIT: solr_sound_ids = solr_sound_ids[0:LIMIT] print "%i document ids retrieved" % len(solr_sound_ids) n_deleted = 0 print "" for count, id in enumerate(solr_sound_ids): sys.stdout.write("\rChecking doc %i of %i" % (count, len(solr_sound_ids))) sys.stdout.flush() if Sound.objects.filter(id=id, moderation_state="OK", processing_state="OK").exists(): pass else: # Sound does not exist in the Db or is not properly moderated and processed print "\n\t - Deleting sound with id %i from solr index" % id solr.delete_by_id(id) n_deleted += 1 print "\n\nDONE! %i sounds deleted from solr index (it may take some minutes to actually see the changes in the page)" % n_deleted
def handle(self, *args, **options): LIMIT = None SLICE_SIZE = 500 solr_post_ids = [] solr = Solr(url=settings.SOLR_FORUM_URL) query = SolrQuery() query.set_dismax_query("") # Query to get ALL forums console_logger.info("Retrieving ids from %i to %i"%(0,SLICE_SIZE)) query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = 0) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_post_ids += list_of_dicts_to_list_of_ids(results.docs) total_num_documents = results.num_found # Start iterating over other pages (slices) if LIMIT: number_of_documents = min(LIMIT,total_num_documents) else: number_of_documents = total_num_documents for i in range(SLICE_SIZE, number_of_documents,SLICE_SIZE): console_logger.info("Retrieving ids from %i to %i"%(i,i+SLICE_SIZE-1)) query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = i) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_post_ids += list_of_dicts_to_list_of_ids(results.docs) solr_post_ids = sorted(list(set(solr_post_ids))) if LIMIT: solr_post_ids = solr_post_ids[0:LIMIT] console_logger.info("%i document ids retrieved"%len(solr_post_ids)) n_deleted = 0 console_logger.info("") for count, id in enumerate(solr_post_ids): if count % 100 == 0: console_logger.info("\rChecking docs %i/%i"%(count,len(solr_post_ids))) if Post.objects.filter(id=id,moderation_state="OK").exists(): pass else: # Post does not exist in the Db or is not properly moderated and processed console_logger.info("\n\t - Deleting forum with id %i from solr index" % id) solr.delete_by_id(id) n_deleted += 1 console_logger.info("\n\nDONE! %i forums deleted from solr index (it may take some minutes to actually see " "the changes in the page)" % n_deleted)
def handle(self, *args, **options): LIMIT = None SLICE_SIZE = 500 solr_sound_ids = [] solr = Solr(url=settings.SOLR_URL) query = SolrQuery() query.set_dismax_query("") # Query to get ALL sounds print "Retrieving ids from %i to %i"%(0,SLICE_SIZE) query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = 0) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs) total_num_documents = results.num_found # Start iterating over other pages (slices) if LIMIT: number_of_documents = min(LIMIT,total_num_documents) else: number_of_documents = total_num_documents for i in range(SLICE_SIZE,number_of_documents,SLICE_SIZE): print "Retrieving ids from %i to %i"%(i,i+SLICE_SIZE) query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = i) results = SolrResponseInterpreter(solr.select(unicode(query))) solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs) solr_sound_ids = sorted(list(set(solr_sound_ids))) if LIMIT: solr_sound_ids = solr_sound_ids[0:LIMIT] print "%i document ids retrieved"%len(solr_sound_ids) n_deleted = 0 print "" for count,id in enumerate(solr_sound_ids): sys.stdout.write("\rChecking doc %i of %i"%(count,len(solr_sound_ids))) sys.stdout.flush() if Sound.objects.filter(id=id,moderation_state="OK",processing_state="OK").exists(): pass else: # Sound does not exist in the Db or is not properly moderated and processed print "\n\t - Deleting sound with id %i from solr index"%id solr.delete_by_id(id) n_deleted += 1 print "\n\nDONE! %i sounds deleted from solr index (it may take some minutes to actually see the changes in the page)"%n_deleted
def handle(self, *args, **options): # init solr = Solr(settings.SOLR_URL) # Get all solr ids print "Getting solr ids...", solr_ids = get_all_sound_ids_from_solr() print "done!" # Get ell gaia ids print "Getting gaia ids...", gaia_ids = Similarity.get_all_sound_ids() print "done!" print "Getting freesound db data..." # Get all moderated and processed sound ids queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK').order_by('id').only("id") fs_mp = [sound.id for sound in queryset] # Get ell moderated, processed and analysed sounds queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK', analysis_state='OK').order_by('id').only("id") fs_mpa = [sound.id for sound in queryset] print "done!" print "\nNumber of sounds per index:\n--------------------------" print "Solr index\t\t%i" % len(solr_ids) print "Gaia index\t\t%i" % len(gaia_ids) print "Freesound\t\t%i (moderated and processed)" % len(fs_mp) print "Freesound\t\t%i (moderated, processed and analyzed)" % len(fs_mpa) print "\n\n***************\nSOLR INDEX\n***************\n" in_solr_not_in_fs = list(set(solr_ids).intersection(set(set(solr_ids).difference(fs_mp)))) in_fs_not_in_solr = list(set(fs_mp).intersection(set(set(fs_mp).difference(solr_ids)))) print "Sounds in solr but not in fs:\t%i" % len(in_solr_not_in_fs) print "Sounds in fs but not in solr:\t%i" % len(in_fs_not_in_solr) if not options['no-changes']: # Mark fs sounds to go processing if in_fs_not_in_solr: print "Changing is_index_dirty_state of sounds that require it" N = len(in_fs_not_in_solr) for count, sid in enumerate(in_fs_not_in_solr): sys.stdout.write('\r\tChanging state of sound sound %i of %i ' % (count+1, N)) sys.stdout.flush() sound = Sound.objects.get(id=sid) sound.set_single_field('is_index_dirty', True) # Delete sounds from solr that are not in the db if in_solr_not_in_fs: print "\nDeleting sounds that should not be in solr" N = len(in_solr_not_in_fs) for count, sid in enumerate(in_solr_not_in_fs): sys.stdout.write('\r\tDeleting sound %i of %i ' % (count+1, N)) sys.stdout.flush() solr.delete_by_id(sid) print "\n***************\nGAIA INDEX\n***************\n" in_gaia_not_in_fs = list(set(gaia_ids).intersection(set(set(gaia_ids).difference(fs_mpa)))) in_fs_not_in_gaia = list(set(fs_mpa).intersection(set(set(fs_mpa).difference(gaia_ids)))) print "Sounds in gaia but not in fs:\t%i" % len(in_gaia_not_in_fs) print "Sounds in fs but not in gaia:\t%i (only considering sounds correctly analyzed)" % len(in_fs_not_in_gaia) #Similarity.save() if not options['no-changes']: # Mark fs sounds to go processing if in_fs_not_in_gaia: print "Changing similarity_state of sounds that require it" N = len(in_fs_not_in_gaia) for count, sid in enumerate(in_fs_not_in_gaia): sys.stdout.write('\r\tChanging state of sound %i of %i ' % (count+1, N)) sys.stdout.flush() sound = Sound.objects.get(id=sid) sound.set_similarity_state('PE') # Delete sounds from gaia that are not in the db if in_gaia_not_in_fs: print "\nDeleting sounds that should not be in gaia" N = len(in_gaia_not_in_fs) for count, sid in enumerate(in_gaia_not_in_fs): sys.stdout.write('\r\tDeleting sound %i of %i ' % (count+1, N)) sys.stdout.flush() Similarity.delete(sid)
def handle(self, *args, **options): # init solr = Solr(settings.SOLR_URL) # Get all solr ids print "Getting solr ids...", solr_ids = get_all_sound_ids_from_solr() print "done!" # Get ell gaia ids print "Getting gaia ids...", gaia_ids = Similarity.get_all_sound_ids() print "done!" print "Getting freesound db data..." # Get all moderated and processed sound ids queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK').order_by('id').only("id") fs_mp = [sound.id for sound in queryset] # Get ell moderated, processed and analysed sounds queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK', analysis_state='OK').order_by('id').only("id") fs_mpa = [sound.id for sound in queryset] print "done!" print "\nNumber of sounds per index:\n--------------------------" print "Solr index\t\t%i" % len(solr_ids) print "Gaia index\t\t%i" % len(gaia_ids) print "Freesound\t\t%i (moderated and processed)" % len(fs_mp) print "Freesound\t\t%i (moderated, processed and analyzed)" % len(fs_mpa) print "\n\n***************\nSOLR INDEX\n***************\n" in_solr_not_in_fs = list(set(solr_ids).intersection(set(set(solr_ids).difference(fs_mp)))) in_fs_not_in_solr = list(set(fs_mp).intersection(set(set(fs_mp).difference(solr_ids)))) print "Sounds in solr but not in fs:\t%i" % len(in_solr_not_in_fs) print "Sounds in fs but not in solr:\t%i" % len(in_fs_not_in_solr) if not options['no-changes']: # Mark fs sounds to go processing if in_fs_not_in_solr: print "Changing is_index_dirty_state of sounds that require it" N = len(in_fs_not_in_solr) for count, sid in enumerate(in_fs_not_in_solr): sys.stdout.write('\r\tChanging state of sound sound %i of %i ' % (count+1, N)) sys.stdout.flush() sound = Sound.objects.get(id=sid) sound.set_single_field('is_index_dirty', True) # Delete sounds from solr that are not in the db if in_solr_not_in_fs: print "\nDeleting sounds that should not be in solr" N = len(in_solr_not_in_fs) for count, sid in enumerate(in_solr_not_in_fs): sys.stdout.write('\r\tDeleting sound %i of %i ' % (count+1, N)) sys.stdout.flush() solr.delete_by_id(sid) print "\n***************\nGAIA INDEX\n***************\n" in_gaia_not_in_fs = list(set(gaia_ids).intersection(set(set(gaia_ids).difference(fs_mpa)))) in_fs_not_in_gaia = list(set(fs_mpa).intersection(set(set(fs_mpa).difference(gaia_ids)))) print "Sounds in gaia but not in fs:\t%i" % len(in_gaia_not_in_fs) print "Sounds in fs but not in gaia:\t%i (only considering sounds correctly analyzed)" % len(in_fs_not_in_gaia) #Similarity.save() if not options['no-changes']: # Mark fs sounds to go processing if in_fs_not_in_gaia: print "Changing similarity_state of sounds that require it" N = len(in_fs_not_in_gaia) for count, sid in enumerate(in_fs_not_in_gaia): sys.stdout.write('\r\tChanging state of sound %i of %i ' % (count+1, N)) sys.stdout.flush() sound = Sound.objects.get(id=sid) sound.set_similarity_state('PE') # Delete sounds from gaia that are not in the db if in_gaia_not_in_fs: print "\nDeleting sounds that should not be in solr" N = len(in_gaia_not_in_fs) for count, sid in enumerate(in_gaia_not_in_fs): sys.stdout.write('\r\tDeleting sound %i of %i ' % (count+1, N)) sys.stdout.flush() Similarity.delete(sid)