def handle(self, *args, **options):

        LIMIT = None
        SLICE_SIZE = 500
        solr_sound_ids = []
        solr = Solr(url=settings.SOLR_URL)
        query = SolrQuery()
        query.set_dismax_query("")  # Query to get ALL sounds

        print "Retrieving ids from %i to %i" % (0, SLICE_SIZE)
        query.set_query_options(field_list=["id"], rows=SLICE_SIZE, start=0)
        results = SolrResponseInterpreter(solr.select(unicode(query)))
        solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs)
        total_num_documents = results.num_found

        # Start iterating over other pages (slices)
        if LIMIT:
            number_of_documents = min(LIMIT, total_num_documents)
        else:
            number_of_documents = total_num_documents

        for i in range(SLICE_SIZE, number_of_documents, SLICE_SIZE):
            print "Retrieving ids from %i to %i" % (i, i + SLICE_SIZE)
            query.set_query_options(field_list=["id"],
                                    rows=SLICE_SIZE,
                                    start=i)
            results = SolrResponseInterpreter(solr.select(unicode(query)))
            solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs)

        solr_sound_ids = sorted(list(set(solr_sound_ids)))
        if LIMIT:
            solr_sound_ids = solr_sound_ids[0:LIMIT]
        print "%i document ids retrieved" % len(solr_sound_ids)
        n_deleted = 0
        print ""
        for count, id in enumerate(solr_sound_ids):
            sys.stdout.write("\rChecking doc %i of %i" %
                             (count, len(solr_sound_ids)))
            sys.stdout.flush()

            if Sound.objects.filter(id=id,
                                    moderation_state="OK",
                                    processing_state="OK").exists():
                pass
            else:
                # Sound does not exist in the Db or is not properly moderated and processed
                print "\n\t - Deleting sound with id %i from solr index" % id
                solr.delete_by_id(id)
                n_deleted += 1

        print "\n\nDONE! %i sounds deleted from solr index (it may take some minutes to actually see the changes in the page)" % n_deleted
    def handle(self, *args, **options):

        LIMIT = None
        SLICE_SIZE = 500
        solr_post_ids = []
        solr = Solr(url=settings.SOLR_FORUM_URL)
        query = SolrQuery()
        query.set_dismax_query("")  # Query to get ALL forums

        console_logger.info("Retrieving ids from %i to %i"%(0,SLICE_SIZE))
        query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = 0)
        results = SolrResponseInterpreter(solr.select(unicode(query)))
        solr_post_ids += list_of_dicts_to_list_of_ids(results.docs)
        total_num_documents = results.num_found

        # Start iterating over other pages (slices)
        if LIMIT:
            number_of_documents = min(LIMIT,total_num_documents)
        else:
            number_of_documents = total_num_documents

        for i in range(SLICE_SIZE, number_of_documents,SLICE_SIZE):
            console_logger.info("Retrieving ids from %i to %i"%(i,i+SLICE_SIZE-1))
            query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = i)
            results = SolrResponseInterpreter(solr.select(unicode(query)))
            solr_post_ids += list_of_dicts_to_list_of_ids(results.docs)

        solr_post_ids = sorted(list(set(solr_post_ids)))
        if LIMIT:
            solr_post_ids = solr_post_ids[0:LIMIT]
        console_logger.info("%i document ids retrieved"%len(solr_post_ids))
        n_deleted = 0
        console_logger.info("")
        for count, id in enumerate(solr_post_ids):
            if count % 100 == 0:
                console_logger.info("\rChecking docs %i/%i"%(count,len(solr_post_ids)))

            if Post.objects.filter(id=id,moderation_state="OK").exists():
                pass
            else:
                # Post does not exist in the Db or is not properly moderated and processed
                console_logger.info("\n\t - Deleting forum with id %i from solr index" % id)
                solr.delete_by_id(id)
                n_deleted += 1

        console_logger.info("\n\nDONE! %i forums deleted from solr index (it may take some minutes to actually see "
                            "the changes in the page)" % n_deleted)
    def handle(self, *args, **options):

        LIMIT = None
        SLICE_SIZE = 500
        solr_sound_ids = []
        solr = Solr(url=settings.SOLR_URL)
        query = SolrQuery()
        query.set_dismax_query("") # Query to get ALL sounds

        print "Retrieving ids from %i to %i"%(0,SLICE_SIZE)
        query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = 0)
        results = SolrResponseInterpreter(solr.select(unicode(query)))
        solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs)
        total_num_documents = results.num_found

        # Start iterating over other pages (slices)
        if LIMIT:
            number_of_documents = min(LIMIT,total_num_documents)
        else:
            number_of_documents = total_num_documents

        for i in range(SLICE_SIZE,number_of_documents,SLICE_SIZE):
            print "Retrieving ids from %i to %i"%(i,i+SLICE_SIZE)
            query.set_query_options(field_list=["id"], rows = SLICE_SIZE, start = i)
            results = SolrResponseInterpreter(solr.select(unicode(query)))
            solr_sound_ids += list_of_dicts_to_list_of_ids(results.docs)

        solr_sound_ids = sorted(list(set(solr_sound_ids)))
        if LIMIT:
            solr_sound_ids = solr_sound_ids[0:LIMIT]
        print "%i document ids retrieved"%len(solr_sound_ids)
        n_deleted = 0
        print ""
        for count,id in enumerate(solr_sound_ids):
            sys.stdout.write("\rChecking doc %i of %i"%(count,len(solr_sound_ids)))
            sys.stdout.flush()

            if Sound.objects.filter(id=id,moderation_state="OK",processing_state="OK").exists():
                pass
            else:
                # Sound does not exist in the Db or is not properly moderated and processed
                print "\n\t - Deleting sound with id %i from solr index"%id
                solr.delete_by_id(id)
                n_deleted += 1

        print "\n\nDONE! %i sounds deleted from solr index (it may take some minutes to actually see the changes in the page)"%n_deleted
    def handle(self,  *args, **options):

        # init
        solr = Solr(settings.SOLR_URL)

        # Get all solr ids
        print "Getting solr ids...",
        solr_ids = get_all_sound_ids_from_solr()
        print "done!"

        # Get ell gaia ids
        print "Getting gaia ids...",
        gaia_ids = Similarity.get_all_sound_ids()
        print "done!"

        print "Getting freesound db data..."
        # Get all moderated and processed sound ids
        queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK').order_by('id').only("id")
        fs_mp = [sound.id for sound in queryset]
        # Get ell moderated, processed and analysed sounds
        queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK', analysis_state='OK').order_by('id').only("id")
        fs_mpa = [sound.id for sound in queryset]
        print "done!"

        print "\nNumber of sounds per index:\n--------------------------"
        print "Solr index\t\t%i" % len(solr_ids)
        print "Gaia index\t\t%i" % len(gaia_ids)
        print "Freesound\t\t%i  (moderated and processed)" % len(fs_mp)
        print "Freesound\t\t%i  (moderated, processed and analyzed)" % len(fs_mpa)

        print "\n\n***************\nSOLR INDEX\n***************\n"
        in_solr_not_in_fs = list(set(solr_ids).intersection(set(set(solr_ids).difference(fs_mp))))
        in_fs_not_in_solr = list(set(fs_mp).intersection(set(set(fs_mp).difference(solr_ids))))
        print "Sounds in solr but not in fs:\t%i" % len(in_solr_not_in_fs)
        print "Sounds in fs but not in solr:\t%i" % len(in_fs_not_in_solr)

        if not options['no-changes']:
            # Mark fs sounds to go processing
            if in_fs_not_in_solr:
                print "Changing is_index_dirty_state of sounds that require it"
                N = len(in_fs_not_in_solr)
                for count, sid in enumerate(in_fs_not_in_solr):
                    sys.stdout.write('\r\tChanging state of sound sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    sound = Sound.objects.get(id=sid)
                    sound.set_single_field('is_index_dirty', True)

            # Delete sounds from solr that are not in the db
            if in_solr_not_in_fs:
                print "\nDeleting sounds that should not be in solr"
                N = len(in_solr_not_in_fs)
                for count, sid in enumerate(in_solr_not_in_fs):
                    sys.stdout.write('\r\tDeleting sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    solr.delete_by_id(sid)

        print "\n***************\nGAIA INDEX\n***************\n"
        in_gaia_not_in_fs = list(set(gaia_ids).intersection(set(set(gaia_ids).difference(fs_mpa))))
        in_fs_not_in_gaia = list(set(fs_mpa).intersection(set(set(fs_mpa).difference(gaia_ids))))
        print "Sounds in gaia but not in fs:\t%i" % len(in_gaia_not_in_fs)
        print "Sounds in fs but not in gaia:\t%i  (only considering sounds correctly analyzed)" % len(in_fs_not_in_gaia)
        #Similarity.save()

        if not options['no-changes']:
            # Mark fs sounds to go processing
            if in_fs_not_in_gaia:
                print "Changing similarity_state of sounds that require it"
                N = len(in_fs_not_in_gaia)
                for count, sid in enumerate(in_fs_not_in_gaia):
                    sys.stdout.write('\r\tChanging state of sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    sound = Sound.objects.get(id=sid)
                    sound.set_similarity_state('PE')

            # Delete sounds from gaia that are not in the db
            if in_gaia_not_in_fs:
                print "\nDeleting sounds that should not be in gaia"
                N = len(in_gaia_not_in_fs)
                for count, sid in enumerate(in_gaia_not_in_fs):
                    sys.stdout.write('\r\tDeleting sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    Similarity.delete(sid)
Exemple #5
0
    def handle(self,  *args, **options):

        # init
        solr = Solr(settings.SOLR_URL)

        # Get all solr ids
        print "Getting solr ids...",
        solr_ids = get_all_sound_ids_from_solr()
        print "done!"

        # Get ell gaia ids
        print "Getting gaia ids...",
        gaia_ids = Similarity.get_all_sound_ids()
        print "done!"

        print "Getting freesound db data..."
        # Get all moderated and processed sound ids
        queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK').order_by('id').only("id")
        fs_mp = [sound.id for sound in queryset]
        # Get ell moderated, processed and analysed sounds
        queryset = Sound.objects.filter(processing_state='OK', moderation_state='OK', analysis_state='OK').order_by('id').only("id")
        fs_mpa = [sound.id for sound in queryset]
        print "done!"

        print "\nNumber of sounds per index:\n--------------------------"
        print "Solr index\t\t%i" % len(solr_ids)
        print "Gaia index\t\t%i" % len(gaia_ids)
        print "Freesound\t\t%i  (moderated and processed)" % len(fs_mp)
        print "Freesound\t\t%i  (moderated, processed and analyzed)" % len(fs_mpa)

        print "\n\n***************\nSOLR INDEX\n***************\n"
        in_solr_not_in_fs = list(set(solr_ids).intersection(set(set(solr_ids).difference(fs_mp))))
        in_fs_not_in_solr = list(set(fs_mp).intersection(set(set(fs_mp).difference(solr_ids))))
        print "Sounds in solr but not in fs:\t%i" % len(in_solr_not_in_fs)
        print "Sounds in fs but not in solr:\t%i" % len(in_fs_not_in_solr)

        if not options['no-changes']:
            # Mark fs sounds to go processing
            if in_fs_not_in_solr:
                print "Changing is_index_dirty_state of sounds that require it"
                N = len(in_fs_not_in_solr)
                for count, sid in enumerate(in_fs_not_in_solr):
                    sys.stdout.write('\r\tChanging state of sound sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    sound = Sound.objects.get(id=sid)
                    sound.set_single_field('is_index_dirty', True)

            # Delete sounds from solr that are not in the db
            if in_solr_not_in_fs:
                print "\nDeleting sounds that should not be in solr"
                N = len(in_solr_not_in_fs)
                for count, sid in enumerate(in_solr_not_in_fs):
                    sys.stdout.write('\r\tDeleting sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    solr.delete_by_id(sid)

        print "\n***************\nGAIA INDEX\n***************\n"
        in_gaia_not_in_fs = list(set(gaia_ids).intersection(set(set(gaia_ids).difference(fs_mpa))))
        in_fs_not_in_gaia = list(set(fs_mpa).intersection(set(set(fs_mpa).difference(gaia_ids))))
        print "Sounds in gaia but not in fs:\t%i" % len(in_gaia_not_in_fs)
        print "Sounds in fs but not in gaia:\t%i  (only considering sounds correctly analyzed)" % len(in_fs_not_in_gaia)
        #Similarity.save()

        if not options['no-changes']:
            # Mark fs sounds to go processing
            if in_fs_not_in_gaia:
                print "Changing similarity_state of sounds that require it"
                N = len(in_fs_not_in_gaia)
                for count, sid in enumerate(in_fs_not_in_gaia):
                    sys.stdout.write('\r\tChanging state of sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    sound = Sound.objects.get(id=sid)
                    sound.set_similarity_state('PE')

            # Delete sounds from gaia that are not in the db
            if in_gaia_not_in_fs:
                print "\nDeleting sounds that should not be in solr"
                N = len(in_gaia_not_in_fs)
                for count, sid in enumerate(in_gaia_not_in_fs):
                    sys.stdout.write('\r\tDeleting sound %i of %i         ' % (count+1, N))
                    sys.stdout.flush()
                    Similarity.delete(sid)