def pruneSolr_factory(job_package): # set new task_name, for the worker below job_package['custom_task_name'] = 'pruneSolr_worker' # get solr results obj solr_total = solr_handle.search(q='*:*', fl='id').total_results # set estimated tasks print "Antipcating",solr_total,"tasks...." redisHandles.r_job_handle.set("job_%s_est_count" % (job_package['job_num']), solr_total) # iterate through solr objects # variables start = 0 rows = 100 step = 1 while start < solr_total: # perform search solr_result = solr_handle.search(q='*:*', fl='id', rows=rows, start=start) # iterate for doc in solr_result.documents: doc_id = doc['id'] print "pruneSolr checking %s" % (doc_id) job_package['doc_id'] = doc_id # fire task via custom_loop_taskWrapper result = actions.actions.custom_loop_taskWrapper.apply_async(kwargs={'job_package':job_package}, queue=job_package['username'] ) task_id = result.id # Set handle in Redis redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (doc_id)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_package['job_num']) # bump step step += 1 # bump start start += rows
def pruneSolr_worker(job_package): print "pruning Solr of objects not found in Fedora" # variables count = 0 pruned = [] start = 0 rows = 100 # get solr results obj solr_total = solr_handle.search(q='*:*', fl='id').total_results # iterate through while start < solr_total: # perform search solr_result = solr_handle.search(q='*:*', fl='id', rows=rows, start=start) # iterate for doc in solr_result.documents: doc_id = doc['id'] print "pruneSolr checking %s, %i / %i" % (doc_id, count, solr_total) if not fedora_handle.get_object(doc_id).exists: print "Did not find object in Fedora, pruning from Solr..." pruned.append(doc_id) solr_handle.delete_by_key(doc_id) # bump counter count+=1 # bump start start += rows # return JSON report return json.dumps(pruned)
def index(): # get form form = solrSearch(request.form) # dynamically update fields # collection selection coll_query = {'q':"rels_hasContentModel:*Collection", 'fl':["id","dc_title"], 'rows':1000} coll_results = solr_handle.search(**coll_query) coll_docs = coll_results.documents form.collection_object.choices = [(each['id'].encode('ascii','ignore'), each['dc_title'][0].encode('ascii','ignore')) for each in coll_docs] form.collection_object.choices.insert(0,("","All Collections")) return render_template("createObjectIndex_index.html",form=form)
def __init__(self,id): self.id = id self.escaped_id = self.id.replace(":","\:") # get stateful, current Solr doc query_params = { "q":'id:%s' % (self.escaped_id), "rows":1 } response = solr_handle.search(**query_params) if len(response.documents) > 0: self.doc = self.SolrFields(**response.documents[0]) #store version, remove from doc self.version = self.doc._version_ del self.doc._version_ # finally, set exists to True self.exists = True else: self.doc = self.SolrFields() self.doc.id = self.id # automatically set ID as PID self.exists = False
def createObjectIndex_worker(collection_PID_suffix): print "Operating on:",collection_PID_suffix # build query query = { "q" : "*%s" % (collection_PID_suffix), "rows" : 100, "start" : 0, "sort" : "id asc" } # get collection length total_results = solr_handle.search(**query).total_results print "Iterating through %s objects..." % (total_results) total_iterations = total_results / query['rows'] if total_results % query['rows'] > 0: total_iterations += 1 print "Total iterations:",total_iterations # iterate through objects cursor = 0 # large iterations for iteration in range(0,total_iterations): # perform new query query['start'] = iteration * query['rows'] print query results = solr_handle.search(**query) # for each in smaller query for doc in results.documents: PID = doc['id'] print "%s gets index: %s" % (PID, cursor) # retrieve COLLINDEX JSON, edit current collection index, resubmit obj_ohandle = fedora_handle.get_object(PID) DS_handle = obj_ohandle.getDatastreamObject("COLLINDEX") COLLINDEX_JSON = DS_handle.content # change values collection_key = "wayne:"+collection_PID_suffix if DS_handle.exists == True: collection_index_dict = json.loads(COLLINDEX_JSON) collection_index_dict[collection_key] = {"index":cursor} else: collection_index_dict = { collection_key : {"index":cursor} } # write new content DS_handle = eulfedora.models.DatastreamObject(obj_ohandle, "COLLINDEX", "COLLINDEX", control_group="M") # construct DS object DS_handle.mimetype = "application/json" # content DS_handle.content = json.dumps(collection_index_dict) # save constructed object result = DS_handle.save() print "Result for %s: %s" % (PID, result) # bump counter cursor += 1 return "Finis."