def goThroughCandidateDB(self): """Go through candidate event db and classify whatever is left""" ei = EventInterface(self.candidate_db, self.candidate_collection) ei_classified = EventInterface(self.classified_event_db, self.classified_event_collection) cnt = 0 # consider past 2 hours for merge low_bound = str(int(getCurrentStampUTC()) - 60 * 60 * 2) condition = {'created_time':{ '$gte': low_bound}} for e in ei.getAllDocuments(condition=condition): logging.warning("Classifying %d-th candidate event..." % cnt) e = Event(e) cnt += 1 region = Region(e.getRegion()) corpus = self.all_corpus[region.getKey()] ef = BaseFeatureProduction(e, corpus) prob = self.clf.classify(ef.extractFeatures()) if ei_classified.getEventByID(e.getID()) is not None: if prob > 0.5: print 'already in front end collection, merge it' ei_classified.addEvent(e) else: print 'after merge it becomes none event, delete it' ei_classified.deleteEventByID(e.getID()) else: if prob > 0.5: print 'new events find in collection but not in front end , add it' ei_classified.addEvent(e)