def prefetch_from_db(self): """For performance reasons, we don't want to do database queries for every page Pull relevant information from the DB that we will need later and store it as dicts and sets. """ logger.info('querying db for %s', self.log_description) self.label_ids_by_name = {} self.text_detector_weights = {} self.label_lookup = {} self.prefetch_keywords() self.prefetch_generic_labels() self.prefetch_labels() self.remote_ids = dict( session.query(WebPage.id, WebPage.remote_id).filter( WebPage.id.in_(self.page_ids))) self.rotating_content_pages = [ row.remote_id for row in session.query(RotatingContentPage.remote_id) ] self.prefetch_active_videos() self.prefetch_crawl_status() self.prefetch_last_detection_times() self.prefetch_page_contents() self.prefetch_wt_clf_targets() self.prefetch_vdrs() self.prefetch_wplrs() self.prefetch_tdrs() self.prefetch_domain_results() self.prefetch_admin_labels() logger.info('done querying db for %s', self.log_description)
def update_status(cls): """Ingest new data from MTurk and write it to the database.""" for job in cls.query.filter(cls.finished == False): num_hits_left = session.query(BoxHit).filter_by( training_job_id=job.id, outstanding=True).count() urls_left = session.query(VideoTrainingURL).filter_by( training_job_id=job.id, processed=False) dynamo = DynamoIngestionStatusClient() num_urls_left = 0 for url in urls_left: dynamo_url = dynamo.get(url.url) if dynamo_url is None or dynamo_url['status'] == 'Failed': # will never be processed, so ignore for our purposes url.processed = True else: num_urls_left += 1 if num_hits_left + num_urls_left == 0: job.finished = True print '*** Job ID: %s is complete ***' % str(job.id) print '------------- Stats for Job ID: %s -------------' % str( job.id) print 'Total URLs : %i' % VideoTrainingURL.query.filter_by( training_job_id=job.id).count() print 'Total HITs : %i' % BoxHit.query.filter_by( training_job_id=job.id).count() if not job.finished: print 'unprocessed URLs: %i' % num_urls_left print 'outstanding HITs: %i\n' % num_hits_left session.flush()
def results_to_qa(cls, min_date): """ Find (max_hits_per_detector) box_detector_results since min_date for Box-QA-enabled Detectors. """ bdr, vdr = BoxDetectorResult, VideoDetectorResult results = defaultdict(list) for clf_target in cls.enabled_clf_targets(): max_boxes = clf_target.box_qa_count max_videos = max_boxes / BOXES_TO_QA_PER_VIDEO if isinstance(clf_target.clf, FaceRecognizeClassifier): query = Video.query.join(Video.video_detector_results).filter( vdr.timestamp > min_date, vdr.clf_target_id == clf_target.id) query = query.order_by(vdr.timestamp.desc()).limit(max_videos) for video in query: box_ids = [ b.id for b in video.boxes if b.timestamp in video.s3_timestamps() ] if box_ids: query = session.query(bdr.box_id).filter( bdr.box_id.in_(box_ids), bdr.clf_target_id == clf_target.id) # removing already qa'd boxes query = query.outerjoin( MTurkBox, and_(MTurkBox.box_id == bdr.box_id, MTurkBox.label_id == clf_target.target_label_id) ) query = query.filter(MTurkBox.box_id == None) query = query.order_by( bdr.box_id).limit(BOXES_TO_QA_PER_VIDEO) results[clf_target] += [bid for (bid,) in query] else: bdr = BoxDetectorResult base_query = session.query(Box.id, Box.timestamp, Video) base_query = base_query.filter(Box.video_id == Video.id) base_query = base_query.join( (bdr, Box.id == bdr.box_id)).filter( bdr.timestamp > min_date, bdr.clf_target_id == clf_target.id) # removing already qa'd boxes base_query = base_query.outerjoin( MTurkBox, and_(MTurkBox.box_id == Box.id, MTurkBox.label_id == clf_target.target_label_id) ) base_query = base_query.filter(MTurkBox.box_id == None) base_query = base_query.order_by(bdr.box_id) bids = [] for bid, ts, v in base_query: if ts in v.s3_timestamps(): bids.append(bid) if len(bids) >= max_boxes: break results[clf_target] += bids return results
def _get_training_data(cls): """ Returns page-ids used for training detectors for the latest version in (detector_id, page_id) format """ tp = TrainingPage # Get latest versions for all detector-ids detector_id_versions = [] for (dtc_id,) in session.query(tp.detector_id).distinct(tp.detector_id): latest_version = AbstractTextDetector.get(dtc_id).updated_at detector_id_versions.append((dtc_id, latest_version)) if not detector_id_versions: return [] # Get training page-ids only for the latest versions tr_dtc_page_ids = session.query(tp.detector_id, tp.page_id).filter( tuple_(tp.detector_id, tp.detector_version).in_(detector_id_versions)) return tr_dtc_page_ids.all()
def results_to_qa(cls, min_date): """ Find (max_hits_per_detector) text detector results since (min_date) for QA-enabled detectors. """ tdr, wpi = TextDetectorResult, WebPageInventory base_query = session.query(tdr).join( (WebPage, tdr.page_id == WebPage.id)) base_query = base_query.filter(tdr.timestamp >= min_date) base_query = base_query.outerjoin((wpi, wpi.page_id == tdr.page_id)) results = [] for clf_target in cls.enabled_clf_targets(): query = base_query.filter(tdr.clf_target_id == clf_target.id, WebPage.text_detection_update > clf_target.clf.updated_at) query = query.outerjoin( PageHit, and_(PageHit.page_id == tdr.page_id, PageHit.label_id == clf_target.target_label_id)).\ filter(PageHit.hit_id == None) query = query.order_by(wpi.count.desc()) query = query.limit(clf_target.screenshot_count) for inst in query: results.append((clf_target, inst.page_id, clf_target.clf.updated_at)) return results
def generate_QA_numbers(cls, start_date, end_date): """ Generate the QA report for all detectors """ count_bools = lambda expr: func.count(func.nullif(expr, 0)) # Total MTurk responses that were True trues_right = count_bools(MTurkBox.result == True) # Total Responses we got from MTurk (True or False) results_total = count_bools(MTurkBox.result != None) # Total responses where there was no consensus conflict_total = count_bools(MTurkBox.result == None) cols = [ClassifierTarget.id, trues_right, results_total, conflict_total] query = session.query(*cols) query = query.join(ClassifierTarget.target_label) query = query.join(ClassifierTarget.mturk_box_detector_results) query = query.filter(cls.timestamp >= start_date, cls.timestamp < end_date) query = query.join(cls.mturk_box) query = query.outerjoin( TrainingBox, (ClassifierTarget.clf_id == TrainingBox.detector_id) &\ (MTurkBox.box_id == TrainingBox.box_id)) query = query.filter(TrainingBox.box_id == None) query = query.join(MTurkBox.hit) query = query.filter_by(outstanding=False) query = query.group_by(cls.clf_target_id).order_by(Label.name) return query.all()
def results_to_qa(cls, min_date): """ Find video detector results since (min_date) for QA-enabled detectors. """ vdr, wpi = VideoDetectorResult, WebPageInventory base_query = session.query( vdr.video_id).filter(vdr.timestamp >= min_date) base_query = base_query.join((wpi, wpi.video_id == vdr.video_id)) base_query = base_query.group_by( vdr.video_id).order_by(func.sum(wpi.count).desc()) results = [] for clf_target in cls.enabled_clf_targets(): max_hits = clf_target.collage_count # Note: # Need to use clf.id and not clf_id since # clf_id will be int_id for clf whereas # for VDRs we use the uuid of the clf. query = base_query.filter(vdr.clf_target_id == clf_target.id) query = query.outerjoin( VideoHit, and_(VideoHit.video_id == vdr.video_id, VideoHit.label_id == clf_target.target_label_id)).\ filter(VideoHit.hit_id == None) results.extend((row[0], clf_target) for row in query.limit(max_hits)) return results
def submit_golden_hits(n_hits, n_lookback): """Submit golden hits. Fetches the N_LOOKBACK hits most recently selected for golden submission and submits N_HITS of them, cycling through them as necessary, and prioritizing those that have been submitted as golden the least number of times. Args: n_hits: Number of golden hits submissions. n_lookback: Number of distinct hits used for submission. Raises: AssertionError: No candidate golden hits """ query = session.query(GoldenHitCandidate.hit_id) assert query.count() > 0, "No candidate golden hits" query = query.order_by( GoldenHitCandidate.created_at.desc()).limit(n_lookback) query = query.from_self() query = query.outerjoin(GoldenHit, GoldenHitCandidate.hit_id == GoldenHit.hit_id) query = query.group_by(GoldenHitCandidate.hit_id) query = query.order_by(func.count( GoldenHit.hit_id).asc()).limit(n_hits) hit_ids = islice(cycle([hit_id for (hit_id, ) in query]), n_hits) for hit in map(get_hit_from_hit_id, hit_ids): ghid = MechanicalTurkEvaluator.create_duplicate_hit(hit) GoldenHit(golden_hit_id=ghid, hit_id=hit.hit_id) session.flush()
def prefetch_crawl_status(self): self.video_crawl_complete = defaultdict(bool) q = session.query(WebPage.id, WebPage.last_crawled_video) q = q.filter(WebPage.id.in_(self.page_ids)) for page_id, video_ts in q: if video_ts is not None: self.video_crawl_complete[page_id] = True
def prefetch_page_contents(self): """Populate self.page_contents""" self.page_contents = {} query = session.query(WebPage.id, WebPage.processed_title).filter( WebPage.id.in_(self.page_ids)) processed_text_dict = get_page_processed_text_dict(self.page_ids, silent=True) for page_id, processed_title in query: self.page_contents[page_id] = (processed_title, processed_text_dict[page_id])
def get_result(self, golden_hit_id): # We need the result for the MTurkBox corresponding to golden_hit_id # Hence we get the MTurkBox with box_id where corresponding BoxHit with # hit_id has a GoldenHit with golden_hit_id golden_hit_id, box_id = golden_hit_id.split('_') result = \ session.query(MTurkBox.result).join(MTurkBox.hit).join(GoldenHit, GoldenHit.hit_id == BoxHit.hit_id).filter(GoldenHit.golden_hit_id == golden_hit_id, MTurkBox.box_id == box_id).scalar() return result
def prefetch_wplrs(self): self.wplr_lookup = defaultdict(set) query = session.query(WebPageLabelResult.page_id, WebPageLabelResult.label_id) query = query.filter(WebPageLabelResult.page_id.in_(self.page_ids)) if not self.all_labels: label_ids = Label.all_descendant_ids(self.target_label_ids) query = query.filter(WebPageLabelResult.label_id.in_(label_ids)) for page_id, label_id in query: self.wplr_lookup[page_id].add(label_id)
def _get_job_status(self): """ get status of outstanding Hits and unprocessed URLS for a training job """ total_hits = session.query(BoxHit).filter_by( training_job_id=self.id).count() num_hits_left = session.query(BoxHit).filter_by( training_job_id=self.id, outstanding=True).count() total_urls = self.num_urls num_urls_left = session.query(VideoTrainingURL).filter_by( job=self, processed=False).count() faces_obtained = MTurkBox.query.filter_by( label=self.evaluator.target_label, result=True).count() return '\n'.join([ '------------- Stats for Job ID: %s -------------' % str(self.id), 'Job for Label : %s' % self.label.name, 'Total URLs : %d' % total_urls, 'Total HITs : %d' % total_hits, 'unprocessed URLS : %d' % num_urls_left, 'outstanding Hits : %d' % num_hits_left, 'Job Finish Status : %s' % self.finished, 'Faces Obtained : %d' % faces_obtained, ]) + '\n'
def get_result(self, golden_hit_id): golden_hit_id, video_id, timestamp = golden_hit_id.split('_') # We need the MTurkImage's result where, # The GoldenHit with golden_hit_id has a hit_id corresponding to an # ImageHit which in turn has the required MTurkImage with video_id and # timestamp result = session.query(MTurkImage.result).\ join(MTurkImage.hit).join(GoldenHit, GoldenHit.hit_id == ImageHit.hit_id).filter( GoldenHit.golden_hit_id == golden_hit_id, MTurkImage.timestamp == timestamp, MTurkImage.video_id == video_id).scalar() return result
def _generate_QA_numbers_query(cls, start_date, end_date, hit_type, exclude_table): count_bools = lambda expr: cast(func.sum(expr), Integer) true_positives = case([(hit_type.result == True, 1)], else_=0) total = case([(hit_type.result != None, 1)], else_=0) conflicts = case([(hit_type.result == None, 1)], else_=0) query = session.query(hit_type.label_id, count_bools(true_positives), count_bools(total), count_bools(conflicts)) query = query.outerjoin(exclude_table, exclude_table.hit_id==hit_type.hit_id) query = query.filter(exclude_table.hit_id==None) query = query.filter(hit_type.timestamp.between(start_date, end_date)) query = query.filter(hit_type.outstanding==False) query = query.group_by(hit_type.label_id) return query
def prefetch_tdrs(self): """Grab all Text Detector Results for our pages""" # lookup containing text detector results for pages self.tdr_lookup = defaultdict(set) if self.clf_target_ids: tdr = TextDetectorResult query = session.query(tdr.page_id, tdr.clf_target_id) query = query.filter(tdr.page_id.in_(self.page_ids)) query = query.filter(tdr.clf_target_id.in_(self.clf_target_ids)) for page_id, clf_target_id in query: self.tdr_lookup[page_id].add(clf_target_id)
def results_to_qa(cls, min_date): """ Find image_detector_results since min_date for Image-QA-enabled Clf Targets""" idr, ti = ImageDetectorResult, TrainingImage results = defaultdict(set) for clf_target in cls.enabled_clf_targets(): # Image count ends up being the number of videos from which we # query. So, the actual number of images might be more max_videos = clf_target.image_qa_count query = session.query(idr.video_id.distinct()) query = query.filter(idr.clf_target_id == clf_target.id, idr.timestamp > min_date) myvids = set() for (vid,) in query: if(len(myvids) >= max_videos): break s3_images = Video.get(vid).s3_timestamps() if s3_images: query = session.query(idr.video_id, idr.time).filter( idr.video_id == vid, idr.clf_target_id == clf_target.id, idr.time.in_(s3_images)) # Filter images used for training the detector training_images = session.query( ti.video_id, ti.timestamp).\ filter_by(detector_id=clf_target.clf.id).all() for vid_id, ts in query: if (vid_id, ts) not in training_images: mtb = MTurkImage.query.filter_by( video_id=vid_id, timestamp=ts, label_id=clf_target.target_label_id).first() if not mtb: results[clf_target].add((vid_id, ts)) myvids.add(vid_id) return results
def prefetch_awplrs(self): """Grab all admin web page label results for our pages""" self.awplr_lookup = defaultdict(list) query = session.query(AdminWebPageLabelResult.page_id, AdminWebPageLabelResult.label_id, AdminWebPageLabelResult.result) query = query.filter(AdminWebPageLabelResult.page_id.in_( self.page_ids)) if not self.all_labels: query = query.filter( AdminWebPageLabelResult.label_id.in_(self.target_label_ids)) for page_id, label_id, result in query: self.awplr_lookup[page_id].append((label_id, result))
def _generate_QA_numbers(cls, cls_to_qa_prop, cls_to_qa, join_name, target_result, start_date, end_date): count_bools = lambda expr: func.count(func.nullif(expr, 0)) trues_right = count_bools( and_(target_result == True, cls.expected_result == True)) trues_total = count_bools( and_(cls.expected_result == True, target_result != None)) conflict_total = count_bools(target_result == None) query = session.query( cls_to_qa_prop, trues_right, trues_total, conflict_total) query = query.filter( cls.timestamp >= start_date, cls.timestamp < end_date) query = query.join(join_name).join( cls.hit).filter_by(outstanding=False) return query.group_by(cls_to_qa.id).order_by(cls_to_qa.id).all()
def prefetch_labels(self): """Populate self.base_label_ids""" self.base_label_ids = set() self.label_decision_thresholds = {} if not self.all_labels: # use only labels that can actually produce results # i.e. have at least one weighted_keyword, weighted_label, # weighted_detector or weighted_text_detector query = session.query(Label.id.distinct()).filter( Label.id.in_(self.target_label_ids)) query = query.outerjoin(Label.weighted_keywords) query = query.outerjoin(Label.weighted_labels) query = query.outerjoin(Label.weighted_clf_targets) query = query.filter((WeightedKeyword.keyword_id != None) | (WeightedLabel.child_id != None) | (WeightedClfTarget.clf_target_id != None)) self.base_label_ids.update(row[0] for row in query) else: query = session.query(Label.id.distinct()) query = query.outerjoin(Label.weighted_keywords) query = query.outerjoin(Label.weighted_clf_targets) query = query.filter((WeightedKeyword.keyword_id != None) | (WeightedClfTarget.clf_target_id != None)) label_ids = [row[0] for row in query] if label_ids: self.base_label_ids.update(Label.all_ancestor_ids(label_ids)) self.descendant_label_ids = ( Label.all_descendant_ids(self.target_label_ids) - self.target_label_ids) # fetch all label thresholds, since it is cheap query = session.query(Label.id, Label.decision_threshold) for label_id, thresh in query: self.label_decision_thresholds[label_id] = thresh
def preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore): label_to_qa = Label.get(label_id) query = session.query(WebPageInventory.page_id.distinct()) query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id) query = query.filter(VideoOnPage.page_id != None) if page_ids_to_ignore: query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore)) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.filter(WebPageLabelResult.label_id == label_to_qa.id) query = query.order_by(func.rand()) query = query.limit(label_to_qa.screenshot_count) return [page_id for (page_id,) in query]
def prefetch_last_detection_times(self): self.last_detections = defaultdict(lambda: None) self.last_text_detections = defaultdict(lambda: None) for page_id in self.page_ids: vcr_videos = self.active_videos[page_id] last_detections = [vv['last_detection'] for vv in vcr_videos] # Set it to min of last_detections or None if last_detections == [] or None in last_detections: last_detection = None else: last_detection = min(last_detections) self.last_detections[page_id] = last_detection query = session.query(WebPage.id, WebPage.text_detection_update) query = query.filter(WebPage.id.in_(self.page_ids)) self.last_text_detections.update(query)
def prefetch_vdrs(self): # lookup containing true detectors for videos self.vdr_lookup = defaultdict(set) # gather all active videos video_ids = set() for video_list in self.active_videos.values(): for vcr_video in video_list: video_ids.add(vcr_video['video_id']) # get all video detector results if self.clf_target_ids and video_ids: vdr = VideoDetectorResult query = session.query(vdr.video_id, vdr.clf_target_id) query = query.filter(vdr.video_id.in_(video_ids)) query = query.filter(vdr.clf_target_id.in_(self.clf_target_ids)) for video_id, clf_target_id in query: self.vdr_lookup[video_id].add(clf_target_id)
def prefetch_keywords(self): """Populate self.keywords""" from affine.detection.nlp.keywords.keyword_matching import \ PageKeywordMatcher self.keyword_matcher = PageKeywordMatcher() self.weighted_keywords = defaultdict(list) query = session.query(WeightedKeyword) query = query.options(joinedload(WeightedKeyword.keyword)) if not self.all_labels: query = query.filter( WeightedKeyword.label_id.in_(self.target_label_ids)) for wk in query: kw = wk.keyword self.weighted_keywords[wk.label_id].append( (kw.id, wk.title_weight, wk.body_weight)) self.keyword_matcher.add_keyword(kw.id, kw.text)
def process_hit(self, hit_id, assignments): processed_results = [] logger.info('processing box hit %s' % hit_id) # get a list of the box ids for this hit from the first assignment in # the assignments box_ids = assignments[0]['box_ids'][0].split('_') # combine all the assignments' results into a single list all_clicked_boxes = reduce(lambda x, y: list(set(x) | set(y.keys())), assignments) all_clicked_boxes = [ k.replace('box_', '') for k in all_clicked_boxes if self.box_pat.search(k) ] # guaranteed 3/3 if nobody clicked false_box_ids = set(box_ids) - set(all_clicked_boxes) for box_id in false_box_ids: processed_results.append( (hit_id, int(box_id), self.target_label_id, False)) true_results = defaultdict(int) for assignment in assignments: for box_id in all_clicked_boxes: munged = "box_%s" % str(box_id) true_results[box_id] += 1 if munged in assignment else 0 true_box_ids = set() for box_id, num_true in true_results.iteritems(): num_false = self.max_assignments - num_true if num_true >= self.match_threshold: result = True true_box_ids.add(box_id) elif num_false >= self.match_threshold: result = False else: result = None processed_results.append( (hit_id, int(box_id), self.target_label_id, result)) # update video results if true face boxes are found if true_box_ids: query = session.query(Box.video_id.distinct()).filter( Box.id.in_(true_box_ids), or_(Box.box_type == 'Face', Box.box_type == 'Logo')) for (vid, ) in query: self.save_video_result(vid, True) return processed_results
def prefetch_avlrs(self): """Grab all admin video label results for our pages""" self.avlr_lookup = defaultdict(list) query = session.query(VideoOnPage.page_id, AdminVideoLabelResult.label_id, AdminVideoLabelResult.result) query = query.join( (AdminVideoLabelResult, VideoOnPage.video_id == AdminVideoLabelResult.video_id)) query = query.filter(VideoOnPage.active == True, VideoOnPage.is_preroll == False, VideoOnPage.page_id.in_(self.page_ids)) if not self.all_labels: query = query.filter( AdminVideoLabelResult.label_id.in_(self.target_label_ids)) for page_id, label_id, result in query: self.avlr_lookup[page_id].append((label_id, result))
def register_prev_qa(cls): tdr, ph, wp = TextDetectorResult, PageHit, WebPage base_query = session.query( tdr.page_id, tdr.clf_target_id, ph.hit_id).\ filter(ph.page_id == tdr.page_id) base_query = base_query.join(wp, tdr.page_id == wp.id) for clf_target in cls.enabled_clf_targets(): query = base_query.\ filter(tdr.clf_target_id == clf_target.id, wp.text_detection_update > clf_target.clf.updated_at, ph.label_id == clf_target.target_label_id) query = query.\ outerjoin(cls, and_(cls.clf_target_id == clf_target.id, cls.detector_version == clf_target.clf.updated_at, cls.page_id == tdr.page_id)) for i in query.filter(cls.page_id == None): cls(page_id=i.page_id, detector_version=clf_target.clf.updated_at, clf_target_id=clf_target.id, hit_id=i.hit_id) session.flush()
def prefetch_wt_clf_targets(self): self.wt_clf_target_lookup = defaultdict(list) self.clf_target_ids = set() cols = [ WeightedClfTarget.label_id, WeightedClfTarget.clf_target_id, WeightedClfTarget.weight ] query = session.query(*cols) if not self.all_labels: query = query.filter( WeightedClfTarget.label_id.in_(self.target_label_ids)) query = query.join(WeightedClfTarget.clf_target) query = query.join(ClassifierTarget.clf) query = query.filter(AbstractClassifier.enabled_since != None) for label_id, clf_target_id, weight in query: self.wt_clf_target_lookup[label_id].append((clf_target_id, weight)) self.clf_target_ids.add(clf_target_id)
def non_preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore): label_to_qa = Label.get(label_id) #Set end date such that no pages ingested same day are QAed to allow for #all stages of ingestion to complete end_date = datetime.utcnow() - timedelta(days=1) query = session.query(WebPageInventory.page_id.distinct()) query = query.join(WebPage, WebPageInventory.page_id == WebPage.id) query = query.filter(WebPage.last_crawled_video <= end_date) query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id) query = query.filter(VideoOnPage.page_id == None) if page_ids_to_ignore: query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore)) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.filter(WebPageLabelResult.label_id == label_to_qa.id) query = query.order_by(func.rand()) query = query.limit(label_to_qa.non_preroll_qa_count) return [page_id for (page_id,) in query]
def results_to_qa_for_label(cls, label_id): logger.info("Gathering results for label_id : %s", label_id) label_results = [] ignore_video_ids = cls.get_ignore_video_ids(label_id) ignore_page_ids = cls.get_ignore_page_ids(label_id) query = session.query(WebPageInventory.video_id, WebPageInventory.page_id) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.distinct(WebPageInventory.video_id).filter(WebPageLabelResult.label_id == label_id) if ignore_video_ids: query = query.filter(~WebPageInventory.video_id.in_(ignore_video_ids)) if ignore_page_ids: query = query.filter(~WebPageInventory.page_id.in_(ignore_page_ids)) query = query.filter(WebPageInventory.video_id != 0).group_by(WebPageInventory.video_id) vid_page_ids = query.order_by(func.rand()).limit(Label.get(label_id).collage_count).all() for video_id, page_id in vid_page_ids: label_results.append((label_id, video_id, page_id, True)) return label_results