def prefetch_from_db(self):
        """For performance reasons,
        we don't want to do database queries for every page
        Pull relevant information from the DB that we will need later and
        store it as dicts and sets.
        """
        logger.info('querying db for %s', self.log_description)
        self.label_ids_by_name = {}
        self.text_detector_weights = {}
        self.label_lookup = {}

        self.prefetch_keywords()
        self.prefetch_generic_labels()
        self.prefetch_labels()

        self.remote_ids = dict(
            session.query(WebPage.id, WebPage.remote_id).filter(
                WebPage.id.in_(self.page_ids)))
        self.rotating_content_pages = [
            row.remote_id
            for row in session.query(RotatingContentPage.remote_id)
        ]

        self.prefetch_active_videos()
        self.prefetch_crawl_status()
        self.prefetch_last_detection_times()
        self.prefetch_page_contents()
        self.prefetch_wt_clf_targets()
        self.prefetch_vdrs()
        self.prefetch_wplrs()
        self.prefetch_tdrs()
        self.prefetch_domain_results()
        self.prefetch_admin_labels()
        logger.info('done querying db for %s', self.log_description)
Esempio n. 2
0
    def update_status(cls):
        """Ingest new data from MTurk and write it to the database."""
        for job in cls.query.filter(cls.finished == False):
            num_hits_left = session.query(BoxHit).filter_by(
                training_job_id=job.id, outstanding=True).count()
            urls_left = session.query(VideoTrainingURL).filter_by(
                training_job_id=job.id, processed=False)
            dynamo = DynamoIngestionStatusClient()
            num_urls_left = 0
            for url in urls_left:
                dynamo_url = dynamo.get(url.url)
                if dynamo_url is None or dynamo_url['status'] == 'Failed':
                    # will never be processed, so ignore for our purposes
                    url.processed = True
                else:
                    num_urls_left += 1
            if num_hits_left + num_urls_left == 0:
                job.finished = True
                print '*** Job ID: %s is complete ***' % str(job.id)

            print '------------- Stats for Job ID: %s -------------' % str(
                job.id)
            print 'Total URLs      : %i' % VideoTrainingURL.query.filter_by(
                training_job_id=job.id).count()
            print 'Total HITs      : %i' % BoxHit.query.filter_by(
                training_job_id=job.id).count()
            if not job.finished:
                print 'unprocessed URLs: %i' % num_urls_left
                print 'outstanding HITs: %i\n' % num_hits_left
        session.flush()
Esempio n. 3
0
 def results_to_qa(cls, min_date):
     """
     Find (max_hits_per_detector) box_detector_results since min_date
     for Box-QA-enabled Detectors.
     """
     bdr, vdr = BoxDetectorResult, VideoDetectorResult
     results = defaultdict(list)
     for clf_target in cls.enabled_clf_targets():
         max_boxes = clf_target.box_qa_count
         max_videos = max_boxes / BOXES_TO_QA_PER_VIDEO
         if isinstance(clf_target.clf, FaceRecognizeClassifier):
             query = Video.query.join(Video.video_detector_results).filter(
                 vdr.timestamp > min_date,
                 vdr.clf_target_id == clf_target.id)
             query = query.order_by(vdr.timestamp.desc()).limit(max_videos)
             for video in query:
                 box_ids = [
                     b.id for b in video.boxes
                     if b.timestamp in video.s3_timestamps()
                 ]
                 if box_ids:
                     query = session.query(bdr.box_id).filter(
                         bdr.box_id.in_(box_ids),
                         bdr.clf_target_id == clf_target.id)
                     # removing already qa'd boxes
                     query = query.outerjoin(
                         MTurkBox,
                         and_(MTurkBox.box_id == bdr.box_id,
                              MTurkBox.label_id == clf_target.target_label_id)
                     )
                     query = query.filter(MTurkBox.box_id == None)
                     query = query.order_by(
                         bdr.box_id).limit(BOXES_TO_QA_PER_VIDEO)
                     results[clf_target] += [bid for (bid,) in query]
         else:
             bdr = BoxDetectorResult
             base_query = session.query(Box.id, Box.timestamp, Video)
             base_query = base_query.filter(Box.video_id == Video.id)
             base_query = base_query.join(
                 (bdr, Box.id == bdr.box_id)).filter(
                 bdr.timestamp > min_date,
                 bdr.clf_target_id == clf_target.id)
             # removing already qa'd boxes
             base_query = base_query.outerjoin(
                 MTurkBox,
                 and_(MTurkBox.box_id == Box.id,
                      MTurkBox.label_id == clf_target.target_label_id)
             )
             base_query = base_query.filter(MTurkBox.box_id == None)
             base_query = base_query.order_by(bdr.box_id)
             bids = []
             for bid, ts, v in base_query:
                 if ts in v.s3_timestamps():
                     bids.append(bid)
                 if len(bids) >= max_boxes:
                     break
             results[clf_target] += bids
     return results
Esempio n. 4
0
    def _get_training_data(cls):
        """ Returns page-ids used for training detectors for the latest version in (detector_id, page_id) format """
        tp = TrainingPage
        # Get latest versions for all detector-ids
        detector_id_versions = []
        for (dtc_id,) in session.query(tp.detector_id).distinct(tp.detector_id):
            latest_version = AbstractTextDetector.get(dtc_id).updated_at
            detector_id_versions.append((dtc_id, latest_version))

        if not detector_id_versions:
            return []

        # Get training page-ids only for the latest versions
        tr_dtc_page_ids = session.query(tp.detector_id, tp.page_id).filter(
            tuple_(tp.detector_id, tp.detector_version).in_(detector_id_versions))
        return tr_dtc_page_ids.all()
Esempio n. 5
0
    def results_to_qa(cls, min_date):
        """
        Find (max_hits_per_detector) text detector results since (min_date)
        for QA-enabled detectors.
        """
        tdr, wpi = TextDetectorResult, WebPageInventory
        base_query = session.query(tdr).join(
            (WebPage, tdr.page_id == WebPage.id))
        base_query = base_query.filter(tdr.timestamp >= min_date)
        base_query = base_query.outerjoin((wpi, wpi.page_id == tdr.page_id))

        results = []
        for clf_target in cls.enabled_clf_targets():
            query = base_query.filter(tdr.clf_target_id == clf_target.id,
                                      WebPage.text_detection_update > clf_target.clf.updated_at)
            query = query.outerjoin(
                PageHit, and_(PageHit.page_id == tdr.page_id,
                              PageHit.label_id == clf_target.target_label_id)).\
                filter(PageHit.hit_id == None)
            query = query.order_by(wpi.count.desc())
            query = query.limit(clf_target.screenshot_count)
            for inst in query:
                results.append((clf_target,
                               inst.page_id,
                               clf_target.clf.updated_at))

        return results
Esempio n. 6
0
    def generate_QA_numbers(cls, start_date, end_date):
        """ Generate the QA report for all detectors """
        count_bools = lambda expr: func.count(func.nullif(expr, 0))
        # Total MTurk responses that were True
        trues_right = count_bools(MTurkBox.result == True)
        # Total Responses we got from MTurk (True or False)
        results_total = count_bools(MTurkBox.result != None)
        # Total responses where there was no consensus
        conflict_total = count_bools(MTurkBox.result == None)

        cols = [ClassifierTarget.id,
                trues_right,
                results_total,
                conflict_total]

        query = session.query(*cols)
        query = query.join(ClassifierTarget.target_label)
        query = query.join(ClassifierTarget.mturk_box_detector_results)

        query = query.filter(cls.timestamp >= start_date, cls.timestamp < end_date)
        query = query.join(cls.mturk_box)
        query = query.outerjoin(
            TrainingBox,
            (ClassifierTarget.clf_id == TrainingBox.detector_id) &\
            (MTurkBox.box_id == TrainingBox.box_id))
        query = query.filter(TrainingBox.box_id == None)
        query = query.join(MTurkBox.hit)
        query = query.filter_by(outstanding=False)
        query = query.group_by(cls.clf_target_id).order_by(Label.name)
        return query.all()
Esempio n. 7
0
    def results_to_qa(cls, min_date):
        """
        Find video detector results since (min_date)
        for QA-enabled detectors.
        """
        vdr, wpi = VideoDetectorResult, WebPageInventory
        base_query = session.query(
            vdr.video_id).filter(vdr.timestamp >= min_date)
        base_query = base_query.join((wpi, wpi.video_id == vdr.video_id))
        base_query = base_query.group_by(
            vdr.video_id).order_by(func.sum(wpi.count).desc())

        results = []
        for clf_target in cls.enabled_clf_targets():
            max_hits = clf_target.collage_count
            # Note:
            # Need to use clf.id and not clf_id since
            # clf_id will be int_id for clf whereas
            # for VDRs we use the uuid of the clf.
            query = base_query.filter(vdr.clf_target_id == clf_target.id)
            query = query.outerjoin(
                VideoHit, and_(VideoHit.video_id == vdr.video_id,
                               VideoHit.label_id == clf_target.target_label_id)).\
                filter(VideoHit.hit_id == None)
            results.extend((row[0], clf_target)
                           for row in query.limit(max_hits))
        return results
    def submit_golden_hits(n_hits, n_lookback):
        """Submit golden hits.

        Fetches the N_LOOKBACK hits most recently selected for golden submission
        and submits N_HITS of them, cycling through them as necessary, and
        prioritizing those that have been submitted as golden the least number of
        times.

        Args:
            n_hits: Number of golden hits submissions.
            n_lookback: Number of distinct hits used for submission.

        Raises:
            AssertionError: No candidate golden hits
        """
        query = session.query(GoldenHitCandidate.hit_id)
        assert query.count() > 0, "No candidate golden hits"
        query = query.order_by(
            GoldenHitCandidate.created_at.desc()).limit(n_lookback)
        query = query.from_self()
        query = query.outerjoin(GoldenHit,
                                GoldenHitCandidate.hit_id == GoldenHit.hit_id)
        query = query.group_by(GoldenHitCandidate.hit_id)
        query = query.order_by(func.count(
            GoldenHit.hit_id).asc()).limit(n_hits)
        hit_ids = islice(cycle([hit_id for (hit_id, ) in query]), n_hits)
        for hit in map(get_hit_from_hit_id, hit_ids):
            ghid = MechanicalTurkEvaluator.create_duplicate_hit(hit)
            GoldenHit(golden_hit_id=ghid, hit_id=hit.hit_id)
        session.flush()
    def prefetch_crawl_status(self):
        self.video_crawl_complete = defaultdict(bool)

        q = session.query(WebPage.id, WebPage.last_crawled_video)
        q = q.filter(WebPage.id.in_(self.page_ids))

        for page_id, video_ts in q:
            if video_ts is not None:
                self.video_crawl_complete[page_id] = True
 def prefetch_page_contents(self):
     """Populate self.page_contents"""
     self.page_contents = {}
     query = session.query(WebPage.id, WebPage.processed_title).filter(
         WebPage.id.in_(self.page_ids))
     processed_text_dict = get_page_processed_text_dict(self.page_ids,
                                                        silent=True)
     for page_id, processed_title in query:
         self.page_contents[page_id] = (processed_title,
                                        processed_text_dict[page_id])
Esempio n. 11
0
 def get_result(self, golden_hit_id):
     # We need the result for the MTurkBox corresponding to golden_hit_id
     # Hence we get the MTurkBox with box_id where corresponding BoxHit with
     # hit_id has a GoldenHit with golden_hit_id
     golden_hit_id, box_id = golden_hit_id.split('_')
     result = \
         session.query(MTurkBox.result).join(MTurkBox.hit).join(GoldenHit,
                                                                GoldenHit.hit_id ==
                                                                BoxHit.hit_id).filter(GoldenHit.golden_hit_id == golden_hit_id,
                                                                                      MTurkBox.box_id == box_id).scalar()
     return result
    def prefetch_wplrs(self):
        self.wplr_lookup = defaultdict(set)
        query = session.query(WebPageLabelResult.page_id,
                              WebPageLabelResult.label_id)
        query = query.filter(WebPageLabelResult.page_id.in_(self.page_ids))

        if not self.all_labels:
            label_ids = Label.all_descendant_ids(self.target_label_ids)
            query = query.filter(WebPageLabelResult.label_id.in_(label_ids))

        for page_id, label_id in query:
            self.wplr_lookup[page_id].add(label_id)
Esempio n. 13
0
 def _get_job_status(self):
     """ get status of outstanding Hits and unprocessed URLS for a training job """
     total_hits = session.query(BoxHit).filter_by(
         training_job_id=self.id).count()
     num_hits_left = session.query(BoxHit).filter_by(
         training_job_id=self.id, outstanding=True).count()
     total_urls = self.num_urls
     num_urls_left = session.query(VideoTrainingURL).filter_by(
         job=self, processed=False).count()
     faces_obtained = MTurkBox.query.filter_by(
         label=self.evaluator.target_label, result=True).count()
     return '\n'.join([
         '------------- Stats for Job ID: %s -------------' % str(self.id),
         'Job for Label        : %s' % self.label.name,
         'Total URLs           : %d' % total_urls,
         'Total HITs           : %d' % total_hits,
         'unprocessed URLS     : %d' % num_urls_left,
         'outstanding Hits     : %d' % num_hits_left,
         'Job Finish Status    : %s' % self.finished,
         'Faces Obtained       : %d' % faces_obtained,
     ]) + '\n'
Esempio n. 14
0
 def get_result(self, golden_hit_id):
     golden_hit_id, video_id, timestamp = golden_hit_id.split('_')
     # We need the MTurkImage's result where,
     # The GoldenHit with golden_hit_id has a hit_id corresponding to an
     # ImageHit which in turn has the required MTurkImage with video_id and
     # timestamp
     result = session.query(MTurkImage.result).\
         join(MTurkImage.hit).join(GoldenHit,
                                   GoldenHit.hit_id == ImageHit.hit_id).filter(
         GoldenHit.golden_hit_id == golden_hit_id,
         MTurkImage.timestamp == timestamp,
         MTurkImage.video_id == video_id).scalar()
     return result
Esempio n. 15
0
 def _generate_QA_numbers_query(cls, start_date, end_date, hit_type, exclude_table):
     count_bools = lambda expr: cast(func.sum(expr), Integer)
     true_positives = case([(hit_type.result == True, 1)], else_=0)
     total = case([(hit_type.result != None, 1)], else_=0)
     conflicts = case([(hit_type.result == None, 1)], else_=0)
     query = session.query(hit_type.label_id, count_bools(true_positives),
         count_bools(total), count_bools(conflicts))
     query = query.outerjoin(exclude_table, exclude_table.hit_id==hit_type.hit_id)
     query = query.filter(exclude_table.hit_id==None)
     query = query.filter(hit_type.timestamp.between(start_date, end_date))
     query = query.filter(hit_type.outstanding==False)
     query = query.group_by(hit_type.label_id)
     return query
    def prefetch_tdrs(self):
        """Grab all Text Detector Results for our pages"""
        # lookup containing text detector results for pages
        self.tdr_lookup = defaultdict(set)

        if self.clf_target_ids:
            tdr = TextDetectorResult
            query = session.query(tdr.page_id, tdr.clf_target_id)
            query = query.filter(tdr.page_id.in_(self.page_ids))
            query = query.filter(tdr.clf_target_id.in_(self.clf_target_ids))

            for page_id, clf_target_id in query:
                self.tdr_lookup[page_id].add(clf_target_id)
Esempio n. 17
0
    def results_to_qa(cls, min_date):
        """ Find image_detector_results since min_date for
        Image-QA-enabled Clf Targets"""
        idr, ti = ImageDetectorResult, TrainingImage
        results = defaultdict(set)

        for clf_target in cls.enabled_clf_targets():
            # Image count ends up being the number of videos from which we
            # query. So, the actual number of images might be more
            max_videos = clf_target.image_qa_count
            query = session.query(idr.video_id.distinct())
            query = query.filter(idr.clf_target_id == clf_target.id,
                                 idr.timestamp > min_date)
            myvids = set()

            for (vid,) in query:
                if(len(myvids) >= max_videos):
                    break
                s3_images = Video.get(vid).s3_timestamps()
                if s3_images:
                    query = session.query(idr.video_id, idr.time).filter(
                        idr.video_id == vid, idr.clf_target_id == clf_target.id,
                        idr.time.in_(s3_images))
                    # Filter images used for training the detector
                    training_images = session.query(
                        ti.video_id, ti.timestamp).\
                        filter_by(detector_id=clf_target.clf.id).all()
                    for vid_id, ts in query:
                        if (vid_id, ts) not in training_images:
                            mtb = MTurkImage.query.filter_by(
                                video_id=vid_id, timestamp=ts,
                                label_id=clf_target.target_label_id).first()
                            if not mtb:
                                results[clf_target].add((vid_id, ts))
                                myvids.add(vid_id)

        return results
    def prefetch_awplrs(self):
        """Grab all admin web page label results for our pages"""
        self.awplr_lookup = defaultdict(list)

        query = session.query(AdminWebPageLabelResult.page_id,
                              AdminWebPageLabelResult.label_id,
                              AdminWebPageLabelResult.result)
        query = query.filter(AdminWebPageLabelResult.page_id.in_(
            self.page_ids))
        if not self.all_labels:
            query = query.filter(
                AdminWebPageLabelResult.label_id.in_(self.target_label_ids))

        for page_id, label_id, result in query:
            self.awplr_lookup[page_id].append((label_id, result))
Esempio n. 19
0
 def _generate_QA_numbers(cls, cls_to_qa_prop, cls_to_qa, join_name, target_result,
                            start_date, end_date):
     count_bools = lambda expr: func.count(func.nullif(expr, 0))
     trues_right = count_bools(
         and_(target_result == True, cls.expected_result == True))
     trues_total = count_bools(
         and_(cls.expected_result == True, target_result != None))
     conflict_total = count_bools(target_result == None)
     query = session.query(
         cls_to_qa_prop, trues_right, trues_total, conflict_total)
     query = query.filter(
         cls.timestamp >= start_date, cls.timestamp < end_date)
     query = query.join(join_name).join(
         cls.hit).filter_by(outstanding=False)
     return query.group_by(cls_to_qa.id).order_by(cls_to_qa.id).all()
    def prefetch_labels(self):
        """Populate self.base_label_ids"""
        self.base_label_ids = set()
        self.label_decision_thresholds = {}

        if not self.all_labels:
            # use only labels that can actually produce results
            # i.e. have at least one weighted_keyword, weighted_label,
            # weighted_detector or weighted_text_detector
            query = session.query(Label.id.distinct()).filter(
                Label.id.in_(self.target_label_ids))
            query = query.outerjoin(Label.weighted_keywords)
            query = query.outerjoin(Label.weighted_labels)
            query = query.outerjoin(Label.weighted_clf_targets)
            query = query.filter((WeightedKeyword.keyword_id != None)
                                 | (WeightedLabel.child_id != None)
                                 | (WeightedClfTarget.clf_target_id != None))
            self.base_label_ids.update(row[0] for row in query)
        else:
            query = session.query(Label.id.distinct())
            query = query.outerjoin(Label.weighted_keywords)
            query = query.outerjoin(Label.weighted_clf_targets)
            query = query.filter((WeightedKeyword.keyword_id != None)
                                 | (WeightedClfTarget.clf_target_id != None))
            label_ids = [row[0] for row in query]
            if label_ids:
                self.base_label_ids.update(Label.all_ancestor_ids(label_ids))

        self.descendant_label_ids = (
            Label.all_descendant_ids(self.target_label_ids) -
            self.target_label_ids)

        # fetch all label thresholds, since it is cheap
        query = session.query(Label.id, Label.decision_threshold)
        for label_id, thresh in query:
            self.label_decision_thresholds[label_id] = thresh
Esempio n. 21
0
    def preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore):
        label_to_qa = Label.get(label_id)

        query = session.query(WebPageInventory.page_id.distinct())
        query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id)
        query = query.filter(VideoOnPage.page_id != None)
        if page_ids_to_ignore:
            query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore))
        query = query.join(WebPageLabelResult,
                WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.filter(WebPageLabelResult.label_id == label_to_qa.id)
        query = query.order_by(func.rand())
        query = query.limit(label_to_qa.screenshot_count)

        return [page_id for (page_id,) in query]
    def prefetch_last_detection_times(self):
        self.last_detections = defaultdict(lambda: None)
        self.last_text_detections = defaultdict(lambda: None)

        for page_id in self.page_ids:
            vcr_videos = self.active_videos[page_id]

            last_detections = [vv['last_detection'] for vv in vcr_videos]
            # Set it to min of last_detections or None
            if last_detections == [] or None in last_detections:
                last_detection = None
            else:
                last_detection = min(last_detections)
            self.last_detections[page_id] = last_detection

        query = session.query(WebPage.id, WebPage.text_detection_update)
        query = query.filter(WebPage.id.in_(self.page_ids))
        self.last_text_detections.update(query)
    def prefetch_vdrs(self):
        # lookup containing true detectors for videos
        self.vdr_lookup = defaultdict(set)

        # gather all active videos
        video_ids = set()
        for video_list in self.active_videos.values():
            for vcr_video in video_list:
                video_ids.add(vcr_video['video_id'])

        # get all video detector results
        if self.clf_target_ids and video_ids:
            vdr = VideoDetectorResult
            query = session.query(vdr.video_id, vdr.clf_target_id)
            query = query.filter(vdr.video_id.in_(video_ids))
            query = query.filter(vdr.clf_target_id.in_(self.clf_target_ids))
            for video_id, clf_target_id in query:
                self.vdr_lookup[video_id].add(clf_target_id)
    def prefetch_keywords(self):
        """Populate self.keywords"""
        from affine.detection.nlp.keywords.keyword_matching import \
            PageKeywordMatcher

        self.keyword_matcher = PageKeywordMatcher()
        self.weighted_keywords = defaultdict(list)

        query = session.query(WeightedKeyword)
        query = query.options(joinedload(WeightedKeyword.keyword))
        if not self.all_labels:
            query = query.filter(
                WeightedKeyword.label_id.in_(self.target_label_ids))
        for wk in query:
            kw = wk.keyword
            self.weighted_keywords[wk.label_id].append(
                (kw.id, wk.title_weight, wk.body_weight))
            self.keyword_matcher.add_keyword(kw.id, kw.text)
Esempio n. 25
0
    def process_hit(self, hit_id, assignments):
        processed_results = []
        logger.info('processing box hit %s' % hit_id)
        # get a list of the box ids for this hit from the first assignment in
        # the assignments
        box_ids = assignments[0]['box_ids'][0].split('_')
        # combine all the assignments' results into a single list
        all_clicked_boxes = reduce(lambda x, y: list(set(x) | set(y.keys())),
                                   assignments)
        all_clicked_boxes = [
            k.replace('box_', '') for k in all_clicked_boxes
            if self.box_pat.search(k)
        ]

        # guaranteed 3/3 if nobody clicked
        false_box_ids = set(box_ids) - set(all_clicked_boxes)
        for box_id in false_box_ids:
            processed_results.append(
                (hit_id, int(box_id), self.target_label_id, False))
        true_results = defaultdict(int)
        for assignment in assignments:
            for box_id in all_clicked_boxes:
                munged = "box_%s" % str(box_id)
                true_results[box_id] += 1 if munged in assignment else 0
        true_box_ids = set()
        for box_id, num_true in true_results.iteritems():
            num_false = self.max_assignments - num_true
            if num_true >= self.match_threshold:
                result = True
                true_box_ids.add(box_id)
            elif num_false >= self.match_threshold:
                result = False
            else:
                result = None
            processed_results.append(
                (hit_id, int(box_id), self.target_label_id, result))
        # update video results if true face boxes are found
        if true_box_ids:
            query = session.query(Box.video_id.distinct()).filter(
                Box.id.in_(true_box_ids),
                or_(Box.box_type == 'Face', Box.box_type == 'Logo'))
            for (vid, ) in query:
                self.save_video_result(vid, True)
        return processed_results
    def prefetch_avlrs(self):
        """Grab all admin video label results for our pages"""
        self.avlr_lookup = defaultdict(list)

        query = session.query(VideoOnPage.page_id,
                              AdminVideoLabelResult.label_id,
                              AdminVideoLabelResult.result)
        query = query.join(
            (AdminVideoLabelResult,
             VideoOnPage.video_id == AdminVideoLabelResult.video_id))
        query = query.filter(VideoOnPage.active == True,
                             VideoOnPage.is_preroll == False,
                             VideoOnPage.page_id.in_(self.page_ids))
        if not self.all_labels:
            query = query.filter(
                AdminVideoLabelResult.label_id.in_(self.target_label_ids))

        for page_id, label_id, result in query:
            self.avlr_lookup[page_id].append((label_id, result))
Esempio n. 27
0
 def register_prev_qa(cls):
     tdr, ph, wp = TextDetectorResult, PageHit, WebPage
     base_query = session.query(
         tdr.page_id, tdr.clf_target_id, ph.hit_id).\
         filter(ph.page_id == tdr.page_id)
     base_query = base_query.join(wp, tdr.page_id == wp.id)
     for clf_target in cls.enabled_clf_targets():
         query = base_query.\
             filter(tdr.clf_target_id == clf_target.id,
                    wp.text_detection_update > clf_target.clf.updated_at,
                    ph.label_id == clf_target.target_label_id)
         query = query.\
             outerjoin(cls, and_(cls.clf_target_id == clf_target.id,
                                 cls.detector_version == clf_target.clf.updated_at,
                                 cls.page_id == tdr.page_id))
         for i in query.filter(cls.page_id == None):
             cls(page_id=i.page_id, detector_version=clf_target.clf.updated_at,
                 clf_target_id=clf_target.id, hit_id=i.hit_id)
     session.flush()
    def prefetch_wt_clf_targets(self):
        self.wt_clf_target_lookup = defaultdict(list)
        self.clf_target_ids = set()

        cols = [
            WeightedClfTarget.label_id, WeightedClfTarget.clf_target_id,
            WeightedClfTarget.weight
        ]
        query = session.query(*cols)
        if not self.all_labels:
            query = query.filter(
                WeightedClfTarget.label_id.in_(self.target_label_ids))
        query = query.join(WeightedClfTarget.clf_target)
        query = query.join(ClassifierTarget.clf)
        query = query.filter(AbstractClassifier.enabled_since != None)

        for label_id, clf_target_id, weight in query:
            self.wt_clf_target_lookup[label_id].append((clf_target_id, weight))
            self.clf_target_ids.add(clf_target_id)
Esempio n. 29
0
    def non_preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore):
        label_to_qa = Label.get(label_id)
        #Set end date such that no pages ingested same day are QAed to allow for
        #all stages of ingestion to complete
        end_date = datetime.utcnow() - timedelta(days=1)

        query = session.query(WebPageInventory.page_id.distinct())
        query = query.join(WebPage, WebPageInventory.page_id == WebPage.id)
        query = query.filter(WebPage.last_crawled_video <= end_date)
        query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id)
        query = query.filter(VideoOnPage.page_id == None)
        if page_ids_to_ignore:
           query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore))
        query = query.join(WebPageLabelResult,
            WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.filter(WebPageLabelResult.label_id == label_to_qa.id)
        query = query.order_by(func.rand())
        query = query.limit(label_to_qa.non_preroll_qa_count)

        return [page_id for (page_id,) in query]
Esempio n. 30
0
    def results_to_qa_for_label(cls, label_id):
        logger.info("Gathering results for label_id : %s", label_id)
        label_results = []

        ignore_video_ids = cls.get_ignore_video_ids(label_id)
        ignore_page_ids = cls.get_ignore_page_ids(label_id)
        query = session.query(WebPageInventory.video_id, WebPageInventory.page_id)
        query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.distinct(WebPageInventory.video_id).filter(WebPageLabelResult.label_id == label_id)
        if ignore_video_ids:
            query = query.filter(~WebPageInventory.video_id.in_(ignore_video_ids))
        if ignore_page_ids:
            query = query.filter(~WebPageInventory.page_id.in_(ignore_page_ids))

        query = query.filter(WebPageInventory.video_id != 0).group_by(WebPageInventory.video_id)
        vid_page_ids = query.order_by(func.rand()).limit(Label.get(label_id).collage_count).all()

        for video_id, page_id in vid_page_ids:
            label_results.append((label_id, video_id, page_id, True))

        return label_results