コード例 #1
0
    def create_training_urls(self):
        num_youtube_urls = int(self.youtube_proportion / 100 * self.num_urls)
        num_dailymotion_urls = int(self.dailymotion_proportion / 100 *
                                   self.num_urls)
        num_5min_urls = self.num_urls - num_youtube_urls - num_dailymotion_urls

        urls = set()
        urls.update(
            VideoUrls.get_youtube_urls(self.search_kw, num_youtube_urls))
        urls.update(VideoUrls.get_5min_urls(self.search_kw, num_5min_urls))
        urls.update(
            VideoUrls.get_dailymotion_urls(self.search_kw,
                                           num_dailymotion_urls))

        dynamo = DynamoIngestionStatusClient()
        download_queue = sqs.get_queue(config.sqs_download_queue_name())
        for url in map(parse_url, urls):
            if dynamo.get(url) is None:
                item_to_enqueue = {
                    "url": url,
                    "status": "Queued",
                    "download_stage": "Text"
                }
                dynamo.put(item_to_enqueue)
                sqs.write_to_queue(download_queue, item_to_enqueue)
            VideoTrainingURL(job=self, url=url)

        self.num_urls = len(urls)
        session.flush()

        print 'Ingested %s urls to the download queue' % (str(len(urls)))
コード例 #2
0
    def submit_hits(cls, days_since_today=30):
        """submit hits to Mturk for all images that we have verdict for"""

        min_date = datetime.utcnow() - timedelta(days=days_since_today)
        num_hits_submitted = 0
        for clf_target, images in cls.results_to_qa(min_date).iteritems():
            label_id = clf_target.target_label_id
            evaluator = ClickableImageEvaluator.query.filter_by(
                target_label_id=label_id).one()
            images_to_submit = []

            for vid, ts in images:
                mtb = MTurkImage.query.filter_by(
                    video_id=vid, timestamp=ts, label_id=label_id).first()
                if not mtb:
                    # post HIT if no MTurkImage
                    images_to_submit.append([vid, ts])

            for i in xrange(0, len(images_to_submit), cls.NUM_IMAGES_PER_HIT):
                images_per_hit = images_to_submit[i:i + cls.NUM_IMAGES_PER_HIT]
                hit_id = evaluator.create_hit(image_ids=images_per_hit)
                ih = ImageHit(hit_id=hit_id)
                num_hits_submitted += 1
                session.flush()

                for vid, ts in images_per_hit:
                    mti = MTurkImage(
                        video_id=vid, timestamp=ts, image_hit_id=ih.id,
                        label_id=label_id)
                    cls(clf_target_id=clf_target.id, mturk_image=mti)
                session.flush()
        return ImageHit, num_hits_submitted
コード例 #3
0
 def get_or_create(cls, target_label):
     existing = cls.query.filter_by(target_label=target_label).first()
     if existing:
         return existing
     evaluator = cls(name=target_label.name, target_label=target_label)
     session.flush()
     return evaluator
コード例 #4
0
 def get_or_create(cls, remote_id):
     remote_id = parse_url(remote_id)
     rm_id_sha1 = sha1(remote_id).hexdigest()
     page = (cls.query.filter_by(remote_id_sha1 = rm_id_sha1).first() or
             cls(remote_id = remote_id, remote_id_sha1 = rm_id_sha1))
     session.flush()
     return page
コード例 #5
0
    def submit_hits(cls, days=30):
        """Submit video detector results to MTurk for QA"""
        timeframe = timedelta(days)
        min_date = datetime.now() - timeframe
        evaluators = {}
        num_hits_submitted = 0
        for video_id, clf_target in cls.results_to_qa(min_date):
            if clf_target.id not in evaluators:
                evaluators[clf_target.id] = VideoCollageEvaluator.query.\
                    filter_by(target_label_id=clf_target.target_label_id).one()
            evaluator = evaluators[clf_target.id]
            label_id = evaluator.target_label_id
            vh = VideoHit.query.filter_by(video_id=video_id,
                                          label_id=label_id).first()
            if not vh:
                hit_id = evaluator.create_hit(video_id=video_id)
                vh = VideoHit(
                    hit_id=hit_id, label_id=label_id, video_id=video_id)
                num_hits_submitted += 1
                session.flush()

            if not cls.query.filter_by(video_id=video_id,
                                       clf_target_id=clf_target.id).count():
                cls(video_id=video_id, clf_target_id=clf_target.id,
                    hit_id=vh.hit_id, expected_result=True)

        session.flush()
        return VideoHit, num_hits_submitted
コード例 #6
0
    def update_status(cls):
        """Ingest new data from MTurk and write it to the database."""
        for job in cls.query.filter(cls.finished == False):
            num_hits_left = session.query(BoxHit).filter_by(
                training_job_id=job.id, outstanding=True).count()
            urls_left = session.query(VideoTrainingURL).filter_by(
                training_job_id=job.id, processed=False)
            dynamo = DynamoIngestionStatusClient()
            num_urls_left = 0
            for url in urls_left:
                dynamo_url = dynamo.get(url.url)
                if dynamo_url is None or dynamo_url['status'] == 'Failed':
                    # will never be processed, so ignore for our purposes
                    url.processed = True
                else:
                    num_urls_left += 1
            if num_hits_left + num_urls_left == 0:
                job.finished = True
                print '*** Job ID: %s is complete ***' % str(job.id)

            print '------------- Stats for Job ID: %s -------------' % str(
                job.id)
            print 'Total URLs      : %i' % VideoTrainingURL.query.filter_by(
                training_job_id=job.id).count()
            print 'Total HITs      : %i' % BoxHit.query.filter_by(
                training_job_id=job.id).count()
            if not job.finished:
                print 'unprocessed URLs: %i' % num_urls_left
                print 'outstanding HITs: %i\n' % num_hits_left
        session.flush()
コード例 #7
0
    def submit_golden_hits(n_hits, n_lookback):
        """Submit golden hits.

        Fetches the N_LOOKBACK hits most recently selected for golden submission
        and submits N_HITS of them, cycling through them as necessary, and
        prioritizing those that have been submitted as golden the least number of
        times.

        Args:
            n_hits: Number of golden hits submissions.
            n_lookback: Number of distinct hits used for submission.

        Raises:
            AssertionError: No candidate golden hits
        """
        query = session.query(GoldenHitCandidate.hit_id)
        assert query.count() > 0, "No candidate golden hits"
        query = query.order_by(
            GoldenHitCandidate.created_at.desc()).limit(n_lookback)
        query = query.from_self()
        query = query.outerjoin(GoldenHit,
                                GoldenHitCandidate.hit_id == GoldenHit.hit_id)
        query = query.group_by(GoldenHitCandidate.hit_id)
        query = query.order_by(func.count(
            GoldenHit.hit_id).asc()).limit(n_hits)
        hit_ids = islice(cycle([hit_id for (hit_id, ) in query]), n_hits)
        for hit in map(get_hit_from_hit_id, hit_ids):
            ghid = MechanicalTurkEvaluator.create_duplicate_hit(hit)
            GoldenHit(golden_hit_id=ghid, hit_id=hit.hit_id)
        session.flush()
コード例 #8
0
 def enable_qa(cls, label, collage_count, **kw):
     """ Enable QA for a given label """
     evaluator = VideoCollageEvaluator.get_or_create(label)
     label.qa_enabled = True
     label.collage_count = collage_count
     for key, value in kw.iteritems():
         setattr(evaluator, key, value)
     session.flush()
コード例 #9
0
 def enable_qa(cls, clf_target, qa_count, **kw):
     assert isinstance(clf_target, ClassifierTarget), \
         "Can only enable ClassifierTargets, got %s" % clf_target
     evaluator = cls.EVALUATOR_CLS.get_or_create(clf_target.target_label)
     for key, value in kw.iteritems():
         setattr(evaluator, key, value)
     setattr(clf_target, cls.QA_TYPE, True)
     setattr(clf_target, cls.QA_COUNT_TYPE, qa_count)
     session.flush()
コード例 #10
0
 def enable_qa(cls, label, screenshot_count, non_preroll_qa_count, **kw):
     """ Enable QA for a given label """
     evaluator = WebPageTextEvaluator.get_or_create(label)
     label.page_qa_enabled = True
     label.screenshot_count = screenshot_count
     label.non_preroll_qa_count = non_preroll_qa_count
     for key, value in kw.iteritems():
         setattr(evaluator, key, value)
     session.flush()
コード例 #11
0
 def update_mturk_results(cls, mt_results):
     for hit_id, video_id, timestamp, label_id, result in mt_results:
         mi = cls.query.filter_by(video_id=video_id, timestamp=timestamp, label_id=label_id).first()
         if mi is None:
             logger.warn("MTurkImage not found for video_id:%s, timestamp%s, ImageHit.hit_id:%s" %(video_id, timestamp, hit_id))
         else:
             mi.result = result
             mi.hit.outstanding = False
     session.flush()
コード例 #12
0
 def update_mturk_results(cls, mt_results):
     for hit_id, box_id, label_id, result in mt_results:
         mb = cls.query.filter_by(box_id = box_id, label_id = label_id).first()
         if not mb:
             logger.warn('MTurkBox not found for box_id:%s, label_id:%s and BoxHit.hit_id:%s' %(box_id, label_id, hit_id))
         else:
             mb.result = result
             BoxHit.query.filter_by(hit_id=hit_id).update({"outstanding":
                 False}, synchronize_session=False)
     session.flush()
コード例 #13
0
 def update_mturk_results(cls, mt_results):
     for hit_id, video_id, label_id, result in mt_results:
         """ take ingested results from MTurk and update results on the DB """
         vh = VideoHit.query.filter_by(hit_id = hit_id).first()
         if vh is None:
             logger.warn("Hit not found %s", hit_id)
         else:
             vh.result = result
             vh.outstanding = False
     session.flush()
コード例 #14
0
 def update_mturk_results(cls, mt_results):
     """ take ingested results from MTurk and update results on the DB """
     for hit_id, page_id, label_id, result in mt_results:
         ph = PageHit.query.filter_by(hit_id=hit_id).first()
         if ph is None:
             logger.warn("Hit not found %s", hit_id)
         else:
             ph.result = result
             ph.outstanding = False
     session.flush()
コード例 #15
0
    def create_hit(self, **kwargs):
        """Submit a task to MTurk"""
        reward_amt = self.reward_amt / 100.0
        try:
            try:
                template_data = self.format_data(**kwargs)
                hit_html = self.generate_html(**template_data)
                hit_id = MTurkUtils.submit_hit(
                    hit_html,
                    self.title,
                    self.description,
                    self.keywords,
                    self.approval_delay,
                    reward_amt,
                    self.duration,
                    self.lifetime,
                    self.max_assignments,
                    require_adult=self.require_adult,
                    min_percent_approved=self.min_percent_approved,
                    min_hits_approved=self.min_hits_approved,
                    require_us=True)
            except (MTurkRequestError, UnicodeEncodeError):
                if self.evaluator_type == 'page_text':
                    template_data = self.format_data(process_title=True,
                                                     **kwargs)
                    hit_html = self.generate_html(**template_data)
                    hit_id = MTurkUtils.submit_hit(
                        hit_html,
                        self.title,
                        self.description,
                        self.keywords,
                        self.approval_delay,
                        reward_amt,
                        self.duration,
                        self.lifetime,
                        self.max_assignments,
                        require_adult=self.require_adult,
                        min_percent_approved=self.min_percent_approved,
                        min_hits_approved=self.min_hits_approved,
                        require_us=True)
                else:
                    raise
        except Exception:
            logger.info('HIT creation failed for %s' % kwargs)
            tb = traceback.format_exc() + '\n input kwargs: %s' % kwargs
            if 'AWS.MechanicalTurk.InsufficientFunds' not in tb:
                MTurkHitFailure(hit_id='Invalid HIT', message=tb)
                session.flush()
            raise

        logger.info('created %s' % hit_id)
        return hit_id
コード例 #16
0
 def update_on_demand_job_status(self, mt_results):
     from affine.model.mturk import MTurkOnDemandJob
     for hit_id, job_id, resource_id, label_id, result in mt_results:
         mj = MTurkOnDemandJob.query.filter_by(resource_id=resource_id,
                                               job_id=job_id,
                                               hit_id=hit_id).first()
         if not mj:
             msg = "MTurkOnDemandJob not found for hit_id:%s, job_id:%s, thumbnail:%s"
             logger.warn(msg % (hit_id, job_id, resource_id))
         else:
             mj.result = result
             mj.outstanding = False
     session.flush()
コード例 #17
0
 def set_values(self, **kwargs):
     bucket = config.get('affine.s3.bucket')
     try:
         if self.mock_evaluator:
             self.min_percent_approved = 0
             self.max_assignments = 1
             self.min_hits_approved = 0
             self.match_threshold = 1
             self.require_adult = False
             session.flush()
             config.set('affine.s3.bucket', 'affine')
         f = func(self, **kwargs)
         return f
     finally:
         config.set('affine.s3.bucket', bucket)
コード例 #18
0
    def submit_hits(cls):
        """Submit video labels to MTurk for QA"""
        evaluators = {}
        result_set = cls.results_to_qa()
        num_hits_submitted = 0
        for label_id, wp_id, expected_result in result_set:
            if label_id not in evaluators:
                evaluators[label_id] = WebPageTextEvaluator.query.filter_by(
                    target_label_id=label_id).one()
            evaluator = evaluators[label_id]

            hit_id = evaluator.create_hit(page_id=wp_id)
            ph = PageHit(hit_id=hit_id, label_id=label_id, page_id=wp_id)
            num_hits_submitted += 1
            session.flush()
        return PageHit, num_hits_submitted
コード例 #19
0
 def register_prev_qa(cls):
     tdr, ph, wp = TextDetectorResult, PageHit, WebPage
     base_query = session.query(
         tdr.page_id, tdr.clf_target_id, ph.hit_id).\
         filter(ph.page_id == tdr.page_id)
     base_query = base_query.join(wp, tdr.page_id == wp.id)
     for clf_target in cls.enabled_clf_targets():
         query = base_query.\
             filter(tdr.clf_target_id == clf_target.id,
                    wp.text_detection_update > clf_target.clf.updated_at,
                    ph.label_id == clf_target.target_label_id)
         query = query.\
             outerjoin(cls, and_(cls.clf_target_id == clf_target.id,
                                 cls.detector_version == clf_target.clf.updated_at,
                                 cls.page_id == tdr.page_id))
         for i in query.filter(cls.page_id == None):
             cls(page_id=i.page_id, detector_version=clf_target.clf.updated_at,
                 clf_target_id=clf_target.id, hit_id=i.hit_id)
     session.flush()
コード例 #20
0
    def create_with_evaluator_and_training_urls(cls, label_id, num_urls,
                                                search_kw, **evaluator_kwargs):
        evluator = ClickableBoxEvaluator.query.filter_by(
            target_label_id=label_id).first()
        if not evluator:
            evaluator_name = "Training Evaluator for %s" % Label.get(
                label_id).name
            evluator = ClickableBoxEvaluator(name=evaluator_name,
                                             target_label_id=label_id,
                                             **evaluator_kwargs)

        training_job = TrainingJob(
            label_id=label_id,
            evaluator=evluator,
            num_urls=num_urls,
            search_kw=search_kw,
        )
        session.flush()
        training_job.create_training_urls()
        return training_job
コード例 #21
0
 def submit_hits(cls, days_since_today=30):
     """submit hits to Mturk for all boxes that we have verdict for"""
     min_date = datetime.now() - timedelta(days=days_since_today)
     num_hits_submitted = 0
     results = cls.results_to_qa(min_date)
     for clf_target, box_ids in results.iteritems():
         label_id = clf_target.target_label_id
         evaluator = ClickableBoxEvaluator.query.filter_by(
             target_label_id=label_id).one()
         box_ids_to_submit = []
         for box_id in box_ids:
             mtb = MTurkBox.query.filter_by(label_id=label_id, box_id=box_id).first()
             if mtb:
                 cls(box_id=box_id, clf_target_id=clf_target.id, mturk_box=mtb)
             else:
                 box_ids_to_submit.append(box_id)
         for i in xrange(0, len(box_ids_to_submit), cls.NUM_BOXES_PER_HIT):
             # slicing all boxes so that we can put "NUM_BOXES_PER_HIT"
             # on each BoxHit
             boxes_per_hit = box_ids_to_submit[i:i + cls.NUM_BOXES_PER_HIT]
             hit_id = evaluator.create_hit(box_ids=boxes_per_hit)
             session.flush()
             b = BoxHit(hit_id=hit_id)
             num_hits_submitted += 1
             session.flush()
             for box_id in boxes_per_hit:
                 mtb = MTurkBox(box_id=box_id, box_hit_id=b.id, label_id=label_id)
                 cls(box_id=box_id, clf_target_id=clf_target.id, mturk_box=mtb)
         session.flush()
     return BoxHit, num_hits_submitted
コード例 #22
0
 def submit_hits(cls, days=30):
     """Submit web pages to MTurk for QA"""
     min_date = datetime.now() - timedelta(days)
     evaluators = {}
     cls.register_prev_qa()
     num_hits_submitted = 0
     for clf_target, page_id, clf_updated_at in cls.results_to_qa(min_date):
         if clf_target.id not in evaluators:
             label_id = clf_target.target_label_id
             evaluators[clf_target.id] = WebPageTextEvaluator.query.filter_by(
                 target_label_id=label_id).one()
         evaluator = evaluators[clf_target.id]
         label_id = evaluator.target_label_id
         ph = PageHit.query.filter_by(
             page_id=page_id, label_id=label_id).first()
         if not ph:
             hit_id = evaluator.create_hit(page_id=page_id)
             ph = PageHit(hit_id=hit_id, label_id=label_id, page_id=page_id)
             num_hits_submitted += 1
         cls(page_id=page_id, detector_version=clf_updated_at,
             clf_target_id=clf_target.id, hit_id=ph.hit_id)
         session.flush()
     return PageHit, num_hits_submitted
コード例 #23
0
    def submit_hits(self):
        """Submit facebox hits from the training video url table to MTurk for QA"""
        # query all processed urls (TrainingVideoURL table) for boxes
        # create BoxHits for all boxes and submit hits to Mturk
        boxes = []
        num_hits_submitted = 0
        for url in session.query(VideoTrainingURL).filter_by(
                training_job_id=self.id, processed=False):
            wpage = WebPage.by_url(url.url)
            if wpage is not None:
                # get the video, and set url.processed only if video is updated on its face version
                videos = sorted(wpage.active_videos,
                                key=lambda x: x.length,
                                reverse=True)
                if len(videos) != 0:
                    video = videos[0]
                    images_in_s3 = video.s3_timestamps()
                    for b in video.face_boxes:
                        if b.timestamp in images_in_s3:
                            boxes.append(b.id)
                    url.processed = True
                else:
                    url.processed = True
            boxes = sorted(set(boxes))

        for i in xrange(0, len(boxes), self.NUM_BOXES_PER_HIT):
            boxes_per_hit = boxes[i:i + self.NUM_BOXES_PER_HIT]
            hit_id = self.evaluator.create_hit(box_ids=boxes_per_hit)
            session.flush()
            b = BoxHit(hit_id=hit_id, training_job_id=self.id)
            num_hits_submitted += 1
            for box_id in boxes_per_hit:
                if not MTurkBox.query.filter_by(
                        box_id=box_id, label_id=self.label_id).count():
                    MTurkBox(box_id=box_id, hit=b, label_id=self.label_id)
        session.flush()
        return BoxHit, num_hits_submitted
コード例 #24
0
 def get_or_create(cls, worker_id):
     wk = MTurkWorker.query.filter_by(worker_id=worker_id).scalar()
     if wk is None:
         wk = MTurkWorker(worker_id=worker_id)
         session.flush()
     return wk
コード例 #25
0
 def get_or_create(cls, domain, **query_args):
     query_args['domain'] = domain.lower()
     entry = cls.query.filter_by(**query_args).first() or cls(**query_args)
     session.flush()
     return entry
コード例 #26
0
 def block(self, reason=BLOCK_REASON):
     MTurkUtils.block_worker(self.worker_id, reason)
     self.blocked_since = datetime.utcnow()
     session.flush()
コード例 #27
0
 def unblock(self, reason=""):
     MTurkUtils.unblock_worker(self.worker_id, reason)
     self.blocked_since = None
     session.flush()