예제 #1
0
    def __init__(self,
                 label_id,
                 evaluator_type=ClickableImageEvaluator,
                 **kwargs):
        super(MTurkImageJob, self).__init__()

        assert (evaluator_type in EVALUATOR_TYPES_SUPPORTED), \
            "evaluator_type has to be in %s" % str(EVALUATOR_TYPES_SUPPORTED)

        self.label_id = label_id
        self.finished = False
        self.evaluator_type = evaluator_type
        self.hit = evaluator_type.hit_type

        if evaluator_type.query.filter_by(target_label_id=label_id).count():
            evaluator = evaluator_type.query.filter_by(
                target_label_id=label_id).one()
        else:
            evaluator_name = "MTurk Image Evaluator for %s" % Label.get(
                label_id).name
            evaluator = evaluator_type(name=evaluator_name,
                                       target_label_id=label_id,
                                       **kwargs)
            session.flush()

        # only for stage and dev
        if 'sandbox' in config.get("mturk_hostname"):
            evaluator.min_hits_approved = 0
            session.flush()
    def prefetch_wplrs(self):
        self.wplr_lookup = defaultdict(set)
        query = session.query(WebPageLabelResult.page_id,
                              WebPageLabelResult.label_id)
        query = query.filter(WebPageLabelResult.page_id.in_(self.page_ids))

        if not self.all_labels:
            label_ids = Label.all_descendant_ids(self.target_label_ids)
            query = query.filter(WebPageLabelResult.label_id.in_(label_ids))

        for page_id, label_id in query:
            self.wplr_lookup[page_id].add(label_id)
    def prefetch_labels(self):
        """Populate self.base_label_ids"""
        self.base_label_ids = set()
        self.label_decision_thresholds = {}

        if not self.all_labels:
            # use only labels that can actually produce results
            # i.e. have at least one weighted_keyword, weighted_label,
            # weighted_detector or weighted_text_detector
            query = session.query(Label.id.distinct()).filter(
                Label.id.in_(self.target_label_ids))
            query = query.outerjoin(Label.weighted_keywords)
            query = query.outerjoin(Label.weighted_labels)
            query = query.outerjoin(Label.weighted_clf_targets)
            query = query.filter((WeightedKeyword.keyword_id != None)
                                 | (WeightedLabel.child_id != None)
                                 | (WeightedClfTarget.clf_target_id != None))
            self.base_label_ids.update(row[0] for row in query)
        else:
            query = session.query(Label.id.distinct())
            query = query.outerjoin(Label.weighted_keywords)
            query = query.outerjoin(Label.weighted_clf_targets)
            query = query.filter((WeightedKeyword.keyword_id != None)
                                 | (WeightedClfTarget.clf_target_id != None))
            label_ids = [row[0] for row in query]
            if label_ids:
                self.base_label_ids.update(Label.all_ancestor_ids(label_ids))

        self.descendant_label_ids = (
            Label.all_descendant_ids(self.target_label_ids) -
            self.target_label_ids)

        # fetch all label thresholds, since it is cheap
        query = session.query(Label.id, Label.decision_threshold)
        for label_id, thresh in query:
            self.label_decision_thresholds[label_id] = thresh
예제 #4
0
    def preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore):
        label_to_qa = Label.get(label_id)

        query = session.query(WebPageInventory.page_id.distinct())
        query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id)
        query = query.filter(VideoOnPage.page_id != None)
        if page_ids_to_ignore:
            query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore))
        query = query.join(WebPageLabelResult,
                WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.filter(WebPageLabelResult.label_id == label_to_qa.id)
        query = query.order_by(func.rand())
        query = query.limit(label_to_qa.screenshot_count)

        return [page_id for (page_id,) in query]
예제 #5
0
    def generate_hash(cls, label_id, hash_lookup=None):
        """Generates the hash tag for the given label by iterating
        its weighted labels, weighted keywords, weighted detectors,
        domain name detectors and weighted text_detectors recursively
        """
        hash_lookup = hash_lookup or {}
        if label_id not in hash_lookup:
            label = Label.get(label_id)
            assert label is not None, "label %s Does not exist" % label_id

            hash_str = ""
            latest_detector = None
            latest_text_detector = None

            if label.name == 'Rotating Content':
                max_id = session.query(func.max(RotatingContentPage.id)).scalar()
                hash_str = str(max_id)
            else:
                query = session.query(WeightedLabel.child_id, WeightedLabel.weight)
                query = query.filter_by(parent_id=label_id).order_by(WeightedLabel.child_id)
                for child_id, weight in query:
                    child_hash, child_latest_detector, child_latest_text_detector = cls.generate_hash(child_id, hash_lookup=hash_lookup)
                    hash_str += "%s%s" % (child_hash, weight)
                    latest_detector = cls._newer_timestamp(latest_detector, child_latest_detector)
                    latest_text_detector = cls._newer_timestamp(latest_text_detector, child_latest_text_detector)

                query = session.query(Keyword.text, WeightedKeyword.body_weight, WeightedKeyword.title_weight)
                query = query.filter(Keyword.id==WeightedKeyword.keyword_id, WeightedKeyword.label_id==label_id).order_by(Keyword.id)
                for text, bw, tw in query:
                    hash_str += "%s%s%s" % (text.encode('utf-8'), bw, tw)

                det_hash_str, latest_det_ts, latest_text_det_ts = cls._get_detector_hash(label_id)
                hash_str += det_hash_str
                latest_detector = cls._newer_timestamp(
                    latest_detector, latest_det_ts)
                latest_text_detector = cls._newer_timestamp(
                    latest_text_detector, latest_text_det_ts)

                query = session.query(DomainNameDetector.domain_name, DomainNameDetector.weight)
                query = query.filter_by(target_label_id=label_id).order_by(DomainNameDetector.id)
                for domain_name, wt in query:
                    hash_str += "%s%s" % (domain_name, wt)

                hash_str += "%s" % label.decision_threshold

            hash_tag = hashlib.sha1(hash_str).hexdigest()
            hash_lookup[label_id] = hash_tag, latest_detector, latest_text_detector
        return hash_lookup[label_id]
예제 #6
0
 def __init__(self, **kwargs):
     super(ClickableImageEvaluator, self).__init__(**kwargs)
     label = self.target_label
     if label is None and self.target_label_id is not None:
         label = Label.get(self.target_label_id)
     if label is not None:
         if self.question is None:
             self.question = "Click on the images whose content is related to %s. " % label.name
         if self.title is None:
             self.title = 'Clickable Image Tagging (%s)' % label.name
         if self.description is None:
             self.description = (
                 'You will be shown a series of images and asked to click the ones whose content is related to %s '
                 % label.name)
     if self.keywords is None:
         self.keywords = self.default_keywords
예제 #7
0
 def __init__(self, **kwargs):
     super(ClickableBoxEvaluator, self).__init__(**kwargs)
     label = self.target_label
     if label is None and self.target_label_id is not None:
         label = Label.get(self.target_label_id)
     if label is not None:
         if self.question is None:
             self.question = "Click on the images where %s's face is contained by the red box." % label.name
         if self.title is None:
             self.title = 'Clickable Image Tagging (%s)' % label.name
         if self.description is None:
             self.description = (
                 'You will be shown a series of images and asked to click the ones that have %s enclosed in a red box'
                 % label.name)
     if self.keywords is None:
         self.keywords = self.default_keywords
    def calculate_admin_labels(self, page_id):
        # Admin video label results
        avlrs = self.avlr_lookup[page_id]
        # Admin web page label results
        awplrs = self.awplr_lookup[page_id]
        # Super-detector video results
        svdrs = self.svdr_lookup[page_id]

        admin_results = dict(svdrs)
        admin_results.update(avlrs)
        admin_results.update(awplrs)

        false_admin_labels = {l for l, r in admin_results.items() if not r}
        for descendant_id in Label.all_descendant_ids(false_admin_labels):
            admin_results[descendant_id] = False

        return admin_results
예제 #9
0
    def __init__(self, **kwargs):
        super(WebPageTextEvaluator, self).__init__(**kwargs)
        label = self.target_label
        if label is None and self.target_label_id is not None:
            label = Label.get(self.target_label_id)
        if label is not None:
            if self.question is None:
                self.question = 'Does this web page contain %s content?' % label.name
            if self.title is None:
                self.title = 'Web Page Categorization (%s)' % label.name
            if self.description is None:
                self.description = (
                    'You will be shown a screen shot of a web page and asked '
                    'whether the web page contains %s content' % label.name)
            self.require_adult = True

        if self.keywords is None:
            self.keywords = self.default_keywords
예제 #10
0
    def create_with_evaluator_and_training_urls(cls, label_id, num_urls,
                                                search_kw, **evaluator_kwargs):
        evluator = ClickableBoxEvaluator.query.filter_by(
            target_label_id=label_id).first()
        if not evluator:
            evaluator_name = "Training Evaluator for %s" % Label.get(
                label_id).name
            evluator = ClickableBoxEvaluator(name=evaluator_name,
                                             target_label_id=label_id,
                                             **evaluator_kwargs)

        training_job = TrainingJob(
            label_id=label_id,
            evaluator=evluator,
            num_urls=num_urls,
            search_kw=search_kw,
        )
        session.flush()
        training_job.create_training_urls()
        return training_job
예제 #11
0
    def non_preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore):
        label_to_qa = Label.get(label_id)
        #Set end date such that no pages ingested same day are QAed to allow for
        #all stages of ingestion to complete
        end_date = datetime.utcnow() - timedelta(days=1)

        query = session.query(WebPageInventory.page_id.distinct())
        query = query.join(WebPage, WebPageInventory.page_id == WebPage.id)
        query = query.filter(WebPage.last_crawled_video <= end_date)
        query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id)
        query = query.filter(VideoOnPage.page_id == None)
        if page_ids_to_ignore:
           query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore))
        query = query.join(WebPageLabelResult,
            WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.filter(WebPageLabelResult.label_id == label_to_qa.id)
        query = query.order_by(func.rand())
        query = query.limit(label_to_qa.non_preroll_qa_count)

        return [page_id for (page_id,) in query]
예제 #12
0
    def results_to_qa_for_label(cls, label_id):
        logger.info("Gathering results for label_id : %s", label_id)
        label_results = []

        ignore_video_ids = cls.get_ignore_video_ids(label_id)
        ignore_page_ids = cls.get_ignore_page_ids(label_id)
        query = session.query(WebPageInventory.video_id, WebPageInventory.page_id)
        query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id)
        query = query.distinct(WebPageInventory.video_id).filter(WebPageLabelResult.label_id == label_id)
        if ignore_video_ids:
            query = query.filter(~WebPageInventory.video_id.in_(ignore_video_ids))
        if ignore_page_ids:
            query = query.filter(~WebPageInventory.page_id.in_(ignore_page_ids))

        query = query.filter(WebPageInventory.video_id != 0).group_by(WebPageInventory.video_id)
        vid_page_ids = query.order_by(func.rand()).limit(Label.get(label_id).collage_count).all()

        for video_id, page_id in vid_page_ids:
            label_results.append((label_id, video_id, page_id, True))

        return label_results
예제 #13
0
    def __init__(self, **kwargs):
        super(VideoCollageEvaluator, self).__init__(**kwargs)
        label = self.target_label
        if label is None and self.target_label_id is not None:
            label = Label.get(self.target_label_id)
        if label is not None:
            if self.question is None:
                self.question = 'Does this video contain %s content?' % label.name
            if self.title is None:
                self.title = 'Image Categorization (%s)' % label.name
            if self.description is None:
                self.description = (
                    'You will be shown a series of images from a single video and asked '
                    'whether the video contains %s content' % label.name)
            # we found an instance of a p**n video being sent as a HIT for other labels
            # so we must mark all our video collage hits as requiring adult to forestall
            # disciplinary action from Amazon -- we'll find a better solution
            # eventually.
            self.require_adult = True

        if self.keywords is None:
            self.keywords = self.default_keywords
 def label_id_by_name(self, name):
     if name not in self.label_ids_by_name:
         label_id = Label.by_name(name).id
         self.label_ids_by_name[name] = label_id
     return self.label_ids_by_name[name]
    def calculate_labels(self, page_id):
        """Determine the current correct label results for a page
        based on its active videos and keywords on the page. Results are
        returned as set of new label IDs to add and old ones to remove.
        """
        admin_results = self.calculate_admin_labels(page_id)

        # gathering all label_ids that we will calculate scores
        # and pass to calculate_sublabels
        label_ids = self.base_label_ids.copy()
        if self.all_labels:
            admin_label_ids = set(admin_results.keys())
            label_ids.update(Label.all_ancestor_ids(admin_label_ids))

        kw_title_matches, kw_body_matches = self.calculate_keywords(page_id)
        # Calculate label scores based on matching keywords, true clf-targets and domains
        label_scores = dict()

        for label_id in label_ids:
            if label_id not in admin_results:
                score = 0
                for kw_id, title_weight, body_weight in self.weighted_keywords[
                        label_id]:
                    if kw_id in kw_title_matches and title_weight != 0:
                        score += title_weight
                    elif kw_id in kw_body_matches:
                        score += body_weight

                # adding weights for all weighted clf-targets
                for clf_target_id, weight in self.wt_clf_target_lookup[
                        label_id]:
                    if clf_target_id in self.tdr_lookup[page_id]:
                        score += weight
                    # VDRs are considered for all non preroll videos of the page
                    for video in self.active_videos[page_id]:
                        video_id = video['video_id']
                        if clf_target_id in self.vdr_lookup[video_id]:
                            score += weight

                # adding weights from domains
                score += self.domain_lookup[(page_id, label_id)]

                label_scores[label_id] = score

        # Existing Web Page Label Results
        existing_true_label_ids = self.wplr_lookup[page_id]

        # existing calculated_true_label_ids are just descendant existing_true_label_ids
        true_label_ids = existing_true_label_ids & self.descendant_label_ids

        calculated_true_label_ids = self.calculate_sublabels(
            label_ids, label_scores, true_label_ids, admin_results,
            self.label_decision_thresholds)

        # append all the info labels to True Labels
        info_labels = self.calculate_info_labels(page_id)
        calculated_true_label_ids |= info_labels

        # here we need to remove existing_true_label_ids that are already in the DB
        # as they do not need to be re-inserted into mysql
        labels_to_add = calculated_true_label_ids - existing_true_label_ids

        if self.all_labels:
            labels_to_delete = existing_true_label_ids - calculated_true_label_ids
        else:
            labels_expected_to_have_results = label_ids | self.target_label_ids
            labels_to_delete = (
                labels_expected_to_have_results -
                calculated_true_label_ids) & existing_true_label_ids

        return labels_to_add, labels_to_delete
예제 #16
0
 def __unicode__(self):
     return u'<MTurkImageJob label: %s, Completion status: %s>' % \
         (Label.get(self.label_id).name, str(self.finished))