def __init__(self, label_id, evaluator_type=ClickableImageEvaluator, **kwargs): super(MTurkImageJob, self).__init__() assert (evaluator_type in EVALUATOR_TYPES_SUPPORTED), \ "evaluator_type has to be in %s" % str(EVALUATOR_TYPES_SUPPORTED) self.label_id = label_id self.finished = False self.evaluator_type = evaluator_type self.hit = evaluator_type.hit_type if evaluator_type.query.filter_by(target_label_id=label_id).count(): evaluator = evaluator_type.query.filter_by( target_label_id=label_id).one() else: evaluator_name = "MTurk Image Evaluator for %s" % Label.get( label_id).name evaluator = evaluator_type(name=evaluator_name, target_label_id=label_id, **kwargs) session.flush() # only for stage and dev if 'sandbox' in config.get("mturk_hostname"): evaluator.min_hits_approved = 0 session.flush()
def prefetch_wplrs(self): self.wplr_lookup = defaultdict(set) query = session.query(WebPageLabelResult.page_id, WebPageLabelResult.label_id) query = query.filter(WebPageLabelResult.page_id.in_(self.page_ids)) if not self.all_labels: label_ids = Label.all_descendant_ids(self.target_label_ids) query = query.filter(WebPageLabelResult.label_id.in_(label_ids)) for page_id, label_id in query: self.wplr_lookup[page_id].add(label_id)
def prefetch_labels(self): """Populate self.base_label_ids""" self.base_label_ids = set() self.label_decision_thresholds = {} if not self.all_labels: # use only labels that can actually produce results # i.e. have at least one weighted_keyword, weighted_label, # weighted_detector or weighted_text_detector query = session.query(Label.id.distinct()).filter( Label.id.in_(self.target_label_ids)) query = query.outerjoin(Label.weighted_keywords) query = query.outerjoin(Label.weighted_labels) query = query.outerjoin(Label.weighted_clf_targets) query = query.filter((WeightedKeyword.keyword_id != None) | (WeightedLabel.child_id != None) | (WeightedClfTarget.clf_target_id != None)) self.base_label_ids.update(row[0] for row in query) else: query = session.query(Label.id.distinct()) query = query.outerjoin(Label.weighted_keywords) query = query.outerjoin(Label.weighted_clf_targets) query = query.filter((WeightedKeyword.keyword_id != None) | (WeightedClfTarget.clf_target_id != None)) label_ids = [row[0] for row in query] if label_ids: self.base_label_ids.update(Label.all_ancestor_ids(label_ids)) self.descendant_label_ids = ( Label.all_descendant_ids(self.target_label_ids) - self.target_label_ids) # fetch all label thresholds, since it is cheap query = session.query(Label.id, Label.decision_threshold) for label_id, thresh in query: self.label_decision_thresholds[label_id] = thresh
def preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore): label_to_qa = Label.get(label_id) query = session.query(WebPageInventory.page_id.distinct()) query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id) query = query.filter(VideoOnPage.page_id != None) if page_ids_to_ignore: query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore)) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.filter(WebPageLabelResult.label_id == label_to_qa.id) query = query.order_by(func.rand()) query = query.limit(label_to_qa.screenshot_count) return [page_id for (page_id,) in query]
def generate_hash(cls, label_id, hash_lookup=None): """Generates the hash tag for the given label by iterating its weighted labels, weighted keywords, weighted detectors, domain name detectors and weighted text_detectors recursively """ hash_lookup = hash_lookup or {} if label_id not in hash_lookup: label = Label.get(label_id) assert label is not None, "label %s Does not exist" % label_id hash_str = "" latest_detector = None latest_text_detector = None if label.name == 'Rotating Content': max_id = session.query(func.max(RotatingContentPage.id)).scalar() hash_str = str(max_id) else: query = session.query(WeightedLabel.child_id, WeightedLabel.weight) query = query.filter_by(parent_id=label_id).order_by(WeightedLabel.child_id) for child_id, weight in query: child_hash, child_latest_detector, child_latest_text_detector = cls.generate_hash(child_id, hash_lookup=hash_lookup) hash_str += "%s%s" % (child_hash, weight) latest_detector = cls._newer_timestamp(latest_detector, child_latest_detector) latest_text_detector = cls._newer_timestamp(latest_text_detector, child_latest_text_detector) query = session.query(Keyword.text, WeightedKeyword.body_weight, WeightedKeyword.title_weight) query = query.filter(Keyword.id==WeightedKeyword.keyword_id, WeightedKeyword.label_id==label_id).order_by(Keyword.id) for text, bw, tw in query: hash_str += "%s%s%s" % (text.encode('utf-8'), bw, tw) det_hash_str, latest_det_ts, latest_text_det_ts = cls._get_detector_hash(label_id) hash_str += det_hash_str latest_detector = cls._newer_timestamp( latest_detector, latest_det_ts) latest_text_detector = cls._newer_timestamp( latest_text_detector, latest_text_det_ts) query = session.query(DomainNameDetector.domain_name, DomainNameDetector.weight) query = query.filter_by(target_label_id=label_id).order_by(DomainNameDetector.id) for domain_name, wt in query: hash_str += "%s%s" % (domain_name, wt) hash_str += "%s" % label.decision_threshold hash_tag = hashlib.sha1(hash_str).hexdigest() hash_lookup[label_id] = hash_tag, latest_detector, latest_text_detector return hash_lookup[label_id]
def __init__(self, **kwargs): super(ClickableImageEvaluator, self).__init__(**kwargs) label = self.target_label if label is None and self.target_label_id is not None: label = Label.get(self.target_label_id) if label is not None: if self.question is None: self.question = "Click on the images whose content is related to %s. " % label.name if self.title is None: self.title = 'Clickable Image Tagging (%s)' % label.name if self.description is None: self.description = ( 'You will be shown a series of images and asked to click the ones whose content is related to %s ' % label.name) if self.keywords is None: self.keywords = self.default_keywords
def __init__(self, **kwargs): super(ClickableBoxEvaluator, self).__init__(**kwargs) label = self.target_label if label is None and self.target_label_id is not None: label = Label.get(self.target_label_id) if label is not None: if self.question is None: self.question = "Click on the images where %s's face is contained by the red box." % label.name if self.title is None: self.title = 'Clickable Image Tagging (%s)' % label.name if self.description is None: self.description = ( 'You will be shown a series of images and asked to click the ones that have %s enclosed in a red box' % label.name) if self.keywords is None: self.keywords = self.default_keywords
def calculate_admin_labels(self, page_id): # Admin video label results avlrs = self.avlr_lookup[page_id] # Admin web page label results awplrs = self.awplr_lookup[page_id] # Super-detector video results svdrs = self.svdr_lookup[page_id] admin_results = dict(svdrs) admin_results.update(avlrs) admin_results.update(awplrs) false_admin_labels = {l for l, r in admin_results.items() if not r} for descendant_id in Label.all_descendant_ids(false_admin_labels): admin_results[descendant_id] = False return admin_results
def __init__(self, **kwargs): super(WebPageTextEvaluator, self).__init__(**kwargs) label = self.target_label if label is None and self.target_label_id is not None: label = Label.get(self.target_label_id) if label is not None: if self.question is None: self.question = 'Does this web page contain %s content?' % label.name if self.title is None: self.title = 'Web Page Categorization (%s)' % label.name if self.description is None: self.description = ( 'You will be shown a screen shot of a web page and asked ' 'whether the web page contains %s content' % label.name) self.require_adult = True if self.keywords is None: self.keywords = self.default_keywords
def create_with_evaluator_and_training_urls(cls, label_id, num_urls, search_kw, **evaluator_kwargs): evluator = ClickableBoxEvaluator.query.filter_by( target_label_id=label_id).first() if not evluator: evaluator_name = "Training Evaluator for %s" % Label.get( label_id).name evluator = ClickableBoxEvaluator(name=evaluator_name, target_label_id=label_id, **evaluator_kwargs) training_job = TrainingJob( label_id=label_id, evaluator=evluator, num_urls=num_urls, search_kw=search_kw, ) session.flush() training_job.create_training_urls() return training_job
def non_preroll_results_to_qa_for_label(cls, label_id, page_ids_to_ignore): label_to_qa = Label.get(label_id) #Set end date such that no pages ingested same day are QAed to allow for #all stages of ingestion to complete end_date = datetime.utcnow() - timedelta(days=1) query = session.query(WebPageInventory.page_id.distinct()) query = query.join(WebPage, WebPageInventory.page_id == WebPage.id) query = query.filter(WebPage.last_crawled_video <= end_date) query = query.outerjoin(VideoOnPage, VideoOnPage.page_id == WebPageInventory.page_id) query = query.filter(VideoOnPage.page_id == None) if page_ids_to_ignore: query = query.filter(~WebPageInventory.page_id.in_(page_ids_to_ignore)) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.filter(WebPageLabelResult.label_id == label_to_qa.id) query = query.order_by(func.rand()) query = query.limit(label_to_qa.non_preroll_qa_count) return [page_id for (page_id,) in query]
def results_to_qa_for_label(cls, label_id): logger.info("Gathering results for label_id : %s", label_id) label_results = [] ignore_video_ids = cls.get_ignore_video_ids(label_id) ignore_page_ids = cls.get_ignore_page_ids(label_id) query = session.query(WebPageInventory.video_id, WebPageInventory.page_id) query = query.join(WebPageLabelResult, WebPageLabelResult.page_id == WebPageInventory.page_id) query = query.distinct(WebPageInventory.video_id).filter(WebPageLabelResult.label_id == label_id) if ignore_video_ids: query = query.filter(~WebPageInventory.video_id.in_(ignore_video_ids)) if ignore_page_ids: query = query.filter(~WebPageInventory.page_id.in_(ignore_page_ids)) query = query.filter(WebPageInventory.video_id != 0).group_by(WebPageInventory.video_id) vid_page_ids = query.order_by(func.rand()).limit(Label.get(label_id).collage_count).all() for video_id, page_id in vid_page_ids: label_results.append((label_id, video_id, page_id, True)) return label_results
def __init__(self, **kwargs): super(VideoCollageEvaluator, self).__init__(**kwargs) label = self.target_label if label is None and self.target_label_id is not None: label = Label.get(self.target_label_id) if label is not None: if self.question is None: self.question = 'Does this video contain %s content?' % label.name if self.title is None: self.title = 'Image Categorization (%s)' % label.name if self.description is None: self.description = ( 'You will be shown a series of images from a single video and asked ' 'whether the video contains %s content' % label.name) # we found an instance of a p**n video being sent as a HIT for other labels # so we must mark all our video collage hits as requiring adult to forestall # disciplinary action from Amazon -- we'll find a better solution # eventually. self.require_adult = True if self.keywords is None: self.keywords = self.default_keywords
def label_id_by_name(self, name): if name not in self.label_ids_by_name: label_id = Label.by_name(name).id self.label_ids_by_name[name] = label_id return self.label_ids_by_name[name]
def calculate_labels(self, page_id): """Determine the current correct label results for a page based on its active videos and keywords on the page. Results are returned as set of new label IDs to add and old ones to remove. """ admin_results = self.calculate_admin_labels(page_id) # gathering all label_ids that we will calculate scores # and pass to calculate_sublabels label_ids = self.base_label_ids.copy() if self.all_labels: admin_label_ids = set(admin_results.keys()) label_ids.update(Label.all_ancestor_ids(admin_label_ids)) kw_title_matches, kw_body_matches = self.calculate_keywords(page_id) # Calculate label scores based on matching keywords, true clf-targets and domains label_scores = dict() for label_id in label_ids: if label_id not in admin_results: score = 0 for kw_id, title_weight, body_weight in self.weighted_keywords[ label_id]: if kw_id in kw_title_matches and title_weight != 0: score += title_weight elif kw_id in kw_body_matches: score += body_weight # adding weights for all weighted clf-targets for clf_target_id, weight in self.wt_clf_target_lookup[ label_id]: if clf_target_id in self.tdr_lookup[page_id]: score += weight # VDRs are considered for all non preroll videos of the page for video in self.active_videos[page_id]: video_id = video['video_id'] if clf_target_id in self.vdr_lookup[video_id]: score += weight # adding weights from domains score += self.domain_lookup[(page_id, label_id)] label_scores[label_id] = score # Existing Web Page Label Results existing_true_label_ids = self.wplr_lookup[page_id] # existing calculated_true_label_ids are just descendant existing_true_label_ids true_label_ids = existing_true_label_ids & self.descendant_label_ids calculated_true_label_ids = self.calculate_sublabels( label_ids, label_scores, true_label_ids, admin_results, self.label_decision_thresholds) # append all the info labels to True Labels info_labels = self.calculate_info_labels(page_id) calculated_true_label_ids |= info_labels # here we need to remove existing_true_label_ids that are already in the DB # as they do not need to be re-inserted into mysql labels_to_add = calculated_true_label_ids - existing_true_label_ids if self.all_labels: labels_to_delete = existing_true_label_ids - calculated_true_label_ids else: labels_expected_to_have_results = label_ids | self.target_label_ids labels_to_delete = ( labels_expected_to_have_results - calculated_true_label_ids) & existing_true_label_ids return labels_to_add, labels_to_delete
def __unicode__(self): return u'<MTurkImageJob label: %s, Completion status: %s>' % \ (Label.get(self.label_id).name, str(self.finished))