def save_boxes(l_id, video_id, boxes): """ Save boxes to TextBox table and detector result to BoxDetectorResult Args: l_id: target Label id video_id: Video id boxes: dict with key: timestamp, value: list of rectangles Returns: List of newly created box ids Raises/Assertions: Asserts if video id does not exist Asserts of label id does not exist """ vid = Video.get(video_id) l = Label.get(l_id) assert vid, "Video %s does not exist" % video_id assert l, "Label %s does not exist" % l_id clf_target = ClassifierTarget.query.filter_by(target_label_id=l.id)\ .join(TextDetectClassifier).one() box_ids = [] for timestamp in boxes.keys(): for h, w, y, x in boxes[timestamp]: box_id = Box.get_or_create(x=x, y=y, width=w, height=h, video=vid, timestamp=timestamp, box_type='Text') box_ids.append(box_id) BoxDetectorResult.log_result(box_id, clf_target.id) return box_ids
def run_training_pipeline(config, pos_file, neg_file, precision_thrsh, recall_thrsh, det_name, target_label_id, video_threshold): """Pipeline for training a new spatial scene detector. Args: config: Path to config file. pos_file: Path to image file specifying positive images. neg_file: Path to image file specifying negative images. precision_thrsh: Cross-validation precision that must be achieved. recall_thrsh: Cross-validation recall that must be achieved. det_name: Name of detector to be created. target_label_id: Detector's target label id. video_threshold: Detector's video_threshold (the # images the detector must fire on to create a VDR) Raises: AssertionError: Target label doesn't exist. """ logger.info("Running training pipeline...") pos_images = read_image_file(pos_file) neg_images = read_image_file(neg_file) images = pos_images + neg_images labels = [POS_LABEL] * len(pos_images) + [NEG_LABEL] * len(neg_images) clf = SpatialSceneClassifier(config) target_label = Label.get(target_label_id) assert target_label evaluate_and_inject(clf, images, labels, precision_thrsh, recall_thrsh, det_name, target_label, video_threshold)
def inject_detector(self, detector_name, label_id, pred_thresh=None): """Create the detector in the table TextDetectClassifier, upload the tar file with all the model files to s3 Args: detector_name: string with detector name (it has to be unique because this detector can't be replaced) label_id: int, target label id of the detector pred_thresh: set float threshold for word detection Returns: clf: the created TextDetectClassifier object Raise/Assertions: This function asserts if the label_id does not correspond to any existing label and if the detector_name already exists in the db """ l = Label.get(label_id) assert l, 'Label id %d does not correspond to any Label!' % (label_id) clf = self.classifier_cls.by_name(detector_name) assert not clf, '%s with name %s already exists!'\ % (self.classifier_cls.__name__, detector_name) clf = self.classifier_cls(name=detector_name, pred_thresh=pred_thresh) session.flush() clf.add_targets([l]) self.tar_and_upload(clf) logger.info('%s detector injected %s' % (self.classifier_cls.__name__, clf)) return clf
def inject(self, clf_name, target_label_id_photo, target_label_id_sshow): ''' Inject model to db. Params: clf_name: Name of classifier created. target_label_id_photo: Classifier's target label for photo videos target_label_id_sshow: target label for slideshow videos Returns: Classifier. ''' target_labels_list = [] for target_label_id in [target_label_id_photo, target_label_id_sshow]: l = Label.get(target_label_id) assert l, 'Label id %s does not correspond to any Label!' % \ (target_label_id) target_labels_list += [l] assert not StaticVideoClassifier.by_name(clf_name), \ 'StaticVideoClassifier with name %s already exists!' % clf_name classifier = StaticVideoClassifier(name=clf_name) session.flush() classifier.add_targets(target_labels_list) self.tar_and_upload(classifier) return classifier
def inject_model(self): cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC) model_name = cfg_obj['model_name'] label = Label.by_name(cfg_obj['target_label_name']) assert label url_model = UrlModel.create(name=model_name) self.tar_and_upload(url_model) logger.info('URL model injected %s' % url_model)
def check_args(args): assert os.path.exists(args.pos_file) assert os.path.exists(args.neg_file) assert os.path.exists(args.config) assert not SpatialSceneDetector.by_name(args.det_name) assert Label.get(args.target_label_id) assert args.video_threshold > 0 assert 0 <= args.precision_thrsh <= 1 assert 0 <= args.recall_thrsh <= 1
def inject_detector(self, detector_name, list_label_ids, true_vid_list=None): """ Create the detector in the table Cnn classifier, upload the tar file with all the model files to s3, and save the video_ids used as positive training data Args: detector_name: string with detector name (it has to be unique because this detector can't be replaced) label_id: int, target label id of the detector true_vid_list: list of ints with ids of the videos used as positive training data Returns: det: the created CnnClassifier object Raise/Assertions: This function asserts if the label_id does not correspond to any existing label and if the detector_name already exists in the db """ target_label_list = { Label.get(label_id) for label_id in list_label_ids if Label.get(label_id) } target_label_list = list(target_label_list) assert len(target_label_list),\ "Target label list needs at least one Label that exists in the DB" det = self.detector_cls.by_name(detector_name) assert not det, 'Cnn Classifier with name %s already exists!'\ % detector_name det = self.detector_cls(name=detector_name) session.flush() det.add_targets(target_label_list) self.tar_and_upload(det) logger.info('CnnClassifier detector injected %s' % det) if true_vid_list: save_training_videos(det.id, true_vid_list) return det
def main(): parser = argparse.ArgumentParser() parser.add_argument('label_id', help='Label id') parser.add_argument('true_pid_file', help='File with one True page id per line') parser.add_argument('false_pid_file', help='File with one False page id per line') parser.add_argument('op_dir', help='Directory to put trained models in') args = parser.parse_args() assert Label.get(args.label_id), 'Invalid label id' trainer = NerTrainer(args.label_id) trainer.run_pipeline(args)
def process_page(page): """ Runs langid's language detection on webpage text""" logger.info("Detecting language for page: %d" % page.id) lang_name = LanguageDetector.detect_language(page.title_and_text) lang_label = Label.by_name(lang_name) assert lang_label is not None, "Label %s does not exist" % lang_name det = LanguageDetector.query.one() LanguageDetector.delete_detector_results(page, [det.id]) det.save_result(page.id, lang_label.id) return lang_label
def inject(self, clf_name, target_label_ids, training_box_ids): """'target_label_ids' and 'training_box_ids' should be sets""" target_labels = [Label.get(l_id) for l_id in target_label_ids] assert all(target_labels), "Bad target label id" training_boxes = [Box.get(b_id) for b_id in training_box_ids] assert all(training_boxes), "Bad training box id" clf = FaceRecognizeClassifier.create(name=clf_name) clf.add_targets(target_labels) for box_id in training_box_ids: TrainingBox(detector_id=clf.id, box_id=box_id) session.flush() self.tar_and_upload(clf) return clf
def validate_config_file(cls, config_file): config_obj = ConfigObj(config_file, configspec=cls.CFG_SPEC.split('\n')) validator = Validator() result = config_obj.validate(validator, copy=True, preserve_errors=True) if result != True: msg = 'Config file validation failed: %s'%result raise Exception(msg) ff = FreebaseFilter() assert ff.is_domain(config_obj['domain']), "Invalid freebase domain: %s"%config_obj['domain'] assert ff.is_type(config_obj['fb_person']), "Invalid freebase type: %s"%config_obj['fb_person'] assert ff.is_type(config_obj['fb_org']), "Invalid freebase type: %s"%config_obj['fb_org'] assert Label.get(config_obj['label_id']), "Invalid label id: %s"%config_obj['label_id'] return config_obj
def load_training_config(configfile_name): """ Load the part of the config file relative to training of this classifier """ training_folders = [] cfg_obj = get_config( configfile_name, spec=TRAINCNNCLF_CFG_SPEC.split('\n')) if cfg_obj: training_folders = cfg_obj['train_params']['training_folders'] target_labels_info = cfg_obj['train_params']['target_label_ids'] label_ids = target_labels_info[1::2] for l_id in label_ids: assert Label.get(l_id), \ " Target label id %d does not exist in the DB" % (l_id) return training_folders
def inject(self, clf_name, target_label_ids): ''' Injects classifier and classifier targets to DB. Params: clf_name: Classifier's name. target_label_ids: The target label ids classifier's 'predict' gives Returns: Classifier. ''' clf = self._clf_cls.create(name=clf_name) target_labels = [Label.get(l_id) for l_id in target_label_ids] assert all(target_labels), "Bad target label id" clf.add_targets(target_labels) self.tar_and_upload(clf) return clf
def inject_detector(self, detector_name, label_id, true_vid_list, confidence_th=None, acceptance_th=None): """ Create the detector in the table VideoMotionColorDetector, upload the tar file with all the model files to s3, and save the video_ids used as positive training data Args: detector_name: string with detector name (it has to be unique because this detector can't be replaced) label_id: int, target label id of the detector true_vid_list: list of ints that correspond to the ids of the videos used as positive training data Returns: det: the created VideoMotionColorDetector object Raise/Assertions: This function asserts if the label_id does not correspond to any existing label and if the detector_name already exists in the db """ l = Label.get(label_id) assert (l != None), 'Label id %d does not correspond to any Label!'\ % (label_id) det = VideoMotionColorDetector.by_name(detector_name) assert not det, 'VideoMotionColorDetector with name %s already exists!'\ % detector_name det = VideoMotionColorDetector(name=detector_name) if confidence_th and acceptance_th: det.confidence_th = confidence_th det.acceptance_th = acceptance_th session.flush() det.add_targets([l]) self.tar_and_upload(det) logger.info('VideoMotionColorDetector detector injected %s' % det) save_training_videos(det.id, true_vid_list) return det
def inject_detector(self, detector_name, label_id, replace_old, true_pid_file): l = Label.get(label_id) assert l is not None, "Label with id %s does not exist" % label_id det = NerDetector.by_name(detector_name) if replace_old: assert det, 'NerDetector with name %s does not exist!'\ % detector_name else: assert not det, 'NerDetector with name %s already exists!'\ % detector_name # create the new detector det = NerDetector(name=detector_name) session.flush() det.add_targets([l]) self.tar_and_upload(det) det.updated_at = datetime.utcnow() session.flush() logger.info('NER detector injected %s' % det) save_training_pages(det.id, det.updated_at, true_pid_file)
def mturk_submission_only(super_config, all_urls, hit_type=ImageHit): """ Read from the video info that corresponds to the injected urls and submit the required hits to mturk Args: super_config: config object with submission params. It is expected to at least contain: super_config['model_to_db_injection']['target_label'] super_config['mturk_submission_params']['mturk_question'] all_urls: list of strings with urls hit_type: Type of MTurk hit (Only ImageHit and VideoHit are supported) Returns: job: MTurkImageJob object created hit_type: type of submitted hits num_hits_submitted: int, number of submitted hits """ assert (hit_type == ImageHit or hit_type == VideoHit), \ "Only ImageHit or VideoHit are valid types for hit_type param" label_name = super_config['model_to_db_injection']['target_label'] question = super_config['mturk_submission_params']['mturk_question'] evaluator_type = HIT_TYPE_TO_EVALUATOR_TYPE[hit_type] if hit_type == ImageHit: hit_data = get_images(all_urls) else: hit_data = get_vids(all_urls) job = None if len(hit_data) > 0: label = Label.get_or_create(label_name) job = MTurkImageJob(label.id, question=question, evaluator_type=evaluator_type) hit_type, num_hits_submitted = job.submit_hits(hit_data) return job, hit_type, num_hits_submitted
def discover_scenes(config_file, folder, label_id): """ Returns a set of clusters that contain similar scenes for a given label Args: config_file: configuration file where all the descriptor and video parameters are defined folder: a directory for discovery's use label_id: id of the label to retrieve videos for (optional) Assertions: ValueError if the label id is not present in the DB """ logger.info("starting scene discovery process") discover = SceneDiscovery(config_file, folder) pickle_folder = os.path.join(folder, 'pickles') image_folder = os.path.join(folder, 'images') feature_folder = os.path.join(folder, 'features') for fold in [pickle_folder, image_folder, feature_folder]: if not os.path.exists(fold): os.makedirs(fold) if label_id: if not Label.get(label_id): raise ValueError("Label id not found") discover.get_videos_from_inventory(label_id) else: discover.ingest_videos_youtube() discover.get_video_from_urls() discover.download_data(image_folder) discover.compute_descriptors(feature_folder) discover_file = os.path.join(pickle_folder, 'scenediscovery.pickle') SceneDiscovery.save_to_file(discover, discover_file) distances = ['intersection', 'chisqr'] methods = ['single', 'ward', 'complete'] for dist in distances: for meth in methods: discover.cluster_scenes(dist, meth) discover_file = os.path.join( pickle_folder, dist + '_' + meth + '_' + 'scenediscovery.pickle') SceneDiscovery.save_to_file(discover, discover_file) logger.info("saved clusters in %s " % discover_file) logger.info("Finished running scene discovery")
def inject_classifier(self, replace_old): # TODO: This seems like it could be generalized for all classifiers cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC) clf_name = cfg_obj['classifier_name'] label = Label.by_name(cfg_obj['target_label_name']) assert label clf = UrlClassifier.by_name(clf_name) if replace_old: assert clf, 'UrlClassifier with name %s does not exist!'\ % clf_name else: assert not clf, 'UrlClassifier with name %s already exists!'\ % clf_name # create the new classifier clf = UrlClassifier.create(name=clf_name) # note that failures above while running the script does not roll back # previously inserted models self.tar_and_upload(clf) clf.updated_at = datetime.utcnow() session.flush() clf.add_targets([label]) logger.info('URL classifier injected %s' % clf)
def recognize_judge_video(clf_dir, video_id, imagedir): model_name = FaceRecognizeClassifierInjector.get_model_name(clf_dir) dp_client = DataProcessorClient(model_name) results = {} ra = ResultAggregator() votes = defaultdict(int) video = Video.get(video_id) assert video for box in video.face_boxes: path = time_to_image_path(imagedir, box.timestamp) fd, cropped_path = mkstemp(suffix='.jpg') os.close(fd) try: rect = get_rect_to_recognize(box) crop_image(path, cropped_path, *rect) [bin_data] = convert_files_to_bin([cropped_path]) result = dp_client.predict(bin_data, box.width, box.height, async=True) results[box.id] = result finally: os.remove(cropped_path) for box_id, result in results.iteritems(): label_id, conf, parts = result.wait(timeout=FACE_CELERY_TIMEOUT) if conf is not None: ra.add_face_info(box_id, conf, parts) if conf > FACE_MIN_CONFIDENCE and label_id is not None: assert Label.get(label_id) ra.add_box_result(box_id, label_id) votes[label_id] += 1 for label_id, occur in votes.iteritems(): if occur >= MIN_OCCURENCE_FOR_VIDEO: ra.add_video_result(label_id) return ra.result_dict
def run_get_training_data(target_label_id, npy_training_info_file=None, old_detector_id=None, max_num_pos_videos=500, ratio_neg_pos=5, excluded_labels_file=None): """ Append given training data (if any) with new available video ids from MTurk hits If no npy is provided, positive are obtained from TrainingVideo and negative from random labels Args: target_label_id: label id that we want to get training data for npy_training_info_file: file with already labeled video ids (same format as output data in this function. column 0: video ids; column 1: label) old_detector_id: detector id from previous version of a similar detector, if we want to use that detector training data to train this new one, i.e., we will add as positive training data all the video_id entries in TrainingVideo with this detector_id max_num_pos_videos: upper limit on the number of positive training data (this function will try to get at least 6 times more negative data than positive, if it exists) RATIO_NEG_POS: ratio between negative and positive data (e.g., ratio 10 means #neg = 10 * #pos) excluded_labels_file: npy file with a list of ids from labels that we dont want to use as negative training data (e.g., specific types of videogames for the videogame classifier) Returns: numpy array with: - first column containing video ids to be used for training - second column containing the corresponding label: 0 - target_label according to MTurk results 1 - any other label Raises/Assertions: asserts if label_id given as target label does not exist in the DB """ assert Label.get(target_label_id), "Target label does not exist" video_id_label_list = [] positive_vids = [] negative_vids = [] new_positive_vids_from_tp = [] new_negative_vids_from_fp = [] if npy_training_info_file and os.path.exists(npy_training_info_file): list_vids_label = np.load(npy_training_info_file) all_vids = list_vids_label[:, 0] all_labels = list_vids_label[:, 1] positive_vids = list(all_vids[all_labels == 0]) negative_vids = list(all_vids[all_labels > 0]) elif old_detector_id: res = TrainingVideo.query.filter_by(detector_id=old_detector_id) positive_vids = [t.video_id for t in res] if max_num_pos_videos > len(positive_vids): new_positive_vids_from_tp = gtd.get_list_of_videoids( target_label_id=target_label_id, target_result=True, excluded_label_list=[], maxNumVideos=max_num_pos_videos - len(positive_vids)) new_negative_vids_from_fp = gtd.get_list_of_videoids( target_label_id=target_label_id, target_result=False, excluded_label_list=[], maxNumVideos=max_num_pos_videos) positive_vids = np.unique(positive_vids + new_positive_vids_from_tp) logger.info('Number of positive ids: %d' % len(positive_vids)) negative_vids = np.unique(negative_vids + new_negative_vids_from_fp) exclusion_label_ids = [] if len(negative_vids) < ratio_neg_pos * len(positive_vids): if excluded_labels_file: exclusion_label_ids = list(np.load(excluded_labels_file)) random_negative_vids = gtd.get_list_of_videoids( target_label_id=None, excluded_label_list=[target_label_id] + exclusion_label_ids, maxNumVideos=len(positive_vids) * ratio_neg_pos - len(negative_vids)) negative_vids = np.unique( np.append(negative_vids, random_negative_vids)) logger.info('Number of negative ids: %d' % len(negative_vids)) video_id_label_list = np.zeros( (len(positive_vids) + len(negative_vids), 2)) video_id_label_list[:, 0] = np.append(positive_vids, negative_vids) zeros_arr = np.zeros((len(positive_vids), 1)) ones_arr = np.ones((len(negative_vids), 1)) video_id_label_list[:, 1] = np.append(zeros_arr, ones_arr) return video_id_label_list
def is_ignored(label_id): if label_id in LABELID_IGNORE_LIST or Label.get( label_id).label_type == 'flip': return True return False
def get_list_of_videoids(target_label_id=None, target_result=True, excluded_label_list=[], start_date=datetime(2013, 1, 1), maxNumVideos=100): """ Returns a list of video ids that correspond to MTurk Hits done for target_label_id. If target_result is True they correspond to True Positives; if False, correspond to False Positives. If target_label_id is None, then we get True positives for any random label (except those indicated in the excluded label list) Note: maxNumVideos is an upper bound, we may get less video ids, either because they do not exist or because there were a lot of duplicated ids in the query result Args: target_label_id: label id that we want to get. IF it's == -1, we'll get random videos EXCEPT those with label id 'negative_label' maxNumVideos: maximum number of video ids that we want to get excluded_label_list: if label_id == None, this function will return videos with any label id except the label ids in this excluded_label_list (e.g., if we want negative training for soccer, excluded_label_list will have soccer and sport labels ids) start_date: datetime object. date that specifies the start date of the period when mturk hits where obtained target_result: target result we want to get from MTurk (either True: True Positives, or False: False Positives) By default we'll get True Positives Returns: list of video ids matching the input parameters requirements Raises/Assertions: """ assert (type(start_date) == datetime and start_date <= datetime.today()), \ "start_date should be a datetime object and can't be in the future" assert (not target_label_id or Label.get(target_label_id)), \ "target_label_id should correspond to a Label id in the DB or be None" if target_label_id: logger.info("obtaining %d '%s positive' video ids for label %s" % (maxNumVideos, str(target_result), Label.get(target_label_id).name)) queryVideos = VideoHit.query.\ filter(VideoHit.result == target_result, VideoHit.label_id == target_label_id, VideoHit.timestamp >= start_date).\ limit(maxNumVideos * EXTRA_RATIO) else: logger.info( 'obtaining %d recent video ids from any label except label id in %s' % (maxNumVideos, str(excluded_label_list))) queryVideos = VideoHit.query.\ filter(VideoHit.result == True, VideoHit.timestamp >= start_date) if len(excluded_label_list) > 0: queryVideos = queryVideos.filter( ~VideoHit.label_id.in_(excluded_label_list)) else: queryVideos = queryVideos.limit(maxNumVideos * EXTRA_RATIO) video_ids = np.unique([v.video_id for v in queryVideos]) video_ids = list(video_ids[0:maxNumVideos]) return video_ids
def find_labels_statistics(list_vids): """ Gets a list of videos and prints the following statistics about it: -Most frequent label -Most frequent QAed label -Labels for video results sorted based on their frequency: -Labels for QAed video results sorted based on their frequency Args: List of video ids Returns: A tuple of webpage label ids and webpage label ids that have been QAed """ pages_q = VideoOnPage.query.filter(VideoOnPage.video_id.in_(list_vids)) page_ids = [p.page_id for p in pages_q] page_ids = list(set(page_ids)) LABELID_IGNORE_LIST = [ 2853, 2854, 3077, 3078, 3079, 3080, 5639, 7840, 7972, 1054, 3076 ] def is_ignored(label_id): if label_id in LABELID_IGNORE_LIST or Label.get( label_id).label_type == 'flip': return True return False def clean_up_list(label_ids_list): return filter(lambda item: not is_ignored(item[0]), label_ids_list) label_ids_from_wpr = [] if page_ids: label_ids_from_wpr = session.query(WebPageLabelResult.label_id, func.count(WebPageLabelResult.label_id)).\ filter(WebPageLabelResult.page_id.in_(page_ids)).group_by( WebPageLabelResult.label_id).all() label_ids_from_wpr = clean_up_list(label_ids_from_wpr) label_ids_from_qawpr = [] if page_ids: label_ids_from_qawpr = session.\ query(VideoHit.label_id, func.count(VideoHit.label_id)).\ filter(VideoHit.page_id.in_(page_ids)).\ filter(VideoHit.result == True).group_by( VideoHit.label_id).all() label_ids_from_qawpr = clean_up_list(label_ids_from_qawpr) label_ids_from_wpr.sort(key=lambda item: item[1]) label_ids_from_qawpr.sort(key=lambda item: item[1]) max_label = 'None' if label_ids_from_wpr: max_label = Label.get(label_ids_from_wpr[-1][0]) max_label_qa = 'None' if label_ids_from_qawpr: max_label_qa = Label.get(label_ids_from_qawpr[-1][0]) print 'Most frequent label = %s' % max_label print 'Most frequent QAed label = %s' % max_label_qa print 'Labels for video results sorted based on their frequency:' label_list = [Label.get(items[0]).name for items in label_ids_from_wpr] label_list.reverse() print '\n'.join(label_list) print 'labels for QAed video results sorted based on their frequency:' label_list_qa = [ Label.get(items[0]).name for items in label_ids_from_qawpr ] label_list_qa.reverse() print '\n'.join(label_list_qa) return (label_ids_from_wpr, label_ids_from_qawpr)