Ejemplo n.º 1
0
def save_boxes(l_id, video_id, boxes):
    """ Save boxes to TextBox table and detector result to BoxDetectorResult
    Args:
        l_id: target Label id
        video_id: Video id
        boxes: dict with key: timestamp, value: list of rectangles

    Returns:
        List of newly created box ids

    Raises/Assertions:
        Asserts if video id does not exist
        Asserts of label id does not exist
    """
    vid = Video.get(video_id)
    l = Label.get(l_id)
    assert vid, "Video %s does not exist" % video_id
    assert l, "Label %s does not exist" % l_id
    clf_target = ClassifierTarget.query.filter_by(target_label_id=l.id)\
        .join(TextDetectClassifier).one()
    box_ids = []
    for timestamp in boxes.keys():
        for h, w, y, x in boxes[timestamp]:
            box_id = Box.get_or_create(x=x,
                                       y=y,
                                       width=w,
                                       height=h,
                                       video=vid,
                                       timestamp=timestamp,
                                       box_type='Text')
            box_ids.append(box_id)
            BoxDetectorResult.log_result(box_id, clf_target.id)
    return box_ids
def run_training_pipeline(config, pos_file, neg_file, precision_thrsh,
                          recall_thrsh, det_name, target_label_id,
                          video_threshold):
    """Pipeline for training a new spatial scene detector.
    
    Args:
        config: Path to config file.
        pos_file: Path to image file specifying positive images.
        neg_file: Path to image file specifying negative images.
        precision_thrsh: Cross-validation precision that must be achieved.
        recall_thrsh: Cross-validation recall that must be achieved.
        det_name: Name of detector to be created.
        target_label_id: Detector's target label id.
        video_threshold: Detector's video_threshold (the # images the detector
                         must fire on to create a VDR)

    Raises:
        AssertionError: Target label doesn't exist.
    """
    logger.info("Running training pipeline...")
    pos_images = read_image_file(pos_file)
    neg_images = read_image_file(neg_file)
    images = pos_images + neg_images
    labels = [POS_LABEL] * len(pos_images) + [NEG_LABEL] * len(neg_images)
    clf = SpatialSceneClassifier(config)
    target_label = Label.get(target_label_id)
    assert target_label
    evaluate_and_inject(clf, images, labels, precision_thrsh, recall_thrsh,
                        det_name, target_label, video_threshold)
    def inject_detector(self, detector_name, label_id, pred_thresh=None):
        """Create the detector in the table TextDetectClassifier, upload the tar
        file with all the model files to s3

        Args:
            detector_name: string with detector name
                (it has to be unique because this detector can't be replaced)
            label_id: int, target label id of the detector
            pred_thresh: set float threshold for word detection

        Returns:
            clf: the created TextDetectClassifier object

        Raise/Assertions:
            This function asserts if the label_id does not correspond to any
            existing label and if the detector_name already exists in the db
        """
        l = Label.get(label_id)
        assert l, 'Label id %d does not correspond to any Label!' % (label_id)
        clf = self.classifier_cls.by_name(detector_name)
        assert not clf, '%s with name %s already exists!'\
            % (self.classifier_cls.__name__, detector_name)

        clf = self.classifier_cls(name=detector_name, pred_thresh=pred_thresh)
        session.flush()
        clf.add_targets([l])
        self.tar_and_upload(clf)
        logger.info('%s detector injected %s' %
                    (self.classifier_cls.__name__, clf))
        return clf
Ejemplo n.º 4
0
    def inject(self, clf_name, target_label_id_photo, target_label_id_sshow):
        '''
        Inject model to db.

        Params:
            clf_name: Name of classifier created.
            target_label_id_photo: Classifier's target label for photo videos
            target_label_id_sshow: target label for slideshow videos

        Returns:
            Classifier.
        '''
        target_labels_list = []
        for target_label_id in [target_label_id_photo, target_label_id_sshow]:
            l = Label.get(target_label_id)
            assert l, 'Label id %s does not correspond to any Label!' % \
                (target_label_id)
            target_labels_list += [l]

        assert not StaticVideoClassifier.by_name(clf_name), \
            'StaticVideoClassifier with name %s already exists!' % clf_name

        classifier = StaticVideoClassifier(name=clf_name)
        session.flush()

        classifier.add_targets(target_labels_list)
        self.tar_and_upload(classifier)
        return classifier
Ejemplo n.º 5
0
 def inject_model(self):
     cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC)
     model_name = cfg_obj['model_name']
     label = Label.by_name(cfg_obj['target_label_name'])
     assert label
     url_model = UrlModel.create(name=model_name)
     self.tar_and_upload(url_model)
     logger.info('URL model injected %s' % url_model)
def check_args(args):
    assert os.path.exists(args.pos_file)
    assert os.path.exists(args.neg_file)
    assert os.path.exists(args.config)
    assert not SpatialSceneDetector.by_name(args.det_name)
    assert Label.get(args.target_label_id)
    assert args.video_threshold > 0
    assert 0 <= args.precision_thrsh <= 1
    assert 0 <= args.recall_thrsh <= 1
Ejemplo n.º 7
0
    def inject_detector(self,
                        detector_name,
                        list_label_ids,
                        true_vid_list=None):
        """
        Create the detector in the table Cnn classifier, upload the tar
        file with all the model files to s3, and save the video_ids used as
        positive training data

        Args:
            detector_name: string with detector name
                (it has to be unique because this detector can't be replaced)
            label_id: int, target label id of the detector
            true_vid_list: list of ints with ids of the videos used
                as positive training data
        Returns:
            det: the created CnnClassifier object

        Raise/Assertions:
            This function asserts if the label_id does not correspond to any
            existing label and if the detector_name already exists in the db
        """
        target_label_list = {
            Label.get(label_id)
            for label_id in list_label_ids if Label.get(label_id)
        }
        target_label_list = list(target_label_list)
        assert len(target_label_list),\
            "Target label list needs at least one Label that exists in the DB"

        det = self.detector_cls.by_name(detector_name)
        assert not det, 'Cnn Classifier with name %s already exists!'\
            % detector_name

        det = self.detector_cls(name=detector_name)
        session.flush()
        det.add_targets(target_label_list)
        self.tar_and_upload(det)
        logger.info('CnnClassifier detector injected %s' % det)

        if true_vid_list:
            save_training_videos(det.id, true_vid_list)

        return det
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('label_id', help='Label id')
    parser.add_argument('true_pid_file',
                        help='File with one True page id per line')
    parser.add_argument('false_pid_file',
                        help='File with one False page id per line')
    parser.add_argument('op_dir', help='Directory to put trained models in')
    args = parser.parse_args()
    assert Label.get(args.label_id), 'Invalid label id'
    trainer = NerTrainer(args.label_id)
    trainer.run_pipeline(args)
def process_page(page):
    """ Runs langid's language detection on webpage text"""
    logger.info("Detecting language for page: %d" % page.id)
    lang_name = LanguageDetector.detect_language(page.title_and_text)
    lang_label = Label.by_name(lang_name)
    assert lang_label is not None, "Label %s does not exist" % lang_name

    det = LanguageDetector.query.one()
    LanguageDetector.delete_detector_results(page, [det.id])
    det.save_result(page.id, lang_label.id)

    return lang_label
Ejemplo n.º 10
0
 def inject(self, clf_name, target_label_ids, training_box_ids):
     """'target_label_ids' and 'training_box_ids' should be sets"""
     target_labels = [Label.get(l_id) for l_id in target_label_ids]
     assert all(target_labels), "Bad target label id"
     training_boxes = [Box.get(b_id) for b_id in training_box_ids]
     assert all(training_boxes), "Bad training box id"
     clf = FaceRecognizeClassifier.create(name=clf_name)
     clf.add_targets(target_labels)
     for box_id in training_box_ids:
         TrainingBox(detector_id=clf.id, box_id=box_id)
     session.flush()
     self.tar_and_upload(clf)
     return clf
Ejemplo n.º 11
0
 def validate_config_file(cls, config_file):
     config_obj = ConfigObj(config_file, configspec=cls.CFG_SPEC.split('\n'))
     validator = Validator()
     result =  config_obj.validate(validator, copy=True, preserve_errors=True)
     if result != True:
         msg = 'Config file validation failed: %s'%result
         raise Exception(msg)
     ff = FreebaseFilter()
     assert ff.is_domain(config_obj['domain']), "Invalid freebase domain: %s"%config_obj['domain']
     assert ff.is_type(config_obj['fb_person']), "Invalid freebase type: %s"%config_obj['fb_person']
     assert ff.is_type(config_obj['fb_org']), "Invalid freebase type: %s"%config_obj['fb_org']
     assert Label.get(config_obj['label_id']), "Invalid label id: %s"%config_obj['label_id']
     return config_obj
def load_training_config(configfile_name):
    """
    Load the part of the config file relative to training of this classifier
    """
    training_folders = []
    cfg_obj = get_config(
        configfile_name, spec=TRAINCNNCLF_CFG_SPEC.split('\n'))
    if cfg_obj:
        training_folders = cfg_obj['train_params']['training_folders']
        target_labels_info = cfg_obj['train_params']['target_label_ids']
        label_ids = target_labels_info[1::2]
        for l_id in label_ids:
            assert Label.get(l_id), \
                " Target label id %d does not exist in the DB" % (l_id)

    return training_folders
Ejemplo n.º 13
0
    def inject(self, clf_name, target_label_ids):
        '''
        Injects classifier and classifier targets to DB.

        Params:
            clf_name: Classifier's name.
            target_label_ids: The target label ids classifier's 'predict' gives

        Returns:
            Classifier.
        '''
        clf = self._clf_cls.create(name=clf_name)
        target_labels = [Label.get(l_id) for l_id in target_label_ids]
        assert all(target_labels), "Bad target label id"
        clf.add_targets(target_labels)
        self.tar_and_upload(clf)
        return clf
    def inject_detector(self,
                        detector_name,
                        label_id,
                        true_vid_list,
                        confidence_th=None,
                        acceptance_th=None):
        """
        Create the detector in the table VideoMotionColorDetector, upload the tar
        file with all the model files to s3, and save the video_ids used as
        positive training data

        Args:
            detector_name: string with detector name
                (it has to be unique because this detector can't be replaced)
            label_id: int, target label id of the detector
            true_vid_list: list of ints that correspond to the ids of the videos
                used as positive training data
        Returns:
            det: the created VideoMotionColorDetector object

        Raise/Assertions:
            This function asserts if the label_id does not correspond to any
            existing label and if the detector_name already exists in the db
        """
        l = Label.get(label_id)
        assert (l != None), 'Label id %d does not correspond to any Label!'\
            % (label_id)

        det = VideoMotionColorDetector.by_name(detector_name)
        assert not det, 'VideoMotionColorDetector with name %s already exists!'\
            % detector_name

        det = VideoMotionColorDetector(name=detector_name)
        if confidence_th and acceptance_th:
            det.confidence_th = confidence_th
            det.acceptance_th = acceptance_th
        session.flush()
        det.add_targets([l])
        self.tar_and_upload(det)

        logger.info('VideoMotionColorDetector detector injected %s' % det)
        save_training_videos(det.id, true_vid_list)

        return det
Ejemplo n.º 15
0
    def inject_detector(self, detector_name, label_id, replace_old,
                        true_pid_file):
        l = Label.get(label_id)
        assert l is not None, "Label with id %s does not exist" % label_id
        det = NerDetector.by_name(detector_name)
        if replace_old:
            assert det, 'NerDetector with name %s does not exist!'\
                % detector_name
        else:
            assert not det, 'NerDetector with name %s already exists!'\
                % detector_name
            # create the new detector
            det = NerDetector(name=detector_name)
            session.flush()
            det.add_targets([l])

        self.tar_and_upload(det)
        det.updated_at = datetime.utcnow()
        session.flush()
        logger.info('NER detector injected %s' % det)
        save_training_pages(det.id, det.updated_at, true_pid_file)
Ejemplo n.º 16
0
def mturk_submission_only(super_config, all_urls, hit_type=ImageHit):
    """
    Read from the video info that corresponds to the injected urls 
    and submit the required hits to mturk

    Args:
        super_config: config object with submission params. 
            It is expected to at least contain:
            super_config['model_to_db_injection']['target_label']
            super_config['mturk_submission_params']['mturk_question']
        all_urls: list of strings with urls
        hit_type: Type of MTurk hit (Only ImageHit and VideoHit are supported)

    Returns:
        job: MTurkImageJob object created
        hit_type: type of submitted hits
        num_hits_submitted: int, number of submitted hits

    """
    assert (hit_type == ImageHit or hit_type == VideoHit), \
        "Only ImageHit or VideoHit are valid types for hit_type param"

    label_name = super_config['model_to_db_injection']['target_label']
    question = super_config['mturk_submission_params']['mturk_question']

    evaluator_type = HIT_TYPE_TO_EVALUATOR_TYPE[hit_type]
    if hit_type == ImageHit:
        hit_data = get_images(all_urls)
    else:
        hit_data = get_vids(all_urls)

    job = None
    if len(hit_data) > 0:
        label = Label.get_or_create(label_name)
        job = MTurkImageJob(label.id,
                            question=question,
                            evaluator_type=evaluator_type)
        hit_type, num_hits_submitted = job.submit_hits(hit_data)

    return job, hit_type, num_hits_submitted
def discover_scenes(config_file, folder, label_id):
    """ Returns a set of clusters that contain similar scenes for a given label
        Args:
            config_file: configuration file where all the descriptor and video parameters are defined
            folder: a directory for discovery's use
            label_id: id of the label to retrieve videos for (optional)
        Assertions:
            ValueError if the label id is not present in the DB
    """
    logger.info("starting scene discovery process")
    discover = SceneDiscovery(config_file, folder)
    pickle_folder = os.path.join(folder, 'pickles')
    image_folder = os.path.join(folder, 'images')
    feature_folder = os.path.join(folder, 'features')
    for fold in [pickle_folder, image_folder, feature_folder]:
        if not os.path.exists(fold):
            os.makedirs(fold)
    if label_id:
        if not Label.get(label_id):
            raise ValueError("Label id not found")
        discover.get_videos_from_inventory(label_id)
    else:
        discover.ingest_videos_youtube()
        discover.get_video_from_urls()
    discover.download_data(image_folder)
    discover.compute_descriptors(feature_folder)
    discover_file = os.path.join(pickle_folder, 'scenediscovery.pickle')
    SceneDiscovery.save_to_file(discover, discover_file)
    distances = ['intersection', 'chisqr']
    methods = ['single', 'ward', 'complete']
    for dist in distances:
        for meth in methods:
            discover.cluster_scenes(dist, meth)
            discover_file = os.path.join(
                pickle_folder,
                dist + '_' + meth + '_' + 'scenediscovery.pickle')
            SceneDiscovery.save_to_file(discover, discover_file)
            logger.info("saved clusters in %s " % discover_file)
    logger.info("Finished running scene discovery")
Ejemplo n.º 18
0
 def inject_classifier(self, replace_old):
     # TODO: This seems like it could be generalized for all classifiers
     cfg_obj = validate_config(self.model_path(CFG_FILE), CFG_SPEC)
     clf_name = cfg_obj['classifier_name']
     label = Label.by_name(cfg_obj['target_label_name'])
     assert label
     clf = UrlClassifier.by_name(clf_name)
     if replace_old:
         assert clf, 'UrlClassifier with name %s does not exist!'\
             % clf_name
     else:
         assert not clf, 'UrlClassifier with name %s already exists!'\
             % clf_name
         # create the new classifier
         clf = UrlClassifier.create(name=clf_name)
     # note that failures above while running the script does not roll back
     # previously inserted models
     self.tar_and_upload(clf)
     clf.updated_at = datetime.utcnow()
     session.flush()
     clf.add_targets([label])
     logger.info('URL classifier injected %s' % clf)
Ejemplo n.º 19
0
def recognize_judge_video(clf_dir, video_id, imagedir):
    model_name = FaceRecognizeClassifierInjector.get_model_name(clf_dir)
    dp_client = DataProcessorClient(model_name)

    results = {}
    ra = ResultAggregator()
    votes = defaultdict(int)
    video = Video.get(video_id)
    assert video
    for box in video.face_boxes:
        path = time_to_image_path(imagedir, box.timestamp)
        fd, cropped_path = mkstemp(suffix='.jpg')
        os.close(fd)
        try:
            rect = get_rect_to_recognize(box)
            crop_image(path, cropped_path, *rect)
            [bin_data] = convert_files_to_bin([cropped_path])
            result = dp_client.predict(bin_data,
                                       box.width,
                                       box.height,
                                       async=True)
            results[box.id] = result
        finally:
            os.remove(cropped_path)
    for box_id, result in results.iteritems():
        label_id, conf, parts = result.wait(timeout=FACE_CELERY_TIMEOUT)
        if conf is not None:
            ra.add_face_info(box_id, conf, parts)
            if conf > FACE_MIN_CONFIDENCE and label_id is not None:
                assert Label.get(label_id)
                ra.add_box_result(box_id, label_id)
                votes[label_id] += 1
    for label_id, occur in votes.iteritems():
        if occur >= MIN_OCCURENCE_FOR_VIDEO:
            ra.add_video_result(label_id)
    return ra.result_dict
Ejemplo n.º 20
0
def run_get_training_data(target_label_id,
                          npy_training_info_file=None,
                          old_detector_id=None,
                          max_num_pos_videos=500,
                          ratio_neg_pos=5,
                          excluded_labels_file=None):
    """
    Append given training data (if any) with new available video ids from
    MTurk hits
    If no npy is provided, positive are obtained from TrainingVideo and
    negative from random labels

    Args:

        target_label_id: label id that we want to get training data for

        npy_training_info_file: file with already labeled video ids 
            (same format as output data in this function. column 0: video ids; column 1: label)

        old_detector_id: detector id from previous version of a similar detector, 
            if we want to use that detector training data to train this new one, 
            i.e., we will add as positive training data all the video_id entries 
                in TrainingVideo with this detector_id

        max_num_pos_videos: upper limit on the number of positive training data 
            (this function will try to get at least 6 times more negative data than positive, if it exists)

        RATIO_NEG_POS: ratio between negative and positive data 
            (e.g., ratio 10 means #neg = 10 * #pos)

        excluded_labels_file: npy file with a list of ids from labels 
            that we dont want to use as negative training data 
            (e.g., specific types of videogames for the videogame classifier)

    Returns:
        numpy array with:
        - first column containing video ids to be used for training
        - second column containing the corresponding label: 0 - target_label    according to MTurk results
                                                            1 - any other label

    Raises/Assertions:
        asserts if label_id given as target label does not exist in the DB
    """
    assert Label.get(target_label_id), "Target label does not exist"

    video_id_label_list = []
    positive_vids = []
    negative_vids = []
    new_positive_vids_from_tp = []
    new_negative_vids_from_fp = []

    if npy_training_info_file and os.path.exists(npy_training_info_file):
        list_vids_label = np.load(npy_training_info_file)
        all_vids = list_vids_label[:, 0]
        all_labels = list_vids_label[:, 1]
        positive_vids = list(all_vids[all_labels == 0])
        negative_vids = list(all_vids[all_labels > 0])
    elif old_detector_id:
        res = TrainingVideo.query.filter_by(detector_id=old_detector_id)
        positive_vids = [t.video_id for t in res]

    if max_num_pos_videos > len(positive_vids):
        new_positive_vids_from_tp = gtd.get_list_of_videoids(
            target_label_id=target_label_id,
            target_result=True,
            excluded_label_list=[],
            maxNumVideos=max_num_pos_videos - len(positive_vids))

    new_negative_vids_from_fp = gtd.get_list_of_videoids(
        target_label_id=target_label_id,
        target_result=False,
        excluded_label_list=[],
        maxNumVideos=max_num_pos_videos)

    positive_vids = np.unique(positive_vids + new_positive_vids_from_tp)
    logger.info('Number of positive ids: %d' % len(positive_vids))

    negative_vids = np.unique(negative_vids + new_negative_vids_from_fp)
    exclusion_label_ids = []
    if len(negative_vids) < ratio_neg_pos * len(positive_vids):
        if excluded_labels_file:
            exclusion_label_ids = list(np.load(excluded_labels_file))
        random_negative_vids = gtd.get_list_of_videoids(
            target_label_id=None,
            excluded_label_list=[target_label_id] + exclusion_label_ids,
            maxNumVideos=len(positive_vids) * ratio_neg_pos -
            len(negative_vids))
        negative_vids = np.unique(
            np.append(negative_vids, random_negative_vids))
    logger.info('Number of negative ids: %d' % len(negative_vids))

    video_id_label_list = np.zeros(
        (len(positive_vids) + len(negative_vids), 2))

    video_id_label_list[:, 0] = np.append(positive_vids, negative_vids)
    zeros_arr = np.zeros((len(positive_vids), 1))
    ones_arr = np.ones((len(negative_vids), 1))
    video_id_label_list[:, 1] = np.append(zeros_arr, ones_arr)

    return video_id_label_list
 def is_ignored(label_id):
     if label_id in LABELID_IGNORE_LIST or Label.get(
             label_id).label_type == 'flip':
         return True
     return False
Ejemplo n.º 22
0
def get_list_of_videoids(target_label_id=None,
                         target_result=True,
                         excluded_label_list=[],
                         start_date=datetime(2013, 1, 1),
                         maxNumVideos=100):
    """
    Returns a list of video ids that correspond to MTurk Hits done for target_label_id.
    If target_result is True they correspond to True Positives; if False, correspond to False Positives.

    If target_label_id is None, then we get True positives for any random label (except those indicated in the excluded label list)

    Note: maxNumVideos is an upper bound, we may get less video ids, either because they do not exist
        or because there were a lot of duplicated ids in the query result

    Args:

        target_label_id: label id that we want to get.
            IF it's == -1, we'll get random videos EXCEPT those with label id 'negative_label'

        maxNumVideos: maximum number of video ids that we want to get

        excluded_label_list:
            if label_id == None, this function will return videos with any
                label id except the label ids in this excluded_label_list
            (e.g., if we want negative training for soccer,
                excluded_label_list will have soccer and sport labels ids)
        start_date: datetime object.
           date that specifies the start date of the period when mturk hits where obtained

        target_result: target result we want to get from MTurk
            (either True: True Positives, or False: False Positives)
            By default we'll get True Positives

    Returns:
        list of video ids matching the input parameters requirements

    Raises/Assertions:

    """
    assert (type(start_date) == datetime and start_date <= datetime.today()), \
        "start_date should be a datetime object and can't be in the future"

    assert (not target_label_id or Label.get(target_label_id)), \
        "target_label_id should correspond to a Label id in the DB or be None"

    if target_label_id:
        logger.info("obtaining %d '%s positive' video ids for label %s" %
                    (maxNumVideos, str(target_result),
                     Label.get(target_label_id).name))

        queryVideos = VideoHit.query.\
            filter(VideoHit.result == target_result,
                   VideoHit.label_id == target_label_id,
                   VideoHit.timestamp >= start_date).\
            limit(maxNumVideos * EXTRA_RATIO)
    else:
        logger.info(
            'obtaining %d recent video ids from any label except label id in %s'
            % (maxNumVideos, str(excluded_label_list)))

        queryVideos = VideoHit.query.\
                filter(VideoHit.result == True,
                       VideoHit.timestamp >= start_date)

        if len(excluded_label_list) > 0:
            queryVideos = queryVideos.filter(
                ~VideoHit.label_id.in_(excluded_label_list))
        else:
            queryVideos = queryVideos.limit(maxNumVideos * EXTRA_RATIO)

    video_ids = np.unique([v.video_id for v in queryVideos])
    video_ids = list(video_ids[0:maxNumVideos])

    return video_ids
def find_labels_statistics(list_vids):
    """
    Gets a list of videos and prints the following statistics about it:
    -Most frequent label
    -Most frequent QAed label
    -Labels for video results sorted based on their frequency:
    -Labels for QAed video results sorted based on their frequency
    Args:
        List of video ids
    Returns:
        A tuple of webpage label ids and webpage label ids that have been QAed
    """
    pages_q = VideoOnPage.query.filter(VideoOnPage.video_id.in_(list_vids))
    page_ids = [p.page_id for p in pages_q]
    page_ids = list(set(page_ids))
    LABELID_IGNORE_LIST = [
        2853, 2854, 3077, 3078, 3079, 3080, 5639, 7840, 7972, 1054, 3076
    ]

    def is_ignored(label_id):
        if label_id in LABELID_IGNORE_LIST or Label.get(
                label_id).label_type == 'flip':
            return True
        return False

    def clean_up_list(label_ids_list):
        return filter(lambda item: not is_ignored(item[0]), label_ids_list)

    label_ids_from_wpr = []
    if page_ids:
        label_ids_from_wpr = session.query(WebPageLabelResult.label_id,
                                     func.count(WebPageLabelResult.label_id)).\
            filter(WebPageLabelResult.page_id.in_(page_ids)).group_by(
                WebPageLabelResult.label_id).all()
    label_ids_from_wpr = clean_up_list(label_ids_from_wpr)

    label_ids_from_qawpr = []
    if page_ids:
        label_ids_from_qawpr = session.\
            query(VideoHit.label_id,
                  func.count(VideoHit.label_id)).\
            filter(VideoHit.page_id.in_(page_ids)).\
            filter(VideoHit.result == True).group_by(
                VideoHit.label_id).all()

    label_ids_from_qawpr = clean_up_list(label_ids_from_qawpr)

    label_ids_from_wpr.sort(key=lambda item: item[1])
    label_ids_from_qawpr.sort(key=lambda item: item[1])

    max_label = 'None'
    if label_ids_from_wpr:
        max_label = Label.get(label_ids_from_wpr[-1][0])
    max_label_qa = 'None'
    if label_ids_from_qawpr:
        max_label_qa = Label.get(label_ids_from_qawpr[-1][0])
    print 'Most frequent label = %s' % max_label
    print 'Most frequent QAed label = %s' % max_label_qa

    print 'Labels for video results sorted based on their frequency:'
    label_list = [Label.get(items[0]).name for items in label_ids_from_wpr]
    label_list.reverse()
    print '\n'.join(label_list)
    print 'labels for QAed video results sorted based on their frequency:'
    label_list_qa = [
        Label.get(items[0]).name for items in label_ids_from_qawpr
    ]
    label_list_qa.reverse()
    print '\n'.join(label_list_qa)
    return (label_ids_from_wpr, label_ids_from_qawpr)