def run_compute_kernels(hdfs_input,
                        hdfs_output,
                        local_labels_x,
                        local_labels_y,
                        cols_per_chunk=1000,
                        rows_per_chunk=100000,
                        **kw):
    if local_labels_y is None or local_labels_x is None:
        raise ValueError('local_labels_* must not be None!')
    cmdenvs = [
        'LOCAL_LABELS_FN_Y=%s' % os.path.basename(local_labels_y),
        'ROWS_PER_CHUNK=%d' % rows_per_chunk,
        'COLS_PER_CHUNK=%d' % cols_per_chunk
    ]
    files = [local_labels_y]
    cmdenvs.append('LOCAL_LABELS_FN_X=%s' % os.path.basename(local_labels_x))
    files.append(local_labels_x)
    picarus._launch_frozen(
        hdfs_input,
        hdfs_output,
        _lf('compute_kernels.py'),
        cmdenvs=cmdenvs,
        partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
        jobconfs_default=[
            'mapred.task.timeout=6000000',
            'mapred.text.key.partitioner.options=-k1,2'
        ],
        files=files,
        **kw)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
def run_image_clean(hdfs_input, hdfs_output, max_side=None, filter_side=None, **kw):
    cmdenvs = {}
    if max_side is not None:
        cmdenvs['MAX_SIDE'] = max_side
    if filter_side is not None:
        cmdenvs['FILTER_SIDE'] = filter_side
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('image_clean.py'),
                           cmdenvs=cmdenvs, **kw)
Example #4
0
def make_thumbnails(hdfs_input, hdfs_output, thumb_size, image_type, **kw):
    script = 'make_thumbnails.py'
    picarus._launch_frozen(
        hdfs_input,
        hdfs_output,
        _lf(script),
        cmdenvs=['THUMB_SIZE=%d' % thumb_size,
                 'IMAGE_TYPE=%s' % image_type])
def run_face_finder(hdfs_input, hdfs_output, image_length, boxes, image_hashes=None, **kw):
    cmdenvs = ['IMAGE_LENGTH=%d' % image_length]
    if boxes:
        cmdenvs.append('OUTPUT_BOXES=True')
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('face_finder.py'), reducer=None,
                           cmdenvs=cmdenvs,
                           files=[_lf('data/haarcascade_frontalface_default.xml')],
                           image_hashes=image_hashes)
def run_video_features(hdfs_input, hdfs_output, **kw):
    fp = viderator.freeze_ffmpeg()
    picarus._launch_frozen(hdfs_input, hdfs_output + '/features', _lf('video_combined_features.py'),
                           cmdenvs=[],
                           jobconfs=['mapred.child.java.opts=-Xmx512M',
                                     'mapred.task.timeout=12000000',
                                     'mapred.map.max.attempts=10'],
                           files=[fp.__enter__(), _lf('data/haarcascade_frontalface_default.xml')],
                           dummy_arg=fp)
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    files.append(local_labels)
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('train_classifier.py'),
                           files=files,
                           cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)],
                           jobconfs_default=['mapred.task.timeout=6000000'],
                           **kw)
def run_image_feature_point(hdfs_input, hdfs_output, feature, image_length=None, image_height=None, image_width=None, **kw):
    if image_length:
        image_height = image_width = image_length
    if image_height is None or image_width is None:
        raise ValueError('Please specify image_height/image_width or image_length')
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('feature_point_compute.py'),
                           cmdenvs=['IMAGE_HEIGHT=%d' % image_height,
                                    'IMAGE_WIDTH=%d' % image_width,
                                    'FEATURE=%s' % feature],
                           files=[_lf('data/eigenfaces_lfw_cropped.pkl')] + glob.glob(imfeat.__path__[0] + "/_object_bank/data/*"))
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    files.append(local_labels)
    picarus._launch_frozen(
        hdfs_input,
        hdfs_output,
        _lf('train_classifier.py'),
        files=files,
        cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)],
        jobconfs_default=['mapred.task.timeout=6000000'],
        **kw)
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, classes=None, image_hashes=None, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    file_parse.dump([x for x in hadoopy.readtb(hdfs_classifier_input)
                     if classes is None or x[0] in classes], fp.name)
    files.append(fp.name)
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_classifier.py'),
                           files=files, reducer=None,
                           cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)],
                           image_hashes=image_hashes,
                           dummy_arg=fp)
def run_video_keyframe(hdfs_input, hdfs_output, frame_skip=1, min_interval=5, max_interval=float('inf'), max_time=float('inf'), keyframer='uniform', **kw):
    fp = viderator.freeze_ffmpeg()
    picarus._launch_frozen(hdfs_input, hdfs_output + '/keyframe', _lf('video_keyframe.py'),
                           cmdenvs=['MIN_INTERVAL=%f' % min_interval,
                                    'MAX_INTERVAL=%f' % max_interval,
                                    'FRAME_SKIP=%d' % frame_skip,
                                    'KEYFRAMER=%s' % keyframer,
                                    'MAX_TIME=%f' % max_time],
                           jobconfs=['mapred.child.java.opts=-Xmx768M',
                                     'mapred.task.timeout=12000000',
                                     'mapred.map.max.attempts=10'],
                           files=[fp.__enter__()],
                           dummy_arg=fp)
def run_compute_kernels(hdfs_input, hdfs_output, local_labels_x, local_labels_y, cols_per_chunk=1000, rows_per_chunk=100000, **kw):
    if local_labels_y is None or local_labels_x is None:
        raise ValueError('local_labels_* must not be None!')
    cmdenvs = ['LOCAL_LABELS_FN_Y=%s' % os.path.basename(local_labels_y),
               'ROWS_PER_CHUNK=%d' % rows_per_chunk,
               'COLS_PER_CHUNK=%d' % cols_per_chunk]
    files = [local_labels_y]
    cmdenvs.append('LOCAL_LABELS_FN_X=%s' % os.path.basename(local_labels_x))
    files.append(local_labels_x)
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('compute_kernels.py'),
                           cmdenvs=cmdenvs,
                           partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                           jobconfs_default=['mapred.task.timeout=6000000', 'mapred.text.key.partitioner.options=-k1,2'],
                           files=files,
                           **kw)
def run_predict_windows(hdfs_input, hdfs_classifier_input, feature, hdfs_output, image_height, image_width, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name)
    files.append(fp.name)
    files.append(_lf('data/haarcascade_frontalface_default.xml'))
    cmdenvs = ['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)]
    cmdenvs += ['IMAGE_HEIGHT=%d' % image_height,
                'IMAGE_WIDTH=%d' % image_width,
                'FEATURE=%s' % feature]
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_windows.py'),
                           cmdenvs=cmdenvs,
                           files=files,
                           dummy_arg=fp)
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, local_image_output, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    picarus._launch_frozen(inputs, hdfs_output, _lf('join_predictions.py'))
    if local_image_output:
        for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_output):
            for classifier, preds in classifier_preds.items():
                for conf, label in preds:
                    path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output, classifier, label, conf, image_hash)
                    try:
                        os.makedirs(os.path.dirname(path))
                    except OSError:
                        pass
                    with open(path, 'w') as fp:
                        fp.write(image_data)
def run_image_feature(hdfs_input, hdfs_output, feature, files=(), **kw):
    files = list(files)
    if isinstance(feature, dict):
        feature = zlib.compress(json.dumps(feature), 9)
    feature_fp = tempfile.NamedTemporaryFile()
    feature_fp.write(feature)
    feature_fp.flush()
    # This allows for replacing the default models
    cur_files = set([os.path.basename(x) for x in files])
    for x in [_lf('data/hog_8_2_clusters.pkl'), _lf('data/eigenfaces_lfw_cropped.pkl')] + glob.glob(imfeat.__path__[0] + "/_object_bank/data/*"):
        if os.path.basename(x) not in cur_files:
            files.append(x)
            cur_files.add(x)
    files.append(feature_fp.name)
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('feature_compute.py'),
                           cmdenvs=['FEATURE=%s' % os.path.basename(feature_fp.name)],
                           files=files,
                           dummy_arg=feature_fp, **kw)
def run_video_grep_frames(hdfs_input, hdfs_output, feature, max_frames_per_video=None, max_outputs_per_video=None, output_frame=True, **kw):
    fp = viderator.freeze_ffmpeg()
    feature_fp = tempfile.NamedTemporaryFile(suffix='.pkl')
    pickle.dump(feature, feature_fp, -1)
    feature_fp.flush()
    cmdenvs = ['FEATURE_FN=%s' % os.path.basename(feature_fp.name)]
    if max_frames_per_video is not None:
        cmdenvs.append('MAX_FRAMES_PER_VIDEO=%d' % (max_frames_per_video))
    if max_outputs_per_video is not None:
        cmdenvs.append('MAX_OUTPUTS_PER_VIDEO=%d' % (max_outputs_per_video))
    cmdenvs.append('OUTPUT_FRAME=%d' % int(output_frame))
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('video_grep_frames.py'),
                           cmdenvs=cmdenvs,
                           jobconfs=['mapred.child.java.opts=-Xmx512M',
                                     'mapred.task.timeout=12000000',
                                     'mapred.map.max.attempts=10'],
                           files=[fp.__enter__(), feature_fp.name],
                           dummy_arg=(fp, feature_fp))
def run_video_predicate_frames(hdfs_input, hdfs_output, features, max_frames_per_video=None, **kw):
    fp = viderator.freeze_ffmpeg()
    features_fp = tempfile.NamedTemporaryFile(suffix='.pkl')
    pickle.dump(features, features_fp, -1)
    features_fp.flush()
    cmdenvs = ['FEATURES_FN=%s' % os.path.basename(features_fp.name)]
    if max_frames_per_video is not None:
        cmdenvs.append('MAX_FRAMES_PER_VIDEO=%d' % (max_frames_per_video))
    picarus._launch_frozen(hdfs_input, hdfs_output + '/predicate_frames', _lf('video_predicate_frames.py'),
                           cmdenvs=cmdenvs,
                           jobconfs=['mapred.child.java.opts=-Xmx768M',
                                     'mapred.skip.attempts.to.start.skipping=2',
                                     'mapred.skip.map.max.skip.records=1',
                                     'mapred.skip.mode.enabled=true',
                                     'mapred.skip.reduce.auto.incr.proc.count=false'
                                     'mapred.skip.map.auto.incr.proc.count=false',
                                     'mapred.task.timeout=12000000',
                                     'mapred.map.max.attempts=10'],
                           files=[fp.__enter__(), features_fp.name],
                           dummy_arg=(fp, features_fp))
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output,
                         local_image_output, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    picarus._launch_frozen(inputs, hdfs_output, _lf('join_predictions.py'))
    if local_image_output:
        for image_hash, (classifier_preds,
                         image_data) in hadoopy.readtb(hdfs_output):
            for classifier, preds in classifier_preds.items():
                for conf, label in preds:
                    path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output,
                                                            classifier, label,
                                                            conf, image_hash)
                    try:
                        os.makedirs(os.path.dirname(path))
                    except OSError:
                        pass
                    with open(path, 'w') as fp:
                        fp.write(image_data)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output,
                          classifier_name, classifier_extra, local_labels,
                          classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos,
                               _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg,
                               _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {
        'labels': {
            '1': pos_keys,
            '-1': neg_keys
        },
        'classifier': classifier,
        'classifier_extra': classifier_extra
    }
    file_parse.dump(labels, local_labels)
def run_predict_classifier(hdfs_input,
                           hdfs_classifier_input,
                           hdfs_output,
                           classes=None,
                           image_hashes=None,
                           **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    file_parse.dump([
        x for x in hadoopy.readtb(hdfs_classifier_input)
        if classes is None or x[0] in classes
    ], fp.name)
    files.append(fp.name)
    picarus._launch_frozen(
        hdfs_input,
        hdfs_output,
        _lf('predict_classifier.py'),
        files=files,
        reducer=None,
        cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)],
        image_hashes=image_hashes,
        dummy_arg=fp)
Example #21
0
def run_kmeans(hdfs_input, hdfs_prev_clusters, hdfs_image_data, hdfs_output, num_clusters,
               num_iters, num_samples, metric='l2sqr', local_json_output=None, image_hashes=None, **kw):
    for cur_iter_num in range(num_iters):
        clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
        clusters_fn = os.path.basename(clusters_fp.name)
        cur_output = '%s/clust%.6d' % (hdfs_output, cur_iter_num)
        picarus._launch_frozen(hdfs_input, cur_output, _lf('kmeans.py'),
                               cmdenvs=['CLUSTERS_FN=%s' % clusters_fn],
                               files=[clusters_fp.name],
                               num_reducers=max(1, num_clusters / 2),
                               dummy_arg=clusters_fp)
        hdfs_prev_clusters = cur_output
    print('Clusters[%s]' % hdfs_prev_clusters)
    # Compute K-Means assignment/samples
    clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
    clusters_fn = os.path.basename(clusters_fp.name)
    cur_output = '%s/partition' % hdfs_output
    picarus._launch_frozen([hdfs_input, hdfs_image_data], cur_output, _lf('kmeans_partition.py'),
                           cmdenvs=['CLUSTERS_FN=%s' % clusters_fn],
                           files=[clusters_fp.name],
                           num_reducers=max(1, num_clusters / 2),
                           image_hashes=image_hashes,
                           dummy_arg=clusters_fp)
    cur_output = '%s/assign' % hdfs_output
    picarus._launch_frozen(hdfs_input, cur_output, _lf('kmeans_assign.py'),
                          cmdenvs=['CLUSTERS_FN=%s' % clusters_fn,
                                   'NUM_SAMPLES=%d' % num_samples,
                                   'mapred.text.key.partitioner.options=-k1'],
                          files=[clusters_fp.name],
                          num_reducers=max(1, num_clusters / 2),
                          partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                          dummy_arg=clusters_fp)
    print('Assignment[%s]' % cur_output)
    # Filter the samples
    assignments_fp = fetch_assignments_from_hdfs(cur_output)
    assignments_fn = os.path.basename(assignments_fp.name)
    cur_output = '%s/samples' % hdfs_output
    picarus._launch_frozen(hdfs_image_data, cur_output, _lf('filter_samples.py'),
                          cmdenvs=['ASSIGNMENTS_FN=%s' % os.path.basename(assignments_fn)],
                          files=[assignments_fp.name],
                          reducer=None,
                          dummy_arg=assignments_fp)
    print('Samples[%s]' % cur_output)
def run_join_predictions_by_class(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('join_predictions_by_class.py'))
def run_assemble_kernels(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('assemble_kernels.py'),
                           jobconfs_default=['mapred.task.timeout=6000000'],
                           **kw)
def run_join_predictions_by_class(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output,
                           _lf('join_predictions_by_class.py'))
Example #25
0
def run_whiten(hdfs_input, hdfs_output, image_hashes=None, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('whiten.py'),
                           image_hashes=image_hashes)
def run_multiple_kernel_combine(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('multiple_kernel_combine.py'),
                           jobconfs_default=['mapred.task.timeout=6000000'],
                           **kw)
def run_multiple_kernel_combine(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input,
                           hdfs_output,
                           _lf('multiple_kernel_combine.py'),
                           jobconfs_default=['mapred.task.timeout=6000000'],
                           **kw)
Example #28
0
def run_sample(hdfs_input, hdfs_output, num_clusters, **kw):
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'),
                          cmdenvs=['SAMPLE_SIZE=%d' % num_clusters])
def run_assemble_kernels(hdfs_input, hdfs_output, **kw):
    picarus._launch_frozen(hdfs_input,
                           hdfs_output,
                           _lf('assemble_kernels.py'),
                           jobconfs_default=['mapred.task.timeout=6000000'],
                           **kw)
def make_thumbnails(hdfs_input, hdfs_output, thumb_size, image_type, **kw):
    script = 'make_thumbnails.py'
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf(script),
                          cmdenvs=['THUMB_SIZE=%d' % thumb_size,
                                   'IMAGE_TYPE=%s' % image_type])