Esempio n. 1
0
def run_face_finder(hdfs_input, hdfs_output, image_length, boxes, **kw):
    cmdenvs = ['IMAGE_LENGTH=%d' % image_length]
    if boxes:
        cmdenvs.append('OUTPUT_BOXES=True')
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'face_finder.py', reducer=False,
                          cmdenvs=cmdenvs,
                          files=['haarcascade_frontalface_default.xml'])
def main():
#    in_path = '/tmp/bwhite/input/1266413011.32-003-fn-data'
#    out_path = '/tmp/bwhite/output/tp/face/run-%f' % time.time()
    in_path = '/user/brandyn/flickr/voc07/flickr_photo_conv4-1297714349.074514'
    out_path = '/user/brandyn/tp/facefinder/run-%f' % time.time()
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False,
                          files=['haarcascade_frontalface_default.xml'])
Esempio n. 3
0
def output_exemplars(hdfs_input,
                     hdfs_output,
                     num=2,
                     output_type='box',
                     output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box,
             score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'exemplars-%d-clip' % num,
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x, ), 'w').write(y)
Esempio n. 4
0
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None):
    """Return an iterator of m kv pairs selected uniformly from the input

    Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p.
    If more kv pairs are returned from Hadoop, then they are ignored.  The resulting kv pairs
    are uniformly random from the input.

    Args:
        m: Desired number of samples (you will get this many as long as n >= m with probability (1-p))
        n: Number of total values (default None uses count_kvs to compute this)
        p: Failure probability (default .01 means there is 1 failure out of 100 runs)

    Yields:
        Sample k/v pairs
    """
    if n is None:
        n = count_kvs(hdfs_input)
    alpha = _random_sample_alpha(n, m, p=p)
    num_outputs = 0
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'),
                              cmdenvs={'ALPHA': alpha})
        for kv in hadoopy.readtb(hdfs_output):
            if num_outputs >= m:
                return
            yield kv
            num_outputs += 1
    if num_outputs < m:
        logging.warn('random_sampler: num_outputs[%d] when m[%d].  To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
Esempio n. 5
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)
     si.feature = json.dumps(self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1.,
                                                                                                                   'HASH_BITS': 128})
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Esempio n. 6
0
def cluster(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          hdfs_output + 'val_pred_pos',
                          'predict_spatial_pyramid_fine.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl'],
                          remove_output=True,
                          files=['exemplars.pkl'],
                          num_reducers=1)
Esempio n. 7
0
def run_train_classifier(hdfs_input, hdfs_output, local_labels, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    files.append(local_labels)
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'train_classifier.py',
                          files=files,
                          cmdenvs=['LOCAL_LABELS_FN=%s' % os.path.basename(local_labels)])
Esempio n. 8
0
 def _run_face(self, fn, **kw):
     in_path = self.data_path + fn
     out_path = "%sout-%s-%f" % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw)
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + "img%.8d.jpg" % num, "w") as fp:
             fp.write(image_data)
Esempio n. 9
0
 def _run_face(self, fn, out_path, **kw):
     in_path = self.data_path + fn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw)
     for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Esempio n. 10
0
 def _run_face(self, fn):
     in_path = self.data_path + fn
     out_path = self.data_path + 'out-' + fn
     cmd = 'hadoop fs -put %s %s' % (fn, in_path)
     subprocess.check_call(cmd.split())
     hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml'])
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + 'img%.8d.jpg' % num, 'w') as fp:
             fp.write(image_data)
Esempio n. 11
0
def run_thresh_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, class_name, class_thresh, output_class, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    hadoopy.launch_frozen(inputs, hdfs_output, 'thresh_predictions.py',
                          cmdenvs=['CLASSIFIER_NAME=%s' % class_name,
                                   'CLASSIFIER_THRESH=%f' % class_thresh,
                                   'OUTPUT_CLASS=%d' % output_class])
Esempio n. 12
0
 def _run(self, fn):
     in_path = self.data_path + fn
     out_path = self.data_path + 'out-' + fn
     cmd = 'hadoop fs -put %s %s' % (fn,  in_path)
     subprocess.check_call(cmd.split())
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     hadoopy.launch_frozen(in_path, out_path, 'wc.py', jobconfs='mapred.min.split.size=100000000')
     wc = dict(hadoopy.cat(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 13
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-qrr%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    jobconfs = []

    # determine the split size
    if 'split_size' in args:
        splitsize = args['split_size']
        jobconfs.append('mapreduce.input.fileinputformat.split.minsize=' +
                        str(splitsize))

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput
            mapper = 'org.apache.hadoop.mapred.lib.IdentityMapper'
        else:
            mapper = True  # use the command line mapper

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            hadoopy.launch_frozen(input,
                                  curoutput,
                                  __file__,
                                  mapper=mapper,
                                  cmdenvs=gopts.cmdenv(),
                                  num_reducers=int(step),
                                  jobconfs=jobconfs)
Esempio n. 14
0
def gen_data(num_clusters, num_points, num_dims):
    hadoopy.launch_frozen(in_name='/tmp/bwhite/input/synth_clusters/dummy',
                          out_name='/tmp/bwhite/input/synth_clusters/%d-%d-%d' % (num_clusters, num_points, num_dims),
                          script_path='generate_data.py',
                          remove_dir=True,
                          cmdenvs=['NUM_CLUSTERS=%d' % (num_clusters),
                                   'NUM_POINTS=%d' % (num_points),
                                   'NUM_DIMS=%d' % (num_dims)],
                          #reducer=None,
                          jobconfs='mapred.reduce.tasks=30',
                          frozen_path='frozen')
Esempio n. 15
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = "hadoopy-test-data/%f/" % cur_time
    print("Storing HDFS temp files and output in [%s]" % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + "out-" + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print("Launching job [%s]" % script_name)
    hadoopy.launch_frozen(in_path, out_path, script_name, files=[data_path + "target.jpg"])
    print("Storing local output in [%s]" % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open("%s%s-img%.8d-%s.jpg" % (local_out, script_name, num, image_name), "w").write(image_data)
Esempio n. 16
0
def launch_frozen(in_name,
                  out_name,
                  script_path,
                  hbase_in=True,
                  hbase_out=False,
                  columns=(),
                  start_row=None,
                  stop_row=None,
                  single_value=None,
                  **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row,
                 single_value, kw)
    hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
Esempio n. 17
0
def run_predict_classifier(hdfs_input, hdfs_classifier_input, hdfs_output, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    print('------------------------BEFORE READTB')
    file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name)
    print('------------------------AFTER  READTB [%s, %s]' % (fp.name, os.path.exists(fp.name)))
    files.append(fp.name)
    hadoopy.launch_frozen(hdfs_input, hdfs_output, 'predict_classifier.py',
                          files=files, reducer=None,
                          cmdenvs=['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)],
                          dummy_arg=fp)
Esempio n. 18
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args

    mat = args.get('mat', None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")

    input = mat
    matname, matext = os.path.splitext(mat)

    gopts.getintkey('blocksize', 3)
    schedule = gopts.getstrkey('reduce_schedule', '1')

    # clear the output
    output = args.get('output', '%s-normal%s' % (matname, matext))
    if hadoopy.exists(output):
        print "Removing %s" % (output)
        hadoopy.rm(output)

    outputnamefunc = lambda x: output + "_iter%i" % (x)
    steps = schedule.split(',')

    for i, step in enumerate(steps):
        if i > 0:
            input = curoutput

        if i + 1 == len(steps):
            curoutput = output
        else:
            curoutput = output + "_iter%i" % (i + 1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)

        gopts.setkey('iter', i)

        if launch:
            if i > 0:
                mapper = "org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      mapper=mapper,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input,
                                      curoutput,
                                      __file__,
                                      cmdenvs=gopts.cmdenv(),
                                      num_reducers=int(step))
Esempio n. 19
0
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x,), 'w').write(y)
Esempio n. 20
0
def run_video_keyframe(hdfs_input, hdfs_output, min_resolution, max_resolution, ffmpeg, **kw):
    if not ffmpeg:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py',
                              reducer=None,
                              cmdenvs=['MIN_RESOLUTION=%d' % min_resolution,
                                       'MAX_RESOLUTION=%f' % max_resolution])
    else:
        fp = vidfeat.freeze_ffmpeg()
        hadoopy.launch_frozen(hdfs_input, hdfs_output, 'video_keyframe.py',
                              reducer=None,
                              cmdenvs=['MIN_RESOLUTION=%d' % min_resolution,
                                       'MAX_RESOLUTION=%f' % max_resolution],
                              files=fp.__enter__(),
                              dummy_arg=fp)
Esempio n. 21
0
 def _run_face(self, fn, out_path, **kw):
     bfn = os.path.basename(fn)
     in_path = self.data_path + bfn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path,
                           hdfs_out_path,
                           'face_finder.py',
                           files=['haarcascade_frontalface_default.xml'],
                           **kw)
     for num, ((image_name, box),
               image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Esempio n. 22
0
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    hdfs_output_pos = hdfs_output + '/pos'
    hdfs_output_neg = hdfs_output + '/neg'
    hadoopy.launch_frozen(hdfs_input_pos, hdfs_output_pos, 'collect_keys.py')
    hadoopy.launch_frozen(hdfs_input_neg, hdfs_output_neg, 'collect_keys.py')
    pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
    neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])
    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
Esempio n. 23
0
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path, hdfs_output, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
Esempio n. 24
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print('Launching job [%s]' % script_name)
    hadoopy.launch_frozen(in_path,
                          out_path,
                          script_name,
                          files=[data_path + 'target.jpg'])
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name),
             'w').write(image_data)
Esempio n. 25
0
def starter(args, launch=True):
    """ The function that calls hadoopy.launch_frozen """
    gopts.args = args
    
    mat = args.get('mat',None)
    if mat is None:
        raise NameError("'mat' option not specified on the command line")
        
    input = mat
    matname,matext = os.path.splitext(mat)
    
    gopts.getintkey('blocksize',3)
    schedule = gopts.getstrkey('reduce_schedule','1')

    # clear the output
    output = args.get('output','%s-normal%s'%(matname,matext))
    if hadoopy.exists(output):
        print "Removing %s"%(output)
        hadoopy.rm(output)
    
    outputnamefunc = lambda x: output+"_iter%i"%(x)
    steps = schedule.split(',')
        
    for i,step in enumerate(steps):
        if i>0:
            input = curoutput
            
        if i+1==len(steps):
            curoutput = output
        else:
            curoutput = output+"_iter%i"%(i+1)
            if hadoopy.exists(curoutput):
                hadoopy.rm(curoutput)
            
        gopts.setkey('iter',i)
            
        if launch:
            if i>0:
                mapper="org.apache.hadoop.mapred.lib.IdentityMapper"
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    mapper=mapper,
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
            else:
                hadoopy.launch_frozen(input, curoutput, __file__, 
                    cmdenvs=gopts.cmdenv(), num_reducers=int(step))
Esempio n. 26
0
def run_join_predictions(hdfs_predictions_input, hdfs_input, hdfs_output, local_image_output, **kw):
    inputs = [hdfs_predictions_input]
    if isinstance(hdfs_input, list):
        inputs += hdfs_input
    else:
        inputs.append(hdfs_input)
    hadoopy.launch_frozen(inputs, hdfs_output, 'join_predictions.py')
    if local_image_output:
        for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_output):
            for classifier, preds in classifier_preds.items():
                for conf, label in preds:
                    path = '%s/%s/label_%d/%8.8f-%s.jpg' % (local_image_output, classifier, label, conf, image_hash)
                    try:
                        os.makedirs(os.path.dirname(path))
                    except OSError:
                        pass
                    with open(path, 'w') as fp:
                        fp.write(image_data)
Esempio n. 27
0
def _run_face(fn):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    cmd = 'hadoop fs -put %s %s' % (fn, in_path)
    subprocess.check_call(cmd.split())
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', files=[data_path + 'haarcascade_frontalface_default.xml'])
    local_out = 'out-%f' % cur_time
    try:
        os.makedirs(local_out)
    except OSError:
        pass
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
        image = np.asarray(Image.open(StringIO.StringIO(image_data)))
        for (x, y, w, h), n in faces:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 3)
        cv2.imwrite('%s/img%.8d.jpg' % (local_out, num), image[:, :, ::-1].copy())
Esempio n. 28
0
def _run_face(fn):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    cmd = 'hadoop fs -put %s %s' % (fn, in_path)
    subprocess.check_call(cmd.split())
    hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', files=[data_path + 'haarcascade_frontalface_default.xml'])
    local_out = 'out-%f' % cur_time
    try:
        os.makedirs(local_out)
    except OSError:
        pass
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
        image = np.asarray(Image.open(StringIO.StringIO(image_data)))
        for (x, y, w, h), n in faces:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 3)
        cv2.imwrite('%s/img%.8d.jpg' % (local_out, num), image[:, :, ::-1].copy())
Esempio n. 29
0
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path,
                              hdfs_output,
                              'clip_boxes.py',
                              files=['image_box_fns.pkl'],
                              remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
Esempio n. 30
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__,
                          self.__class__.__name__)
     si.feature = json.dumps(
         self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path,
                                        hdfs_output + '/clean',
                                        max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean',
                                          hdfs_output + '/feature',
                                          self.feature_dict,
                                          files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/hasher',
                               _lf('train_hasher.py'),
                               cmdenvs={
                                   'KV_PROB': 1.,
                                   'HASH_BITS': 128
                               })
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/db',
                               _lf('build_db.py'),
                               files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(
             hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Esempio n. 31
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'pos', num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {'NEG_FEATS': 'neg_feats.pkl',
               'POS_VAL_FEATS': 'pos_val_feats.pkl',
               'NEG_VAL_FEATS': 'neg_val_feats.pkl'}
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py',
                          cmdenvs=cmdenvs, remove_output=True, files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 32
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          hdfs_output + 'val_pos',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          hdfs_output + 'val_neg',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([
        hdfs_output + 'val_neg', hdfs_output + 'val_pos',
        hdfs_output + 'exemplars-1'
    ],
                          hdfs_output + 'exemplars-2',
                          'calibrate_thresholds.py',
                          num_reducers=50,
                          remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 33
0
def run_kmeans(hdfs_input, hdfs_prev_clusters, hdfs_image_data, hdfs_output, num_clusters,
               num_iters, num_samples, metric, local_json_output=None, **kw):
    frozen_tar_path = None
    for cur_iter_num in range(num_iters):
        clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
        clusters_fn = os.path.basename(clusters_fp.name)
        cur_output = '%s/clust%.6d' % (hdfs_output, cur_iter_num)
        frozen_tar_path = hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans.py',
                                                cmdenvs=['CLUSTERS_FN=%s' % clusters_fn],
                                                files=[clusters_fp.name],
                                                num_reducers=max(1, num_clusters / 2),
                                                frozen_tar_path=frozen_tar_path,
                                                dummy_arg=clusters_fp)['frozen_tar_path']
        hdfs_prev_clusters = cur_output
    print('Clusters[%s]' % hdfs_prev_clusters)
    # Compute K-Means assignment/samples
    # TODO Do full assignment, then sample
    clusters_fp = fetch_clusters_from_hdfs(hdfs_prev_clusters)
    clusters_fn = os.path.basename(clusters_fp.name)
    cur_output = '%s/assign' % hdfs_output
    hadoopy.launch_frozen(hdfs_input, cur_output, 'kmeans_assign.py',
                          cmdenvs=['CLUSTERS_FN=%s' % clusters_fn,
                                   'NUM_SAMPLES=%d' % num_samples,
                                   'mapred.text.key.partitioner.options=-k1'],
                          files=[clusters_fp.name],
                          num_reducers=max(1, num_clusters / 2),
                          partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                          dummy_arg=clusters_fp)
    print('Assignment[%s]' % cur_output)
    # Filter the samples
    assignments_fp = fetch_assignments_from_hdfs(cur_output)
    assignments_fn = os.path.basename(assignments_fp.name)
    cur_output = '%s/samples' % hdfs_output
    hadoopy.launch_frozen(hdfs_image_data, cur_output, 'filter_samples.py',
                          cmdenvs=['ASSIGNMENTS_FN=%s' % os.path.basename(assignments_fn)],
                          files=[assignments_fp.name],
                          reducer=None,
                          dummy_arg=assignments_fp)
    print('Samples[%s]' % cur_output)
def run_face_ranker(hdfs_input, hdfs_output,
                    feature_pkl, exemplar_fn):
    """
    Runs the face_ranker.py hadoopy script.  The output consists of
    the distance of each image to an exemplar as key, and the
    input tuple of (key, imagedata) as value.
    Inputs:
    - hdfs_input: path to hdfs input: (key, imagedata) pairs
    - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata))
      where dist is the distance in Eigenfaces feature space to the exemplar
      image
    - feature_pkl: pickle file containing a trained Eigenfaces feature
    - exemplar_fn: filename of the exemplar image
    """
    fp = tempfile.NamedTemporaryFile()
    _compute_exemplar_feature(exemplar_fn, feature_pkl, fp)
    fp.flush()
    hadoopy.launch_frozen(hdfs_input, hdfs_output,
                          'face_ranker.py',
                          cmdenvs=['EXEMPLAR_FN=%s' % os.path.basename(fp.name),
                                   'FEATURE_FN=%s' % os.path.basename(feature_pkl)],
                          files=[feature_pkl, fp.name])
Esempio n. 35
0
def main(input_path, output_path, num_clusters, cluster_path, num_reducers):
    def inc_path():
        global iter_cnt
        iter_cnt +=1
        return '%s/%d' % (output_path, iter_cnt)
    def prev_path():
        return '%s/%d' % (output_path, iter_cnt)
    consolidate_clusters(cluster_path, 'clusters.pkl')
    if 1:
        hadoopy.launch_frozen(in_name=input_path,
                              out_name=inc_path(),
                              script_path='kmeans_cluster_single.py',
                              reducer=None,
                              cmdenvs=['CLUSTERS_PKL=%s' % ('clusters.pkl'),
                                       'NN_MODULE=nn_l2sqr_c'],
                              #combiner=True,
                              files=['nn_l2sqr_c.py','clusters.pkl'],
                              shared_libs=SHARED_LIBS,
                              modules=['vitrieve_algorithms', 'nn_l2sqr_c',],
                              remove_dir=True,
                              jobconfs=['mapred.min.split.size=999999999999',
                                        'mapred.reduce.tasks=%d' % (num_reducers)])
Esempio n. 36
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1'], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 37
0
def flickr_images(tags,
                  images_per_tag,
                  hdfs_output,
                  num_files=20,
                  max_iters=1,
                  max_pages=1,
                  output_meta=False,
                  api_key=None,
                  api_secret=None,
                  remove_output=False):
    tags = list(tags)
    if api_key is None or api_secret is None:
        api_key = os.environ['FLICKR_API_KEY']
        api_secret = os.environ['FLICKR_API_SECRET']
    tags_per_chunk = max(len(tags) / num_files, 1)
    if remove_output and hadoopy.exists(hdfs_output):
        print('Removing output dir[%s]' % hdfs_output)
        hadoopy.rmr(hdfs_output)
    cmdenvs = {
        'FLICKR_API_KEY': api_key,
        'FLICKR_API_SECRET': api_secret,
        'MAX_ITERS': str(max_iters),
        'MAX_PAGES': str(max_pages)
    }
    for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)):
        hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num,
                        [(images_per_tag, tag) for tag in chunk_tags])
    hadoopy.launch_frozen(hdfs_output + '/tags',
                          hdfs_output + '/metadata',
                          _lf('flickr_bulk.py'),
                          cmdenvs=cmdenvs,
                          num_reducers=num_files)
    output_type = 'meta' if output_meta else 'image'
    hadoopy.launch_frozen(hdfs_output + '/metadata',
                          hdfs_output + '/image_metadata',
                          _lf('file_downloader.py'),
                          cmdenvs={'OUTPUT_TYPE': output_type})
Esempio n. 38
0
def compute_database(flickr_data):
    r = 'image_search/%f/' % time.time()
    f_path = r + 'features/'
    m_path = r + 'median/'
    h_path = r + 'hashes/'
    j_path = r + 'hash_metadata/'
    hadoopy.launch_frozen(flickr_data, f_path, 'build_features.py')
    hadoopy.launch_frozen(f_path, m_path, 'calc_median_feature.py')
    median = np.array([x for _, x in sorted(hadoopy.readtb(m_path))])
    pickle.dump(median, open('median.pkl', 'w'), -1)
    hadoopy.launch_frozen(f_path, h_path, 'compute_hashes.py', files=['median.pkl'])
    hadoopy.launch_frozen([h_path, flickr_data], j_path, 'join.py',
                          num_reducers=10)
    hashes, metadatas = zip(*[x[1] for x in hadoopy.readtb(j_path)])
    hashes = np.array([x.ravel() for x in hashes])
    with open('database.pkl', 'w') as fp:
        pickle.dump((hashes, metadatas, median), fp, -1)
def run_face_ranker(hdfs_input, hdfs_output, feature_pkl, exemplar_fn):
    """
    Runs the face_ranker.py hadoopy script.  The output consists of
    the distance of each image to an exemplar as key, and the
    input tuple of (key, imagedata) as value.
    Inputs:
    - hdfs_input: path to hdfs input: (key, imagedata) pairs
    - hdfs_output: path to the hdfs output tuples: (dist, (key, imagedata))
      where dist is the distance in Eigenfaces feature space to the exemplar
      image
    - feature_pkl: pickle file containing a trained Eigenfaces feature
    - exemplar_fn: filename of the exemplar image
    """
    fp = tempfile.NamedTemporaryFile()
    _compute_exemplar_feature(exemplar_fn, feature_pkl, fp)
    fp.flush()
    hadoopy.launch_frozen(hdfs_input,
                          hdfs_output,
                          'face_ranker.py',
                          cmdenvs=[
                              'EXEMPLAR_FN=%s' % os.path.basename(fp.name),
                              'FEATURE_FN=%s' % os.path.basename(feature_pkl)
                          ],
                          files=[feature_pkl, fp.name])
Esempio n. 40
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Esempio n. 41
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'neg',
                          'compute_exemplar_features.py',
                          remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'pos',
                          'compute_exemplar_features.py',
                          remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(
            hadoopy_helper.jobs.random_sample(hdfs_output + 'neg',
                                              num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(
            hadoopy_helper.jobs.random_sample(
                hdfs_output + 'pos',
                num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {
        'NEG_FEATS': 'neg_feats.pkl',
        'POS_VAL_FEATS': 'pos_val_feats.pkl',
        'NEG_VAL_FEATS': 'neg_val_feats.pkl'
    }
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos',
                          hdfs_output + 'exemplars-0',
                          'uniform_selection.py',
                          cmdenvs=cmdenvs,
                          remove_output=True,
                          files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
def _launch_frozen(in_path, out_path, script_path, jobconfs_default=(), *args, **kw):
    import hadoopy
    import os
    kw = dict(kw)  # Make a copy as we will be mutating it
    kw['frozen_tar_path'] = _freeze_script(script_path)['frozen_tar_path']
    if 'reducer' not in kw and 'num_reducers' not in kw:
        kw['num_reducers'] = 1
    if 'jobconfs' in kw:
        kw['jobconfs'] = kw['jobconfs'] + GLOBAL_JOBCONFS
    else:
        kw['jobconfs'] = GLOBAL_JOBCONFS
    if 'jobconfs' not in kw:
        kw['jobconfs'] = []
    if jobconfs_default:
        jobconfs_dict = dict(x.split('=', 1) for x in kw['jobconfs'])
        jobconfs_default_dict = dict(x.split('=', 1) for x in jobconfs_default)
        for jobconf_name, jobconf_value in jobconfs_default_dict.items():
            if jobconf_name not in jobconfs_dict:
                jobconfs_dict[jobconf_name] = jobconf_value
        kw['jobconfs'] = ['%s=%s' % x for x in jobconfs_dict.items()]
    if 'image_hashes' in kw and kw['image_hashes'] is not None:
        import tempfile
        fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
        file_parse.dump(kw['image_hashes'], fp.name)
        try:
            kw['files'].append(fp.name)
        except KeyError:
            kw['files'] = [fp.name]
        try:
            kw['cmdenvs'].append('PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name))
        except KeyError:
            kw['cmdenvs'] = ['PICARUS_VALID_IMAGE_HASHES=%s' % os.path.basename(fp.name)]
        kw['_internal_dummy_arg'] = fp  # Keep the object alive
        del kw['image_hashes']
        
    return hadoopy.launch_frozen(in_path, out_path, script_path, *args, **kw)
Esempio n. 43
0
def hard_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          num_reducers=10,
                          files=['exemplars.pkl'],
                          remove_output=True)

    def _inner():
        with open('image_box_fns.pkl', 'w') as fp:
            image_box_fns = {}
            for (image_id, box,
                 score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
                for score2, image_id2, box2 in negs:
                    image_box_fns.setdefault(image_id2, []).append(
                        (box2, [image_id, box, score]))
            pickle.dump(image_box_fns, fp, -1)
        del image_box_fns
        gc.collect()

    _inner()
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg_clip',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=feature'])
    hadoopy.launch_frozen(
        [hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'],
        hdfs_output + 'exemplars-1',
        'train_exemplars_hard.py',
        cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'],
        files=['neg_feats.pkl'],
        remove_output=True,
        num_reducers=10)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 44
0
def hard_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars.pkl',
                                                                                                         'MAX_HARD=100',
                                                                                                         'OUTPUT_FORMAT=score_image_box'],
                          num_reducers=10, files=['exemplars.pkl'], remove_output=True)

    def _inner():
        with open('image_box_fns.pkl', 'w') as fp:
            image_box_fns = {}
            for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
                for score2, image_id2, box2 in negs:
                    image_box_fns.setdefault(image_id2, []).append((box2, [image_id, box, score]))
            pickle.dump(image_box_fns, fp, -1)
        del image_box_fns
        gc.collect()
    _inner()
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'hard_neg_clip', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=feature'])
    hadoopy.launch_frozen([hdfs_output + 'pos_sample',
                           hdfs_output + 'hard_neg_clip'], hdfs_output + 'exemplars-1', 'train_exemplars_hard.py',
                          cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'], files=['neg_feats.pkl'],
                          remove_output=True, num_reducers=10)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 45
0
import hadoopy
import random
for i in range(5):
    prefix = str(random.random())
    print(prefix)
    hadoopy.launch_frozen('/tmp/bwhite/input/pets2006.video_frame_data.tb',
                          '/tmp/bwhite/output/pets2006.video_frame_data.b/' + prefix,
                          'bgsub.py',
                          partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
                          jobconfs=['mapred.text.key.partitioner.options=-k1,1',
                                    #'mapred.reduce.tasks=500',
                                    'mapred.min.split.size=999999999999'
                                    'mapred.reduce.tasks=1',
                                    'mapred.output.compress=true',
                                    'mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec'],
                          shared_libs=['libbgsub_fast.so'],
                          frozen_path='frozen') 
Esempio n. 46
0
import hadoopy
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'wc-input'
output_path = data_path + 'wc-output'

# Write data to HDFS in the form of (term #, term)
input_data = enumerate(
    'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.'
    .split())
hadoopy.writetb(input_path, input_data)

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [('the', 6), ('Lorem', 4), ('of', 4)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
Esempio n. 47
0
 def test_cluster_info(self):
     hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)])
     hadoopy.launch_frozen(self.data_path + 'cluster_info_input',
                           self.data_path + 'cluster_info',
                           'cluster_info.py')
     pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
Esempio n. 48
0
def launch_frozen(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw):
    _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw)
    hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
Esempio n. 49
0
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'input'
output_path_a = data_path + 'output_a'
output_path_b = data_path + 'output_b'
output_path_c = data_path + 'output_c'
output_path_d = data_path + 'output_d'

# Write data to HDFS in the form of (term #, term)
input_data = [(1, 5), ('dsfs', {
    'a': 3
}), ([1, 2], 'sdflk')]  # Diverse KV input
hadoopy.writetb(input_path, input_data)

# Launch the jobs
hadoopy.launch_frozen(input_path, output_path_a, 'identity.py')
hadoopy.launch_frozen(input_path, output_path_b, 'identity.py')
hadoopy.launch_frozen(output_path_b, output_path_c, 'identity.py')
hadoopy.launch_frozen(
    [input_path, output_path_a, output_path_b, output_path_c], output_path_d,
    'identity.py')

# Read the first KV pair
print('KV Input[%s]' % str(hadoopy.readtb(input_path).next()))
print('KV Output a[%s]' % str(hadoopy.readtb(output_path_a).next()))
print('KV Output b[%s]' % str(hadoopy.readtb(output_path_b).next()))
print('KV Output c[%s]' % str(hadoopy.readtb(output_path_c).next()))
print('KV Output d[%s]' % str(hadoopy.readtb(output_path_d).next()))
from hadoopy import launch_frozen

input_path = 'hdfs://laserson-1.ent.cloudera.com/ngrams'
output_path = 'hdfs://laserson-1.ent.cloudera.com/output-hadoopy-frozen'

launch_frozen(
    input_path,
    output_path,
    'ngrams.py',
    use_seqoutput=False,
    num_reducers=10,
    hstreaming=
    '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar'
)
Esempio n. 51
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output +
                                   'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          exemplar_out + 'val_pos',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          exemplar_out + 'val_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box,
                  pol) in enumerate(sorted(pos_boxes + neg_boxes,
                                           reverse=True)):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes_cropped',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Esempio n. 52
0
#hadoopy.launch_local(r + 'out/flickr_metadata', r + 'out/flickr_images', 'file_downloader.py',
#                     worker_queue_maxsize=10)  # , max_input=100
print('Downloaded images')
import glob
import shutil
import cv2
for fn in glob.glob('*.JPG'):
    img = cv2.imread(fn)
    img = cv2.resize(img, (int(img.shape[1] / 2.5), int(img.shape[0] / 2.5)))
    try:
        os.remove('target.jpg')
    except OSError:
        pass
    cv2.imwrite('target.jpg', img)
    hadoopy.launch_frozen([
        'flickr_data_picarus/run-1343747418.029870/out/down',
        'flickr_data_picarus/run-1343712226.822338/out/flickr_images'
    ],
                          r + 'tiles',
                          'picnic_job.py',
                          files=['target.jpg'],
                          remove_output=True)
    base = os.path.basename(fn) + '_tiles/'
    try:
        os.makedirs(base)
    except OSError:
        pass
    for k, v in hadoopy.readtb(r + 'tiles'):
        with open(base + k, 'w') as fp:
            fp.write(v)