Esempio n. 1
0
def output_exemplars(hdfs_input,
                     hdfs_output,
                     num=2,
                     output_type='box',
                     output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box,
             score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'exemplars-%d-clip' % num,
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x, ), 'w').write(y)
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
Esempio n. 3
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)
     si.feature = json.dumps(self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1.,
                                                                                                                   'HASH_BITS': 128})
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
def report_clusters(hdfs_input, category, make_faces, **kw):
    """
    NOTE: This transfers much more image data than is necessary! Really this operation
    should be done directly on hdfs
    """
    def make_face_image(facestr):
        name, ext = os.path.splitext(facestr)
        m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name)
        hash, l, t, r, b = m.groups()
        l, t, r, b = map(int, (l, t, r, b))
        #m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name)
        #hash, l, t, r, b = m.groups()
        return {
            'hash': hash,
            'categories': ['faces'],
            'faces': [{'boundingbox': ((l, t), (r, b))}],
            'video': [],
        }

    # Collect all the clusters as a set of lists
    clusters = {}
    cluster_samples = {}

    def update(cluster_index, image_name, clusters):
        cluster = clusters.setdefault(cluster_index, [])
        if make_faces:
            face_image = make_face_image(image_name)
            cluster.append(face_image)
        else:
            cluster.append({
                'hash': image_name,
                'categories': [category],
                'faces': [],
                'video': [],
                })

    for cluster_index, (image_name, _)  in hadoopy.readtb(hdfs_input + '/partition'):
        update(cluster_index, image_name, clusters)

    for cluster_index, (image_name, _)  in hadoopy.readtb(hdfs_input + '/samples'):
        update(cluster_index, image_name, cluster_samples)

    # Gather each cluster
    clusters = [{
        # Sample images uniformly
        'sample_images': samples,
        'all_images': images,
        'size': len(images),
        'children': [],
        'std': 0.0,
        'position': [0.0, 0.0],
        } for ((_, images), (_, samples)) in zip(sorted(clusters.items()),
                                                 sorted(cluster_samples.items()))]

    report = {category: clusters}
    return report
Esempio n. 5
0
def output_exemplars(hdfs_input, hdfs_output, num=2, output_type='box', output_path='exemplars'):
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        for (image_id, box, score), _ in hadoopy.readtb(hdfs_output + 'exemplars-%d' % num):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%s-%s.png' % (score, image_id, box)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'exemplars-%d-clip' % num, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs=['TYPE=%s' % output_type])
    try:
        shutil.rmtree(output_path)
    except OSError:
        pass
    os.makedirs(output_path)
    for x, y in hadoopy.readtb(hdfs_output + 'exemplars-%d-clip' % num):
        open(output_path + '/%s' % (x,), 'w').write(y)
Esempio n. 6
0
def run_hac(hdfs_input, **kw):
    import scipy as sp
    import scipy.cluster
    import scipy.spatial.distance
    x = np.array([x[1] for x in hadoopy.readtb(hdfs_input)])
    y = sp.spatial.distance.pdist(x)
    return sp.cluster.hierarchy.linkage(y)
Esempio n. 7
0
def main2():
    exemplar_name = 'e05c099586f744a6d9e70b334e79da08-[0.5217391304347826, 0.0, 0.8695652173913043, 0.9523809523809523]'
    path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2'
    exemplars = pickle.load(open('exemplars.pkl'))
    exemplar_path = 'exemplars'
    exemplar_ids = {}
    exemplar_num = None
    for exemplar_num, ((image_id, box, _), _) in enumerate(exemplars):
        if exemplar_name == '%s-%s' % (image_id, box):
            break
    for y, x in enumerate(exemplars):
        x = x[0][0]
        exemplar_ids.setdefault(x, []).append(y)
    try:
        shutil.rmtree('hik_pairs_specific')
    except OSError:
        pass
    os.makedirs('hik_pairs_specific')
    pq = LeakyPriorityQueue(100)
    for (kernel, row_num), columns in hadoopy.readtb(path):
        if kernel != 'hik' or row_num != exemplar_num:
            continue
        print(row_num)
        # Blacklist all exemplars from the same image
        columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf
        for column_num, val in enumerate(columns[:row_num]):
            pq.add(-val, (row_num, column_num))
    for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()):
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]),
                    'hik_pairs_specific/%.5d-a-%f.png' % (num, -score))
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]),
                    'hik_pairs_specific/%.5d-b-%f.png' % (num, -score))
Esempio n. 8
0
def main():
    path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2'
    exemplars = pickle.load(open('exemplars.pkl'))
    exemplar_path = 'exemplars'
    exemplar_ids = {}
    for y, x in enumerate(exemplars):
        x = x[0][0]
        exemplar_ids.setdefault(x, []).append(y)
    try:
        shutil.rmtree('hik_pairs')
    except OSError:
        pass
    os.makedirs('hik_pairs')
    pq = LeakyPriorityQueue(100)
    for (kernel, row_num), columns in hadoopy.readtb(path):
        if kernel != 'hik':
            continue
        print(row_num)
        # Blacklist all exemplars from the same image
        columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf
        for column_num, val in enumerate(columns[:row_num]):
            pq.add(-val, (row_num, column_num))
    for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()):
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]),
                    'hik_pairs/%.5d-a-%f.png' % (num, -score))
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]),
                    'hik_pairs/%.5d-b-%f.png' % (num, -score))
Esempio n. 9
0
 def test_err(self):
     nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk'
     self.assertFalse(hadoopy.exists(nonsense_path))
     self.assertEquals(
         hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path)
     self.assertRaises(IOError, hadoopy.ls, nonsense_path)
     self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
def save_display_images(path_hdfs, path_local, min_count,
                        max_count, key_to_path=None):
    """
    Saves the first max_count images obtained by calling
    hadoopy.readtb(path_hdfs).  Each item in the sequence is assumed
    to be of the form (key, (imagedata, boxes)).  The boxes are
    drawn on each image before it is saved to the local path.
    If key_to_path is provided, which maps a key to a path, the image
    corresponding to that key will be saved in key_to_path[key].
    """
    if key_to_path == None:
        key_to_path = {}
    count = 0
    for k, (i, bs) in hadoopy.readtb(path_hdfs):
        if count >= min_count:
            if k in key_to_path:
                path = key_to_path[k]
            else:
                path = path_local
            filename = '%s/%s.jpg' % (path, k)
            im = imfeat.convert_image(Image.open(StringIO.StringIO(i)),
                                      [('opencv', 'bgr', 8)])
            print(k)
            for b in bs:
                cv.Rectangle(im, (b[0], b[1]), (b[2], b[3]),
                             cv.CV_RGB(255, 0, 0), 3)
            cv.SaveImage(filename, im)
        # update count and break loop if necessary
        # TODO(Vlad): can we slice notation on a list of generators?
        count += 1
        if count > max_count:
            break
Esempio n. 11
0
def main():
    exemplars = sorted(pickle.load(open('exemplars.pkl')), key=lambda x: x[0][2], reverse=True)[:100]
    with open('exemplars_best.pkl', 'w') as fp:
        pickle.dump(exemplars, fp, -1)
    hdfs_output = 'exemplarbank/output/%s/' % '1341790878.92'
    #hadoopy.launch_frozen('/user/brandyn/aladdin_results/keyframe/9/keyframe', hdfs_output + 'frame_pred', 'predict_video_frame.py', cmdenvs=['EXEMPLARS=exemplars_best.pkl', 'CELL_SKIP=1'], remove_output=True, files=['exemplars_best.pkl'])
    local_out = 'frame_preds/'
    try:
        shutil.rmtree(local_out)
    except OSError:
        pass
    os.makedirs(local_out)
    for num, (data, (pyramid, num_boxes)) in enumerate(hadoopy.readtb(hdfs_output + 'frame_pred')):
        if np.sum(pyramid):
            pyramid_norm = pyramid / float(num_boxes)
            pyramid_prob = np.sqrt(pyramid / float(np.max(pyramid)))
            p = np.sum(pyramid_norm)
            f = imfeat.image_fromstring(data['frame'])
            pyramid_prob_frame = cv2.resize(pyramid_prob, (f.shape[1], f.shape[0]))
            pyramid_prob_frame_color = COLORS[(pyramid_prob_frame * 255).astype(np.int), :]
            alpha = .5
            beta = alpha * pyramid_prob_frame
            beta = beta.reshape((beta.shape[0], beta.shape[1], 1))
        else:
            beta = 0.
        f = ((1 - beta) * f + beta * pyramid_prob_frame_color).astype(np.uint8)
        print(p)
        open(local_out + '%f-%d.jpg' % (p, num), 'w').write(imfeat.image_tostring(f, 'jpg'))
Esempio n. 12
0
 def __init__(self, input_path, output_path, temp_path):
     self.input_path = input_path
     self.output_path = output_path
     self.temp_path = temp_path
     self.vect_1 = {}
     for k, v in hadoopy.readtb(self.input_path):
         self.vect_1[k] = v               
Esempio n. 13
0
def run_video_frame_classification(train_dir):
    try:
        neg_dir = train_dir + '/0'
        pos_dir = train_dir + '/1'
        while 1:
            # Train using initial pos/neg
            c = vidfeat.SyntheticFrameFeature().train(vidfeat.load_label_frames(train_dir))
            # Predict on dataset
            hdfs_input = random.sample(hadoopy.ls('/user/brandyn/aladdin/mp4_devt/'), 96)
            start_time = '%f' % time.time()
            hdfs_output = '/user/brandyn/aladdin_results/video_grep/%s' % start_time
            picarus.vision.run_video_grep_frames(hdfs_input, hdfs_output, c)
            unsorted_dir = tempfile.mkdtemp()
            try:
                for _, y in hadoopy.readtb(hdfs_output):
                    open('%s/%s.jpg' % (unsorted_dir, hashlib.sha1(y).hexdigest()), 'w').write(y)
                # Present results to user and add to list
                try:
                    cmd = 'python -m interactive_learning.image_selector %s %s %s --port 8083' % (unsorted_dir, pos_dir, neg_dir)
                    print(cmd)
                    subprocess.call(cmd.split())
                except OSError:
                    pass
            finally:
                shutil.rmtree(unsorted_dir)
    finally:
        #shutil.rmtree(temp_root)
        pass
Esempio n. 14
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          hdfs_output + 'val_pos',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          hdfs_output + 'val_neg',
                          'image_predict.py',
                          cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'],
                          remove_output=True,
                          num_reducers=10,
                          files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([
        hdfs_output + 'val_neg', hdfs_output + 'val_pos',
        hdfs_output + 'exemplars-1'
    ],
                          hdfs_output + 'exemplars-2',
                          'calibrate_thresholds.py',
                          num_reducers=50,
                          remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 15
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 16
0
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw):
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    hdfs_output_pos = hdfs_output + '/pos'
    hdfs_output_neg = hdfs_output + '/neg'
    hadoopy.launch_frozen(hdfs_input_pos, hdfs_output_pos, 'collect_keys.py')
    hadoopy.launch_frozen(hdfs_input_neg, hdfs_output_neg, 'collect_keys.py')
    pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
    neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])
    labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys},
                               'classifier': classifier,
                               'classifier_extra': classifier_extra}
    file_parse.dump(labels, local_labels)
def dump_local(hdfs_input, local_output, extension='', **kw):
    """Read data from hdfs and store the contents as hash.ext

    Args:
        hdfs_input: HDFS input path in either 'kv' or 'record' format
        local_output: Local directory output path
        extension: Use this file extension if none available (kv format or
            record with missing extension) (default '')
    """
    try:
        os.makedirs(local_output)
    except OSError:
        pass
    for k, v in hadoopy.readtb(hdfs_input):
        if not isinstance(k, str):
            raise ValueError("Key must be a string. If you are reading data in 'record' form use the 'records' file and not the directory it is in.")
        if isinstance(v, dict):  # record
            try:
                extension = '.' + v['extension'] if v['extension'] else extension
            except KeyError:
                pass
            _record_to_file(v, os.path.join(local_output, k + extension))
        else:
            out_path = os.path.join(local_output, k + ('.' + extension if extension else ''))
            with open(out_path, 'wb') as fp:
                fp.write(v)
Esempio n. 18
0
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None):
    """Return an iterator of m kv pairs selected uniformly from the input

    Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p.
    If more kv pairs are returned from Hadoop, then they are ignored.  The resulting kv pairs
    are uniformly random from the input.

    Args:
        m: Desired number of samples (you will get this many as long as n >= m with probability (1-p))
        n: Number of total values (default None uses count_kvs to compute this)
        p: Failure probability (default .01 means there is 1 failure out of 100 runs)

    Yields:
        Sample k/v pairs
    """
    if n is None:
        n = count_kvs(hdfs_input)
    alpha = _random_sample_alpha(n, m, p=p)
    num_outputs = 0
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'),
                              cmdenvs={'ALPHA': alpha})
        for kv in hadoopy.readtb(hdfs_output):
            if num_outputs >= m:
                return
            yield kv
            num_outputs += 1
    if num_outputs < m:
        logging.warn('random_sampler: num_outputs[%d] when m[%d].  To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
Esempio n. 19
0
def main2():
    exemplar_name = 'e05c099586f744a6d9e70b334e79da08-[0.5217391304347826, 0.0, 0.8695652173913043, 0.9523809523809523]'
    path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2'
    exemplars = pickle.load(open('exemplars.pkl'))
    exemplar_path = 'exemplars'
    exemplar_ids = {}
    exemplar_num = None
    for exemplar_num, ((image_id, box, _), _) in enumerate(exemplars):
        if exemplar_name == '%s-%s' % (image_id, box):
            break
    for y, x in enumerate(exemplars):
        x = x[0][0]
        exemplar_ids.setdefault(x, []).append(y)
    try:
        shutil.rmtree('hik_pairs_specific')
    except OSError:
        pass
    os.makedirs('hik_pairs_specific')
    pq = LeakyPriorityQueue(100)
    for (kernel, row_num), columns in hadoopy.readtb(path):
        if kernel != 'hik' or row_num != exemplar_num:
            continue
        print(row_num)
        # Blacklist all exemplars from the same image
        columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf
        for column_num, val in enumerate(columns[:row_num]):
            pq.add(-val, (row_num, column_num))
    for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()):
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]), 'hik_pairs_specific/%.5d-a-%f.png' % (num, -score))
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]), 'hik_pairs_specific/%.5d-b-%f.png' % (num, -score))
Esempio n. 20
0
def main():
    path = 'exemplarbank/output/1341790878.92/val_pred_pos_kern2'
    exemplars = pickle.load(open('exemplars.pkl'))
    exemplar_path = 'exemplars'
    exemplar_ids = {}
    for y, x in enumerate(exemplars):
        x = x[0][0]
        exemplar_ids.setdefault(x, []).append(y)
    try:
        shutil.rmtree('hik_pairs')
    except OSError:
        pass
    os.makedirs('hik_pairs')
    pq = LeakyPriorityQueue(100)
    for (kernel, row_num), columns in hadoopy.readtb(path):
        if kernel != 'hik':
            continue
        print(row_num)
        # Blacklist all exemplars from the same image
        columns[exemplar_ids[exemplars[row_num][0][0]]] = -np.inf
        for column_num, val in enumerate(columns[:row_num]):
            pq.add(-val, (row_num, column_num))
    for num, (score, (row_num, max_col)) in enumerate(pq.items_sorted()):
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[row_num][0]), 'hik_pairs/%.5d-a-%f.png' % (num, -score))
        shutil.copy(_find_exemplar_fn(exemplar_path, exemplars[max_col][0]), 'hik_pairs/%.5d-b-%f.png' % (num, -score))
Esempio n. 21
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr', hdfs_output + 'neg', 'compute_exemplar_features.py', remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr', hdfs_output + 'pos', 'compute_exemplar_features.py', remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'neg', num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(hadoopy_helper.jobs.random_sample(hdfs_output + 'pos', num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {'NEG_FEATS': 'neg_feats.pkl',
               'POS_VAL_FEATS': 'pos_val_feats.pkl',
               'NEG_VAL_FEATS': 'neg_val_feats.pkl'}
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos', hdfs_output + 'exemplars-0', 'uniform_selection.py',
                          cmdenvs=cmdenvs, remove_output=True, files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 22
0
def report_categories(hdfs_join_predictions_input, local_output, image_limit, local_thumb_output, **kw):
    # Output a cluster for each category
    # FIXME This is hardcoded for indoor_outdoor, it will have to change when
    # there are multiple classifiers (indoor, outdoor, photos, documents, etc)
    hashes = {-1: [], 1: []}
    totals = {-1: 0, 1: 0}

    # First pass: find images for each category
    for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_join_predictions_input):
        for classifier, preds in classifier_preds.items():
            posname, negname = classifier.split('_')
            for conf, label in preds:
                totals[label] += 1
                if len(hashes[label]) < image_limit:
                    heapq.heappush(hashes[label], (conf, image_hash))
                else:
                    heapq.heappushpop(hashes[label], (conf, image_hash))

    print negname, len(hashes[-1]), totals[-1]
    print posname, len(hashes[1]), totals[1]

    categories = {}
    categories[posname] = report_output.make_random_clusters([h for _, h in hashes[1]], posname)
    categories[negname] = report_output.make_random_clusters([h for _, h in hashes[-1]], negname)

    try:
        os.makedirs(os.path.dirname(local_output))
    except OSError:
        pass
    file_parse.dump(categories, local_output)

    # Second pass: make image thumbnails
    if local_thumb_output:
        try:
            os.makedirs(local_thumb_output)
        except OSError:
            pass
        hashset = set([h for _, h in hashes[-1] + hashes[1]])
        for image_hash, (classifier_preds, image_data) in hadoopy.readtb(hdfs_join_predictions_input):
            if image_hash in hashset:
                s = StringIO.StringIO()
                s.write(image_data)
                s.seek(0)
                frame = Image.open(s)
                frame.thumbnail((100,100))
                path = '%s/%s.jpg' % (local_thumb_output, image_hash)
                frame.save(path)
Esempio n. 23
0
def report_clusters(hdfs_input, local_json_output, sample, category, make_faces, **kw):
    """
    NOTE: This transfers much more image data than is necessary! Really this operation
    should be done directly on hdfs
    """
    def make_face_image(facestr):
        name, ext = os.path.splitext(facestr)
        m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name)
        print name
        try:
            hash, l, t, r, b = m.groups()
            l,t,r,b = map(int, (l,t,r,b))
            #m = re.match('(\w+)-face-x0(\d+)-y0(\d+)-x1(\d+)-y1(\d+)', name)
            #hash, l, t, r, b = m.groups()
            return {
                'hash': hash,
                'categories': ['faces'],
                'faces': [{'boundingbox': ((l,t),(r,b))}],
                'video': [],
                }
        except:
            return {}

    # Collect all the clusters as a set of lists
    clusters = {}
    count = 0
    for cluster_index, (image_name, _)  in hadoopy.readtb(hdfs_input):
        count += 1
        if count % 100 == 0: print count
        cluster = clusters.setdefault(cluster_index, [])
        if make_faces:
            face_image = make_face_image(image_name)
            cluster.append(face_image)
        else:
            cluster.append({
                'hash': image_name,
                'categories': [category],
                'faces': [],
                'video': [],
                })

    # Gather each cluster
    print len(clusters), 'clusters'
    clusters = [{
        # Sample images uniformly
        'sample_images': random.sample(image_set, min(len(image_set), sample)),
        'all_images': image_set,
        'size': len(image_set),
        'children': [],
        'std': 0.0,
        'position': [0.0, 0.0],
        } for image_set in clusters.values()]

    try:
        os.makedirs(os.path.dirname(local_json_output))
    except OSError:
        pass
    report = {category: clusters}
    file_parse.dump(report, local_json_output)
Esempio n. 24
0
def compute_database(flickr_data):
    r = 'image_search/%f/' % time.time()
    f_path = r + 'features/'
    m_path = r + 'median/'
    h_path = r + 'hashes/'
    j_path = r + 'hash_metadata/'
    hadoopy.launch_frozen(flickr_data, f_path, 'build_features.py')
    hadoopy.launch_frozen(f_path, m_path, 'calc_median_feature.py')
    median = np.array([x for _, x in sorted(hadoopy.readtb(m_path))])
    pickle.dump(median, open('median.pkl', 'w'), -1)
    hadoopy.launch_frozen(f_path, h_path, 'compute_hashes.py', files=['median.pkl'])
    hadoopy.launch_frozen([h_path, flickr_data], j_path, 'join.py',
                          num_reducers=10)
    hashes, metadatas = zip(*[x[1] for x in hadoopy.readtb(j_path)])
    hashes = np.array([x.ravel() for x in hashes])
    with open('database.pkl', 'w') as fp:
        pickle.dump((hashes, metadatas, median), fp, -1)
Esempio n. 25
0
def read_hdfs_as_generator(path, read_all_at_once=False):
    """Reads a path at HDFS and returns it line by line as a generator
       
       Args:
           path (strng): HDFS path
           read_all_at_once
        
        Returns: strings (lines of the file) 
    """
    if read_all_at_once:
        lines = [i for i in hadoopy.readtb(path)]
        for i in lines:
            yield i

    else:
        for i in hadoopy.readtb(path):
            yield i
def main():
    word_counts = dict(hadoopy.readtb(hdfs_index))
    for word in word_counts:
        batch = table_index.batch()
        for url in word_counts[word]:
            tfidf = word_counts[word][url]
            batch.put(word.encode('utf-8'),{"wiki:"+url:str(tfidf).encode('utf-8')})
        batch.send()
Esempio n. 27
0
def extractInfo(file_name):
    line_raw = dict(hadoopy.readtb(file_name))
    line_raw = pd.DataFrame(line_raw.values(),
                            columns=[
                                'DATE', 'TIME', 'LINE', 'BUS_NUM',
                                'X_COORDINATE', 'Y_COORDINATE'
                            ])
    line = data_extraction.getCoord(line_raw)
    return line
Esempio n. 28
0
 def _run_face(self, fn, out_path, **kw):
     in_path = self.data_path + fn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, hdfs_out_path, 'face_finder.py', files=['haarcascade_frontalface_default.xml'], **kw)
     for num, ((image_name, box), image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Esempio n. 29
0
 def _inner():
     with open('image_box_fns.pkl', 'w') as fp:
         image_box_fns = {}
         for (image_id, box, score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
             for score2, image_id2, box2 in negs:
                 image_box_fns.setdefault(image_id2, []).append((box2, [image_id, box, score]))
         pickle.dump(image_box_fns, fp, -1)
     del image_box_fns
     gc.collect()
def main():
    word_counts = dict(hadoopy.readtb(hdfs_index))
    for word in word_counts:
        batch = table_index.batch()
        for url in word_counts[word]:
            tfidf = word_counts[word][url]
            batch.put(word.encode('utf-8'),
                      {"wiki:" + url: str(tfidf).encode('utf-8')})
        batch.send()
Esempio n. 31
0
 def _run_face(self, fn, **kw):
     in_path = self.data_path + fn
     out_path = "%sout-%s-%f" % (self.data_path, fn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path, out_path, "face_finder.py", files=["haarcascade_frontalface_default.xml"], **kw)
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + "img%.8d.jpg" % num, "w") as fp:
             fp.write(image_data)
Esempio n. 32
0
def calibrate(hdfs_input, hdfs_output):
    # Predict on pos/neg sets
    hadoopy.launch_frozen(hdfs_input + '1-v', hdfs_output + 'val_pos', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=16'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    hadoopy.launch_frozen(hdfs_input + '0-v', hdfs_output + 'val_neg', 'image_predict.py', cmdenvs=['EXEMPLARS=exemplars.pkl', 'CELL_SKIP=1'], remove_output=True, num_reducers=10, files=['exemplars.pkl'])
    # Calibrate threshold using pos/neg validation set #1
    hadoopy.launch_frozen([hdfs_output + 'val_neg', hdfs_output + 'val_pos', hdfs_output + 'exemplars-1'], hdfs_output + 'exemplars-2', 'calibrate_thresholds.py', num_reducers=50, remove_output=True)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-2'), key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 33
0
 def _run_face(self, fn):
     in_path = self.data_path + fn
     out_path = self.data_path + 'out-' + fn
     cmd = 'hadoop fs -put %s %s' % (fn, in_path)
     subprocess.check_call(cmd.split())
     hadoopy.launch_frozen(in_path, out_path, 'face_finder.py', reducer=False, files=['haarcascade_frontalface_default.xml'])
     for num, (image_name, (image_data, faces)) in enumerate(hadoopy.readtb(out_path)):
         with open(self.out_path + 'img%.8d.jpg' % num, 'w') as fp:
             fp.write(image_data)
def test_tb(path):
    """
    This function tests the sequence file at 'path' (on hdfs) by
    reading the images from it.
    """
    # test that we can read each file using _load_cv_image
    for (key, val) in hadoopy.readtb(path):
        print(key)
        i = imfeat.convert_image(Image.open(StringIO.StringIO(val)),
                                [('opencv', 'gray', 8)])
def report_video_keyframe(hdfs_input, **kw):
    videos = {}
    for (kind, hash), v in hadoopy.readtb(hdfs_input):
        if kind == 'video':
            videos[hash] = v
    if not len(videos):
        # Sanity check
        print "No videos returned by readtb(%s). This is probably the wrong keyframe path" % hdfs_input
    report = {'videos': videos}
    return report
Esempio n. 36
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
def test_tb(path):
    """
    This function tests the sequence file at 'path' (on hdfs) by
    reading the images from it.
    """
    # test that we can read each file using _load_cv_image
    for (key, val) in hadoopy.readtb(path):
        print(key)
        i = imfeat.convert_image(Image.open(StringIO.StringIO(val)),
                                 [('opencv', 'gray', 8)])
Esempio n. 38
0
def report_video_keyframe(hdfs_input, **kw):
    videos = {}
    for (kind, hash), v in hadoopy.readtb(hdfs_input):
        if kind == 'video':
            videos[hash] = v
    if not len(videos):
        # Sanity check
        print "No videos returned by readtb(%s). This is probably the wrong keyframe path" % hdfs_input
    report = {'videos': videos}
    return report
Esempio n. 39
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 40
0
 def _run_hdfs(self, orig_fn):
     fn = "%f-%s" % (time.time(), orig_fn)
     file_path = "%s/%s" % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, "Title: Alice's Adventures in Wonderland")
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
Esempio n. 41
0
 def _run_hdfs(self, orig_fn):
     fn = '%f-%s' % (time.time(), orig_fn)
     file_path = '%s/%s' % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, 'Title: Alice\'s Adventures in Wonderland')
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
Esempio n. 42
0
 def _inner():
     with open('image_box_fns.pkl', 'w') as fp:
         image_box_fns = {}
         for (image_id, box,
              score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
             for score2, image_id2, box2 in negs:
                 image_box_fns.setdefault(image_id2, []).append(
                     (box2, [image_id, box, score]))
         pickle.dump(image_box_fns, fp, -1)
     del image_box_fns
     gc.collect()
Esempio n. 43
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__,
                          self.__class__.__name__)
     si.feature = json.dumps(
         self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path,
                                        hdfs_output + '/clean',
                                        max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean',
                                          hdfs_output + '/feature',
                                          self.feature_dict,
                                          files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/hasher',
                               _lf('train_hasher.py'),
                               cmdenvs={
                                   'KV_PROB': 1.,
                                   'HASH_BITS': 128
                               })
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/db',
                               _lf('build_db.py'),
                               files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(
             hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
def make_training_set(path_hdfs, pos_disp_dir, neg_disp_dir,
                      pos_dir, neg_dir, pos_file, neg_file, max_count):
    """
    Makes a training set by downloading the original images (w/o overlayed
    boxes) corresponding to the positives and negatives from the display
    directories.  The file lists used as input by opencv_createsamples and
    by opencv_haartraining are also created.
    """
    key_to_path = {}
    for (d1, d2) in [(pos_disp_dir, pos_dir), (neg_disp_dir, neg_dir)]:
        if not os.path.exists(d2):
            os.makedirs(d2)
        key_to_path.update([(os.path.splitext(
            os.path.basename(f))[0], d2) for f in glob.glob('%s/*' % d1)])
    # save the bounding boxes in a pickle file in each directory
    boxes = {pos_dir : {}, neg_dir : {}}
    # the following two files will contain the list of positive/negative
    # images for training the OpenCV face detector
    pos_fp = open(pos_file, 'w')
    neg_fp = open(neg_file, 'w')
    count = 0
    for k, (i, bs) in hadoopy.readtb(path_hdfs):
        try:
            path = key_to_path[k]
            # save the face bounding boxes for this image
            boxes[path][k] = bs
            # save the original image
            filename = '%s/%s.jpg' % (path, k)
            print(filename)
            with open(filename, 'wb') as f:
                f.write(i)
            # update the positive/negative training lists
            if path == pos_dir:
                pos_fp.write('%s %i' % (filename, len(bs)))
                for b in bs:
                    pos_fp.write(' %i %i %i %i' % (
                        b[0], b[1], b[2] - b[0] + 1, b[3] - b[1] + 1))
                pos_fp.write('\n')
            else:
                neg_fp.write('%s\n' % filename)
        except KeyError:
            pass
        # update count and break loop if necessary
        # TODO(Vlad): can we slice notation on a list of generators?
        count += 1
        if count > max_count:
            break
    pos_fp.close()
    neg_fp.close()
    # save the bounding boxes in a pickle file
    for (path, bs) in boxes.items():
        with open('%s/boxes.pkl' % path, 'wb') as f:
            pickle.dump(bs, f)
Esempio n. 45
0
def initial_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'neg',
                          'compute_exemplar_features.py',
                          remove_output=True)
    hadoopy.launch_frozen(hdfs_input + '1-tr',
                          hdfs_output + 'pos',
                          'compute_exemplar_features.py',
                          remove_output=True)
    # Compute desired probability
    num_val = 5000
    num_neg_train = 5000
    toggle_launch()
    if 0:
        neg_samples = list(
            hadoopy_helper.jobs.random_sample(hdfs_output + 'neg',
                                              num_val + num_neg_train))
        neg_samples = [x[1] for x in neg_samples]
        with open('neg_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[num_val:]), fp, -1)
        with open('neg_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(neg_samples[:num_val]), fp, -1)
        del neg_samples
        gc.collect()
        pos_samples = list(
            hadoopy_helper.jobs.random_sample(
                hdfs_output + 'pos',
                num_val / 2))  # Twice as many neg as positive
        pos_samples = [x[1] for x in pos_samples]
        with open('pos_val_feats.pkl', 'w') as fp:
            pickle.dump(np.array(pos_samples), fp, -1)
        del pos_samples
    gc.collect()
    cmdenvs = {
        'NEG_FEATS': 'neg_feats.pkl',
        'POS_VAL_FEATS': 'pos_val_feats.pkl',
        'NEG_VAL_FEATS': 'neg_val_feats.pkl'
    }
    files = cmdenvs.values()
    cmdenvs['SAMPLE_SIZE'] = 1000
    hadoopy.launch_frozen(hdfs_output + 'pos',
                          hdfs_output + 'exemplars-0',
                          'uniform_selection.py',
                          cmdenvs=cmdenvs,
                          remove_output=True,
                          files=files)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-0'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)
Esempio n. 46
0
def fetch_assignments_from_hdfs(hdfs_input):
    """Fetch remote assignments and store locally

    Args:
        hdfs_input: HDFS input path

    Returns:
        NamedTemporaryFile holding the assignment data
    """
    assignments_fp = tempfile.NamedTemporaryFile()
    assignments = list(hadoopy.readtb(hdfs_input))
    pickle.dump(assignments, assignments_fp, -1)
    assignments_fp.seek(0)
    return assignments_fp
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output,
                          classifier_name, classifier_extra, local_labels,
                          classifier, **kw):
    """
    TODO Finish docstring
    Args:
        hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local.
    """
    labels = {}
    try:
        labels = file_parse.load(local_labels)
    except IOError:
        pass
    if hdfs_output is None:
        j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in j['output']), [])
        j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py'))
        neg_keys = sum((x[1] for x in j['output']), [])
    else:
        hdfs_output_pos = hdfs_output + '/pos'
        hdfs_output_neg = hdfs_output + '/neg'
        picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos,
                               _lf('collect_keys.py'))
        picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg,
                               _lf('collect_keys.py'))
        pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), [])
        neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), [])

    labels[classifier_name] = {
        'labels': {
            '1': pos_keys,
            '-1': neg_keys
        },
        'classifier': classifier,
        'classifier_extra': classifier_extra
    }
    file_parse.dump(labels, local_labels)
Esempio n. 48
0
 def _run_face(self, fn, out_path, **kw):
     bfn = os.path.basename(fn)
     in_path = self.data_path + bfn
     hdfs_out_path = '%sout-%s-%f' % (self.data_path, bfn, time.time())
     if not hadoopy.exists(in_path):
         hadoopy.put(fn, in_path)
     hadoopy.launch_frozen(in_path,
                           hdfs_out_path,
                           'face_finder.py',
                           files=['haarcascade_frontalface_default.xml'],
                           **kw)
     for num, ((image_name, box),
               image_data) in enumerate(hadoopy.readtb(hdfs_out_path)):
         with open(out_path + 'img%.8d.png' % num, 'w') as fp:
             fp.write(image_data)
Esempio n. 49
0
def readHDFS(path):
    data_raw = dict(hadoopy.readtb(path))
    coordinate = []
    for row in data_raw.itervalues():
        if 'Xrec' in row:
            coordinate.append(row)
    if coordinate == []:
        return pd.DataFrame()
    length = len(sorted(coordinate,key=len, reverse=True)[0])
    coordinate_list = [x.encode('UTF8').split(';') for x in coordinate]
    gps_data=np.array([xi+[None]*(length-len(xi)) for xi in coordinate_list])
    gps_data = pd.DataFrame(gps_data)
    gps_data = gps_data.iloc[:,[0,12,2,7,14,16]]
    gps_data.columns = ['Date','Time','Line','Bus_num','X_coordinate','Y_coordinate']
    return gps_data
def run_predict_windows(hdfs_input, hdfs_classifier_input, feature, hdfs_output, image_height, image_width, **kw):
    import classipy
    # NOTE: Adds necessary files
    files = glob.glob(classipy.__path__[0] + "/lib/*")
    fp = tempfile.NamedTemporaryFile(suffix='.pkl.gz')
    file_parse.dump(list(hadoopy.readtb(hdfs_classifier_input)), fp.name)
    files.append(fp.name)
    files.append(_lf('data/haarcascade_frontalface_default.xml'))
    cmdenvs = ['CLASSIFIERS_FN=%s' % os.path.basename(fp.name)]
    cmdenvs += ['IMAGE_HEIGHT=%d' % image_height,
                'IMAGE_WIDTH=%d' % image_width,
                'FEATURE=%s' % feature]
    picarus._launch_frozen(hdfs_input, hdfs_output, _lf('predict_windows.py'),
                           cmdenvs=cmdenvs,
                           files=files,
                           dummy_arg=fp)
Esempio n. 51
0
def _run_haystack(fn, script_name):
    cur_time = time.time()
    hdfs_base_path = 'hadoopy-test-data/%f/' % cur_time
    print('Storing HDFS temp files and output in [%s]' % hdfs_base_path)
    in_path = hdfs_base_path + os.path.basename(fn)
    out_path = hdfs_base_path + 'out-' + os.path.basename(fn)
    hadoopy.put(fn, in_path)
    print('Launching job [%s]' % script_name)
    hadoopy.launch_frozen(in_path,
                          out_path,
                          script_name,
                          files=[data_path + 'target.jpg'])
    print('Storing local output in [%s]' % local_out)
    for num, (image_name, image_data) in enumerate(hadoopy.readtb(out_path)):
        open('%s%s-img%.8d-%s.jpg' % (local_out, script_name, num, image_name),
             'w').write(image_data)
Esempio n. 52
0
def report_thumbnails(hdfs_input, local_thumb_output, **kw):
    """Collect thumbnails of all images in hdfs://${hdfs_input}
    """
    counter = 0
    for image_hash, image_data in hadoopy.readtb(hdfs_input):
        path = '%s/%s/%s/%s.jpg' % (local_thumb_output, image_hash[:2],
                                    image_hash[2:4], image_hash)
        try:
            os.makedirs(os.path.dirname(path))
        except OSError:
            pass

        with open(path, 'w') as f:
            f.write(image_data)
        counter += 1
    if not counter:
        print 'There were no images in readtb(%s). This is probably not a thumbnail path' % hdfs_input
Esempio n. 53
0
def main():
    path = 'exemplarbank/output/1341790878.92/val_pred_pos'
    pyramid, num_boxes = hadoopy.readtb(path).next()[1]
    try:
        shutil.rmtree('priors')
    except OSError:
        pass
    os.makedirs('priors')
    exemplars = pickle.load(open('exemplars.pkl'))
    for exemplar_num in range(pyramid.shape[0]):
        print(exemplar_num)
        p = pyramid[exemplar_num, :, :] / float(
            np.max(pyramid[exemplar_num, :, :]))
        p = (p * 255).astype(np.uint8)
        print p
        cv2.imwrite(
            'priors/%.5d-%.5d.png' %
            (exemplars[exemplar_num][0][2], exemplar_num), p)
Esempio n. 54
0
def fetch_clusters_from_hdfs(hdfs_input):
    """Fetch remote clusters and store locally

    Clusters are sorted to allow comparing between iterations

    Args:
        hdfs_input: HDFS input path

    Returns:
        NamedTemporaryFile holding the cluster data
    """
    clusters_fp = tempfile.NamedTemporaryFile()
    clusters = [v.tolist() for k, v in hadoopy.readtb(hdfs_input)]
    clusters.sort()
    clusters = np.ascontiguousarray(clusters, dtype=np.float64)
    pickle.dump(clusters, clusters_fp, -1)
    clusters_fp.seek(0)
    return clusters_fp
Esempio n. 55
0
def main():
    exemplar_feats = list(
        hadoopy.readtb('exemplarbank/output/1341790878.92/pos_sample'))
    feats = np.vstack([x[1] for x in exemplar_feats])
    print(feats.shape)
    try:
        shutil.rmtree('clusters')
    except OSError:
        pass
    os.makedirs('clusters')
    for exemplar_num, cluster_num in enumerate(
            sp.cluster.vq.kmeans2(feats, 20, minit='points')[1]):
        fn = _find_exemplar_fn('exemplars', exemplar_feats[exemplar_num][0])
        cluster_path = 'clusters/%d/' % cluster_num
        try:
            os.makedirs(cluster_path)
        except OSError:
            pass
        shutil.copy(fn, cluster_path)
Esempio n. 56
0
def hard_train(hdfs_input, hdfs_output):
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          num_reducers=10,
                          files=['exemplars.pkl'],
                          remove_output=True)

    def _inner():
        with open('image_box_fns.pkl', 'w') as fp:
            image_box_fns = {}
            for (image_id, box,
                 score), negs in hadoopy.readtb(hdfs_output + 'hard_neg'):
                for score2, image_id2, box2 in negs:
                    image_box_fns.setdefault(image_id2, []).append(
                        (box2, [image_id, box, score]))
            pickle.dump(image_box_fns, fp, -1)
        del image_box_fns
        gc.collect()

    _inner()
    hadoopy.launch_frozen(hdfs_input + '0-tr',
                          hdfs_output + 'hard_neg_clip',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs=['TYPE=feature'])
    hadoopy.launch_frozen(
        [hdfs_output + 'pos_sample', hdfs_output + 'hard_neg_clip'],
        hdfs_output + 'exemplars-1',
        'train_exemplars_hard.py',
        cmdenvs=['NEG_FEATS=neg_feats.pkl', 'MAX_HARD=200'],
        files=['neg_feats.pkl'],
        remove_output=True,
        num_reducers=10)
    exemplar_out = sorted(hadoopy.readtb(hdfs_output + 'exemplars-1'),
                          key=lambda x: x[0])
    with open('exemplars.pkl', 'w') as fp:
        pickle.dump(exemplar_out, fp, -1)