Example #1
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)
     si.feature = json.dumps(self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path, hdfs_output + '/clean', max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean', hdfs_output + '/feature', self.feature_dict, files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/hasher', _lf('train_hasher.py'), cmdenvs={'KV_PROB': 1.,
                                                                                                                   'HASH_BITS': 128})
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature', hdfs_output + '/db', _lf('build_db.py'), files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Example #2
0
def random_sample(hdfs_input, m, n=None, p=.01, hdfs_temp_dir=None):
    """Return an iterator of m kv pairs selected uniformly from the input

    Finds an alpha such that X = np.sum(np.random(n) < alpha) where X >= m with probability p.
    If more kv pairs are returned from Hadoop, then they are ignored.  The resulting kv pairs
    are uniformly random from the input.

    Args:
        m: Desired number of samples (you will get this many as long as n >= m with probability (1-p))
        n: Number of total values (default None uses count_kvs to compute this)
        p: Failure probability (default .01 means there is 1 failure out of 100 runs)

    Yields:
        Sample k/v pairs
    """
    if n is None:
        n = count_kvs(hdfs_input)
    alpha = _random_sample_alpha(n, m, p=p)
    num_outputs = 0
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('random_sample.py'),
                              cmdenvs={'ALPHA': alpha})
        for kv in hadoopy.readtb(hdfs_output):
            if num_outputs >= m:
                return
            yield kv
            num_outputs += 1
    if num_outputs < m:
        logging.warn('random_sampler: num_outputs[%d] when m[%d].  To prevent this, call with a smaller value of p (currently [%f]).' % (num_outputs, m, p))
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path, hdfs_output, 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
def main():
    dense_path = 'exemplarbank/output/1341790878.92/pos'
    image_path = 'exemplarbank/data/sun_labelme_person/1-tr'
    image_box_fns = {}
    id_box_features = dict(hash_features(dense_path))
    print id_box_features.items()[0]
    for (image_id, box), feature in id_box_features.items():
        image_box_fns.setdefault(image_id, []).append((box, (image_id, box)))
    with open('image_box_fns.pkl', 'w') as fp:
        pickle.dump(image_box_fns, fp, -1)
    with hadoopy_helper.hdfs_temp() as hdfs_output:
        hadoopy.launch_frozen(image_path,
                              hdfs_output,
                              'clip_boxes.py',
                              files=['image_box_fns.pkl'],
                              remove_output=True,
                              cmdenvs=['TYPE=feature'])
        id_box_features2 = dict(hash_features(hdfs_output))
        with open('compare.pkl', 'w') as fp:
            pickle.dump((id_box_features, id_box_features2), fp, -1)
Example #5
0
 def compute_db_hadoop(self, hdfs_path):
     import json
     si = picarus.api.SearchIndex()
     si.name = '%s.%s' % (self.__class__.__module__,
                          self.__class__.__name__)
     si.feature = json.dumps(
         self.feature_dict)  # TODO: What to do with the pkl file?
     with hadoopy_helper.hdfs_temp() as hdfs_output:
         picarus.vision.run_image_clean(hdfs_path,
                                        hdfs_output + '/clean',
                                        max_side=self.max_side)
         # Compute features (map)
         picarus.vision.run_image_feature(hdfs_output + '/clean',
                                          hdfs_output + '/feature',
                                          self.feature_dict,
                                          files=self.required_files)
         # Random sample features for hashes (map) and train hasher (reduce)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/hasher',
                               _lf('train_hasher.py'),
                               cmdenvs={
                                   'KV_PROB': 1.,
                                   'HASH_BITS': 128
                               })
         hasher = hadoopy.readtb(hdfs_output + '/hasher').next()[1]
         si.hash = pickle.dumps(hasher, -1)
         si.hash_format = si.PICKLE
         # Compute features hashes (map) and build database (reduce)
         open('hasher.pkl', 'w').write(si.hash)
         hadoopy.launch_frozen(hdfs_output + '/feature',
                               hdfs_output + '/db',
                               _lf('build_db.py'),
                               files=['hasher.pkl'])
         metadata, hashes = hadoopy.readtb(hdfs_output + '/db').next()
         self.metadata = metadata
         si.metadata.extend(metadata.tolist())
         self.index = image_search.LinearHashDB().store_hashes(
             hashes, np.arange(len(metadata), dtype=np.uint64))
         si.index = pickle.dumps(self.index, -1)
         si.index_format = si.PICKLE
         open('index.pb', 'w').write(si.SerializeToString())
Example #6
0
def launch_map_update(nodes, job_id, redis_host, jobconfs=None):
    jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false',
                'mapred.reduce.tasks.speculative.execution': 'false',
                'mapred.task.timeout': '0'}
    if jobconfs:
        jobconfs_base.update(jobconfs)
    with hadoopy_helper.hdfs_temp() as input_path:
        for node in nodes:
            print(node)
            v = {'script_name': os.path.basename(node['script_path']),
                 'script_data': open(node['script_path']).read()}
            if 'cmdenvs' in node and node['cmdenvs'] is not None:
                v['cmdenvs'] = node['cmdenvs']
            if 'files' in node and node['files'] is not None:
                v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files'])
            cmdenvs = {'job_id': job_id,
                       'hadoopy_rt_redis': redis_host}
            if 'outputs' in node and node['outputs']:
                v['outputs'] = node['outputs']
            hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)])
        hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs,
                       jobconfs=jobconfs_base)
Example #7
0
def count_kvs(hdfs_input, hdfs_temp_dir=None):
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('count.py'), num_reducers=1)
        return sum(x for _, x in hadoopy.readtb(hdfs_output))
Example #8
0
def unique_keys(hdfs_input, hdfs_temp_dir=None):
    with hadoopy_helper.hdfs_temp(hdfs_temp_dir=hdfs_temp_dir) as hdfs_output:
        hadoopy.launch_frozen(hdfs_input, hdfs_output, _lf('unique_keys.py'))
        for x in hadoopy.readtb(hdfs_output):
            yield x[0]