def _read_files(fns, prev_hashes, hdfs_output, output_format, max_record_size):
    """
    Args:
        fns: Iterator of file names
        prev_hashes: Set of hashes (they will be skipped), this is used to make
            the data unique

    Yields:
        Tuple of (data_hash, data) where data_hash is a sha1 hash
    """
    for fn in fns:
        sha1_hash = _sha1(fn)
        if sha1_hash not in prev_hashes:
            prev_hashes.add(sha1_hash)
            if output_format == 'record' and max_record_size is not None and max_record_size < os.stat(fn)[6]:
                # Put the file into the remote location
                hdfs_path = hadoopy.abspath('%s/_blobs/%s_%s' % (hdfs_output, sha1_hash, os.path.basename(fn)))
                data = ''
                hadoopy.put(fn, hdfs_path)
            else:
                hdfs_path = ''
                data = open(fn).read()
            if output_format == 'kv':
                yield sha1_hash, data
            elif output_format == 'record':
                out = {'sha1': sha1_hash, 'full_path': fn,
                       'extension': os.path.splitext(fn)[1][1:]}
                if data:
                    out['data'] = data
                if hdfs_path:
                    out['hdfs_path'] = hdfs_path
                yield sha1_hash, out
Example #2
0
def hdfs_temp(hdfs_temp_dir=None):
    if hdfs_temp_dir is None:
        hdfs_temp_dir = HDFS_TEMP_DIR
    temp_path = hadoopy.abspath('%s/%f-%f' % (hdfs_temp_dir, time.time(), random.random()))
    yield temp_path
    if hadoopy.exists(temp_path):
        hadoopy.rmr(temp_path)
Example #3
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Example #4
0
 def test_err(self):
     nonsense_path = 'sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk'
     self.assertFalse(hadoopy.exists(nonsense_path))
     self.assertEquals(
         hadoopy.abspath(nonsense_path).rsplit('/')[-1], nonsense_path)
     self.assertRaises(IOError, hadoopy.ls, nonsense_path)
     self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
Example #5
0
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
Example #6
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output + 'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v', exemplar_out + 'val_pos', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v', exemplar_out + 'val_neg', 'hard_predictions.py', cmdenvs=['EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100', 'OUTPUT_FORMAT=score_image_box'], files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0) for score, image_id, box in sorted(hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box, pol) in enumerate(sorted(pos_boxes + neg_boxes, reverse=True)):
            image_box_fns.setdefault(image_id, []).append((box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes_cropped', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'], exemplar_out + 'boxes', 'clip_boxes.py', files=['image_box_fns.pkl'], remove_output=True, cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Example #7
0
def exemplar_boxes(hdfs_input, hdfs_output):
    exemplar_name = 'ad813d130f4803e948124823a67cdd7b-[0.0, 0.16326530612244897, 0.3448275862068966, 0.5714285714285714]'
    st = time.time()
    exemplar_out = hadoopy.abspath(hdfs_output +
                                   'exemplar_boxes/%s' % st) + '/'
    for kv in hadoopy.readtb(hdfs_output + 'exemplars-2'):
        (image_id, box, score), _ = kv
        if exemplar_name == '%s-%s' % (image_id, box):
            print('Found it')
            with open('exemplars-patch.pkl', 'w') as fp:
                pickle.dump([kv], fp, -1)
    hadoopy.launch_frozen(hdfs_input + '1-v',
                          exemplar_out + 'val_pos',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    hadoopy.launch_frozen(hdfs_input + '0-v',
                          exemplar_out + 'val_neg',
                          'hard_predictions.py',
                          cmdenvs=[
                              'EXEMPLARS=exemplars-patch.pkl', 'MAX_HARD=100',
                              'OUTPUT_FORMAT=score_image_box'
                          ],
                          files=['exemplars-patch.pkl'],
                          num_reducers=10)
    with open('image_box_fns.pkl', 'w') as fp:
        image_box_fns = {}
        pos_boxes = [(score, image_id, box, 1)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_pos').next()[1])]
        neg_boxes = [(score, image_id, box, 0)
                     for score, image_id, box in sorted(
                         hadoopy.readtb(exemplar_out + 'val_neg').next()[1])]
        for num, (score, image_id, box,
                  pol) in enumerate(sorted(pos_boxes + neg_boxes,
                                           reverse=True)):
            image_box_fns.setdefault(image_id, []).append(
                (box, 'exemplar-%.5d-%d-%f.png' % (num, pol, score)))
        pickle.dump(image_box_fns, fp, -1)
    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes_cropped',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'image'})
    out_dir = 'exemplars_similar_cropped/'
    try:
        shutil.rmtree('exemplars_similar_cropped')
    except OSError:
        pass
    print('Outputting cropped')
    os.makedirs(out_dir)
    print(exemplar_out + 'boxes_cropped')
    for x, y in hadoopy.readtb(exemplar_out + 'boxes_cropped'):
        open(out_dir + x, 'w').write(y)

    hadoopy.launch_frozen([hdfs_input + '1-v', hdfs_input + '0-v'],
                          exemplar_out + 'boxes',
                          'clip_boxes.py',
                          files=['image_box_fns.pkl'],
                          remove_output=True,
                          cmdenvs={'TYPE': 'box'})
    out_dir = 'exemplars_similar/'
    try:
        shutil.rmtree('exemplars_similar')
    except OSError:
        pass
    print('Outputting boxes')
    os.makedirs(out_dir)
    for x, y in hadoopy.readtb(exemplar_out + 'boxes'):
        open(out_dir + x, 'w').write(y)
Example #8
0
 def test_err(self):
     nonsense_path = "sdfskjdfksjdkfjskdfksjdfksdkfjskdjfksjdk"
     self.assertFalse(hadoopy.exists(nonsense_path))
     self.assertEquals(hadoopy.abspath(nonsense_path).rsplit("/")[-1], nonsense_path)
     self.assertRaises(IOError, hadoopy.ls, nonsense_path)
     self.assertRaises(IOError, hadoopy.readtb(nonsense_path).next)
Example #9
0
def canonicalize_path(path):
    import hadoopy
    return hadoopy.abspath(path)