Esempio n. 1
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 2
0
 def _run_hdfs(self, orig_fn):
     fn = "%f-%s" % (time.time(), orig_fn)
     file_path = "%s/%s" % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, "Title: Alice's Adventures in Wonderland")
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
Esempio n. 3
0
 def _run_hdfs(self, orig_fn):
     fn = '%f-%s' % (time.time(), orig_fn)
     file_path = '%s/%s' % (self.data_path, fn)
     hadoopy.put(orig_fn, file_path)
     cat_output = [_ for _ in hadoopy.readtb(file_path)]
     line = (331, 'Title: Alice\'s Adventures in Wonderland')
     self.assertTrue(line in cat_output)
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn])
     ls_output = hadoopy.ls(file_path)
     self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
def generate_digests(pinput, digest_command):
    """
    generate digests from the input
    :param pinput (string) the input path
    :param digest_command (string) tells if should execute the -text or -cat command
    :return a dict with pairs <file, digest>
    """
    digests = {}

    try:
        files = hadoopy.ls(pinput)
    except IOError:
        return digests

    for afile in files:
        if digest_command == "-cat":
            stdout, stderr = subprocess.Popen([get_hadoop_path() + "/bin/hadoop", "fs", "-cat", afile],
                                              stdout=subprocess.PIPE).communicate()
        elif digest_command == "-text":
            stdout, stderr = subprocess.Popen([get_hadoop_path() + "/bin/hadoop", "fs", "-text", afile],
                                              stdout=subprocess.PIPE).communicate()

        m = hashlib.sha256()
        m.update(stdout)

        digests.update({afile: m.hexdigest()})

    return digests
Esempio n. 5
0
def run_video_frame_classification(train_dir):
    try:
        neg_dir = train_dir + '/0'
        pos_dir = train_dir + '/1'
        while 1:
            # Train using initial pos/neg
            c = vidfeat.SyntheticFrameFeature().train(vidfeat.load_label_frames(train_dir))
            # Predict on dataset
            hdfs_input = random.sample(hadoopy.ls('/user/brandyn/aladdin/mp4_devt/'), 96)
            start_time = '%f' % time.time()
            hdfs_output = '/user/brandyn/aladdin_results/video_grep/%s' % start_time
            picarus.vision.run_video_grep_frames(hdfs_input, hdfs_output, c)
            unsorted_dir = tempfile.mkdtemp()
            try:
                for _, y in hadoopy.readtb(hdfs_output):
                    open('%s/%s.jpg' % (unsorted_dir, hashlib.sha1(y).hexdigest()), 'w').write(y)
                # Present results to user and add to list
                try:
                    cmd = 'python -m interactive_learning.image_selector %s %s %s --port 8083' % (unsorted_dir, pos_dir, neg_dir)
                    print(cmd)
                    subprocess.call(cmd.split())
                except OSError:
                    pass
            finally:
                shutil.rmtree(unsorted_dir)
    finally:
        #shutil.rmtree(temp_root)
        pass
Esempio n. 6
0
def ls(pinput):
    """ list hdfs files """

    try:
        files = hadoopy.ls(pinput)
    except IOError:
        files = []

    return files
Esempio n. 7
0
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
Esempio n. 8
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 9
0
def get_total_size(path, _format="%b"):
    """ get the total size of the path """

    size = 0
    if hadoopy.isdir(path):
        files = hadoopy.ls(path)
        for file in files:
            size += int(hadoopy.stat(file, _format))
    else:
        size = hadoopy.stat(path, _format)
    return size
Esempio n. 10
0
 def list_directory(self, path=None):
     prefix = ''
     if path:
         prefix = '%s/' % path
     local_path, hdfs_path = self._init_path(path)
     exists = False
     try:
         for d in hadoopy.ls(hdfs_path):
             exists = True
             yield os.path.relpath(d, self._local_path)
     except Exception:
         pass
     if not exists:
         raise exceptions.FileNotFoundError('%s is not there' % path)
Esempio n. 11
0
def abspath(path):
    """Return the absolute path to a file and canonicalize it

    Path is returned without a trailing slash and without redundant slashes.
    Caches the user's home directory.

    :param path: A string for the path.  This should not have any wildcards.
    :returns Absolute path to the file
    :raises IOError: If unsuccessful
    """
    global _USER_HOME_DIR
    # FIXME(brandyn): User's home directory must exist
    if path[0] == '/':
        return os.path.abspath(path)
    if _USER_HOME_DIR is None:
        try:
            _USER_HOME_DIR = hadoopy.ls('.')[0].rsplit('/', 1)[0]
        except IOError, e:
            if not exists('.'):
                raise IOError("Home directory doesn't exist")
            raise e
Esempio n. 12
0
def hdfs_to_vidjs():
    runs = list(hadoopy.ls('output/video_keyframes'))
    run = runs[-1]
    print run
    for (kind, hash), v in hadoopy.readtb(run):
        if kind == 'frame':
            s = StringIO.StringIO()
            s.write(v)
            s.seek(0)
            frame = Image.open(s)
            frame.save('vid_t/%s.jpg' % hash)
        if kind == 'video':
            with open('videojs/%s.js' % hash, 'w') as f:
                json.dump(v, f)
        if kind == 'scores':
            try:
                pylab.figure(1)
                pylab.clf()
                pylab.plot(v)
                pylab.savefig('scores/%s.png' % hash)
            except Exception, e:
                print e
Esempio n. 13
0
 def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000',
                                                                               'mapreduce.task.userlog.limit.kb=1000'], **kw)
         launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000',
                                                                 'mapreduce.task.userlog.limit.kb': '1000'}, **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name,
                                                                                                                                             in_path,
                                                                                                                                             out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Esempio n. 14
0
def readtb(paths, ignore_logs=True):
    """Read typedbytes sequence files on HDFS (with optional compression).

    By default, ignores files who's names start with an underscore '_' as they
    are log files.  This allows you to cat a directory that may be a variety of
    outputs from hadoop (e.g., _SUCCESS, _logs).  This works on directories and
    files.

    Args:
        paths: HDFS path (str) or paths (iterator)
        ignore_logs: If True, ignore all files who's name starts with an
            underscore.  Defaults to True.

    Returns:
        An iterator of key, value pairs.

    Raises:
        IOError: An error occurred listing the directory (e.g., not available).
    """
    hstreaming = hadoopy._runner._find_hstreaming()
    if isinstance(paths, str):
        paths = [paths]
    for root_path in paths:
        all_paths = hadoopy.ls(root_path)
        if ignore_logs:
            # Ignore any files that start with an underscore
            keep_file = lambda x: os.path.basename(x)[0] != "_"
            all_paths = filter(keep_file, all_paths)
        for cur_path in all_paths:
            cmd = "hadoop jar %s dumptb %s" % (hstreaming, cur_path)
            read_fd, write_fd = os.pipe()
            write_fp = os.fdopen(write_fd, "w")
            p = hadoopy._hdfs._hadoop_fs_command(cmd, stdout=write_fp)
            write_fp.close()
            with hadoopy.TypedBytesFile(read_fd=read_fd) as tb_fp:
                for kv in tb_fp:
                    yield kv
            p.wait()
Esempio n. 15
0
def readtb(paths, ignore_logs=True):
    """Read typedbytes sequence files on HDFS (with optional compression).

    By default, ignores files who's names start with an underscore '_' as they
    are log files.  This allows you to cat a directory that may be a variety of
    outputs from hadoop (e.g., _SUCCESS, _logs).  This works on directories and
    files.

    Args:
        paths: HDFS path (str) or paths (iterator)
        ignore_logs: If True, ignore all files who's name starts with an
            underscore.  Defaults to True.

    Returns:
        An iterator of key, value pairs.

    Raises:
        IOError: An error occurred listing the directory (e.g., not available).
    """
    hstreaming = hadoopy._runner._find_hstreaming()
    if isinstance(paths, str):
        paths = [paths]
    for root_path in paths:
        all_paths = hadoopy.ls(root_path)
        if ignore_logs:
            # Ignore any files that start with an underscore
            keep_file = lambda x: os.path.basename(x)[0] != '_'
            all_paths = filter(keep_file, all_paths)
        for cur_path in all_paths:
            cmd = 'hadoop jar %s dumptb %s' % (hstreaming, cur_path)
            read_fd, write_fd = os.pipe()
            write_fp = os.fdopen(write_fd, 'w')
            p = hadoopy._hdfs._hadoop_fs_command(cmd, stdout=write_fp)
            write_fp.close()
            with hadoopy.TypedBytesFile(read_fd=read_fd) as tb_fp:
                for kv in tb_fp:
                    yield kv
            p.wait()
Esempio n. 16
0
def doSample(jarfile, inputs, output, k):
    for item in inputs:
        if item[-1] == "/":
            name = (item[:-1]).split('/')[-1]
        else:
            name = item.split('/')[-1]
        print "item", item 
        #tmp_dir = tmp_path + name + "/"
        if hadoopy.exists(item):
            continue
        hadoopy.mkdir(item)
        #tmp_inputs.append(tmp_dir)
        real_input = data_dir + name + "/"
        for f in hadoopy.ls(real_input):
            if not hadoopy.isdir(f):
                #ff = tmp_dir + f.split('/')[-1]
                if k > 0:
                    poolSample(f, item, k)
                else:
                    commonSample(f, item, ratio)
    '''if not hadoopy.exists(output):
        hadoopy.mkdir(output)
    if hadoopy.isdir(output):
        output = output[:-1]
    if output[-1] == '/':
        output = output[:-1]
    name = output.split('/')[-1]
    tmp_output = tmp_path + name + "/"'''
    #if not hpath.exists(tmp_output):
    #    hdfs.mkdir(tmp_output)
    codegen.executeJar(jarfile, inputs, output)
    #jobid = job.getJobIDFromLog(tmp_log_dir)
    job_para = job.getJobPara()
    '''for item in tmp_inputs:
        os.system("hadoop fs -rmr " + item)
    os.system("hadoop fs -rmr " + tmp_output)'''
    return job_para
Esempio n. 17
0
def _get_home_dir_old():
    # NOTE(brandyn): Not compatible with CDH4's ls
    return hadoopy.ls('.')[0].rsplit('/', 1)[0]
Esempio n. 18
0
def _get_home_dir_old():
    # NOTE(brandyn): Not compatible with CDH4's ls
    return hadoopy.ls('.')[0].rsplit('/', 1)[0]
Esempio n. 19
0
 def test_ls(self):
     ls_output = hadoopy.ls(self.data_path)
     self.assertTrue(ls_output[0].endswith(self.file_path))