Esempio n. 1
0
def writetb(path, kvs, java_mem_mb=256):
    """Write typedbytes sequence file to HDFS given an iterator of KeyValue pairs

    :param path: HDFS path (string)
    :param kvs: Iterator of (key, value)
    :param java_mem_mb: Integer of java heap size in MB (default 256)
    :raises: IOError: An error occurred while saving the data.
    """
    read_fd, write_fd = os.pipe()
    read_fp = os.fdopen(read_fd, 'r')
    hstreaming = _find_hstreaming()
    cmd = 'hadoop jar %s loadtb %s' % (hstreaming, path)
    p = _hadoop_fs_command(cmd, stdin=read_fp, java_mem_mb=java_mem_mb)
    read_fp.close()
    with hadoopy.TypedBytesFile(write_fd=write_fd) as tb_fp:
        for kv in kvs:
            if p.poll() is not None:
                raise IOError(
                    'writetb: Hadoop process quit while we were sending it data.  Hadoop output below...\nstdout\n%s\nstderr\n%s'
                    % p.communicate())
            tb_fp.write(kv)
        tb_fp.flush()
    p.wait()
    if p.returncode is not 0:
        raise IOError(
            'writetb: Hadoop process returned [%d]. Hadoop output below...\nstderr\n%s'
            % (p.returncode, p.stderr.read()))
Esempio n. 2
0
def writetb(path, kvs, java_mem_mb=256):
    """Write typedbytes sequence file to HDFS given an iterator of KeyValue pairs

    :param path: HDFS path (string)
    :param kvs: Iterator of (key, value)
    :param java_mem_mb: Integer of java heap size in MB (default 256)
    :raises: IOError: An error occurred while saving the data.
    """
    read_fd, write_fd = os.pipe()
    read_fp = os.fdopen(read_fd, "r")
    hstreaming = _find_hstreaming()
    cmd = "hadoop jar %s loadtb %s" % (hstreaming, path)
    p = _hadoop_fs_command(cmd, stdin=read_fp, java_mem_mb=java_mem_mb)
    read_fp.close()
    with hadoopy.TypedBytesFile(write_fd=write_fd) as tb_fp:
        for kv in kvs:
            if p.poll() is not None:
                raise IOError(
                    "writetb: Hadoop process quit while we were sending it data.  Hadoop output below...\nstdout\n%s\nstderr\n%s"
                    % p.communicate()
                )
            tb_fp.write(kv)
        tb_fp.flush()
    p.wait()
    if p.returncode is not 0:
        raise IOError(
            "writetb: Hadoop process returned [%d]. Hadoop output below...\nstderr\n%s"
            % (p.returncode, p.stderr.read())
        )
Esempio n. 3
0
def cat(path, procs=10):
    """Read typedbytes sequence files on HDFS (with optional compression).

    Args:
        path: A string (potentially with wildcards).
        procs: Number of processes to use.

    Returns:
        An iterator of key, value pairs.

    Raises:
        IOError: An error occurred listing the directory (e.g., not available).
    """
    max_files = 100
    hstreaming = _find_hstreaming()
    all_paths = ls(path)
    p = multiprocessing.Pool(min((procs, max_files, len(all_paths))))
    while all_paths:
        paths = all_paths[:max_files]
        del all_paths[:max_files]
        fps = [tempfile.NamedTemporaryFile() for x in paths]
        p.map(_hdfs_cat_tb, [(path, hstreaming, fp.name)
                             for path, fp in zip(paths, fps)])
        for y in fps:
            for x in hadoopy.TypedBytesFile(y.name, 'r'):
                yield x
Esempio n. 4
0
def cat(path, ignore_logs=True, procs=10):
    """Read typedbytes sequence files on HDFS (with optional compression).

    By default, ignores files who's names start with an underscore '_' as they
    are log files.  This allows you to cat a directory that may be a variety of
    outputs from hadoop (e.g., _SUCCESS, _logs).

    Args:
        path: A string (potentially with wildcards).
        procs: Number of processes to use.
        ignore_logs: If True, ignore all files who's name starts with an
            underscore.  Defaults to True.

    Returns:
        An iterator of key, value pairs.

    Raises:
        IOError: An error occurred listing the directory (e.g., not available).
    """
    max_files = 100
    hstreaming = _find_hstreaming()
    all_paths = ls(path)
    if ignore_logs:
        # Ignore any files that start with an underscore
        keep_file = lambda x: os.path.basename(x)[0] != '_'
        all_paths = filter(keep_file, all_paths)
    p = multiprocessing.Pool(min((procs, max_files, len(all_paths))))
    while all_paths:
        paths = all_paths[:max_files]
        del all_paths[:max_files]
        fps = [tempfile.NamedTemporaryFile() for x in paths]
        p.map(_hdfs_cat_tb, [(path, hstreaming, fp.name)
                             for path, fp in zip(paths, fps)])
        for y in fps:
            for x in hadoopy.TypedBytesFile(y.name, 'r'):
                yield x
Esempio n. 5
0
def readtb(paths, num_procs=10, java_mem_mb=256, ignore_logs=True):
    """Read typedbytes sequence files on HDFS (with optional compression).

    By default, ignores files who's names start with an underscore '_' as they
    are log files.  This allows you to cat a directory that may be a variety of
    outputs from hadoop (e.g., _SUCCESS, _logs).  This works on directories and
    files.  The KV pairs may be interleaved between files
    (they are read in parallel).

    :param paths: HDFS path (str) or paths (iterator)
    :param num_procs: Number of reading procs to open (default 10)
    :param java_mem_mb: Integer of java heap size in MB (default 256)
    :param ignore_logs: If True, ignore all files who's name starts with an underscore.  Defaults to True.
    :returns: An iterator of key, value pairs.
    :raises: IOError: An error occurred reading the directory (e.g., not available).
    """
    import select
    hstreaming = _find_hstreaming()
    if isinstance(paths, (str, unicode)):
        paths = [paths]
    read_fds = set()
    procs = {}
    tb_fps = {}

    def _open_tb(cur_path):
        cmd = 'hadoop jar %s dumptb %s' % (hstreaming, cur_path)
        read_fd, write_fd = os.pipe()
        write_fp = os.fdopen(write_fd, 'w')
        p = _hadoop_fs_command(cmd, stdout=write_fp, java_mem_mb=java_mem_mb)
        write_fp.close()
        read_fds.add(read_fd)
        procs[read_fd] = p
        tb_fps[read_fd] = hadoopy.TypedBytesFile(read_fd=read_fd)

    def _path_gen():
        for root_path in paths:
            try:
                all_paths = ls(root_path)
            except IOError:
                raise IOError("No such file or directory: '%s'" % root_path)
            if ignore_logs:
                # Ignore any files that start with an underscore
                keep_file = lambda x: os.path.basename(x)[0] != '_'
                all_paths = filter(keep_file, all_paths)
            for cur_path in all_paths:
                yield _open_tb(cur_path)

    try:
        path_gen = _path_gen()
        for x in range(num_procs):
            try:
                path_gen.next()
            except (AttributeError, StopIteration):
                path_gen = None
        while read_fds:
            cur_fds = select.select(read_fds, [], [])[0]
            for read_fd in cur_fds:
                p = procs[read_fd]
                tp_fp = tb_fps[read_fd]
                try:
                    yield tp_fp.next()
                except StopIteration:
                    p.wait()
                    del procs[read_fd]
                    del tb_fps[read_fd]
                    del p
                    os.close(read_fd)
                    read_fds.remove(read_fd)
                    try:
                        path_gen.next()
                    except (AttributeError, StopIteration):
                        path_gen = None
    finally:
        # Cleanup outstanding procs
        for p in procs.values():
            p.kill()
            p.wait()
Esempio n. 6
0
def readtb(paths, num_procs=10, java_mem_mb=256, ignore_logs=True):
    """Read typedbytes sequence files on HDFS (with optional compression).

    By default, ignores files who's names start with an underscore '_' as they
    are log files.  This allows you to cat a directory that may be a variety of
    outputs from hadoop (e.g., _SUCCESS, _logs).  This works on directories and
    files.  The KV pairs may be interleaved between files
    (they are read in parallel).

    :param paths: HDFS path (str) or paths (iterator)
    :param num_procs: Number of reading procs to open (default 10)
    :param java_mem_mb: Integer of java heap size in MB (default 256)
    :param ignore_logs: If True, ignore all files who's name starts with an underscore.  Defaults to True.
    :returns: An iterator of key, value pairs.
    :raises: IOError: An error occurred reading the directory (e.g., not available).
    """
    import select

    hstreaming = _find_hstreaming()
    if isinstance(paths, (str, unicode)):
        paths = [paths]
    read_fds = set()
    procs = {}
    tb_fps = {}

    def _open_tb(cur_path):
        cmd = "hadoop jar %s dumptb %s" % (hstreaming, cur_path)
        read_fd, write_fd = os.pipe()
        write_fp = os.fdopen(write_fd, "w")
        p = _hadoop_fs_command(cmd, stdout=write_fp, java_mem_mb=java_mem_mb)
        write_fp.close()
        read_fds.add(read_fd)
        procs[read_fd] = p
        tb_fps[read_fd] = hadoopy.TypedBytesFile(read_fd=read_fd)

    def _path_gen():
        for root_path in paths:
            try:
                all_paths = ls(root_path)
            except IOError:
                raise IOError("No such file or directory: '%s'" % root_path)
            if ignore_logs:
                # Ignore any files that start with an underscore
                keep_file = lambda x: os.path.basename(x)[0] != "_"
                all_paths = filter(keep_file, all_paths)
            for cur_path in all_paths:
                yield _open_tb(cur_path)

    try:
        path_gen = _path_gen()
        for x in range(num_procs):
            try:
                path_gen.next()
            except (AttributeError, StopIteration):
                path_gen = None
        while read_fds:
            cur_fds = select.select(read_fds, [], [])[0]
            for read_fd in cur_fds:
                p = procs[read_fd]
                tp_fp = tb_fps[read_fd]
                try:
                    yield tp_fp.next()
                except StopIteration:
                    p.wait()
                    del procs[read_fd]
                    del tb_fps[read_fd]
                    del p
                    os.close(read_fd)
                    read_fds.remove(read_fd)
                    try:
                        path_gen.next()
                    except (AttributeError, StopIteration):
                        path_gen = None
    finally:
        # Cleanup outstanding procs
        for p in procs.values():
            p.kill()
            p.wait()