def write_tb(path, fold=None, data_path='data'):
    """
    Copy all images in the specified fddb folds and put them on hdfs.
    If folds=None (default), then all images from all folds are copied.
    If the fddb dataset does not already exist in {data_path}/fddb,
    then that directory is created and the fddb is downloaded there.
    """
    fddb_path = '%s/fddb' % data_path
    # download fddb, if necessary
    if not os.path.isdir(fddb_path):
        download_data(data_path)
    if fold == None:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt'
    else:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold
    names = []
    for fn in glob.glob(folds_glob):
        with open(fn, 'r') as fp:
            names.extend([
                '%s/%s.jpg' % (fddb_path, l)
                for l in fp.read().strip().split('\n')
            ])
    # print message about filenames that do not exist
    for n in names:
        if not os.path.exists(n):
            print('"%s" does not exist!' % n)
    # remove those filenames from the list
    names = filter(os.path.exists, names)
    # write the images to tb files
    hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
def write_tb(path, fold=None, data_path='data'):
    """
    Copy all images in the specified fddb folds and put them on hdfs.
    If folds=None (default), then all images from all folds are copied.
    If the fddb dataset does not already exist in {data_path}/fddb,
    then that directory is created and the fddb is downloaded there.
    """
    fddb_path = '%s/fddb' % data_path
    # download fddb, if necessary
    if not os.path.isdir(fddb_path):
        download_data(data_path)
    if fold == None:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt'
    else:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold
    names = []
    for fn in glob.glob(folds_glob):
        with open(fn, 'r') as fp:
            names.extend(['%s/%s.jpg' % (fddb_path, l)
                          for l in fp.read().strip().split('\n')])
    # print message about filenames that do not exist
    for n in names:
        if not os.path.exists(n):
            print('"%s" does not exist!' % n)
    # remove those filenames from the list
    names = filter(os.path.exists, names)
    # write the images to tb files
    hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
Exemple #3
0
def write_tb(path, fold=None):
    fddb_path = '/home/morariu/downloads/fddb'
    if fold == None:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt'
    else:
        folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold

    if hadoopy.exists(path):
        # do nothing if the file already exists
        pass
    else:
        # otherwise, find all images in the fddb folds and put them  on hdfs
        names = []
        for fn in glob.glob(folds_glob):
            with open(fn, 'r') as fp:
                names.extend(['%s/%s.jpg' % (fddb_path, l) 
                              for l in fp.read().strip().split('\n')])
        # print message about filenames that do not exist
        for n in names:
            if not os.path.exists(n):
                print('"%s" does not exist!' % n)
        # remove those filenames from the list
        names = filter(os.path.exists, names)
        # write the images to tb files
        hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
Exemple #4
0
def insert_vector_into_hdfs(hdfs_path, iterator):
    # Deleting the file if it existes
    if hadoopy.exists(hdfs_path):
        hadoopy.rmr("-skipTrash %s"%hdfs_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(hdfs_path, iterator)
Exemple #5
0
def insert_data_into_hdfs():
    # Deleting the file if it existes
    if hadoopy.exists(tb_path):
        hadoopy.rmr("-skipTrash %s"%tb_path)
    # Writing to HDFS   
    # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode.
    hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
Exemple #6
0
def custom_initialization():
    host= 'localhost'
    connection = happybase.Connection(host)
    wiki_table = connection.table('wiki')
    hdfs_path = 'wiki_index.tb'
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    hadoopy.writetb(hdfs_path,wiki_table.scan(limit=1000)) # Writing the wiki table inot HDFS
def getGpsData(source,destination):
    gps_data = readHDFS(source)
    if gps_data.empty == False:
        gps_data = uniformTimeFormat(gps_data)
        data_trans = gps_data.T.to_dict('list')
        tuples = [ item for item in data_trans.iteritems()]
        hadoopy.writetb(destination,tuples)
Exemple #8
0
def write_texton_hadoop(dataset, classes):
    """Writes (image_name, image_label_points)

    image_name: A string
    image_label_points: List of (image, [(label, points), ...]) where points is Nx2 (y, x)
    """
    if not isinstance(classes, dict):
        classes = dict((y, x) for x, y in enumerate(classes))
    sample_points = 15000
    samples_per_class = {}

    def make_data():
        for image_num, (masks, image) in enumerate(dataset.segmentation_boxes()):
            ratio, image = resize(image)
            if image.shape[0] < radius * 2 + 1 or image.shape[1] < radius * 2 + 1:
                continue
            image = make_masks(image)
            image_size = float(image.shape[0] * image.shape[1])
            print(image.shape)
            print(image_num)
            label_points = []
            for class_name, mask in masks.items():
                mask = cv2.resize(mask.astype(np.uint8), (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)
                assert mask.shape == image.shape[:2]
                try:
                    class_num = classes[class_name]
                except KeyError:
                    continue
                ijs = np.dstack(mask.nonzero())[0]
                orig_ijs = ijs
                ijs = np.ascontiguousarray(random.sample(ijs, min(sample_points, len(ijs))))
                if not len(ijs):
                    continue
                print('Image[%d][%s][%d][%d][%f] has ijs available' % (image_num, class_name, len(ijs), len(orig_ijs), len(orig_ijs) / image_size))
                try:
                    samples_per_class[class_name] += len(ijs)
                except KeyError:
                    samples_per_class[class_name] = len(ijs)
                label_points.append((class_num, np.array(ijs, dtype=np.int32)))  # * ratio
            if not label_points:
                print('Image[%d] has no points available' % image_num)
                continue
            print(samples_per_class)
            yield str(image_num), (image, label_points)
    hdfs_file_cnt = 0
    hdfs_buf = []
    start_time = time.time()
    for x in make_data():
        print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt))
        hdfs_buf.append(x)
        if len(hdfs_buf) >= 100:
            try:
                hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
            except IOError, e:
                print('Got IOError, skipping')
                print(e)
            hdfs_file_cnt += 1
            hdfs_buf = []
Exemple #9
0
def setup_data(local_inputs, hdfs_input, images_per_file=2):
    cnt = 0
    out = []
    for x in load_data_iter(local_inputs):
        out.append(x)
        if len(out) > images_per_file:
            hadoopy.writetb(hdfs_input + '/%d' % cnt, out)
            cnt += 1
            out = []
    if out:
        hadoopy.writetb(hdfs_input + '/%d' % cnt, out)
Exemple #10
0
def setup_data(local_inputs, hdfs_input, images_per_file=2):
    cnt = 0
    out = []
    for x in load_data_iter(local_inputs):
        out.append(x)
        if len(out) > images_per_file:
            hadoopy.writetb(hdfs_input + '/%d' % cnt, out)
            cnt += 1
            out = []
    if out:
        hadoopy.writetb(hdfs_input + '/%d' % cnt, out)
Exemple #11
0
 def test_readtb_writetb(self):
     working_path = "%s/readtb_writetb/" % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = "%s/%.5d" % (working_path, x)
         print(fn)
         data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
Exemple #12
0
def latency_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'

    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    hadoopy.writetb(output_path + '/input', [kv])
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    v = hadoopy.readtb(output_path + '/output').next()[1]
    v['server_time'] = time.time()
    t0 = v['worker_time'] - v['client_time']
    t1 = v['server_time'] - v['worker_time']
    t2 = v['server_time'] - v['client_time']
    print((t0, t1, t2))
    hadoopy.rmr(output_path)
Exemple #13
0
def throughput_test(launcher):
    output_path = '_hadoopy_bench/%f' % time.time()
    v = 'blah'
    kv = (v, {'client_time': time.time(),
              'value_len': len(v),
              'count': 0})
    num_files = 3
    num_kvs = 10000000
    hadoopy.writetb(output_path + '/input/0', (kv for x in xrange(num_kvs)))
    for x in range(1, num_files):
        hadoopy.cp(output_path + '/input/0', output_path + '/input/%d' % x)
    hadoopy.freeze_script('time_job.py')  # Factor out Pyinstaller time
    st = time.time()
    launcher(output_path + '/input', output_path + '/output', 'time_job.py')
    print((num_kvs * num_files) / (time.time() - st))
    hadoopy.rmr(output_path)
Exemple #14
0
 def test_readtb_writetb(self):
     working_path = '%s/readtb_writetb/' % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = '%s/%.5d' % (working_path, x)
         print(fn)
         data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(
         working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path),
                      self._readtb(hadoopy.readtb, working_path))
Exemple #15
0
def run_local_kmeans(hdfs_input, hdfs_output, num_clusters, *args, **kw):
    import multiprocessing
    import Queue
    import time
    q = multiprocessing.Queue()
    data = np.asfarray([y for x, y in hadoopy.readtb(hdfs_input)])
    p = multiprocessing.Process(target=_run_local_kmeans, args=(data, q, num_clusters))
    p.start()
    while 1:
        try:
            clusters = q.get(timeout=10)
            break
        except Queue.Empty:
            print('Queue get failed')
            time.sleep(0)
    p.join()
    hadoopy.writetb(hdfs_output, enumerate(clusters))
    print('Done Writing to HDFS')
    print('Done joining')
def load_local(local_input, hdfs_output, output_format='kv', max_record_size=None, max_kv_per_file=None, **kw):
    """Read data, de-duplicate, and put on HDFS in the specified format

    Args:
        local_input: Local directory path
        hdfs_output: HDFS output path
        output_format: One of 'kv' or 'record'.  If 'kv' then output sequence
            files of the form (sha1_hash, binary_file_data).  If 'record'
            then output sequence files of the form (sha1_hash, metadata)
            where metadata has keys
            sha1: Sha1 hash
            extension: File extension without a period (blah.avi -> avi,
                blah.foo.avi -> avi, blah -> '')
            full_path: Local file path
            hdfs_path: HDFS path of the file (if any), the data should be the
                binary contents of the file stored at this location on HDFS.
            data: Binary file contents

            where only one of data or hdfs_path has to exist.
        max_record_size: If using 'record' and the filesize (in bytes) is larger
            than this, then store the contents of the file in a directory called
            '_blobs' inside output path with the name as the sha1 hash prefixed
            to the original file name (example, hdfs_output/blobs/sha1hash_origname).
            If None then there is no limit to the record size (default is None).
        max_kv_per_file: If not None then only put this number of kv pairs in each
            sequence file (default None).
    """
    fns = sorted([os.path.join(local_input, x) for x in os.listdir(local_input)])
    if output_format not in ('kv', 'record'):
        raise ValueError('Unsupported output_format [%s]' % output_format)
    out = []
    out_cnt = 0
    for x in _read_files(fns, set(), hdfs_output, output_format, max_record_size):
        out.append(x)
        if max_kv_per_file is not None and max_kv_per_file < len(out):
            hadoopy.writetb(hdfs_output + '/part-%.5d' % out_cnt, out)
            out_cnt += 1
            out = []
    if out:
        hadoopy.writetb(hdfs_output + '/part-%.5d' % out_cnt, out)
Exemple #17
0
def launch_map_update(nodes, job_id, redis_host, jobconfs=None):
    jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false',
                'mapred.reduce.tasks.speculative.execution': 'false',
                'mapred.task.timeout': '0'}
    if jobconfs:
        jobconfs_base.update(jobconfs)
    with hadoopy_helper.hdfs_temp() as input_path:
        for node in nodes:
            print(node)
            v = {'script_name': os.path.basename(node['script_path']),
                 'script_data': open(node['script_path']).read()}
            if 'cmdenvs' in node and node['cmdenvs'] is not None:
                v['cmdenvs'] = node['cmdenvs']
            if 'files' in node and node['files'] is not None:
                v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files'])
            cmdenvs = {'job_id': job_id,
                       'hadoopy_rt_redis': redis_host}
            if 'outputs' in node and node['outputs']:
                v['outputs'] = node['outputs']
            hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)])
        hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs,
                       jobconfs=jobconfs_base)
Exemple #18
0
def flickr_images(tags,
                  images_per_tag,
                  hdfs_output,
                  num_files=20,
                  max_iters=1,
                  max_pages=1,
                  output_meta=False,
                  api_key=None,
                  api_secret=None,
                  remove_output=False):
    tags = list(tags)
    if api_key is None or api_secret is None:
        api_key = os.environ['FLICKR_API_KEY']
        api_secret = os.environ['FLICKR_API_SECRET']
    tags_per_chunk = max(len(tags) / num_files, 1)
    if remove_output and hadoopy.exists(hdfs_output):
        print('Removing output dir[%s]' % hdfs_output)
        hadoopy.rmr(hdfs_output)
    cmdenvs = {
        'FLICKR_API_KEY': api_key,
        'FLICKR_API_SECRET': api_secret,
        'MAX_ITERS': str(max_iters),
        'MAX_PAGES': str(max_pages)
    }
    for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)):
        hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num,
                        [(images_per_tag, tag) for tag in chunk_tags])
    hadoopy.launch_frozen(hdfs_output + '/tags',
                          hdfs_output + '/metadata',
                          _lf('flickr_bulk.py'),
                          cmdenvs=cmdenvs,
                          num_reducers=num_files)
    output_type = 'meta' if output_meta else 'image'
    hadoopy.launch_frozen(hdfs_output + '/metadata',
                          hdfs_output + '/image_metadata',
                          _lf('file_downloader.py'),
                          cmdenvs={'OUTPUT_TYPE': output_type})
Exemple #19
0
def copy(source_path, destination_path):
    hadoopy.writetb(destination_path, read_tb(source_path))
Exemple #20
0
import hadoopy
import pickle
import json
import sys

print "Loading pickle file..."
sys.stdout.flush()
data=pickle.load(open('guns_america.pickle','rb'))
print "Done."
sys.stdout.flush()

print "Writing readme...",
sys.stdout.flush()
hadoopy.writetb('/data/weapons/gunsamerica/README.txt', [('README','author: svebor karaman\[email protected]\n\n\
	This folder contains ads crawled by JPL from the gunsamerica.com website. \
	The goal is to use this data to train image classifiers using automatic extractions\
	as labels.\n\n\
	The file is composed of key value pairs where the key is a string like \'com/gunsamerica/www/C378FA4FB824B323B706222F826938AE660D7A1E322F501441F203BCB01239F5\' \
	extracted from the JPL imagecat id (discarding the first part \'file:/data2/USCWeaponsStatsGathering/nutch/full_dump/\') \
	and the value is a JSON with fields \'raw_html\',  \'dlimagecat_url\' and \'original_doc\'.')])
print "Done."
sys.stdout.flush()

print "Start writing whole seq file."
sys.stdout.flush()
hadoopy.writetb('/data/weapons/gunsamerica/gunsamerica.seq', [(key,json.dumps(data[key])) for key in data.keys()])
print "Done writing seq file."
sys.stdout.flush()
 def _flush(out, part_num):
     hadoopy.writetb('%s/part-%.5d' % (path, part_num), out, **kw)
     return [], part_num + 1
Exemple #22
0
 def test_cluster_info(self):
     hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)])
     hadoopy.launch_frozen(self.data_path + 'cluster_info_input',
                           self.data_path + 'cluster_info',
                           'cluster_info.py')
     pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
Exemple #23
0
def launch_local(in_name,
                 out_name,
                 script_path,
                 poll=None,
                 max_input=None,
                 files=(),
                 cmdenvs=(),
                 pipe=True,
                 python_cmd='python',
                 remove_tempdir=True,
                 identity_mapper=False,
                 num_reducers=None,
                 **kw):
    """A simple local emulation of hadoop

    This doesn't run hadoop and it doesn't support many advanced features, it
    is intended for simple debugging.  The input/output uses HDFS if an
    HDFS path is given. This allows for small tasks to be run locally
    (primarily while debugging). A temporary working directory is used and
    removed.

    Support

    * Environmental variables
    * Map-only tasks
    * Combiner
    * Files
    * Pipe (see below)
    * Display of stdout/stderr
    * Iterator of KV pairs as input or output (bypassing HDFS)

    :param in_name: Input path (string or list of strings) or Iterator of (key, value).  If it is an iterator then no input is taken from HDFS.
    :param out_name: Output path or None.  If None then output is not placed on HDFS, it is available through the 'output' key of the return value.
    :param script_path: Path to the script (e.g., script.py)
    :param poll: If not None, then only attempt to get a kv pair from kvs if when called, poll returns True.
    :param max_input: Maximum number of Mapper inputs, None (default) then unlimited.
    :param files: Extra files (other than the script) (iterator).  NOTE: Hadoop copies the files into working directory
    :param cmdenvs: Extra cmdenv parameters (iterator)
    :param pipe: If true (default) then call user code through a pipe to isolate it and stop bugs when printing to stdout.  See project docs.
    :param python_cmd: The python command to use. The default is "python".  Can be used to override the system default python, e.g. python_cmd = "python2.6"
    :param remove_tempdir: If True (default), then rmtree the temporary dir, else print its location.  Useful if you need to see temporary files or how input files are copied.
    :param identity_mapper: If True, use an identity mapper, regardless of what is in the script.
    :param num_reducers: If 0, don't run the reducer even if one exists, else obey what is in the script.
    :rtype: Dictionary with some of the following entries (depending on options)
    :returns: freeze_cmds: Freeze command(s) ran
    :returns: frozen_tar_path: HDFS path to frozen file
    :returns: hadoop_cmds: Hadoopy command(s) ran
    :returns: process: subprocess.Popen object
    :returns: output: Iterator of (key, value) pairs
    :raises: subprocess.CalledProcessError: Hadoop error.
    :raises: OSError: Hadoop streaming not found.
    :raises: TypeError: Input types are not correct.
    :raises: ValueError: Script not found
    """
    if isinstance(files, (str, unicode)) or isinstance(
            cmdenvs,
        (str, unicode)) or ('cmdenvs' in kw
                            and isinstance(kw['cmdenvs'], (str, unicode))):
        raise TypeError(
            'files and cmdenvs must be iterators of strings and not strings!')
    logging.info('Local[%s]' % script_path)
    script_info = hadoopy._runner._parse_info(script_path, python_cmd)
    if isinstance(in_name, (str, unicode)) or (in_name and isinstance(
            in_name, (list, tuple)) and isinstance(in_name[0],
                                                   (str, unicode))):
        in_kvs = hadoopy.readtb(in_name)
    else:
        in_kvs = in_name
    if 'reduce' in script_info['tasks'] and num_reducers != 0:
        if identity_mapper:
            kvs = in_kvs
        else:
            kvs = list(
                LocalTask(script_path, 'map', files, max_input, pipe,
                          python_cmd,
                          remove_tempdir).run_task(in_kvs, cmdenvs, poll))
        if 'combine' in script_info['tasks']:
            kvs = hadoopy.Test.sort_kv(kvs)
            kvs = list(
                LocalTask(script_path, 'combine', files, max_input, pipe,
                          python_cmd, remove_tempdir).run_task(kvs, cmdenvs))
        kvs = hadoopy.Test.sort_kv(kvs)
        kvs = LocalTask(script_path, 'reduce', files, max_input, pipe,
                        python_cmd, remove_tempdir).run_task(kvs, cmdenvs)
    else:
        if identity_mapper:
            kvs = in_kvs
        else:
            kvs = LocalTask(script_path, 'map', files, max_input, pipe,
                            python_cmd,
                            remove_tempdir).run_task(in_kvs, cmdenvs, poll)
    out = {}
    if out_name is not None:
        hadoopy.writetb(out_name, kvs)
        out['output'] = hadoopy.readtb(out_name)
    else:
        out['output'] = kvs
    return out
def main():
    local_path = '/home/hdusr/erika/data'
    hadoopy.writetb(hdfs_path, read_local_dir(local_path))
Exemple #25
0
def launch_local(in_name, out_name, script_path, poll=None, max_input=None,
                 files=(), cmdenvs=(), pipe=True, python_cmd='python', remove_tempdir=True,
                 identity_mapper=False, num_reducers=None,
                 **kw):
    """A simple local emulation of hadoop

    This doesn't run hadoop and it doesn't support many advanced features, it
    is intended for simple debugging.  The input/output uses HDFS if an
    HDFS path is given. This allows for small tasks to be run locally
    (primarily while debugging). A temporary working directory is used and
    removed.

    Support

    * Environmental variables
    * Map-only tasks
    * Combiner
    * Files
    * Pipe (see below)
    * Display of stdout/stderr
    * Iterator of KV pairs as input or output (bypassing HDFS)

    :param in_name: Input path (string or list of strings) or Iterator of (key, value).  If it is an iterator then no input is taken from HDFS.
    :param out_name: Output path or None.  If None then output is not placed on HDFS, it is available through the 'output' key of the return value.
    :param script_path: Path to the script (e.g., script.py)
    :param poll: If not None, then only attempt to get a kv pair from kvs if when called, poll returns True.
    :param max_input: Maximum number of Mapper inputs, None (default) then unlimited.
    :param files: Extra files (other than the script) (iterator).  NOTE: Hadoop copies the files into working directory
    :param cmdenvs: Extra cmdenv parameters (iterator)
    :param pipe: If true (default) then call user code through a pipe to isolate it and stop bugs when printing to stdout.  See project docs.
    :param python_cmd: The python command to use. The default is "python".  Can be used to override the system default python, e.g. python_cmd = "python2.6"
    :param remove_tempdir: If True (default), then rmtree the temporary dir, else print its location.  Useful if you need to see temporary files or how input files are copied.
    :param identity_mapper: If True, use an identity mapper, regardless of what is in the script.
    :param num_reducers: If 0, don't run the reducer even if one exists, else obey what is in the script.
    :rtype: Dictionary with some of the following entries (depending on options)
    :returns: freeze_cmds: Freeze command(s) ran
    :returns: frozen_tar_path: HDFS path to frozen file
    :returns: hadoop_cmds: Hadoopy command(s) ran
    :returns: process: subprocess.Popen object
    :returns: output: Iterator of (key, value) pairs
    :raises: subprocess.CalledProcessError: Hadoop error.
    :raises: OSError: Hadoop streaming not found.
    :raises: TypeError: Input types are not correct.
    :raises: ValueError: Script not found
    """
    if isinstance(files, (str, unicode)) or isinstance(cmdenvs, (str, unicode)) or ('cmdenvs' in kw and isinstance(kw['cmdenvs'], (str, unicode))):
        raise TypeError('files and cmdenvs must be iterators of strings and not strings!')
    logging.info('Local[%s]' % script_path)
    script_info = hadoopy._runner._parse_info(script_path, python_cmd)
    if isinstance(in_name, (str, unicode)) or (in_name and isinstance(in_name, (list, tuple)) and isinstance(in_name[0], (str, unicode))):
        in_kvs = hadoopy.readtb(in_name)
    else:
        in_kvs = in_name
    if 'reduce' in script_info['tasks'] and num_reducers != 0:
        if identity_mapper:
            kvs = in_kvs
        else:
            kvs = list(LocalTask(script_path, 'map', files, max_input, pipe,
                                 python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll))
        if 'combine' in script_info['tasks']:
            kvs = hadoopy.Test.sort_kv(kvs)
            kvs = list(LocalTask(script_path, 'combine', files, max_input, pipe,
                                 python_cmd, remove_tempdir).run_task(kvs, cmdenvs))
        kvs = hadoopy.Test.sort_kv(kvs)
        kvs = LocalTask(script_path, 'reduce', files, max_input, pipe,
                        python_cmd, remove_tempdir).run_task(kvs, cmdenvs)
    else:
        if identity_mapper:
            kvs = in_kvs
        else:
            kvs = LocalTask(script_path, 'map', files, max_input, pipe,
                            python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll)
    out = {}
    if out_name is not None:
        hadoopy.writetb(out_name, kvs)
        out['output'] = hadoopy.readtb(out_name)
    else:
        out['output'] = kvs
    return out
            prev_hashes.add(data_hash)
            yield data_hash, data


# Write vidoes
videos = ['youtube_action_dataset']  # 'videos',
for video_name in videos:
    picarus.io.load_local(os.path.join(local_root, video_name), '%s/video_record_%s' % (hdfs_root, video_name), output_format='record', max_record_size=723074, max_kv_per_file=5)

# Write unabled data (used for evaluation)
unlabeled = []  #
for unlabeled_name in unlabeled:
    fns = glob.glob('%s/%s/*' % (local_root, unlabeled_name))
    random.shuffle(fns)
    prev_hashes = set()
    hadoopy.writetb('%s/unlabeled_%s' % (hdfs_root, unlabeled_name), read_files(fns, prev_hashes))
    print('Unlabeled:[%s] Num[%d]' % (unlabeled_name, len(prev_hashes)))

quit()
# Write train/test
data_pairs = [('detected_faces', 'detected_nonfaces'), ('photos', 'nonphotos'), ('indoors', 'outdoors'), ('pr0n', 'nonpr0n'), ('objects', 'nonobjects')]
for pos_name, neg_name in data_pairs:
    pos_fns = glob.glob('%s/%s/*' % (local_root, pos_name))
    neg_fns = glob.glob('%s/%s/*' % (local_root, neg_name))
    random.shuffle(pos_fns)
    random.shuffle(neg_fns)
    num_train = int(min(len(neg_fns), len(pos_fns)) * pct_train)
    prev_hashes = set()
    # Pos
    hadoopy.writetb('%s/test_%s' % (hdfs_root, pos_name), read_files(pos_fns[num_train:], prev_hashes))
    print(len(prev_hashes))
    for i,v in enumerate(vect):
        yield str(i).encode('utf-8'),v

N = 64375

diff=1.

r0 = np.ones(N).astype(np.float)/N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s"%input_path)
os.system('hdfs dfs -cp '+edge_path+' '+input_path)
    
if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s"%output_path)
hadoopy.writetb(output_path,read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s"%temp_path)

iteration = 0
while diff>0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s"%temp_path)
    hadoopy.launch(input_path,temp_path,'PageRank.py',files=[])
    
    generator_vector = hadoopy.readtb(output_path)
    rk = {}
    for score in generator_vector:
        url = score[0]
        r = score[1]
def main():
    hadoopy.writetb(hdfs_path,read_hbase(table_wiki))
    if os.path.isfile(local_path):
        print "deleting "+local_path
        os.remove(local_path)
    os.system('hadoop fs -copyToLocal ' + hdfs_path + ' ' + local_path)
Exemple #29
0
        yield str(i).encode('utf-8'), v


N = 64375

diff = 1.

r0 = np.ones(N).astype(np.float) / N

if hadoopy.exists(input_path):
    hadoopy.rmr("-skipTrash %s" % input_path)
os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path)

if hadoopy.exists(output_path):
    hadoopy.rmr("-skipTrash %s" % output_path)
hadoopy.writetb(output_path, read_vector(r0))

if hadoopy.exists(temp_path):
    hadoopy.rmr("-skipTrash %s" % temp_path)

iteration = 0
while diff > 0.01:
    if hadoopy.exists(temp_path):
        hadoopy.rmr("-skipTrash %s" % temp_path)
    hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[])

    generator_vector = hadoopy.readtb(output_path)
    rk = {}
    for score in generator_vector:
        url = score[0]
        r = score[1]
    lambda (x, y): (x[5:].decode('utf-8'), y[5:].decode('utf-8')))

splitText = lines.map(lambda (url, text): (url, [
    stem(word.group().lower()) for word in re.finditer(
        r"\w+", text, re.UNICODE) if word.group().lower() not in words_stop
]))

tf = splitText.map(lambda (url, splittedText): (url, {
    word: 1.0 * splittedText.count(word) / len(splittedText)
    for word in splittedText
}))

tfWordAsKey = tf.flatMap(lambda (url, tf): [(word, [(url, tf[
    word])]) for word in tf]).reduceByKey(lambda a, b: a + b)

tfidf = tfWordAsKey.map(lambda (word, tfList): (word, [(url, tf * np.log10(
    27474.0 / len(tfList))) for (url, tf) in tfList]))

NwordsMax = 200000


def read_rdd(rdd):
    for key, data in rdd.takeSample(True, NwordsMax):
        yield key, data


if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s" % output_hdfs_path)

hadoopy.writetb(output_hdfs_path, read_rdd(tfidf))
Exemple #31
0
import hadoopy
import os
import logging


input_path = '/data/corpus_data'
output_path = '/data/output'
local_path = '/app/opencorpora'

# Utilities
def read_local_dir(local_path):
  for fn in os.listdir(local_path):
    path = os.path.join(local_path, fn)
    if os.path.isfile(path):
      yield path, open(path).read()

# Cleanup and write input data
if hadoopy.exists(input_path):
  hadoopy.rmr(input_path)
if hadoopy.exists(output_path):
  hadoopy.rmr(output_path)
hadoopy.writetb(input_path, read_local_dir(local_path))

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for w3, tpl in word_counts.items():
  if tpl[1] > 4:
    print tpl[0][0], tpl[0][1], tpl[0][2], tpl[1], tpl[2], tpl[3]
Exemple #32
0
        print('spatial_queries/input/%s/%f/%d.tb.seq' %
              (dataset._name, start_time, hdfs_file_cnt))
        hdfs_buf.append(x)
        if len(hdfs_buf) >= 100:
            try:
                hadoopy.writetb(
                    'spatial_queries/input/%s/%f/%d.tb.seq' %
                    (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
            except IOError, e:
                print('Got IOError, skipping')
                print(e)
            hdfs_file_cnt += 1
            hdfs_buf = []
    if hdfs_buf:
        hadoopy.writetb(
            'spatial_queries/input/%s/%f/%d.tb.seq' %
            (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
    print('NumClasses[%d]' % len(classes))
    print('Classes: %r' % classes)


if __name__ == '__main__':
    dataset = vision_data.MSRC()
    classes = msrc_classes
    if 1:
        from data_sources import data_source_from_uri
        from sun397_dataset import SUN397
        uri = 'hbase://localhost:9090/images?image=data:image_320&gt=feat:masks_gt'
        dataset = SUN397(data_source_from_uri(uri))
        classes = json.load(open('classes.js'))
    write_texton_hadoop(dataset, classes)
Exemple #33
0
import hadoopy
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'wc-input'
output_path = data_path + 'wc-output'

# Write data to HDFS in the form of (term #, term)
input_data = enumerate(
    'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.'
    .split())
hadoopy.writetb(input_path, input_data)

# Launch the job
hadoopy.launch_frozen(input_path, output_path, 'wc.py')

# Read the first KV pair
word_counts = dict(hadoopy.readtb(output_path))
for probe_word, expected_count in [('the', 6), ('Lorem', 4), ('of', 4)]:
    print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word]))
    assert expected_count == word_counts[probe_word]
Exemple #34
0
 def test_cluster_info(self):
     hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)])
     hadoopy.launch_frozen(self.data_path + 'cluster_info_input', self.data_path + 'cluster_info', 'cluster_info.py')
     pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
Exemple #35
0
def write_texton_hadoop(dataset, classes):
    """Writes (image_name, image_label_points)

    image_name: A string
    image_label_points: List of (image, [(label, points), ...]) where points is Nx2 (y, x)
    """
    if not isinstance(classes, dict):
        classes = dict((y, x) for x, y in enumerate(classes))
    sample_points = 15000
    samples_per_class = {}

    def make_data():
        for image_num, (masks,
                        image) in enumerate(dataset.segmentation_boxes()):
            ratio, image = resize(image)
            if image.shape[0] < radius * 2 + 1 or image.shape[
                    1] < radius * 2 + 1:
                continue
            image = make_masks(image)
            image_size = float(image.shape[0] * image.shape[1])
            print(image.shape)
            print(image_num)
            label_points = []
            for class_name, mask in masks.items():
                mask = cv2.resize(mask.astype(np.uint8),
                                  (image.shape[1], image.shape[0]),
                                  interpolation=cv2.INTER_NEAREST)
                assert mask.shape == image.shape[:2]
                try:
                    class_num = classes[class_name]
                except KeyError:
                    continue
                ijs = np.dstack(mask.nonzero())[0]
                orig_ijs = ijs
                ijs = np.ascontiguousarray(
                    random.sample(ijs, min(sample_points, len(ijs))))
                if not len(ijs):
                    continue
                print('Image[%d][%s][%d][%d][%f] has ijs available' %
                      (image_num, class_name, len(ijs), len(orig_ijs),
                       len(orig_ijs) / image_size))
                try:
                    samples_per_class[class_name] += len(ijs)
                except KeyError:
                    samples_per_class[class_name] = len(ijs)
                label_points.append(
                    (class_num, np.array(ijs, dtype=np.int32)))  # * ratio
            if not label_points:
                print('Image[%d] has no points available' % image_num)
                continue
            print(samples_per_class)
            yield str(image_num), (image, label_points)

    hdfs_file_cnt = 0
    hdfs_buf = []
    start_time = time.time()
    for x in make_data():
        print('spatial_queries/input/%s/%f/%d.tb.seq' %
              (dataset._name, start_time, hdfs_file_cnt))
        hdfs_buf.append(x)
        if len(hdfs_buf) >= 100:
            try:
                hadoopy.writetb(
                    'spatial_queries/input/%s/%f/%d.tb.seq' %
                    (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
            except IOError, e:
                print('Got IOError, skipping')
                print(e)
            hdfs_file_cnt += 1
            hdfs_buf = []
def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path,read_edge_wiki(f))
def main():
    hadoopy.writetb(hdfs_path, read_hbase(table_wiki))
    if os.path.isfile(local_path):
        print "deleting " + local_path
        os.remove(local_path)
    os.system('hadoop fs -copyToLocal ' + hdfs_path + ' ' + local_path)
Exemple #38
0
            yield str(image_num), (image, label_points)
    hdfs_file_cnt = 0
    hdfs_buf = []
    start_time = time.time()
    for x in make_data():
        print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt))
        hdfs_buf.append(x)
        if len(hdfs_buf) >= 100:
            try:
                hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
            except IOError, e:
                print('Got IOError, skipping')
                print(e)
            hdfs_file_cnt += 1
            hdfs_buf = []
    if hdfs_buf:
        hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf)
    print('NumClasses[%d]' % len(classes))
    print('Classes: %r' % classes)

if __name__ == '__main__':
    dataset = vision_data.MSRC()
    classes = msrc_classes
    if 1:
        from data_sources import data_source_from_uri
        from sun397_dataset import SUN397
        uri = 'hbase://localhost:9090/images?image=data:image_320&gt=feat:masks_gt'
        dataset = SUN397(data_source_from_uri(uri))
        classes = json.load(open('classes.js'))
    write_texton_hadoop(dataset, classes)
def main():
    local_path = './file/'
    hadoopy.writetb(hdfs_path, read_local_dir(local_path))
Exemple #40
0
hbase_table = 'wiki'
hdfs_path = 'wiki.tb'

host= 'localhost'
connection = happybase.Connection(host)
wiki_table = connection.table(hbase_table)


def get_url_content_for_hdfs():
    for url, content in wiki_table.scan():
        v = content['cf:content'].encode('utf-8')
        yield url, v

if hadoopy.exists(hdfs_path):
    hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning)
    
hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS

# Test OK (ATIH 2/12/2015)
url_content_dict = dict(hadoopy.readtb(hdfs_path))
for k, v in url_content_dict.iteritems():
    print 'k = ', k
    print 'v = ', v
    break

for k, v in hadoopy.readtb(hdfs_path):
    print 'k = ', k.encode('utf-8')
    print 'v = ', v.encode('utf-8')
    break
Exemple #41
0
    import hadoopy_flow
except ImportError:
    raise ImportError('You need hadoopy_flow from https://github.com/bwhite/hadoopy_flow')
import hadoopy
import time

# Setup paths
data_path = 'hadoopy-test-data/%f/' % time.time()
input_path = data_path + 'input'
output_path_a = data_path + 'output_a'
output_path_b = data_path + 'output_b'
output_path_c = data_path + 'output_c'
output_path_d = data_path + 'output_d'

# Write data to HDFS in the form of (term #, term)
input_data = [(1, 5), ('dsfs', {'a': 3}), ([1, 2], 'sdflk')]  # Diverse KV input
hadoopy.writetb(input_path, input_data)

# Launch the jobs
hadoopy.launch_frozen(input_path, output_path_a, 'identity.py')
hadoopy.launch_frozen(input_path, output_path_b, 'identity.py')
hadoopy.launch_frozen(output_path_b, output_path_c, 'identity.py')
hadoopy.launch_frozen([input_path, output_path_a, output_path_b, output_path_c], output_path_d, 'identity.py')

# Read the first KV pair
print('KV Input[%s]' % str(hadoopy.readtb(input_path).next()))
print('KV Output a[%s]' % str(hadoopy.readtb(output_path_a).next()))
print('KV Output b[%s]' % str(hadoopy.readtb(output_path_b).next()))
print('KV Output c[%s]' % str(hadoopy.readtb(output_path_c).next()))
print('KV Output d[%s]' % str(hadoopy.readtb(output_path_d).next()))
Exemple #42
0
 def _flush(out, part_num):
     hadoopy.writetb('%s/part-%.5d' % (path, part_num), out, **kw)
     return [], part_num + 1
"""Make fake video frames

Usage:
python video_frame_generator.py <hdfs_out_path>
"""
import hadoopy
import random
import string
import sys


def rand_string(l):
    return ''.join(random.choice(string.letters) for i in xrange(l))

# Format: ('image_hash', {'image_data': 'longbinarydata', 'frame_num': 5, 'source_video': 'long_hash'})
def generate():
    def _inner():
        image_data = rand_string(100)
        frame_num = random.randint(0, 2**10)
        source_video = rand_string(32)
        return rand_string(32), locals()
    for x in range(50):
        yield _inner()

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(__doc__)
        sys.exit(1)
    hadoopy.writetb(sys.argv[1], generate())
#input_path="hdfs://localhost:9000/alice.txt"
input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase"
output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark'

words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')]
words_stop.append('')

sc=SparkContext()

lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8')))

splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop]))

tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText}))

tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b)

tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList]))

NwordsMax = 200000
def read_rdd(rdd):
    for key,data in rdd.takeSample(True,NwordsMax):
        yield key,data

if hadoopy.exists(output_hdfs_path):
    hadoopy.rmr("-skipTrash %s"%output_hdfs_path)

hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))


Exemple #45
0
def main():
    with open('edge_list.txt') as f:
        hadoopy.writetb(tb_path, read_edge_wiki(f))