def write_tb(path, fold=None, data_path='data'): """ Copy all images in the specified fddb folds and put them on hdfs. If folds=None (default), then all images from all folds are copied. If the fddb dataset does not already exist in {data_path}/fddb, then that directory is created and the fddb is downloaded there. """ fddb_path = '%s/fddb' % data_path # download fddb, if necessary if not os.path.isdir(fddb_path): download_data(data_path) if fold == None: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt' else: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold names = [] for fn in glob.glob(folds_glob): with open(fn, 'r') as fp: names.extend([ '%s/%s.jpg' % (fddb_path, l) for l in fp.read().strip().split('\n') ]) # print message about filenames that do not exist for n in names: if not os.path.exists(n): print('"%s" does not exist!' % n) # remove those filenames from the list names = filter(os.path.exists, names) # write the images to tb files hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
def write_tb(path, fold=None, data_path='data'): """ Copy all images in the specified fddb folds and put them on hdfs. If folds=None (default), then all images from all folds are copied. If the fddb dataset does not already exist in {data_path}/fddb, then that directory is created and the fddb is downloaded there. """ fddb_path = '%s/fddb' % data_path # download fddb, if necessary if not os.path.isdir(fddb_path): download_data(data_path) if fold == None: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt' else: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold names = [] for fn in glob.glob(folds_glob): with open(fn, 'r') as fp: names.extend(['%s/%s.jpg' % (fddb_path, l) for l in fp.read().strip().split('\n')]) # print message about filenames that do not exist for n in names: if not os.path.exists(n): print('"%s" does not exist!' % n) # remove those filenames from the list names = filter(os.path.exists, names) # write the images to tb files hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
def write_tb(path, fold=None): fddb_path = '/home/morariu/downloads/fddb' if fold == None: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-??.txt' else: folds_glob = fddb_path + '/FDDB-folds/FDDB-fold-%02i.txt' % fold if hadoopy.exists(path): # do nothing if the file already exists pass else: # otherwise, find all images in the fddb folds and put them on hdfs names = [] for fn in glob.glob(folds_glob): with open(fn, 'r') as fp: names.extend(['%s/%s.jpg' % (fddb_path, l) for l in fp.read().strip().split('\n')]) # print message about filenames that do not exist for n in names: if not os.path.exists(n): print('"%s" does not exist!' % n) # remove those filenames from the list names = filter(os.path.exists, names) # write the images to tb files hadoopy.writetb(path, [(n, open(n, 'rb').read()) for n in names])
def insert_vector_into_hdfs(hdfs_path, iterator): # Deleting the file if it existes if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s"%hdfs_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(hdfs_path, iterator)
def insert_data_into_hdfs(): # Deleting the file if it existes if hadoopy.exists(tb_path): hadoopy.rmr("-skipTrash %s"%tb_path) # Writing to HDFS # user$ hadoop dfsadmin -safemode leave (this command to avoid the error ) Cannot create file/user/edge_list.tb. Name node is in safe mode. hadoopy.writetb(tb_path, get_kv_from_file(data_file_path))
def custom_initialization(): host= 'localhost' connection = happybase.Connection(host) wiki_table = connection.table('wiki') hdfs_path = 'wiki_index.tb' hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning) hadoopy.writetb(hdfs_path,wiki_table.scan(limit=1000)) # Writing the wiki table inot HDFS
def getGpsData(source,destination): gps_data = readHDFS(source) if gps_data.empty == False: gps_data = uniformTimeFormat(gps_data) data_trans = gps_data.T.to_dict('list') tuples = [ item for item in data_trans.iteritems()] hadoopy.writetb(destination,tuples)
def write_texton_hadoop(dataset, classes): """Writes (image_name, image_label_points) image_name: A string image_label_points: List of (image, [(label, points), ...]) where points is Nx2 (y, x) """ if not isinstance(classes, dict): classes = dict((y, x) for x, y in enumerate(classes)) sample_points = 15000 samples_per_class = {} def make_data(): for image_num, (masks, image) in enumerate(dataset.segmentation_boxes()): ratio, image = resize(image) if image.shape[0] < radius * 2 + 1 or image.shape[1] < radius * 2 + 1: continue image = make_masks(image) image_size = float(image.shape[0] * image.shape[1]) print(image.shape) print(image_num) label_points = [] for class_name, mask in masks.items(): mask = cv2.resize(mask.astype(np.uint8), (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST) assert mask.shape == image.shape[:2] try: class_num = classes[class_name] except KeyError: continue ijs = np.dstack(mask.nonzero())[0] orig_ijs = ijs ijs = np.ascontiguousarray(random.sample(ijs, min(sample_points, len(ijs)))) if not len(ijs): continue print('Image[%d][%s][%d][%d][%f] has ijs available' % (image_num, class_name, len(ijs), len(orig_ijs), len(orig_ijs) / image_size)) try: samples_per_class[class_name] += len(ijs) except KeyError: samples_per_class[class_name] = len(ijs) label_points.append((class_num, np.array(ijs, dtype=np.int32))) # * ratio if not label_points: print('Image[%d] has no points available' % image_num) continue print(samples_per_class) yield str(image_num), (image, label_points) hdfs_file_cnt = 0 hdfs_buf = [] start_time = time.time() for x in make_data(): print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt)) hdfs_buf.append(x) if len(hdfs_buf) >= 100: try: hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) except IOError, e: print('Got IOError, skipping') print(e) hdfs_file_cnt += 1 hdfs_buf = []
def setup_data(local_inputs, hdfs_input, images_per_file=2): cnt = 0 out = [] for x in load_data_iter(local_inputs): out.append(x) if len(out) > images_per_file: hadoopy.writetb(hdfs_input + '/%d' % cnt, out) cnt += 1 out = [] if out: hadoopy.writetb(hdfs_input + '/%d' % cnt, out)
def test_readtb_writetb(self): working_path = "%s/readtb_writetb/" % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = "%s/%.5d" % (working_path, x) print(fn) data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty(working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def latency_test(launcher): output_path = '_hadoopy_bench/%f' % time.time() v = 'blah' kv = (v, {'client_time': time.time(), 'value_len': len(v), 'count': 0}) hadoopy.writetb(output_path + '/input', [kv]) launcher(output_path + '/input', output_path + '/output', 'time_job.py') v = hadoopy.readtb(output_path + '/output').next()[1] v['server_time'] = time.time() t0 = v['worker_time'] - v['client_time'] t1 = v['server_time'] - v['worker_time'] t2 = v['server_time'] - v['client_time'] print((t0, t1, t2)) hadoopy.rmr(output_path)
def throughput_test(launcher): output_path = '_hadoopy_bench/%f' % time.time() v = 'blah' kv = (v, {'client_time': time.time(), 'value_len': len(v), 'count': 0}) num_files = 3 num_kvs = 10000000 hadoopy.writetb(output_path + '/input/0', (kv for x in xrange(num_kvs))) for x in range(1, num_files): hadoopy.cp(output_path + '/input/0', output_path + '/input/%d' % x) hadoopy.freeze_script('time_job.py') # Factor out Pyinstaller time st = time.time() launcher(output_path + '/input', output_path + '/output', 'time_job.py') print((num_kvs * num_files) / (time.time() - st)) hadoopy.rmr(output_path)
def test_readtb_writetb(self): working_path = '%s/readtb_writetb/' % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = '%s/%.5d' % (working_path, x) print(fn) data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty( working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def run_local_kmeans(hdfs_input, hdfs_output, num_clusters, *args, **kw): import multiprocessing import Queue import time q = multiprocessing.Queue() data = np.asfarray([y for x, y in hadoopy.readtb(hdfs_input)]) p = multiprocessing.Process(target=_run_local_kmeans, args=(data, q, num_clusters)) p.start() while 1: try: clusters = q.get(timeout=10) break except Queue.Empty: print('Queue get failed') time.sleep(0) p.join() hadoopy.writetb(hdfs_output, enumerate(clusters)) print('Done Writing to HDFS') print('Done joining')
def load_local(local_input, hdfs_output, output_format='kv', max_record_size=None, max_kv_per_file=None, **kw): """Read data, de-duplicate, and put on HDFS in the specified format Args: local_input: Local directory path hdfs_output: HDFS output path output_format: One of 'kv' or 'record'. If 'kv' then output sequence files of the form (sha1_hash, binary_file_data). If 'record' then output sequence files of the form (sha1_hash, metadata) where metadata has keys sha1: Sha1 hash extension: File extension without a period (blah.avi -> avi, blah.foo.avi -> avi, blah -> '') full_path: Local file path hdfs_path: HDFS path of the file (if any), the data should be the binary contents of the file stored at this location on HDFS. data: Binary file contents where only one of data or hdfs_path has to exist. max_record_size: If using 'record' and the filesize (in bytes) is larger than this, then store the contents of the file in a directory called '_blobs' inside output path with the name as the sha1 hash prefixed to the original file name (example, hdfs_output/blobs/sha1hash_origname). If None then there is no limit to the record size (default is None). max_kv_per_file: If not None then only put this number of kv pairs in each sequence file (default None). """ fns = sorted([os.path.join(local_input, x) for x in os.listdir(local_input)]) if output_format not in ('kv', 'record'): raise ValueError('Unsupported output_format [%s]' % output_format) out = [] out_cnt = 0 for x in _read_files(fns, set(), hdfs_output, output_format, max_record_size): out.append(x) if max_kv_per_file is not None and max_kv_per_file < len(out): hadoopy.writetb(hdfs_output + '/part-%.5d' % out_cnt, out) out_cnt += 1 out = [] if out: hadoopy.writetb(hdfs_output + '/part-%.5d' % out_cnt, out)
def launch_map_update(nodes, job_id, redis_host, jobconfs=None): jobconfs_base = {'mapred.map.tasks.speculative.execution': 'false', 'mapred.reduce.tasks.speculative.execution': 'false', 'mapred.task.timeout': '0'} if jobconfs: jobconfs_base.update(jobconfs) with hadoopy_helper.hdfs_temp() as input_path: for node in nodes: print(node) v = {'script_name': os.path.basename(node['script_path']), 'script_data': open(node['script_path']).read()} if 'cmdenvs' in node and node['cmdenvs'] is not None: v['cmdenvs'] = node['cmdenvs'] if 'files' in node and node['files'] is not None: v['files'] = dict((os.path.basename(f), open(f).read()) for f in node['files']) cmdenvs = {'job_id': job_id, 'hadoopy_rt_redis': redis_host} if 'outputs' in node and node['outputs']: v['outputs'] = node['outputs'] hadoopy.writetb('%s/input/%d' % (input_path, node['name']), [(node['name'], v)]) hadoopy.launch(input_path + '/input', input_path + '/output_path_empty', _lf('hadoopy_rt_job.py'), cmdenvs=cmdenvs, jobconfs=jobconfs_base)
def flickr_images(tags, images_per_tag, hdfs_output, num_files=20, max_iters=1, max_pages=1, output_meta=False, api_key=None, api_secret=None, remove_output=False): tags = list(tags) if api_key is None or api_secret is None: api_key = os.environ['FLICKR_API_KEY'] api_secret = os.environ['FLICKR_API_SECRET'] tags_per_chunk = max(len(tags) / num_files, 1) if remove_output and hadoopy.exists(hdfs_output): print('Removing output dir[%s]' % hdfs_output) hadoopy.rmr(hdfs_output) cmdenvs = { 'FLICKR_API_KEY': api_key, 'FLICKR_API_SECRET': api_secret, 'MAX_ITERS': str(max_iters), 'MAX_PAGES': str(max_pages) } for chunk_num, chunk_tags in enumerate(_chunks(tags, tags_per_chunk)): hadoopy.writetb(hdfs_output + '/tags/%d' % chunk_num, [(images_per_tag, tag) for tag in chunk_tags]) hadoopy.launch_frozen(hdfs_output + '/tags', hdfs_output + '/metadata', _lf('flickr_bulk.py'), cmdenvs=cmdenvs, num_reducers=num_files) output_type = 'meta' if output_meta else 'image' hadoopy.launch_frozen(hdfs_output + '/metadata', hdfs_output + '/image_metadata', _lf('file_downloader.py'), cmdenvs={'OUTPUT_TYPE': output_type})
def copy(source_path, destination_path): hadoopy.writetb(destination_path, read_tb(source_path))
import hadoopy import pickle import json import sys print "Loading pickle file..." sys.stdout.flush() data=pickle.load(open('guns_america.pickle','rb')) print "Done." sys.stdout.flush() print "Writing readme...", sys.stdout.flush() hadoopy.writetb('/data/weapons/gunsamerica/README.txt', [('README','author: svebor karaman\[email protected]\n\n\ This folder contains ads crawled by JPL from the gunsamerica.com website. \ The goal is to use this data to train image classifiers using automatic extractions\ as labels.\n\n\ The file is composed of key value pairs where the key is a string like \'com/gunsamerica/www/C378FA4FB824B323B706222F826938AE660D7A1E322F501441F203BCB01239F5\' \ extracted from the JPL imagecat id (discarding the first part \'file:/data2/USCWeaponsStatsGathering/nutch/full_dump/\') \ and the value is a JSON with fields \'raw_html\', \'dlimagecat_url\' and \'original_doc\'.')]) print "Done." sys.stdout.flush() print "Start writing whole seq file." sys.stdout.flush() hadoopy.writetb('/data/weapons/gunsamerica/gunsamerica.seq', [(key,json.dumps(data[key])) for key in data.keys()]) print "Done writing seq file." sys.stdout.flush()
def _flush(out, part_num): hadoopy.writetb('%s/part-%.5d' % (path, part_num), out, **kw) return [], part_num + 1
def test_cluster_info(self): hadoopy.writetb(self.data_path + 'cluster_info_input', [(0, 0)]) hadoopy.launch_frozen(self.data_path + 'cluster_info_input', self.data_path + 'cluster_info', 'cluster_info.py') pprint.pprint(dict(hadoopy.readtb(self.data_path + 'cluster_info')))
def launch_local(in_name, out_name, script_path, poll=None, max_input=None, files=(), cmdenvs=(), pipe=True, python_cmd='python', remove_tempdir=True, identity_mapper=False, num_reducers=None, **kw): """A simple local emulation of hadoop This doesn't run hadoop and it doesn't support many advanced features, it is intended for simple debugging. The input/output uses HDFS if an HDFS path is given. This allows for small tasks to be run locally (primarily while debugging). A temporary working directory is used and removed. Support * Environmental variables * Map-only tasks * Combiner * Files * Pipe (see below) * Display of stdout/stderr * Iterator of KV pairs as input or output (bypassing HDFS) :param in_name: Input path (string or list of strings) or Iterator of (key, value). If it is an iterator then no input is taken from HDFS. :param out_name: Output path or None. If None then output is not placed on HDFS, it is available through the 'output' key of the return value. :param script_path: Path to the script (e.g., script.py) :param poll: If not None, then only attempt to get a kv pair from kvs if when called, poll returns True. :param max_input: Maximum number of Mapper inputs, None (default) then unlimited. :param files: Extra files (other than the script) (iterator). NOTE: Hadoop copies the files into working directory :param cmdenvs: Extra cmdenv parameters (iterator) :param pipe: If true (default) then call user code through a pipe to isolate it and stop bugs when printing to stdout. See project docs. :param python_cmd: The python command to use. The default is "python". Can be used to override the system default python, e.g. python_cmd = "python2.6" :param remove_tempdir: If True (default), then rmtree the temporary dir, else print its location. Useful if you need to see temporary files or how input files are copied. :param identity_mapper: If True, use an identity mapper, regardless of what is in the script. :param num_reducers: If 0, don't run the reducer even if one exists, else obey what is in the script. :rtype: Dictionary with some of the following entries (depending on options) :returns: freeze_cmds: Freeze command(s) ran :returns: frozen_tar_path: HDFS path to frozen file :returns: hadoop_cmds: Hadoopy command(s) ran :returns: process: subprocess.Popen object :returns: output: Iterator of (key, value) pairs :raises: subprocess.CalledProcessError: Hadoop error. :raises: OSError: Hadoop streaming not found. :raises: TypeError: Input types are not correct. :raises: ValueError: Script not found """ if isinstance(files, (str, unicode)) or isinstance( cmdenvs, (str, unicode)) or ('cmdenvs' in kw and isinstance(kw['cmdenvs'], (str, unicode))): raise TypeError( 'files and cmdenvs must be iterators of strings and not strings!') logging.info('Local[%s]' % script_path) script_info = hadoopy._runner._parse_info(script_path, python_cmd) if isinstance(in_name, (str, unicode)) or (in_name and isinstance( in_name, (list, tuple)) and isinstance(in_name[0], (str, unicode))): in_kvs = hadoopy.readtb(in_name) else: in_kvs = in_name if 'reduce' in script_info['tasks'] and num_reducers != 0: if identity_mapper: kvs = in_kvs else: kvs = list( LocalTask(script_path, 'map', files, max_input, pipe, python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll)) if 'combine' in script_info['tasks']: kvs = hadoopy.Test.sort_kv(kvs) kvs = list( LocalTask(script_path, 'combine', files, max_input, pipe, python_cmd, remove_tempdir).run_task(kvs, cmdenvs)) kvs = hadoopy.Test.sort_kv(kvs) kvs = LocalTask(script_path, 'reduce', files, max_input, pipe, python_cmd, remove_tempdir).run_task(kvs, cmdenvs) else: if identity_mapper: kvs = in_kvs else: kvs = LocalTask(script_path, 'map', files, max_input, pipe, python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll) out = {} if out_name is not None: hadoopy.writetb(out_name, kvs) out['output'] = hadoopy.readtb(out_name) else: out['output'] = kvs return out
def main(): local_path = '/home/hdusr/erika/data' hadoopy.writetb(hdfs_path, read_local_dir(local_path))
def launch_local(in_name, out_name, script_path, poll=None, max_input=None, files=(), cmdenvs=(), pipe=True, python_cmd='python', remove_tempdir=True, identity_mapper=False, num_reducers=None, **kw): """A simple local emulation of hadoop This doesn't run hadoop and it doesn't support many advanced features, it is intended for simple debugging. The input/output uses HDFS if an HDFS path is given. This allows for small tasks to be run locally (primarily while debugging). A temporary working directory is used and removed. Support * Environmental variables * Map-only tasks * Combiner * Files * Pipe (see below) * Display of stdout/stderr * Iterator of KV pairs as input or output (bypassing HDFS) :param in_name: Input path (string or list of strings) or Iterator of (key, value). If it is an iterator then no input is taken from HDFS. :param out_name: Output path or None. If None then output is not placed on HDFS, it is available through the 'output' key of the return value. :param script_path: Path to the script (e.g., script.py) :param poll: If not None, then only attempt to get a kv pair from kvs if when called, poll returns True. :param max_input: Maximum number of Mapper inputs, None (default) then unlimited. :param files: Extra files (other than the script) (iterator). NOTE: Hadoop copies the files into working directory :param cmdenvs: Extra cmdenv parameters (iterator) :param pipe: If true (default) then call user code through a pipe to isolate it and stop bugs when printing to stdout. See project docs. :param python_cmd: The python command to use. The default is "python". Can be used to override the system default python, e.g. python_cmd = "python2.6" :param remove_tempdir: If True (default), then rmtree the temporary dir, else print its location. Useful if you need to see temporary files or how input files are copied. :param identity_mapper: If True, use an identity mapper, regardless of what is in the script. :param num_reducers: If 0, don't run the reducer even if one exists, else obey what is in the script. :rtype: Dictionary with some of the following entries (depending on options) :returns: freeze_cmds: Freeze command(s) ran :returns: frozen_tar_path: HDFS path to frozen file :returns: hadoop_cmds: Hadoopy command(s) ran :returns: process: subprocess.Popen object :returns: output: Iterator of (key, value) pairs :raises: subprocess.CalledProcessError: Hadoop error. :raises: OSError: Hadoop streaming not found. :raises: TypeError: Input types are not correct. :raises: ValueError: Script not found """ if isinstance(files, (str, unicode)) or isinstance(cmdenvs, (str, unicode)) or ('cmdenvs' in kw and isinstance(kw['cmdenvs'], (str, unicode))): raise TypeError('files and cmdenvs must be iterators of strings and not strings!') logging.info('Local[%s]' % script_path) script_info = hadoopy._runner._parse_info(script_path, python_cmd) if isinstance(in_name, (str, unicode)) or (in_name and isinstance(in_name, (list, tuple)) and isinstance(in_name[0], (str, unicode))): in_kvs = hadoopy.readtb(in_name) else: in_kvs = in_name if 'reduce' in script_info['tasks'] and num_reducers != 0: if identity_mapper: kvs = in_kvs else: kvs = list(LocalTask(script_path, 'map', files, max_input, pipe, python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll)) if 'combine' in script_info['tasks']: kvs = hadoopy.Test.sort_kv(kvs) kvs = list(LocalTask(script_path, 'combine', files, max_input, pipe, python_cmd, remove_tempdir).run_task(kvs, cmdenvs)) kvs = hadoopy.Test.sort_kv(kvs) kvs = LocalTask(script_path, 'reduce', files, max_input, pipe, python_cmd, remove_tempdir).run_task(kvs, cmdenvs) else: if identity_mapper: kvs = in_kvs else: kvs = LocalTask(script_path, 'map', files, max_input, pipe, python_cmd, remove_tempdir).run_task(in_kvs, cmdenvs, poll) out = {} if out_name is not None: hadoopy.writetb(out_name, kvs) out['output'] = hadoopy.readtb(out_name) else: out['output'] = kvs return out
prev_hashes.add(data_hash) yield data_hash, data # Write vidoes videos = ['youtube_action_dataset'] # 'videos', for video_name in videos: picarus.io.load_local(os.path.join(local_root, video_name), '%s/video_record_%s' % (hdfs_root, video_name), output_format='record', max_record_size=723074, max_kv_per_file=5) # Write unabled data (used for evaluation) unlabeled = [] # for unlabeled_name in unlabeled: fns = glob.glob('%s/%s/*' % (local_root, unlabeled_name)) random.shuffle(fns) prev_hashes = set() hadoopy.writetb('%s/unlabeled_%s' % (hdfs_root, unlabeled_name), read_files(fns, prev_hashes)) print('Unlabeled:[%s] Num[%d]' % (unlabeled_name, len(prev_hashes))) quit() # Write train/test data_pairs = [('detected_faces', 'detected_nonfaces'), ('photos', 'nonphotos'), ('indoors', 'outdoors'), ('pr0n', 'nonpr0n'), ('objects', 'nonobjects')] for pos_name, neg_name in data_pairs: pos_fns = glob.glob('%s/%s/*' % (local_root, pos_name)) neg_fns = glob.glob('%s/%s/*' % (local_root, neg_name)) random.shuffle(pos_fns) random.shuffle(neg_fns) num_train = int(min(len(neg_fns), len(pos_fns)) * pct_train) prev_hashes = set() # Pos hadoopy.writetb('%s/test_%s' % (hdfs_root, pos_name), read_files(pos_fns[num_train:], prev_hashes)) print(len(prev_hashes))
for i,v in enumerate(vect): yield str(i).encode('utf-8'),v N = 64375 diff=1. r0 = np.ones(N).astype(np.float)/N if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s"%input_path) os.system('hdfs dfs -cp '+edge_path+' '+input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s"%output_path) hadoopy.writetb(output_path,read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) iteration = 0 while diff>0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s"%temp_path) hadoopy.launch(input_path,temp_path,'PageRank.py',files=[]) generator_vector = hadoopy.readtb(output_path) rk = {} for score in generator_vector: url = score[0] r = score[1]
def main(): hadoopy.writetb(hdfs_path,read_hbase(table_wiki)) if os.path.isfile(local_path): print "deleting "+local_path os.remove(local_path) os.system('hadoop fs -copyToLocal ' + hdfs_path + ' ' + local_path)
yield str(i).encode('utf-8'), v N = 64375 diff = 1. r0 = np.ones(N).astype(np.float) / N if hadoopy.exists(input_path): hadoopy.rmr("-skipTrash %s" % input_path) os.system('hdfs dfs -cp ' + edge_path + ' ' + input_path) if hadoopy.exists(output_path): hadoopy.rmr("-skipTrash %s" % output_path) hadoopy.writetb(output_path, read_vector(r0)) if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) iteration = 0 while diff > 0.01: if hadoopy.exists(temp_path): hadoopy.rmr("-skipTrash %s" % temp_path) hadoopy.launch(input_path, temp_path, 'PageRank.py', files=[]) generator_vector = hadoopy.readtb(output_path) rk = {} for score in generator_vector: url = score[0] r = score[1]
lambda (x, y): (x[5:].decode('utf-8'), y[5:].decode('utf-8'))) splitText = lines.map(lambda (url, text): (url, [ stem(word.group().lower()) for word in re.finditer( r"\w+", text, re.UNICODE) if word.group().lower() not in words_stop ])) tf = splitText.map(lambda (url, splittedText): (url, { word: 1.0 * splittedText.count(word) / len(splittedText) for word in splittedText })) tfWordAsKey = tf.flatMap(lambda (url, tf): [(word, [(url, tf[ word])]) for word in tf]).reduceByKey(lambda a, b: a + b) tfidf = tfWordAsKey.map(lambda (word, tfList): (word, [(url, tf * np.log10( 27474.0 / len(tfList))) for (url, tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key, data in rdd.takeSample(True, NwordsMax): yield key, data if hadoopy.exists(output_hdfs_path): hadoopy.rmr("-skipTrash %s" % output_hdfs_path) hadoopy.writetb(output_hdfs_path, read_rdd(tfidf))
import hadoopy import os import logging input_path = '/data/corpus_data' output_path = '/data/output' local_path = '/app/opencorpora' # Utilities def read_local_dir(local_path): for fn in os.listdir(local_path): path = os.path.join(local_path, fn) if os.path.isfile(path): yield path, open(path).read() # Cleanup and write input data if hadoopy.exists(input_path): hadoopy.rmr(input_path) if hadoopy.exists(output_path): hadoopy.rmr(output_path) hadoopy.writetb(input_path, read_local_dir(local_path)) # Launch the job hadoopy.launch_frozen(input_path, output_path, 'wc.py') # Read the first KV pair word_counts = dict(hadoopy.readtb(output_path)) for w3, tpl in word_counts.items(): if tpl[1] > 4: print tpl[0][0], tpl[0][1], tpl[0][2], tpl[1], tpl[2], tpl[3]
print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt)) hdfs_buf.append(x) if len(hdfs_buf) >= 100: try: hadoopy.writetb( 'spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) except IOError, e: print('Got IOError, skipping') print(e) hdfs_file_cnt += 1 hdfs_buf = [] if hdfs_buf: hadoopy.writetb( 'spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) print('NumClasses[%d]' % len(classes)) print('Classes: %r' % classes) if __name__ == '__main__': dataset = vision_data.MSRC() classes = msrc_classes if 1: from data_sources import data_source_from_uri from sun397_dataset import SUN397 uri = 'hbase://localhost:9090/images?image=data:image_320>=feat:masks_gt' dataset = SUN397(data_source_from_uri(uri)) classes = json.load(open('classes.js')) write_texton_hadoop(dataset, classes)
import hadoopy import time # Setup paths data_path = 'hadoopy-test-data/%f/' % time.time() input_path = data_path + 'wc-input' output_path = data_path + 'wc-output' # Write data to HDFS in the form of (term #, term) input_data = enumerate( 'Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.' .split()) hadoopy.writetb(input_path, input_data) # Launch the job hadoopy.launch_frozen(input_path, output_path, 'wc.py') # Read the first KV pair word_counts = dict(hadoopy.readtb(output_path)) for probe_word, expected_count in [('the', 6), ('Lorem', 4), ('of', 4)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
def write_texton_hadoop(dataset, classes): """Writes (image_name, image_label_points) image_name: A string image_label_points: List of (image, [(label, points), ...]) where points is Nx2 (y, x) """ if not isinstance(classes, dict): classes = dict((y, x) for x, y in enumerate(classes)) sample_points = 15000 samples_per_class = {} def make_data(): for image_num, (masks, image) in enumerate(dataset.segmentation_boxes()): ratio, image = resize(image) if image.shape[0] < radius * 2 + 1 or image.shape[ 1] < radius * 2 + 1: continue image = make_masks(image) image_size = float(image.shape[0] * image.shape[1]) print(image.shape) print(image_num) label_points = [] for class_name, mask in masks.items(): mask = cv2.resize(mask.astype(np.uint8), (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST) assert mask.shape == image.shape[:2] try: class_num = classes[class_name] except KeyError: continue ijs = np.dstack(mask.nonzero())[0] orig_ijs = ijs ijs = np.ascontiguousarray( random.sample(ijs, min(sample_points, len(ijs)))) if not len(ijs): continue print('Image[%d][%s][%d][%d][%f] has ijs available' % (image_num, class_name, len(ijs), len(orig_ijs), len(orig_ijs) / image_size)) try: samples_per_class[class_name] += len(ijs) except KeyError: samples_per_class[class_name] = len(ijs) label_points.append( (class_num, np.array(ijs, dtype=np.int32))) # * ratio if not label_points: print('Image[%d] has no points available' % image_num) continue print(samples_per_class) yield str(image_num), (image, label_points) hdfs_file_cnt = 0 hdfs_buf = [] start_time = time.time() for x in make_data(): print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt)) hdfs_buf.append(x) if len(hdfs_buf) >= 100: try: hadoopy.writetb( 'spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) except IOError, e: print('Got IOError, skipping') print(e) hdfs_file_cnt += 1 hdfs_buf = []
def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path,read_edge_wiki(f))
def main(): hadoopy.writetb(hdfs_path, read_hbase(table_wiki)) if os.path.isfile(local_path): print "deleting " + local_path os.remove(local_path) os.system('hadoop fs -copyToLocal ' + hdfs_path + ' ' + local_path)
yield str(image_num), (image, label_points) hdfs_file_cnt = 0 hdfs_buf = [] start_time = time.time() for x in make_data(): print('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt)) hdfs_buf.append(x) if len(hdfs_buf) >= 100: try: hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) except IOError, e: print('Got IOError, skipping') print(e) hdfs_file_cnt += 1 hdfs_buf = [] if hdfs_buf: hadoopy.writetb('spatial_queries/input/%s/%f/%d.tb.seq' % (dataset._name, start_time, hdfs_file_cnt), hdfs_buf) print('NumClasses[%d]' % len(classes)) print('Classes: %r' % classes) if __name__ == '__main__': dataset = vision_data.MSRC() classes = msrc_classes if 1: from data_sources import data_source_from_uri from sun397_dataset import SUN397 uri = 'hbase://localhost:9090/images?image=data:image_320>=feat:masks_gt' dataset = SUN397(data_source_from_uri(uri)) classes = json.load(open('classes.js')) write_texton_hadoop(dataset, classes)
def main(): local_path = './file/' hadoopy.writetb(hdfs_path, read_local_dir(local_path))
hbase_table = 'wiki' hdfs_path = 'wiki.tb' host= 'localhost' connection = happybase.Connection(host) wiki_table = connection.table(hbase_table) def get_url_content_for_hdfs(): for url, content in wiki_table.scan(): v = content['cf:content'].encode('utf-8') yield url, v if hadoopy.exists(hdfs_path): hadoopy.rmr("-skipTrash %s" %(hdfs_path)) # Suppression of the file (cleaning) hadoopy.writetb(hdfs_path,get_url_content_for_hdfs()) # Writing the wiki table inot HDFS # Test OK (ATIH 2/12/2015) url_content_dict = dict(hadoopy.readtb(hdfs_path)) for k, v in url_content_dict.iteritems(): print 'k = ', k print 'v = ', v break for k, v in hadoopy.readtb(hdfs_path): print 'k = ', k.encode('utf-8') print 'v = ', v.encode('utf-8') break
import hadoopy_flow except ImportError: raise ImportError('You need hadoopy_flow from https://github.com/bwhite/hadoopy_flow') import hadoopy import time # Setup paths data_path = 'hadoopy-test-data/%f/' % time.time() input_path = data_path + 'input' output_path_a = data_path + 'output_a' output_path_b = data_path + 'output_b' output_path_c = data_path + 'output_c' output_path_d = data_path + 'output_d' # Write data to HDFS in the form of (term #, term) input_data = [(1, 5), ('dsfs', {'a': 3}), ([1, 2], 'sdflk')] # Diverse KV input hadoopy.writetb(input_path, input_data) # Launch the jobs hadoopy.launch_frozen(input_path, output_path_a, 'identity.py') hadoopy.launch_frozen(input_path, output_path_b, 'identity.py') hadoopy.launch_frozen(output_path_b, output_path_c, 'identity.py') hadoopy.launch_frozen([input_path, output_path_a, output_path_b, output_path_c], output_path_d, 'identity.py') # Read the first KV pair print('KV Input[%s]' % str(hadoopy.readtb(input_path).next())) print('KV Output a[%s]' % str(hadoopy.readtb(output_path_a).next())) print('KV Output b[%s]' % str(hadoopy.readtb(output_path_b).next())) print('KV Output c[%s]' % str(hadoopy.readtb(output_path_c).next())) print('KV Output d[%s]' % str(hadoopy.readtb(output_path_d).next()))
"""Make fake video frames Usage: python video_frame_generator.py <hdfs_out_path> """ import hadoopy import random import string import sys def rand_string(l): return ''.join(random.choice(string.letters) for i in xrange(l)) # Format: ('image_hash', {'image_data': 'longbinarydata', 'frame_num': 5, 'source_video': 'long_hash'}) def generate(): def _inner(): image_data = rand_string(100) frame_num = random.randint(0, 2**10) source_video = rand_string(32) return rand_string(32), locals() for x in range(50): yield _inner() if __name__ == '__main__': if len(sys.argv) != 2: print(__doc__) sys.exit(1) hadoopy.writetb(sys.argv[1], generate())
#input_path="hdfs://localhost:9000/alice.txt" input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase" output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark' words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')] words_stop.append('') sc=SparkContext() lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8'))) splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop])) tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText})) tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b) tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList])) NwordsMax = 200000 def read_rdd(rdd): for key,data in rdd.takeSample(True,NwordsMax): yield key,data if hadoopy.exists(output_hdfs_path): hadoopy.rmr("-skipTrash %s"%output_hdfs_path) hadoopy.writetb(output_hdfs_path,read_rdd(tfidf))
def main(): with open('edge_list.txt') as f: hadoopy.writetb(tb_path, read_edge_wiki(f))