def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = {'labels': {'1': pos_keys, '-1': neg_keys}, 'classifier': classifier, 'classifier_extra': classifier_extra} file_parse.dump(labels, local_labels)
def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb']) # Just bring this along to test the files hadoopy.launch_local(((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def test_name(self): kv_sizes = [(1, 1), (1024 ** 2, 1024 ** 2), (50 * 1024 ** 2, 50 * 1024 ** 2)] a = hadoopy.launch_local((x for x in kv_sizes for y in range(20)), None, 'size_job.py') kvs = list(((len(x), len(y)) for x, y in a['output'])) print(len(kvs)) a = hadoopy.launch_local(kv_sizes, None, 'size_job.py') kvs = list(a['output']) print(len(kvs)) a = hadoopy.launch_local(kvs, None, 'null_job.py') kvs = list(a['output']) print(len(kvs))
def launch(in_name, out_name, script_path, **kw): # If local kv cache doesn't exist, then copy the correct number of values there try: kw['files'] = list(kw['files']) + list(files) except KeyError: kw['files'] = files return hadoopy.launch_local(_local_iter(in_name, max_input), None, script_path, **kw)['output']
def test_local(self): out_path = "%s/local_test/%f" % (self.data_path, time.time()) hadoopy.put("wc-input-alice.tb", out_path + "/wc-input-alice.tb") hadoopy.launch_local( out_path + "/wc-input-alice.tb", out_path + "/out", "local.py", max_input=1000, cmdenvs=["TEST_ENV=10"], files=["wc-input-alice.tb"], ) # Just bring this along to test the files hadoopy.launch_local( ((1000 * "a", 10000000 * "b") for x in range(100)), None, "local.py", max_input=10000, cmdenvs=["TEST_ENV=10"], files=["wc-input-alice.tb"], )
def test_local(self): out_path = '%s/local_test/%f' % (self.data_path, time.time()) hadoopy.mkdir(out_path) hadoopy.put('wc-input-alice.tb', out_path + '/wc-input-alice.tb') hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out_list_cmdenvs', 'local.py', max_input=1000, cmdenvs=['TEST_ENV=10'], files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local(out_path + '/wc-input-alice.tb', out_path + '/out', 'local.py', max_input=1000, cmdenvs={'TEST_ENV': '10'}, files=[ 'wc-input-alice.tb' ]) # Just bring this along to test the files hadoopy.launch_local( ((1000 * 'a', 10000000 * 'b') for x in range(100)), None, 'local.py', max_input=10000, cmdenvs=['TEST_ENV=10'], files=['wc-input-alice.tb'])
def run_classifier_labels(hdfs_input_pos, hdfs_input_neg, hdfs_output, classifier_name, classifier_extra, local_labels, classifier, **kw): """ TODO Finish docstring Args: hdfs_output: Path to hdfs temporary output or None if execution should be performed locally using hadoopy.launch_local. """ labels = {} try: labels = file_parse.load(local_labels) except IOError: pass if hdfs_output is None: j = hadoopy.launch_local(hdfs_input_pos, None, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in j['output']), []) j = hadoopy.launch_local(hdfs_input_neg, None, _lf('collect_keys.py')) neg_keys = sum((x[1] for x in j['output']), []) else: hdfs_output_pos = hdfs_output + '/pos' hdfs_output_neg = hdfs_output + '/neg' picarus._launch_frozen(hdfs_input_pos, hdfs_output_pos, _lf('collect_keys.py')) picarus._launch_frozen(hdfs_input_neg, hdfs_output_neg, _lf('collect_keys.py')) pos_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_pos)), []) neg_keys = sum((x[1] for x in hadoopy.readtb(hdfs_output_neg)), []) labels[classifier_name] = { 'labels': { '1': pos_keys, '-1': neg_keys }, 'classifier': classifier, 'classifier_extra': classifier_extra } file_parse.dump(labels, local_labels)
def launch_zmq(flow_controller, script_path, cleanup_func=None, outputs=None, **kw): def _kvs(): while True: yield flow_controller.recv() kvs = hadoopy.launch_local(_kvs(), None, script_path, poll=flow_controller.poll, **kw)['output'] if outputs is None: for k, v in kvs: # k is the node number, v is a k/v tuple flow_controller.send(k, v) else: for kv in kvs: for s in outputs: flow_controller.send(s, kv)
# and then run. This is the most common way to use Hadoopy as it avoid having to install # anything on the cluster including Python, dependencies, and your code. # 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs. # It emulates the behavior of launch/launch_frozen as close as possible but on the local # machine. Read its docstring for compatibility and details. # # The first argument is the input, for launch_local it can use HDFS paths or an iterator # of (key, value) pairs. The second argument is the output, it can use an HDFS path or None # if the output shouldn't be written to HDFS (as in this case). The third argument is the script # path. The return value of launch_local is a dictionary (see its docstring), and we want 'output' # which is an iterator of the output (key, value) pairs. # # By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes. # The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which # are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that # string can't contain <tab> or <newline> characters. # # Note that the types of the (key, value) pairs can be any serializable Python type when using the # TypedBytes interface (recommended and default), they will be presented to your program in the same # form they are provided. All base types are serialized very efficiently and they fall back to Pickle # for types not supported by TypedBytes. If this is confusing, just know that you can input/output # anything you can pickle and Hadoopy does things in a fast way. output_kvs = hadoopy.launch_local(get_lines(input_path), None, 'wc.py')['output'] # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(output_kvs) for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
return s ############################################################################## if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) copy(eigen_vector_tb_path, temp_vector_path) while diff>0.01: eigen_vector_before = load_eigen_vector(temp_vector_path) if hadoopy.exists(temp_vector_path): hadoopy.rmr("-skipTrash %s"%temp_vector_path) hadoopy.launch_local(data_tb_path, temp_vector_path, 'PageRank.py') eigen_vector_after = load_eigen_vector(temp_vector_path) if hadoopy.exists(eigen_vector_tb_path): hadoopy.rmr("-skipTrash %s"%eigen_vector_tb_path) copy(temp_vector_path, eigen_vector_tb_path) diff = calcul_delta(eigen_vector_before, eigen_vector_after) print diff
# and then run. This is the most common way to use Hadoopy as it avoid having to install # anything on the cluster including Python, dependencies, and your code. # 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs. # It emulates the behavior of launch/launch_frozen as close as possible but on the local # machine. Read its docstring for compatibility and details. # # The first argument is the input, for launch_local it can use HDFS paths or an iterator # of (key, value) pairs. The second argument is the output, it can use an HDFS path or None # if the output shouldn't be written to HDFS (as in this case). The third argument is the script # path. The return value of launch_local is a dictionary (see its docstring), and we want 'output' # which is an iterator of the output (key, value) pairs. # # By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes. # The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which # are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that # string can't contain <tab> or <newline> characters. # # Note that the types of the (key, value) pairs can be any serializable Python type when using the # TypedBytes interface (recommended and default), they will be presented to your program in the same # form they are provided. All base types are serialized very efficiently and they fall back to Pickle # for types not supported by TypedBytes. If this is confusing, just know that you can input/output # anything you can pickle and Hadoopy does things in a fast way. output_kvs = hadoopy.launch_local(get_lines(input_path), None, wc_py )['output'] # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(output_kvs) for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]
# and then run. This is the most common way to use Hadoopy as it avoid having to install # anything on the cluster including Python, dependencies, and your code. # 3. launch_local: This is intended for unit tests, debugging, education, and very small jobs. # It emulates the behavior of launch/launch_frozen as close as possible but on the local # machine. Read its docstring for compatibility and details. # # The first argument is the input, for launch_local it can use HDFS paths or an iterator # of (key, value) pairs. The second argument is the output, it can use an HDFS path or None # if the output shouldn't be written to HDFS (as in this case). The third argument is the script # path. The return value of launch_local is a dictionary (see its docstring), and we want 'output' # which is an iterator of the output (key, value) pairs. # # By default Hadoopy talks to Hadoop Streaming using a simple serialization format called TypedBytes. # The alternative is line oriented records like key0<tab>value0<newline>key1<tab>value1<newline> which # are 1.) less efficient, 2.) more annoying to work with as everything has to be a string and that # string can't contain <tab> or <newline> characters. # # Note that the types of the (key, value) pairs can be any serializable Python type when using the # TypedBytes interface (recommended and default), they will be presented to your program in the same # form they are provided. All base types are serialized very efficiently and they fall back to Pickle # for types not supported by TypedBytes. If this is confusing, just know that you can input/output # anything you can pickle and Hadoopy does things in a fast way. output_kvs = hadoopy.launch_local(get_lines(input_path), None, wc_py)['output'] # Analyze the output. The output is an iterator of (word, count) where word is a string and count # is an integer. word_counts = dict(output_kvs) for probe_word, expected_count in [('the', 1664), ('Alice', 221), ('tree', 3)]: print('word_counts[%s] = %d' % (probe_word, word_counts[probe_word])) assert expected_count == word_counts[probe_word]