def setUp(self): super(SetupTestCase, self).setUp() os.mkdir(os.path.join(self.tmp_dir, "foo")) self.foo_py = os.path.join(self.tmp_dir, "foo", "foo.py") # if our job can import foo, getsize will return 2x as many bytes with open(self.foo_py, "w") as foo_py: foo_py.write( "import os.path\n" "from os.path import getsize as _real_getsize\n" "os.path.getsize = lambda p: _real_getsize(p) * 2" ) self.foo_sh = os.path.join(self.tmp_dir, "foo", "foo.sh") with open(self.foo_sh, "w") as foo_sh: foo_sh.write("#!/bin/sh\n" "touch foo.sh-made-this\n") os.chmod(self.foo_sh, stat.S_IRWXU) self.foo_tar_gz = os.path.join(self.tmp_dir, "foo.tar.gz") tar_and_gzip(os.path.join(self.tmp_dir, "foo"), self.foo_tar_gz) self.foo_py_size = os.path.getsize(self.foo_py) self.foo_sh_size = os.path.getsize(self.foo_sh) self.foo_tar_gz_size = os.path.getsize(self.foo_tar_gz)
def setUp(self): super(SetupTestCase, self).setUp() os.mkdir(os.path.join(self.tmp_dir, 'foo')) self.foo_py = os.path.join(self.tmp_dir, 'foo', 'foo.py') # if our job can import foo, getsize will return 2x as many bytes with open(self.foo_py, 'w') as foo_py: foo_py.write('import os.path\n' 'from os.path import getsize as _real_getsize\n' 'os.path.getsize = lambda p: _real_getsize(p) * 2') self.foo_sh = os.path.join(self.tmp_dir, 'foo', 'foo.sh') with open(self.foo_sh, 'w') as foo_sh: foo_sh.write('#!/bin/sh\n' 'touch foo.sh-made-this\n') os.chmod(self.foo_sh, stat.S_IRWXU) self.foo_tar_gz = os.path.join(self.tmp_dir, 'foo.tar.gz') tar_and_gzip(os.path.join(self.tmp_dir, 'foo'), self.foo_tar_gz) self.foo_py_size = os.path.getsize(self.foo_py) self.foo_sh_size = os.path.getsize(self.foo_sh) self.foo_tar_gz_size = os.path.getsize(self.foo_tar_gz)
def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, and return its path. This will also set self._mrjob_tar_gz_path It's safe to call this method multiple times (we'll only create the tarball once.) """ if self._mrjob_tar_gz_path is None: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' tar_gz_path = os.path.join( self._get_local_tmp_dir(), 'mrjob.tar.gz') def filter_path(path): filename = os.path.basename(path) return not(file_ext(filename).lower() in ('.pyc', '.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path) self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path
def test_extract_dir_for_tar(self): join = os.path.join tar_and_gzip(dir=join(self.tmp_dir, 'a'), out_path=join(self.tmp_dir, 'not_a.tar.gz'), prefix='b') assert_equal(extract_dir_for_tar(join(self.tmp_dir, 'not_a.tar.gz')), 'b')
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" # iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list) # pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w')) # os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ # S_IRGRP | S_IXGRP | \ # S_IROTH | S_IXOTH ) from subprocess import call call(["mkdir", "-p", "gmm"]) for i in range (0, len(iteration_bic_list)): gidx, didx = iteration_bic_list[i] pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w')) os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ S_IRGRP | S_IXGRP | \ S_IROTH | S_IXOTH ) import mrjob.util as util util.tar_and_gzip('gmm', 'gmm.tgz') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): gidx1, didx1 = iteration_bic_list[gmm1idx] gidx2, didx2 = iteration_bic_list[gmm2idx] an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices gidx1, idx1 = iteration_bic_list[ind1] gidx2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def test_tar_and_gzip(self): join = os.path.join # tar it up, and put it in subdirectory (b/) tar_and_gzip(dir=join(self.tmp_dir, 'a'), out_path=join(self.tmp_dir, 'a.tar.gz'), filter=lambda path: not path.endswith('z'), prefix='b') # untar it into b/ t = tarfile.open(join(self.tmp_dir, 'a.tar.gz'), 'r:gz') t.extractall(self.tmp_dir) t.close() self.ensure_expected_results(excluded_files=['baz'])
def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_tar_gz_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the tarball once.) """ if not self._mrjob_tar_gz_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' tar_gz_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.tar.gz') def filter_path(path): filename = os.path.basename(path) return not (filename.lower().endswith('.pyc') or filename.lower().endswith('.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % (mrjob_dir, tar_gz_path, os.path.join('mrjob', ''))) tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob') self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path
def test_master_bootstrap_script_is_valid_python(self): # create a fake src tarball with open(os.path.join(self.tmp_dir, 'foo.py'), 'w'): pass yelpy_tar_gz_path = os.path.join(self.tmp_dir, 'yelpy.tar.gz') tar_and_gzip(self.tmp_dir, yelpy_tar_gz_path, prefix='yelpy') # do everything runner = EMRJobRunner(conf_path=False, bootstrap_cmds=['echo "Hi!"', 'true', 'ls'], bootstrap_files=['/tmp/quz'], bootstrap_mrjob=True, bootstrap_python_packages=[yelpy_tar_gz_path], bootstrap_scripts=['speedups.sh', '/tmp/s.sh']) script_path = os.path.join(self.tmp_dir, 'b.py') runner._create_master_bootstrap_script(dest=script_path) assert os.path.exists(script_path) py_compile.compile(script_path)
def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, This will also set ``self._mrjob_tar_gz_path`` and return it. Typically called from :py:meth:`_create_setup_wrapper_script`. It's safe to call this method multiple times (we'll only create the tarball once.) """ if not self._mrjob_tar_gz_path: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith('__init__.'): raise Exception( "Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or '.' tar_gz_path = os.path.join(self._get_local_tmp_dir(), 'mrjob.tar.gz') def filter_path(path): filename = os.path.basename(path) return not(filename.lower().endswith('.pyc') or filename.lower().endswith('.pyo') or # filter out emacs backup files filename.endswith('~') or # filter out emacs lock files filename.startswith('.#') or # filter out MacFuse resource forks filename.startswith('._')) log.debug('archiving %s -> %s as %s' % ( mrjob_dir, tar_gz_path, os.path.join('mrjob', ''))) tar_and_gzip( mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob') self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path
def _create_mrjob_tar_gz(self): """Make a tarball of the mrjob library, without .pyc or .pyo files, and return its path. This will also set self._mrjob_tar_gz_path It's safe to call this method multiple times (we'll only create the tarball once.) """ if self._mrjob_tar_gz_path is None: # find mrjob library import mrjob if not os.path.basename(mrjob.__file__).startswith("__init__."): raise Exception("Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__) mrjob_dir = os.path.dirname(mrjob.__file__) or "." tar_gz_path = os.path.join(self._get_local_tmp_dir(), "mrjob.tar.gz") def filter_path(path): filename = os.path.basename(path) return not ( file_ext(filename).lower() in (".pyc", ".pyo") or # filter out emacs backup files filename.endswith("~") or # filter out emacs lock files filename.startswith(".#") or # filter out MacFuse resource forks filename.startswith("._") ) log.debug("archiving %s -> %s as %s" % (mrjob_dir, tar_gz_path, os.path.join("mrjob", ""))) tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path, prefix="mrjob") self._mrjob_tar_gz_path = tar_gz_path return self._mrjob_tar_gz_path
def test_extract_dir_for_tar(self): join = os.path.join tar_and_gzip(dir=join(self.tmp_dir, "a"), out_path=join(self.tmp_dir, "not_a.tar.gz"), prefix="b") self.assertEqual(extract_dir_for_tar(join(self.tmp_dir, "not_a.tar.gz")), "b")