Esempio n. 1
0
    def setUp(self):
        super(SetupTestCase, self).setUp()

        os.mkdir(os.path.join(self.tmp_dir, "foo"))

        self.foo_py = os.path.join(self.tmp_dir, "foo", "foo.py")

        # if our job can import foo, getsize will return 2x as many bytes
        with open(self.foo_py, "w") as foo_py:
            foo_py.write(
                "import os.path\n"
                "from os.path import getsize as _real_getsize\n"
                "os.path.getsize = lambda p: _real_getsize(p) * 2"
            )

        self.foo_sh = os.path.join(self.tmp_dir, "foo", "foo.sh")

        with open(self.foo_sh, "w") as foo_sh:
            foo_sh.write("#!/bin/sh\n" "touch foo.sh-made-this\n")
        os.chmod(self.foo_sh, stat.S_IRWXU)

        self.foo_tar_gz = os.path.join(self.tmp_dir, "foo.tar.gz")
        tar_and_gzip(os.path.join(self.tmp_dir, "foo"), self.foo_tar_gz)

        self.foo_py_size = os.path.getsize(self.foo_py)
        self.foo_sh_size = os.path.getsize(self.foo_sh)
        self.foo_tar_gz_size = os.path.getsize(self.foo_tar_gz)
Esempio n. 2
0
    def setUp(self):
        super(SetupTestCase, self).setUp()

        os.mkdir(os.path.join(self.tmp_dir, 'foo'))

        self.foo_py = os.path.join(self.tmp_dir, 'foo', 'foo.py')

        # if our job can import foo, getsize will return 2x as many bytes
        with open(self.foo_py, 'w') as foo_py:
            foo_py.write('import os.path\n'
                         'from os.path import getsize as _real_getsize\n'
                         'os.path.getsize = lambda p: _real_getsize(p) * 2')

        self.foo_sh = os.path.join(self.tmp_dir, 'foo', 'foo.sh')

        with open(self.foo_sh, 'w') as foo_sh:
            foo_sh.write('#!/bin/sh\n'
                         'touch foo.sh-made-this\n')
        os.chmod(self.foo_sh, stat.S_IRWXU)

        self.foo_tar_gz = os.path.join(self.tmp_dir, 'foo.tar.gz')
        tar_and_gzip(os.path.join(self.tmp_dir, 'foo'), self.foo_tar_gz)

        self.foo_py_size = os.path.getsize(self.foo_py)
        self.foo_sh_size = os.path.getsize(self.foo_sh)
        self.foo_tar_gz_size = os.path.getsize(self.foo_tar_gz)
Esempio n. 3
0
    def setUp(self):
        super(SetupTestCase, self).setUp()

        os.mkdir(os.path.join(self.tmp_dir, 'foo'))

        self.foo_py = os.path.join(self.tmp_dir, 'foo', 'foo.py')

        # if our job can import foo, getsize will return 2x as many bytes
        with open(self.foo_py, 'w') as foo_py:
            foo_py.write('import os.path\n'
                         'from os.path import getsize as _real_getsize\n'
                         'os.path.getsize = lambda p: _real_getsize(p) * 2')

        self.foo_sh = os.path.join(self.tmp_dir, 'foo', 'foo.sh')

        with open(self.foo_sh, 'w') as foo_sh:
            foo_sh.write('#!/bin/sh\n' 'touch foo.sh-made-this\n')
        os.chmod(self.foo_sh, stat.S_IRWXU)

        self.foo_tar_gz = os.path.join(self.tmp_dir, 'foo.tar.gz')
        tar_and_gzip(os.path.join(self.tmp_dir, 'foo'), self.foo_tar_gz)

        self.foo_py_size = os.path.getsize(self.foo_py)
        self.foo_sh_size = os.path.getsize(self.foo_sh)
        self.foo_tar_gz_size = os.path.getsize(self.foo_tar_gz)
Esempio n. 4
0
    def _create_mrjob_tar_gz(self):
        """Make a tarball of the mrjob library, without .pyc or .pyo files,
        and return its path. This will also set self._mrjob_tar_gz_path

        It's safe to call this method multiple times (we'll only create
        the tarball once.)
        """
        if self._mrjob_tar_gz_path is None:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",
                    mrjob.__file__)

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            tar_gz_path = os.path.join(
                self._get_local_tmp_dir(), 'mrjob.tar.gz')

            def filter_path(path):
                filename = os.path.basename(path)
                return not(file_ext(filename).lower() in ('.pyc', '.pyo') or
                    # filter out emacs backup files
                    filename.endswith('~') or
                    # filter out emacs lock files
                    filename.startswith('.#') or
                    # filter out MacFuse resource forks
                    filename.startswith('._'))

            tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path)
            self._mrjob_tar_gz_path = tar_gz_path

        return self._mrjob_tar_gz_path
Esempio n. 5
0
    def test_extract_dir_for_tar(self):
        join = os.path.join
        tar_and_gzip(dir=join(self.tmp_dir, 'a'),
                     out_path=join(self.tmp_dir, 'not_a.tar.gz'),
                     prefix='b')

        assert_equal(extract_dir_for_tar(join(self.tmp_dir, 'not_a.tar.gz')),
                     'b')
Esempio n. 6
0
    def test_extract_dir_for_tar(self):
        join = os.path.join
        tar_and_gzip(dir=join(self.tmp_dir, 'a'),
                     out_path=join(self.tmp_dir, 'not_a.tar.gz'),
                     prefix='b')

        assert_equal(extract_dir_for_tar(join(self.tmp_dir, 'not_a.tar.gz')),
                     'b')
    def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list):
        """
        Computes the BIC score for all pairs by using MapReduce and returns
        the pair with the best score
        """
        
        print "Map-Reduce execution"
#        iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list)
#        pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w'))
#        os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
#                                 S_IRGRP | S_IXGRP |           \
#                                 S_IROTH | S_IXOTH             )
        
        from subprocess import call
        call(["mkdir", "-p", "gmm"])
        for i in range (0, len(iteration_bic_list)):
            gidx, didx = iteration_bic_list[i]
            pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w'))
            os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
                                      S_IRGRP | S_IXGRP |           \
                                      S_IROTH | S_IXOTH             )
        import mrjob.util as util
        util.tar_and_gzip('gmm', 'gmm.tgz') 
        
        input = []
        l = len(iteration_bic_list)
        for gmm1idx in range(l):
            for gmm2idx in range(gmm1idx+1, l):
                gidx1, didx1 = iteration_bic_list[gmm1idx]
                gidx2, didx2 = iteration_bic_list[gmm2idx] 
                an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters))
                input.append(an_item+"\n")     
        
        mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
        job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
        runner = job.make_runner()
        runner.run()
        kv_pairs = map(job.parse_output_line, runner.stream_output())
        assert len(kv_pairs) == 1
        merged_tuple_indices, best_score = kv_pairs[0][1]
    
        # Re-merge the GMM pair with the highest score *here*, otherwise the next
        # segment_majority_vote will crash (issue with data ownership). If we don't
        # find a different workaround, we can simplify more the mapper and the reducer.
        # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
        # merged GMMs. Instead, we can move just indices and scores.
        # However, this re-merging is serialized...
        ind1, ind2 = merged_tuple_indices
        gidx1, idx1 = iteration_bic_list[ind1]
        gidx2, idx2 = iteration_bic_list[ind2]
        d1 = tools.get_data_from_indices(X, idx1)
        d2 = tools.get_data_from_indices(X, idx2)
        data = np.concatenate((d1,d2))
        g1 = gmm_list[gidx1]
        g2 = gmm_list[gidx2]
        new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
            
        return new_gmm, (g1, g2), merged_tuple_indices, best_score
Esempio n. 8
0
    def test_tar_and_gzip(self):
        join = os.path.join

        # tar it up, and put it in subdirectory (b/)
        tar_and_gzip(dir=join(self.tmp_dir, 'a'),
                     out_path=join(self.tmp_dir, 'a.tar.gz'),
                     filter=lambda path: not path.endswith('z'),
                     prefix='b')

        # untar it into b/
        t = tarfile.open(join(self.tmp_dir, 'a.tar.gz'), 'r:gz')
        t.extractall(self.tmp_dir)
        t.close()

        self.ensure_expected_results(excluded_files=['baz'])
Esempio n. 9
0
    def test_tar_and_gzip(self):
        join = os.path.join

        # tar it up, and put it in subdirectory (b/)
        tar_and_gzip(dir=join(self.tmp_dir, 'a'),
                     out_path=join(self.tmp_dir, 'a.tar.gz'),
                     filter=lambda path: not path.endswith('z'),
                     prefix='b')

        # untar it into b/
        t = tarfile.open(join(self.tmp_dir, 'a.tar.gz'), 'r:gz')
        t.extractall(self.tmp_dir)
        t.close()

        self.ensure_expected_results(excluded_files=['baz'])
Esempio n. 10
0
    def _create_mrjob_tar_gz(self):
        """Make a tarball of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_tar_gz_path`` and return it.

        Typically called from
        :py:meth:`_create_setup_wrapper_script`.

        It's safe to call this method multiple times (we'll only create
        the tarball once.)
        """
        if not self._mrjob_tar_gz_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",
                    mrjob.__file__)

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            tar_gz_path = os.path.join(self._get_local_tmp_dir(),
                                       'mrjob.tar.gz')

            def filter_path(path):
                filename = os.path.basename(path)
                return not (filename.lower().endswith('.pyc')
                            or filename.lower().endswith('.pyo') or
                            # filter out emacs backup files
                            filename.endswith('~') or
                            # filter out emacs lock files
                            filename.startswith('.#') or
                            # filter out MacFuse resource forks
                            filename.startswith('._'))

            log.debug('archiving %s -> %s as %s' %
                      (mrjob_dir, tar_gz_path, os.path.join('mrjob', '')))
            tar_and_gzip(mrjob_dir,
                         tar_gz_path,
                         filter=filter_path,
                         prefix='mrjob')

            self._mrjob_tar_gz_path = tar_gz_path

        return self._mrjob_tar_gz_path
Esempio n. 11
0
    def test_master_bootstrap_script_is_valid_python(self):
        # create a fake src tarball
        with open(os.path.join(self.tmp_dir, 'foo.py'), 'w'): pass
        yelpy_tar_gz_path = os.path.join(self.tmp_dir, 'yelpy.tar.gz')
        tar_and_gzip(self.tmp_dir, yelpy_tar_gz_path, prefix='yelpy')

        # do everything
        runner = EMRJobRunner(conf_path=False,
                              bootstrap_cmds=['echo "Hi!"', 'true', 'ls'],
                              bootstrap_files=['/tmp/quz'],
                              bootstrap_mrjob=True,
                              bootstrap_python_packages=[yelpy_tar_gz_path],
                              bootstrap_scripts=['speedups.sh', '/tmp/s.sh'])
        script_path = os.path.join(self.tmp_dir, 'b.py')
        runner._create_master_bootstrap_script(dest=script_path)

        assert os.path.exists(script_path)
        py_compile.compile(script_path)
Esempio n. 12
0
    def _create_mrjob_tar_gz(self):
        """Make a tarball of the mrjob library, without .pyc or .pyo files,
        This will also set ``self._mrjob_tar_gz_path`` and return it.

        Typically called from
        :py:meth:`_create_setup_wrapper_script`.

        It's safe to call this method multiple times (we'll only create
        the tarball once.)
        """
        if not self._mrjob_tar_gz_path:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith('__init__.'):
                raise Exception(
                    "Bad path for mrjob library: %s; can't bootstrap mrjob",
                    mrjob.__file__)

            mrjob_dir = os.path.dirname(mrjob.__file__) or '.'

            tar_gz_path = os.path.join(self._get_local_tmp_dir(),
                                       'mrjob.tar.gz')

            def filter_path(path):
                filename = os.path.basename(path)
                return not(filename.lower().endswith('.pyc') or
                           filename.lower().endswith('.pyo') or
                           # filter out emacs backup files
                           filename.endswith('~') or
                           # filter out emacs lock files
                           filename.startswith('.#') or
                           # filter out MacFuse resource forks
                           filename.startswith('._'))

            log.debug('archiving %s -> %s as %s' % (
                mrjob_dir, tar_gz_path, os.path.join('mrjob', '')))
            tar_and_gzip(
                mrjob_dir, tar_gz_path, filter=filter_path, prefix='mrjob')

            self._mrjob_tar_gz_path = tar_gz_path

        return self._mrjob_tar_gz_path
Esempio n. 13
0
File: runner.py Progetto: pjob/mrjob
    def _create_mrjob_tar_gz(self):
        """Make a tarball of the mrjob library, without .pyc or .pyo files,
        and return its path. This will also set self._mrjob_tar_gz_path

        It's safe to call this method multiple times (we'll only create
        the tarball once.)
        """
        if self._mrjob_tar_gz_path is None:
            # find mrjob library
            import mrjob

            if not os.path.basename(mrjob.__file__).startswith("__init__."):
                raise Exception("Bad path for mrjob library: %s; can't bootstrap mrjob", mrjob.__file__)

            mrjob_dir = os.path.dirname(mrjob.__file__) or "."

            tar_gz_path = os.path.join(self._get_local_tmp_dir(), "mrjob.tar.gz")

            def filter_path(path):
                filename = os.path.basename(path)
                return not (
                    file_ext(filename).lower() in (".pyc", ".pyo")
                    or
                    # filter out emacs backup files
                    filename.endswith("~")
                    or
                    # filter out emacs lock files
                    filename.startswith(".#")
                    or
                    # filter out MacFuse resource forks
                    filename.startswith("._")
                )

            log.debug("archiving %s -> %s as %s" % (mrjob_dir, tar_gz_path, os.path.join("mrjob", "")))
            tar_and_gzip(mrjob_dir, tar_gz_path, filter=filter_path, prefix="mrjob")
            self._mrjob_tar_gz_path = tar_gz_path

        return self._mrjob_tar_gz_path
Esempio n. 14
0
    def test_extract_dir_for_tar(self):
        join = os.path.join
        tar_and_gzip(dir=join(self.tmp_dir, "a"), out_path=join(self.tmp_dir, "not_a.tar.gz"), prefix="b")

        self.assertEqual(extract_dir_for_tar(join(self.tmp_dir, "not_a.tar.gz")), "b")