Example #1
0
File: hadoop.py Project: Yelp/mrjob
    def _find_hadoop_bin(self):
        """Look for the hadoop binary in any plausible place. If all
        else fails, return ``['hadoop']``.
        """
        def yield_paths():
            for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
                path = os.environ.get(name)
                if path:
                    yield os.path.join(path, 'bin')

            # They use $HADOOP_INSTALL/hadoop/bin here:
            # https://wiki.apache.org/hadoop/GettingStartedWithHadoop
            if os.environ.get('HADOOP_INSTALL'):
                yield os.path.join(
                    os.environ['HADOOP_INSTALL'], 'hadoop', 'bin')

            yield None  # use $PATH

            # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
            # up. Don't worry about duplicates; they're de-duplicated below
            for name, path in sorted(os.environ.items()):
                if name.startswith('HADOOP_') and name.endswith('_HOME'):
                    yield os.path.join(path, 'bin')

        for path in unique(yield_paths()):
            log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))

            hadoop_bin = which('hadoop', path=path)

            if hadoop_bin:
                log.info('Found hadoop binary: %s' % hadoop_bin)
                return [hadoop_bin]
        else:
            log.info("Falling back to 'hadoop'")
            return ['hadoop']
Example #2
0
    def _find_spark_submit_bin(self):
        # TODO: this is very similar to _find_hadoop_bin() (in fs)
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Example #3
0
File: bin.py Project: Affirm/mrjob
    def _find_spark_submit_bin(self):
        """Attempt to find the spark binary. Returns a list of arguments.
        Defaults to ``['spark-submit']``.

        Re-define this in your subclass if you already know where
        to find spark-submit (e.g. on cloud services).
        """
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Example #4
0
def _hadoop_prefix_from_bin(hadoop_bin):
    """Given a path to the hadoop binary, return the path of the implied
    hadoop home, or None if we don't know.

    Don't return the parent directory of directories in the default
    path (not ``/``, ``/usr``, or ``/usr/local``).
    """
    # resolve unqualified binary name (relative paths are okay)
    if '/' not in hadoop_bin:
        hadoop_bin = which(hadoop_bin)
        if not hadoop_bin:
            return None

    # use parent of hadoop_bin's directory
    hadoop_home = posixpath.abspath(
        posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)), '..')
    )

    if hadoop_home in _BAD_HADOOP_HOMES:
        return None

    return hadoop_home
Example #5
0
def _hadoop_prefix_from_bin(hadoop_bin):
    """Given a path to the hadoop binary, return the path of the implied
    hadoop home, or None if we don't know.

    Don't return the parent directory of directories in the default
    path (not ``/``, ``/usr``, or ``/usr/local``).
    """
    # resolve unqualified binary name (relative paths are okay)
    if '/' not in hadoop_bin:
        hadoop_bin = which(hadoop_bin)
        if not hadoop_bin:
            return None

    # use parent of hadoop_bin's directory
    hadoop_home = posixpath.abspath(
        posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)),
                       '..'))

    if hadoop_home in _BAD_HADOOP_HOMES:
        return None

    return hadoop_home
Example #6
0
    def _find_hadoop_bin(self):
        """Look for the hadoop binary in any plausible place. If all
        else fails, return ``['hadoop']``.
        """
        def yield_paths():
            if self._hadoop_home:
                yield os.path.join(self._hadoop_home, 'bin')

            for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
                path = os.environ.get(name)
                if path:
                    yield os.path.join(path, 'bin')

            # They use $HADOOP_INSTALL/hadoop/bin here:
            # https://wiki.apache.org/hadoop/GettingStartedWithHadoop
            if os.environ.get('HADOOP_INSTALL'):
                yield os.path.join(os.environ['HADOOP_INSTALL'], 'hadoop',
                                   'bin')

            yield None  # use $PATH

            # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
            # up. Don't worry about duplicates; they're de-duplicated below
            for name, path in sorted(os.environ.items()):
                if name.startswith('HADOOP_') and name.endswith('_HOME'):
                    yield os.path.join(path, 'bin')

        for path in unique(yield_paths()):
            log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))

            hadoop_bin = which('hadoop', path=path)

            if hadoop_bin:
                log.info('Found hadoop binary: %s' % hadoop_bin)
                return [hadoop_bin]
        else:
            log.info("Falling back to 'hadoop'")
            return ['hadoop']
Example #7
0
 def test_no_path(self):
     with patch.dict(os.environ, clear=True):
         # make sure we protect find_executable() from missing $PATH
         # on Python 2.
         self.assertEqual(which('shekondar'), None)
Example #8
0
 def test_not_found(self):
     self.assertEqual(which('shekondar-the-fearsome', self.tmp_dir), None)
Example #9
0
 def test_path_from_environment(self):
     with patch.dict(os.environ, PATH=self.tmp_dir):
         self.assertEqual(which('shekondar'), self.shekondar_path)
Example #10
0
 def test_explicit_path(self):
     self.assertEqual(which('shekondar', path=self.tmp_dir),
                      self.shekondar_path)
Example #11
0
 def test_no_path(self):
     with patch.dict(os.environ, clear=True):
         # make sure we protect find_executable() from missing $PATH
         # on Python 2.
         self.assertEqual(which('shekondar'), None)
Example #12
0
 def test_not_found(self):
     self.assertEqual(which('shekondar-the-fearsome', self.tmp_dir), None)
Example #13
0
 def test_path_from_environment(self):
     with patch.dict(os.environ, PATH=self.tmp_dir):
         self.assertEqual(which('shekondar'), self.shekondar_path)
Example #14
0
 def test_explicit_path(self):
     self.assertEqual(which('shekondar', path=self.tmp_dir),
                      self.shekondar_path)
Example #15
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import skipIf

from mrjob.examples.mr_words_containing_u_freq_count import \
     MRWordsContainingUFreqCount
from mrjob.util import which

from tests.job import run_job
from tests.sandbox import BasicTestCase


@skipIf(not which('grep'), 'grep command not in path')
class MRWordsContainingUFreqCountTestCase(BasicTestCase):
    def test_empty(self):
        self.assertEqual(run_job(MRWordsContainingUFreqCount(['-r', 'local'])),
                         {})

    def test_the_wheels_on_the_bus(self):
        RAW_INPUT = b"""
        The wheels on the bus go round and round,
        round and round, round and round
        The wheels on the bus go round and round,
        all through the town.
        """

        EXPECTED_OUTPUT = {
            u'bus': 2,