def _find_hadoop_bin(self): """Look for the hadoop binary in any plausible place. If all else fails, return ``['hadoop']``. """ def yield_paths(): for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL': path = os.environ.get(name) if path: yield os.path.join(path, 'bin') # They use $HADOOP_INSTALL/hadoop/bin here: # https://wiki.apache.org/hadoop/GettingStartedWithHadoop if os.environ.get('HADOOP_INSTALL'): yield os.path.join( os.environ['HADOOP_INSTALL'], 'hadoop', 'bin') yield None # use $PATH # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give # up. Don't worry about duplicates; they're de-duplicated below for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield os.path.join(path, 'bin') for path in unique(yield_paths()): log.info('Looking for hadoop binary in %s...' % (path or '$PATH')) hadoop_bin = which('hadoop', path=path) if hadoop_bin: log.info('Found hadoop binary: %s' % hadoop_bin) return [hadoop_bin] else: log.info("Falling back to 'hadoop'") return ['hadoop']
def _find_spark_submit_bin(self): # TODO: this is very similar to _find_hadoop_bin() (in fs) for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % ( path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit']
def _find_spark_submit_bin(self): """Attempt to find the spark binary. Returns a list of arguments. Defaults to ``['spark-submit']``. Re-define this in your subclass if you already know where to find spark-submit (e.g. on cloud services). """ for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % ( path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit']
def _hadoop_prefix_from_bin(hadoop_bin): """Given a path to the hadoop binary, return the path of the implied hadoop home, or None if we don't know. Don't return the parent directory of directories in the default path (not ``/``, ``/usr``, or ``/usr/local``). """ # resolve unqualified binary name (relative paths are okay) if '/' not in hadoop_bin: hadoop_bin = which(hadoop_bin) if not hadoop_bin: return None # use parent of hadoop_bin's directory hadoop_home = posixpath.abspath( posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)), '..') ) if hadoop_home in _BAD_HADOOP_HOMES: return None return hadoop_home
def _hadoop_prefix_from_bin(hadoop_bin): """Given a path to the hadoop binary, return the path of the implied hadoop home, or None if we don't know. Don't return the parent directory of directories in the default path (not ``/``, ``/usr``, or ``/usr/local``). """ # resolve unqualified binary name (relative paths are okay) if '/' not in hadoop_bin: hadoop_bin = which(hadoop_bin) if not hadoop_bin: return None # use parent of hadoop_bin's directory hadoop_home = posixpath.abspath( posixpath.join(posixpath.realpath(posixpath.dirname(hadoop_bin)), '..')) if hadoop_home in _BAD_HADOOP_HOMES: return None return hadoop_home
def _find_hadoop_bin(self): """Look for the hadoop binary in any plausible place. If all else fails, return ``['hadoop']``. """ def yield_paths(): if self._hadoop_home: yield os.path.join(self._hadoop_home, 'bin') for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL': path = os.environ.get(name) if path: yield os.path.join(path, 'bin') # They use $HADOOP_INSTALL/hadoop/bin here: # https://wiki.apache.org/hadoop/GettingStartedWithHadoop if os.environ.get('HADOOP_INSTALL'): yield os.path.join(os.environ['HADOOP_INSTALL'], 'hadoop', 'bin') yield None # use $PATH # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give # up. Don't worry about duplicates; they're de-duplicated below for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield os.path.join(path, 'bin') for path in unique(yield_paths()): log.info('Looking for hadoop binary in %s...' % (path or '$PATH')) hadoop_bin = which('hadoop', path=path) if hadoop_bin: log.info('Found hadoop binary: %s' % hadoop_bin) return [hadoop_bin] else: log.info("Falling back to 'hadoop'") return ['hadoop']
def test_no_path(self): with patch.dict(os.environ, clear=True): # make sure we protect find_executable() from missing $PATH # on Python 2. self.assertEqual(which('shekondar'), None)
def test_not_found(self): self.assertEqual(which('shekondar-the-fearsome', self.tmp_dir), None)
def test_path_from_environment(self): with patch.dict(os.environ, PATH=self.tmp_dir): self.assertEqual(which('shekondar'), self.shekondar_path)
def test_explicit_path(self): self.assertEqual(which('shekondar', path=self.tmp_dir), self.shekondar_path)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from unittest import skipIf from mrjob.examples.mr_words_containing_u_freq_count import \ MRWordsContainingUFreqCount from mrjob.util import which from tests.job import run_job from tests.sandbox import BasicTestCase @skipIf(not which('grep'), 'grep command not in path') class MRWordsContainingUFreqCountTestCase(BasicTestCase): def test_empty(self): self.assertEqual(run_job(MRWordsContainingUFreqCount(['-r', 'local'])), {}) def test_the_wheels_on_the_bus(self): RAW_INPUT = b""" The wheels on the bus go round and round, round and round, round and round The wheels on the bus go round and round, all through the town. """ EXPECTED_OUTPUT = { u'bus': 2,