Beispiel #1
0
def _syslog_to_stderr_path(path):
    """Get the path/uri of the stderr log corresponding to the given syslog.

    If the syslog is gzipped (/path/to/syslog.gz), we'll expect
    stderr to be gzipped too (/path/to/stderr.gz).
    """
    stem, filename = posixpath.split(path)
    return posixpath.join(stem, 'stderr' + file_ext(filename))
Beispiel #2
0
def _syslog_to_stderr_path(path):
    """Get the path/uri of the stderr log corresponding to the given syslog.

    If the syslog is gzipped (/path/to/syslog.gz), we'll expect
    stderr to be gzipped too (/path/to/stderr.gz).
    """
    stem, filename = posixpath.split(path)
    return posixpath.join(stem, 'stderr' + file_ext(filename))
Beispiel #3
0
 def filter_path(path):
     filename = os.path.basename(path)
     return not(file_ext(filename).lower() in ('.pyc', '.pyo') or
         # filter out emacs backup files
         filename.endswith('~') or
         # filter out emacs lock files
         filename.startswith('.#') or
         # filter out MacFuse resource forks
         filename.startswith('._'))
Beispiel #4
0
 def filter_path(path):
     filename = os.path.basename(path)
     return not(file_ext(filename).lower() in ('.pyc', '.pyo') or
                # filter out emacs backup files
                filename.endswith('~') or
                # filter out emacs lock files
                filename.startswith('.#') or
                # filter out MacFuse resource forks
                filename.startswith('._'))
Beispiel #5
0
    def _unarchive_file(self, path, dest):
        path = os.path.abspath(path)
        
        # figure out how to unarchive the file, based on its extension
        unarchive_args = HOW_TO_UNARCHIVE.get(file_ext(path))

        if not unarchive_args:
            raise ValueError("Don't know how to unarchive %s" % path)

        log.debug('unarchiving %s -> %s' % (path, dest))
        self.mkdir(dest)
        check_call(unarchive_args + [path], cwd=dest)
Beispiel #6
0
def name_uniquely(path,
                  names_taken=(),
                  proposed_name=None,
                  unhide=False,
                  strip_ext=False,
                  suffix=''):
    """Come up with a unique name for *path*.

    :param names_taken: a dictionary or set of names not to use.
    :param proposed_name: name to use if it is not taken. If this is not set,
                          we propose a name based on the filename.
    :param unhide: make sure final name doesn't start with periods or
                   underscores
    :param strip_ext: if we propose a name, it shouldn't have a file extension
    :param suffix: if set to a string, add this to the end of any filename
                    we propose. Should include the ``.``.

    If the proposed name is taken, we add a number to the end of the
    filename, keeping the extension the same. For example:

    >>> name_uniquely('foo.txt', {'foo.txt'})
    'foo-1.txt'
    >>> name_uniquely('bar.tar.gz', {'bar'}, strip_ext=True)
    'bar-1'
    """
    filename = proposed_name or os.path.basename(path.rstrip('/' + os.sep))
    ext = file_ext(filename)
    prefix = filename[:-len(ext) or None]

    if strip_ext and not proposed_name:
        ext = ''

    if suffix and not proposed_name:
        ext += suffix

    if unhide:
        prefix = prefix.lstrip('.').lstrip('_')

    # is our proposed name taken?
    name = prefix + ext
    if prefix and name not in names_taken:
        return name

    # add 1, 2, etc. to the name until it's not taken
    for i in itertools.count(1):
        if prefix:
            name = '%s-%d%s' % (prefix, i, ext)
        else:
            # if no prefix is left (due to empty filename or unhiding)
            # just use numbers; don't start filenames with '-'
            name = '%d%s' % (i, ext)
        if name not in names_taken:
            return name
Beispiel #7
0
 def test_file_ext(self):
     self.assertEqual(file_ext('foo.zip'), '.zip')
     self.assertEqual(file_ext('foo.Z'), '.Z')
     self.assertEqual(file_ext('foo.tar.gz'), '.tar.gz')
     self.assertEqual(file_ext('README'), '')
     self.assertEqual(file_ext('README,v'), '')
     self.assertEqual(file_ext('README.txt,v'), '.txt,v')
Beispiel #8
0
 def test_file_ext(self):
     self.assertEqual(file_ext('foo.zip'), '.zip')
     self.assertEqual(file_ext('foo.Z'), '.Z')
     self.assertEqual(file_ext('foo.tar.gz'), '.tar.gz')
     self.assertEqual(file_ext('README'), '')
     self.assertEqual(file_ext('README,v'), '')
     self.assertEqual(file_ext('README.txt,v'), '.txt,v')
Beispiel #9
0
 def test_file_ext(self):
     self.assertEqual(file_ext("foo.zip"), ".zip")
     self.assertEqual(file_ext("foo.Z"), ".Z")
     self.assertEqual(file_ext("foo.tar.gz"), ".tar.gz")
     self.assertEqual(file_ext("README"), "")
     self.assertEqual(file_ext("README,v"), "")
     self.assertEqual(file_ext("README.txt,v"), ".txt,v")
Beispiel #10
0
 def filter_path(path):
     filename = os.path.basename(path)
     return not (
         file_ext(filename).lower() in (".pyc", ".pyo")
         or
         # filter out emacs backup files
         filename.endswith("~")
         or
         # filter out emacs lock files
         filename.startswith(".#")
         or
         # filter out MacFuse resource forks
         filename.startswith("._")
     )
def parse_doc_filename(input_uri):
    """Parse a filename like ``some_id-cat1-cat2-not_cat3.txt`` into
    ``dict(id='some_id', cats=dict(cat1=True, cat2=True, cat3=False))``
    """
    # get filename without extension
    name_with_ext = posixpath.basename(input_uri)
    name = name_with_ext[:-len(file_ext(name_with_ext))]

    parts = name.split('-')

    doc_id = parts[0]
    cats = {}

    for part in parts[1:]:
        if part.startswith('not_'):
            cats[part[4:]] = False
        else:
            cats[part] = True

    return dict(id=doc_id, cats=cats)
Beispiel #12
0
    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        """
        out = []

        # shebang, precommands
        out.extend(self._start_of_sh_script())
        out.append('')

        # for example, create a tmp dir and cd to it
        if self._bootstrap_pre_commands():
            out.extend(self._bootstrap_pre_commands())
            out.append('')

        # store $PWD
        out.append('# store $PWD')
        out.append('__mrjob_PWD=$PWD')
        out.append('')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')
        out.append('fi')
        out.append('')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370
        out.append('{')

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
                self._bootstrap_dir_mgr.name_to_path('file').items()):
            uri = self._upload_mgr.uri(path)
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))
            out.append('')

        # download and unarchive archives
        archive_names_and_paths = sorted(
            self._bootstrap_dir_mgr.name_to_path('archive').items())
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')
            out.append('')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                out.append(
                    '  %s %s %s' %
                    (cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd %
                           dict(file=quoted_archive_path,
                                dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                out.append('  chmod u+rx -R $__mrjob_PWD/%s' %
                           pipes.quote(name))

                out.append('')

        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._bootstrap_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            out.append(line)

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out
Beispiel #13
0
    def _master_bootstrap_script_content(self, bootstrap):
        """Return a list containing the lines of the master bootstrap script.
        (without trailing newlines)
        """
        out = []

        # shebang, precommands
        out.extend(self._start_of_sh_script())
        out.append('')

        # store $PWD
        out.append('# store $PWD')
        out.append('__mrjob_PWD=$PWD')
        out.append('')

        # special case for PWD being in /, which happens on Dataproc
        # (really we should cd to tmp or something)
        out.append('if [ $__mrjob_PWD = "/" ]; then')
        out.append('  __mrjob_PWD=""')
        out.append('fi')
        out.append('')

        # run commands in a block so we can redirect stdout to stderr
        # (e.g. to catch errors from compileall). See #370
        out.append('{')

        # download files
        out.append('  # download files and mark them executable')

        cp_to_local = self._cp_to_local_cmd()

        # TODO: why bother with $__mrjob_PWD here, since we're already in it?
        for name, path in sorted(
                self._bootstrap_dir_mgr.name_to_path('file').items()):
            uri = self._upload_mgr.uri(path)
            out.append('')
            out.append('  %s %s $__mrjob_PWD/%s' %
                       (cp_to_local, pipes.quote(uri), pipes.quote(name)))
            # imitate Hadoop Distributed Cache (see #1602)
            out.append('  chmod u+rx $__mrjob_PWD/%s' % pipes.quote(name))
        out.append('')

        # download and unarchive archives
        archive_names_and_paths = sorted(
            self._bootstrap_dir_mgr.name_to_path('archive').items())
        if archive_names_and_paths:
            # make tmp dir if needed
            out.append('  # download and unpack archives')
            out.append('  __mrjob_TMP=$(mktemp -d)')
            out.append('')

            for name, path in archive_names_and_paths:
                uri = self._upload_mgr.uri(path)
                ext = file_ext(basename(path))

                # copy file to tmp dir
                quoted_archive_path = '$__mrjob_TMP/%s' % pipes.quote(name)

                out.append('  %s %s %s' % (
                    cp_to_local, pipes.quote(uri), quoted_archive_path))

                # unarchive file
                if ext not in _EXT_TO_UNARCHIVE_CMD:
                    raise KeyError('unknown archive file extension: %s' % path)
                unarchive_cmd = _EXT_TO_UNARCHIVE_CMD[ext]

                out.append('  ' + unarchive_cmd % dict(
                    file=quoted_archive_path,
                    dir='$__mrjob_PWD/' + pipes.quote(name)))

                # imitate Hadoop Distributed Cache (see #1602)
                out.append(
                    '  chmod u+rx -R $__mrjob_PWD/%s' % pipes.quote(name))

                out.append('')

        # run bootstrap commands
        out.append('  # bootstrap commands')
        for cmd in bootstrap:
            # reconstruct the command line, substituting $__mrjob_PWD/<name>
            # for path dicts
            line = '  '
            for token in cmd:
                if isinstance(token, dict):
                    # it's a path dictionary
                    line += '$__mrjob_PWD/'
                    line += pipes.quote(self._bootstrap_dir_mgr.name(**token))
                else:
                    # it's raw script
                    line += token
            out.append(line)

        out.append('} 1>&2')  # stdout -> stderr for ease of error log parsing

        return out
Beispiel #14
0
 def ignore_initial_dots(self):
     self.assertEqual(file_ext('.emacs'), '')
     self.assertEqual(file_ext('.mrjob.conf'), '.conf')
     self.assertEqual(file_ext('...dots.txt'), '.txt')