Python AttrDict Examples, memsql_loader.util.attr_dict.AttrDict Python Examples

Example #1

0

Show file

File: downloader.py Project: lterblanche/memsql-loader

    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key,
                                       job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException(
                    "Received %s %s accessing `%s`, aborting" %
                    (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(job.spec.source.hdfs_host,
                                     job.spec.source.webhdfs_port,
                                     user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(
                    fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" %
                                      fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException(
                    "File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" %
                                      fname)

            self.key = AttrDict({
                'name': fname,
                'size': os.path.getsize(fname)
            })
        else:
            raise WorkerException('Unsupported job with paths: %s' %
                                  [str(p) for p in self.job.paths])

        if self.key is None:
            raise WorkerException(
                'Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)

Example #2

0

Show file

    def _create_formatted_totals_row(self, active_rows):
        totals_row = AttrDict({
            'tasks_finished': 0,
            'tasks_total': 0,
            'bytes_downloaded': 0,
            'bytes_total': 0,
            'download_rate': 0,
            'data': defaultdict(lambda: 0, {'time_left': -1})
        })
        for row in active_rows:
            if self.options.jobs:
                totals_row['tasks_finished'] += row.tasks_finished
                totals_row['tasks_total'] += row.tasks_total
            totals_row['bytes_downloaded'] += row.bytes_downloaded or 0
            totals_row['bytes_total'] += row.bytes_total or 0
            totals_row['download_rate'] += row.download_rate or 0
            totals_row.data['time_left'] = max(row.data.get('time_left', -1),
                                               totals_row.data['time_left'])

        formatted_totals_row = defaultdict(lambda: '')

        first_col = self.KEY_FN.keys()[0]
        formatted_totals_row[first_col] = 'Total'
        if self.options.jobs:
            formatted_totals_row.update(self._format_tasks_col(totals_row))
        formatted_totals_row.update(self._make_progress(totals_row, 50))

        return formatted_totals_row

Example #3

0

Show file

File: ps.py Project: Vlad777/memsql-loader

    def _create_formatted_totals_row(self, active_rows):
        totals_row = AttrDict({
            'tasks_finished': 0,
            'tasks_total': 0,
            'bytes_downloaded': 0,
            'bytes_total': 0,
            'download_rate': 0,
            'data': defaultdict(lambda: 0, { 'time_left': -1 })
        })
        for row in active_rows:
            if self.options.jobs:
                totals_row['tasks_finished'] += row.tasks_finished
                totals_row['tasks_total'] += row.tasks_total
            totals_row['bytes_downloaded'] += row.bytes_downloaded or 0
            totals_row['bytes_total'] += row.bytes_total or 0
            totals_row['download_rate'] += row.download_rate or 0
            totals_row.data['time_left'] = max(row.data.get('time_left', -1), totals_row.data['time_left'])

        formatted_totals_row = defaultdict(lambda: '')

        first_col = self.KEY_FN.keys()[0]
        formatted_totals_row[first_col] = 'Total'
        if self.options.jobs:
            formatted_totals_row.update(self._format_tasks_col(totals_row))
        formatted_totals_row.update(self._make_progress(totals_row, 50))

        return formatted_totals_row

Example #4

0

Show file

File: downloader.py Project: Vlad777/memsql-loader

    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException("Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(
                job.spec.source.hdfs_host,
                job.spec.source.webhdfs_port,
                user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" % fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException("File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" % fname)

            self.key = AttrDict({'name': fname, 'size': os.path.getsize(fname)})
        else:
            raise WorkerException('Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ])

        if self.key is None:
            raise WorkerException('Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)

Example #5

0

Show file

File: schema.py Project: Vlad777/memsql-loader

def validate_spec(spec):
    spec = AttrDict.from_dict(get_spec_validator()(spec))

    # post validation steps go here
    assert 'file_id_column' in spec.options
    if spec.options.file_id_column is not None:
        file_id_column = spec['options']['file_id_column']
        if 'columns' not in spec['options']:
            raise V.Invalid('options.columns must be specified if file_id_column is provided', path=[ 'options', 'columns' ])
        else:
            if file_id_column in spec['options']['columns']:
                raise V.Invalid('options.columns can not contain the file_id_column, it will be filled in by MemSQL-Loader',
                    path=[ 'options', 'columns' ])
    if spec.options.script is not None:
        try:
            shlex.split(spec.options.script)
        except ValueError as e:
            raise V.Invalid('options.script is invalid: %s' % str(e), path=[ 'options', 'script' ])
    return spec

Example #6

0

Show file

def validate_spec(spec):
    spec = AttrDict.from_dict(get_spec_validator()(spec))

    # post validation steps go here
    assert 'file_id_column' in spec.options
    if spec.options.file_id_column is not None:
        file_id_column = spec['options']['file_id_column']
        if 'columns' not in spec['options']:
            raise V.Invalid('options.columns must be specified if file_id_column is provided', path=[ 'options', 'columns' ])
        else:
            if file_id_column in spec['options']['columns']:
                raise V.Invalid('options.columns can not contain the file_id_column, it will be filled in by MemSQL-Loader',
                    path=[ 'options', 'columns' ])
    if spec.options.script is not None:
        try:
            shlex.split(spec.options.script)
        except ValueError as e:
            raise V.Invalid('options.script is invalid: %s' % str(e), path=[ 'options', 'script' ])
    return spec

Example #7

0

Show file

File: schema.py Project: cuigrey/memsql-loader

def validate_spec(spec):
    spec = AttrDict.from_dict(get_spec_validator()(spec))

    # post validation steps go here
    assert "file_id_column" in spec.options
    if spec.options.file_id_column is not None:
        file_id_column = spec["options"]["file_id_column"]
        if "columns" not in spec["options"]:
            raise V.Invalid(
                "options.columns must be specified if file_id_column is provided", path=["options", "columns"]
            )
        else:
            if file_id_column in spec["options"]["columns"]:
                raise V.Invalid(
                    "options.columns can not contain the file_id_column, it will be filled in by MemSQL-Loader",
                    path=["options", "columns"],
                )
    if spec.options.script is not None:
        try:
            shlex.split(spec.options.script)
        except ValueError as e:
            raise V.Invalid("options.script is invalid: %s" % str(e), path=["options", "script"])
    return spec

Example #8

0

Show file

File: downloader.py Project: Vlad777/memsql-loader

class Downloader(threading.Thread):
    def __init__(self):
        super(Downloader, self).__init__()
        self.logger = log.get_logger('downloader')
        self._error = None
        self._tb = None
        self._should_exit = False

        self._last_size = -1
        self._last_download_time = 0

    def terminate(self):
        self._should_exit = True

    @property
    def error(self):
        return self._error

    @property
    def traceback(self):
        return self._tb

    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException("Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(
                job.spec.source.hdfs_host,
                job.spec.source.webhdfs_port,
                user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" % fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException("File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" % fname)

            self.key = AttrDict({'name': fname, 'size': os.path.getsize(fname)})
        else:
            raise WorkerException('Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ])

        if self.key is None:
            raise WorkerException('Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)

    def run(self):
        try:
            try:
                # This is at the top so that any exceptions that occur will
                # emit a KILL QUERY due to fifo.open()

                # if we are piping through a script, the fifo should block
                # because the downloader is polling the script's stdin instead
                # of the fifo
                blocking = self.job.spec.options.script is not None
                with self.fifo.open(blocking=blocking) as target_file:
                    # allocate an URL for the target file
                    if self.task.data['scheme'] == 's3':
                        if self.is_anonymous:
                            key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % {
                                'bucket': self.key.bucket.name,
                                'path': self.key.name.encode('utf-8')
                            }
                        else:
                            key_url = self.key.generate_url(expires_in=3600)
                    elif self.task.data['scheme'] == 'hdfs':
                        host = self.job.spec.source.hdfs_host
                        port = self.job.spec.source.webhdfs_port
                        hdfs_user = self.job.spec.source.hdfs_user
                        key_name = self.key.name
                        key_url = webhdfs.get_webhdfs_url(
                            host, port, hdfs_user, 'OPEN', key_name)
                    elif self.task.data['scheme'] == 'file':
                        key_url = 'file://%(path)s' % {'path': self.key.name}
                    else:
                        assert False, 'Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ]

                    self._curl = curl = pycurl.Curl()
                    curl.setopt(pycurl.URL, key_url)
                    curl.setopt(pycurl.NOPROGRESS, 0)
                    curl.setopt(pycurl.PROGRESSFUNCTION, self._progress)
                    curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                    curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                    curl.setopt(pycurl.CONNECTTIMEOUT, 30)

                    if self.job.spec.options.script is not None:
                        self.script_proc = subprocess.Popen(
                            ["/bin/bash", "-c", self.job.spec.options.script],
                            stdout=target_file.fileno(),
                            stdin=subprocess.PIPE)

                        # check that script hasn't errored before downloading
                        # NOTE: we wait here so that we can check if a script exits prematurely
                        # if this is the case, we fail the job without requeueing
                        time.sleep(1)
                        if self.script_proc.poll() is not None:
                            self.logger.error('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                            raise WorkerException('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))

                        # If we're piping data into a script and this file is
                        # a gzipped file, we'll decompress the data ourselves
                        # before piping it into the script.
                        if self.task.data['key_name'].endswith('.gz'):
                            # Set the window bits during decompression to
                            # zlib.MAX_WBITS | 32 tells the zlib library to
                            # automatically detect gzip headers.
                            self.decompress_obj = zlib.decompressobj(zlib.MAX_WBITS | 32)

                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(self.script_proc.stdin))
                    else:
                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file))

                    if self.task.data['scheme'] == 'hdfs':
                        curl.setopt(pycurl.FOLLOWLOCATION, True)

                    self.logger.info('Starting download')
                    with self.task.protect():
                        self.task.start_step('download')

                    try:
                        curl.perform()
                        status_code = curl.getinfo(pycurl.HTTP_CODE)
                        # Catch HTTP client errors, e.g. 404:
                        if status_code >= 400 and status_code < 500:
                            raise WorkerException('HTTP status code %s for file %s' % (status_code, self.key.name))

                        # If we're piping data through a script, catch timeouts and return codes
                        if self.script_proc is not None:
                            self.script_proc.stdin.close()
                            for i in range(SCRIPT_EXIT_TIMEOUT):
                                if self.script_proc.poll() is not None:
                                    break

                                time.sleep(1)
                            else:
                                self.logger.error('Script `%s` failed to exit...killing' % self.job.spec.options.script)
                                self.script_proc.kill()
                                raise WorkerException('Script `%s` failed to exit after %d seconds' % (self.job.spec.options.script, SCRIPT_EXIT_TIMEOUT))

                            if self.script_proc.returncode != 0:
                                self.logger.error('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                                raise WorkerException('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                    finally:
                        with self.task.protect():
                            self.task.stop_step('download')

                            if self.script_proc is not None and self.script_proc.returncode is None:
                                try:
                                    self.script_proc.kill()
                                except OSError as e:
                                    self.logger.warn("Failed to kill script `%s`: %s" % (self.job.spec.options.script, str(e)))
            except pycurl.error as e:
                errno = e.args[0]
                if errno in (pycurl.E_WRITE_ERROR, pycurl.E_ABORTED_BY_CALLBACK):
                    if self.pycurl_callback_exception is not None:
                        raise self.pycurl_callback_exception
                    elif self._should_exit:
                        self.logger.warn('Download failed...requeueing')
                        # Caught by the outer `except Exception as e`
                        raise RequeueTask()

                # Caught by the outer `except pycurl.error as e`
                raise
        except pycurl.error as e:
            errno = e.args[0]
            self._set_error(ConnectionException('libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html' % errno))
        except IOError as e:
            # This is raised sometimes instead of a pycurl error
            self._set_error(ConnectionException('IOError: %s (%d)' % (e.args[1], e.args[0])))
        except Exception as e:
            self._set_error(e)
        except KeyboardInterrupt:
            pass
        finally:
            self.logger.info('Finished downloading')

    def _set_error(self, err):
        self._error = err
        self._tb = sys.exc_info()[2]
        self.logger.debug("Downloader failed: %s." % (err), exc_info=True)

    def _progress(self, dltotal, dlnow, ultotal, ulnow):
        self.metrics.accumulate_bytes(dlnow)

        if self._should_exit or time.time() > self.metrics.last_change + DOWNLOAD_TIMEOUT:
            return 1

    def _write_to_fifo(self, target_file):
        def _write_to_fifo_helper(data):
            to_write = data

            try:
                if self.decompress_obj is not None:
                    to_write = self.decompress_obj.decompress(to_write)
                while len(to_write) > 0:
                    # First step is to wait until we can write to the FIFO.
                    #
                    # Wait for half of the download timeout for the FIFO to become open
                    # for writing.  While we're doing this, ping the download metrics
                    # so that the worker doesn't assume this download has hung.
                    is_writable = False
                    while not is_writable:
                        self.metrics.ping()
                        timeout = DOWNLOAD_TIMEOUT / 2
                        _, writable_objects, _ = select.select(
                            [ ], [ target_file ], [ ], timeout)
                        is_writable = bool(writable_objects)

                    # Then, we write as much as we can within this opportunity to write
                    written_bytes = os.write(target_file.fileno(), to_write)
                    assert written_bytes >= 0, "Expect os.write() to return non-negative numbers"
                    to_write = to_write[written_bytes:]
            except zlib.error as e:
                self.terminate()
                # pycurl will just raise pycurl.error if this function
                # raises an exception, so we also need to set the exception
                # on the Downloader object so that we can check it and
                # re-raise it above.
                self.pycurl_callback_exception = WorkerException(
                    'Could not decompress data: %s' % str(e))
                raise self.pycurl_callback_exception
            except OSError as e:
                if self.script_proc is not None and self.script_proc.poll() is not None:
                    self.terminate()
                    self.pycurl_callback_exception = WorkerException(
                        'Script `%s` exited during download with return code %d' %
                        (self.job.spec.options.script, self.script_proc.returncode))
                    raise self.pycurl_callback_exception
                else:
                    raise

        return _write_to_fifo_helper

Example #9

0

Show file

File: downloader.py Project: lterblanche/memsql-loader

class Downloader(threading.Thread):
    def __init__(self):
        super(Downloader, self).__init__()
        self.logger = log.get_logger('downloader')
        self._error = None
        self._tb = None
        self._should_exit = False

        self._last_size = -1
        self._last_download_time = 0

    def terminate(self):
        self._should_exit = True

    @property
    def error(self):
        return self._error

    @property
    def traceback(self):
        return self._tb

    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key,
                                       job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException(
                    "Received %s %s accessing `%s`, aborting" %
                    (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(job.spec.source.hdfs_host,
                                     job.spec.source.webhdfs_port,
                                     user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(
                    fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" %
                                      fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException(
                    "File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" %
                                      fname)

            self.key = AttrDict({
                'name': fname,
                'size': os.path.getsize(fname)
            })
        else:
            raise WorkerException('Unsupported job with paths: %s' %
                                  [str(p) for p in self.job.paths])

        if self.key is None:
            raise WorkerException(
                'Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)

    def run(self):
        try:
            try:
                # This is at the top so that any exceptions that occur will
                # emit a KILL QUERY due to fifo.open()

                # if we are piping through a script, the fifo should block
                # because the downloader is polling the script's stdin instead
                # of the fifo
                blocking = self.job.spec.options.script is not None
                with self.fifo.open(blocking=blocking) as target_file:
                    # allocate an URL for the target file
                    if self.task.data['scheme'] == 's3':
                        if self.is_anonymous:
                            key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % {
                                'bucket': self.key.bucket.name,
                                'path': self.key.name.encode('utf-8')
                            }
                        else:
                            key_url = self.key.generate_url(expires_in=3600)
                    elif self.task.data['scheme'] == 'hdfs':
                        host = self.job.spec.source.hdfs_host
                        port = self.job.spec.source.webhdfs_port
                        hdfs_user = self.job.spec.source.hdfs_user
                        key_name = self.key.name
                        key_url = webhdfs.get_webhdfs_url(
                            host, port, hdfs_user, 'OPEN', key_name)
                    elif self.task.data['scheme'] == 'file':
                        key_url = 'file://%(path)s' % {'path': self.key.name}
                    else:
                        assert False, 'Unsupported job with paths: %s' % [
                            str(p) for p in self.job.paths
                        ]

                    self._curl = curl = pycurl.Curl()
                    curl.setopt(pycurl.URL, key_url)
                    curl.setopt(pycurl.NOPROGRESS, 0)
                    curl.setopt(pycurl.PROGRESSFUNCTION, self._progress)
                    curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                    curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                    curl.setopt(pycurl.CONNECTTIMEOUT, 30)

                    if self.job.spec.options.script is not None:
                        self.script_proc = subprocess.Popen(
                            ["/bin/bash", "-c", self.job.spec.options.script],
                            stdout=target_file.fileno(),
                            stdin=subprocess.PIPE)

                        # check that script hasn't errored before downloading
                        # NOTE: we wait here so that we can check if a script exits prematurely
                        # if this is the case, we fail the job without requeueing
                        time.sleep(1)
                        if self.script_proc.poll() is not None:
                            self.logger.error(
                                'Script `%s` exited prematurely with return code %d'
                                % (self.job.spec.options.script,
                                   self.script_proc.returncode))
                            raise WorkerException(
                                'Script `%s` exited prematurely with return code %d'
                                % (self.job.spec.options.script,
                                   self.script_proc.returncode))

                        # If we're piping data into a script and this file is
                        # a gzipped file, we'll decompress the data ourselves
                        # before piping it into the script.
                        if self.task.data['key_name'].endswith('.gz'):
                            # Set the window bits during decompression to
                            # zlib.MAX_WBITS | 32 tells the zlib library to
                            # automatically detect gzip headers.
                            self.decompress_obj = zlib.decompressobj(
                                zlib.MAX_WBITS | 32)

                        curl.setopt(
                            pycurl.WRITEFUNCTION,
                            self._write_to_fifo(self.script_proc.stdin))
                    else:
                        curl.setopt(pycurl.WRITEFUNCTION,
                                    self._write_to_fifo(target_file))

                    if self.task.data['scheme'] == 'hdfs':
                        curl.setopt(pycurl.FOLLOWLOCATION, True)

                    self.logger.info('Starting download')
                    with self.task.protect():
                        self.task.start_step('download')

                    try:
                        curl.perform()
                        status_code = curl.getinfo(pycurl.HTTP_CODE)
                        # Catch HTTP client errors, e.g. 404:
                        if status_code >= 400 and status_code < 500:
                            raise WorkerException(
                                'HTTP status code %s for file %s' %
                                (status_code, self.key.name))

                        # If we're piping data through a script, catch timeouts and return codes
                        if self.script_proc is not None:
                            self.script_proc.stdin.close()
                            for i in range(SCRIPT_EXIT_TIMEOUT):
                                if self.script_proc.poll() is not None:
                                    break

                                time.sleep(1)
                            else:
                                self.logger.error(
                                    'Script `%s` failed to exit...killing' %
                                    self.job.spec.options.script)
                                self.script_proc.kill()
                                raise WorkerException(
                                    'Script `%s` failed to exit after %d seconds'
                                    % (self.job.spec.options.script,
                                       SCRIPT_EXIT_TIMEOUT))

                            if self.script_proc.returncode != 0:
                                self.logger.error(
                                    'Script `%s` exited with return code %d' %
                                    (self.job.spec.options.script,
                                     self.script_proc.returncode))
                                raise WorkerException(
                                    'Script `%s` exited with return code %d' %
                                    (self.job.spec.options.script,
                                     self.script_proc.returncode))
                    finally:
                        with self.task.protect():
                            self.task.stop_step('download')

                            if self.script_proc is not None and self.script_proc.returncode is None:
                                try:
                                    self.script_proc.kill()
                                except OSError as e:
                                    self.logger.warn(
                                        "Failed to kill script `%s`: %s" %
                                        (self.job.spec.options.script, str(e)))
            except pycurl.error as e:
                errno = e.args[0]
                if errno in (pycurl.E_WRITE_ERROR,
                             pycurl.E_ABORTED_BY_CALLBACK):
                    if self.pycurl_callback_exception is not None:
                        raise self.pycurl_callback_exception
                    elif self._should_exit:
                        self.logger.warn('Download failed...requeueing')
                        # Caught by the outer `except Exception as e`
                        raise RequeueTask()

                # Caught by the outer `except pycurl.error as e`
                raise
        except pycurl.error as e:
            errno = e.args[0]
            self._set_error(
                ConnectionException(
                    'libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html'
                    % errno))
        except IOError as e:
            # This is raised sometimes instead of a pycurl error
            self._set_error(
                ConnectionException('IOError: %s (%d)' %
                                    (e.args[1], e.args[0])))
        except Exception as e:
            self._set_error(e)
        except KeyboardInterrupt:
            pass
        finally:
            self.logger.info('Finished downloading')

    def _set_error(self, err):
        self._error = err
        self._tb = sys.exc_info()[2]
        self.logger.debug("Downloader failed: %s." % (err), exc_info=True)

    def _progress(self, dltotal, dlnow, ultotal, ulnow):
        self.metrics.accumulate_bytes(dlnow)

        if self._should_exit or time.time(
        ) > self.metrics.last_change + DOWNLOAD_TIMEOUT:
            return 1

    def _write_to_fifo(self, target_file):
        def _write_to_fifo_helper(data):
            to_write = data

            try:
                if self.decompress_obj is not None:
                    to_write = self.decompress_obj.decompress(to_write)
                while len(to_write) > 0:
                    # First step is to wait until we can write to the FIFO.
                    #
                    # Wait for half of the download timeout for the FIFO to become open
                    # for writing.  While we're doing this, ping the download metrics
                    # so that the worker doesn't assume this download has hung.
                    is_writable = False
                    while not is_writable:
                        self.metrics.ping()
                        timeout = DOWNLOAD_TIMEOUT / 2
                        _, writable_objects, _ = select.select([],
                                                               [target_file],
                                                               [], timeout)
                        is_writable = bool(writable_objects)

                    # Then, we write as much as we can within this opportunity to write
                    written_bytes = os.write(target_file.fileno(), to_write)
                    assert written_bytes >= 0, "Expect os.write() to return non-negative numbers"
                    to_write = to_write[written_bytes:]
            except zlib.error as e:
                self.terminate()
                # pycurl will just raise pycurl.error if this function
                # raises an exception, so we also need to set the exception
                # on the Downloader object so that we can check it and
                # re-raise it above.
                self.pycurl_callback_exception = WorkerException(
                    'Could not decompress data: %s' % str(e))
                raise self.pycurl_callback_exception
            except OSError as e:
                if self.script_proc is not None and self.script_proc.poll(
                ) is not None:
                    self.terminate()
                    self.pycurl_callback_exception = WorkerException(
                        'Script `%s` exited during download with return code %d'
                        % (self.job.spec.options.script,
                           self.script_proc.returncode))
                    raise self.pycurl_callback_exception
                else:
                    raise

        return _write_to_fifo_helper