Example #1
0
    def run(self):
        try:
            try:
                # This is at the top so that any exceptions that occur will
                # emit a KILL QUERY due to fifo.open()

                # if we are piping through a script, the fifo should block
                # because the downloader is polling the script's stdin instead
                # of the fifo
                blocking = self.job.spec.options.script is not None
                with self.fifo.open(blocking=blocking) as target_file:
                    # allocate an URL for the target file
                    if self.task.data['scheme'] == 's3':
                        if self.is_anonymous:
                            key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % {
                                'bucket': self.key.bucket.name,
                                'path': self.key.name.encode('utf-8')
                            }
                        else:
                            key_url = self.key.generate_url(expires_in=3600)
                    elif self.task.data['scheme'] == 'hdfs':
                        host = self.job.spec.source.hdfs_host
                        port = self.job.spec.source.webhdfs_port
                        hdfs_user = self.job.spec.source.hdfs_user
                        key_name = self.key.name
                        key_url = webhdfs.get_webhdfs_url(
                            host, port, hdfs_user, 'OPEN', key_name)
                    elif self.task.data['scheme'] == 'file':
                        key_url = 'file://%(path)s' % {'path': self.key.name}
                    else:
                        assert False, 'Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ]

                    self._curl = curl = pycurl.Curl()
                    curl.setopt(pycurl.URL, key_url)
                    curl.setopt(pycurl.NOPROGRESS, 0)
                    curl.setopt(pycurl.PROGRESSFUNCTION, self._progress)
                    curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                    curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                    curl.setopt(pycurl.CONNECTTIMEOUT, 30)

                    if self.job.spec.options.script is not None:
                        self.script_proc = subprocess.Popen(
                            ["/bin/bash", "-c", self.job.spec.options.script],
                            stdout=target_file.fileno(),
                            stdin=subprocess.PIPE)

                        # check that script hasn't errored before downloading
                        # NOTE: we wait here so that we can check if a script exits prematurely
                        # if this is the case, we fail the job without requeueing
                        time.sleep(1)
                        if self.script_proc.poll() is not None:
                            self.logger.error('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                            raise WorkerException('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))

                        # If we're piping data into a script and this file is
                        # a gzipped file, we'll decompress the data ourselves
                        # before piping it into the script.
                        if self.task.data['key_name'].endswith('.gz'):
                            # Set the window bits during decompression to
                            # zlib.MAX_WBITS | 32 tells the zlib library to
                            # automatically detect gzip headers.
                            self.decompress_obj = zlib.decompressobj(zlib.MAX_WBITS | 32)

                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(self.script_proc.stdin))
                    else:
                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file))

                    if self.task.data['scheme'] == 'hdfs':
                        curl.setopt(pycurl.FOLLOWLOCATION, True)

                    self.logger.info('Starting download')
                    with self.task.protect():
                        self.task.start_step('download')

                    try:
                        curl.perform()
                        status_code = curl.getinfo(pycurl.HTTP_CODE)
                        # Catch HTTP client errors, e.g. 404:
                        if status_code >= 400 and status_code < 500:
                            raise WorkerException('HTTP status code %s for file %s' % (status_code, self.key.name))

                        # If we're piping data through a script, catch timeouts and return codes
                        if self.script_proc is not None:
                            self.script_proc.stdin.close()
                            for i in range(SCRIPT_EXIT_TIMEOUT):
                                if self.script_proc.poll() is not None:
                                    break

                                time.sleep(1)
                            else:
                                self.logger.error('Script `%s` failed to exit...killing' % self.job.spec.options.script)
                                self.script_proc.kill()
                                raise WorkerException('Script `%s` failed to exit after %d seconds' % (self.job.spec.options.script, SCRIPT_EXIT_TIMEOUT))

                            if self.script_proc.returncode != 0:
                                self.logger.error('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                                raise WorkerException('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                    finally:
                        with self.task.protect():
                            self.task.stop_step('download')

                            if self.script_proc is not None and self.script_proc.returncode is None:
                                try:
                                    self.script_proc.kill()
                                except OSError as e:
                                    self.logger.warn("Failed to kill script `%s`: %s" % (self.job.spec.options.script, str(e)))
            except pycurl.error as e:
                errno = e.args[0]
                if errno in (pycurl.E_WRITE_ERROR, pycurl.E_ABORTED_BY_CALLBACK):
                    if self.pycurl_callback_exception is not None:
                        raise self.pycurl_callback_exception
                    elif self._should_exit:
                        self.logger.warn('Download failed...requeueing')
                        # Caught by the outer `except Exception as e`
                        raise RequeueTask()

                # Caught by the outer `except pycurl.error as e`
                raise
        except pycurl.error as e:
            errno = e.args[0]
            self._set_error(ConnectionException('libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html' % errno))
        except IOError as e:
            # This is raised sometimes instead of a pycurl error
            self._set_error(ConnectionException('IOError: %s (%d)' % (e.args[1], e.args[0])))
        except Exception as e:
            self._set_error(e)
        except KeyboardInterrupt:
            pass
        finally:
            self.logger.info('Finished downloading')
Example #2
0
    def run(self):
        try:
            try:
                # This is at the top so that any exceptions that occur will
                # emit a KILL QUERY due to fifo.open()

                # if we are piping through a script, the fifo should block
                # because the downloader is polling the script's stdin instead
                # of the fifo
                blocking = self.job.spec.options.script is not None
                with self.fifo.open(blocking=blocking) as target_file:
                    # allocate an URL for the target file
                    if self.task.data['scheme'] == 's3':
                        if self.is_anonymous:
                            key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % {
                                'bucket': self.key.bucket.name,
                                'path': self.key.name.encode('utf-8')
                            }
                        else:
                            key_url = self.key.generate_url(expires_in=3600)
                    elif self.task.data['scheme'] == 'hdfs':
                        host = self.job.spec.source.hdfs_host
                        port = self.job.spec.source.webhdfs_port
                        hdfs_user = self.job.spec.source.hdfs_user
                        key_name = self.key.name
                        key_url = webhdfs.get_webhdfs_url(
                            host, port, hdfs_user, 'OPEN', key_name)
                    elif self.task.data['scheme'] == 'file':
                        key_url = 'file://%(path)s' % {'path': self.key.name}
                    else:
                        assert False, 'Unsupported job with paths: %s' % [
                            str(p) for p in self.job.paths
                        ]

                    self._curl = curl = pycurl.Curl()
                    curl.setopt(pycurl.URL, key_url)
                    curl.setopt(pycurl.NOPROGRESS, 0)
                    curl.setopt(pycurl.PROGRESSFUNCTION, self._progress)
                    curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                    curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                    curl.setopt(pycurl.CONNECTTIMEOUT, 30)

                    if self.job.spec.options.script is not None:
                        self.script_proc = subprocess.Popen(
                            ["/bin/bash", "-c", self.job.spec.options.script],
                            stdout=target_file.fileno(),
                            stdin=subprocess.PIPE)

                        # check that script hasn't errored before downloading
                        # NOTE: we wait here so that we can check if a script exits prematurely
                        # if this is the case, we fail the job without requeueing
                        time.sleep(1)
                        if self.script_proc.poll() is not None:
                            self.logger.error(
                                'Script `%s` exited prematurely with return code %d'
                                % (self.job.spec.options.script,
                                   self.script_proc.returncode))
                            raise WorkerException(
                                'Script `%s` exited prematurely with return code %d'
                                % (self.job.spec.options.script,
                                   self.script_proc.returncode))

                        # If we're piping data into a script and this file is
                        # a gzipped file, we'll decompress the data ourselves
                        # before piping it into the script.
                        if self.task.data['key_name'].endswith('.gz'):
                            # Set the window bits during decompression to
                            # zlib.MAX_WBITS | 32 tells the zlib library to
                            # automatically detect gzip headers.
                            self.decompress_obj = zlib.decompressobj(
                                zlib.MAX_WBITS | 32)

                        curl.setopt(
                            pycurl.WRITEFUNCTION,
                            self._write_to_fifo(self.script_proc.stdin))
                    else:
                        curl.setopt(pycurl.WRITEFUNCTION,
                                    self._write_to_fifo(target_file))

                    if self.task.data['scheme'] == 'hdfs':
                        curl.setopt(pycurl.FOLLOWLOCATION, True)

                    self.logger.info('Starting download')
                    with self.task.protect():
                        self.task.start_step('download')

                    try:
                        curl.perform()
                        status_code = curl.getinfo(pycurl.HTTP_CODE)
                        # Catch HTTP client errors, e.g. 404:
                        if status_code >= 400 and status_code < 500:
                            raise WorkerException(
                                'HTTP status code %s for file %s' %
                                (status_code, self.key.name))

                        # If we're piping data through a script, catch timeouts and return codes
                        if self.script_proc is not None:
                            self.script_proc.stdin.close()
                            for i in range(SCRIPT_EXIT_TIMEOUT):
                                if self.script_proc.poll() is not None:
                                    break

                                time.sleep(1)
                            else:
                                self.logger.error(
                                    'Script `%s` failed to exit...killing' %
                                    self.job.spec.options.script)
                                self.script_proc.kill()
                                raise WorkerException(
                                    'Script `%s` failed to exit after %d seconds'
                                    % (self.job.spec.options.script,
                                       SCRIPT_EXIT_TIMEOUT))

                            if self.script_proc.returncode != 0:
                                self.logger.error(
                                    'Script `%s` exited with return code %d' %
                                    (self.job.spec.options.script,
                                     self.script_proc.returncode))
                                raise WorkerException(
                                    'Script `%s` exited with return code %d' %
                                    (self.job.spec.options.script,
                                     self.script_proc.returncode))
                    finally:
                        with self.task.protect():
                            self.task.stop_step('download')

                            if self.script_proc is not None and self.script_proc.returncode is None:
                                try:
                                    self.script_proc.kill()
                                except OSError as e:
                                    self.logger.warn(
                                        "Failed to kill script `%s`: %s" %
                                        (self.job.spec.options.script, str(e)))
            except pycurl.error as e:
                errno = e.args[0]
                if errno in (pycurl.E_WRITE_ERROR,
                             pycurl.E_ABORTED_BY_CALLBACK):
                    if self.pycurl_callback_exception is not None:
                        raise self.pycurl_callback_exception
                    elif self._should_exit:
                        self.logger.warn('Download failed...requeueing')
                        # Caught by the outer `except Exception as e`
                        raise RequeueTask()

                # Caught by the outer `except pycurl.error as e`
                raise
        except pycurl.error as e:
            errno = e.args[0]
            self._set_error(
                ConnectionException(
                    'libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html'
                    % errno))
        except IOError as e:
            # This is raised sometimes instead of a pycurl error
            self._set_error(
                ConnectionException('IOError: %s (%d)' %
                                    (e.args[1], e.args[0])))
        except Exception as e:
            self._set_error(e)
        except KeyboardInterrupt:
            pass
        finally:
            self.logger.info('Finished downloading')
Example #3
0
    def validate_path_conditions(self, path):
        if path.scheme == 's3':
            is_anonymous = self.job.spec.source.aws_access_key is None or self.job.spec.source.aws_secret_key is None
            if is_anonymous:
                self.logger.debug('Either access key or secret key was not specified, connecting to S3 as anonymous')
                self.s3_conn = S3Connection(anon=True)
            else:
                self.logger.debug('Connecting to S3')
                self.s3_conn = S3Connection(self.job.spec.source.aws_access_key, self.job.spec.source.aws_secret_key)

            try:
                if not cli_utils.RE_VALIDATE_BUCKET_NAME.match(path.bucket):
                    raise cli_utils.AWSBucketNameInvalid("Bucket name is not valid")

                self.s3_conn.get_bucket(path.bucket)
            except S3ResponseError as e:
                if e.status == 403:
                    self.logger.error('Invalid credentials for this bucket, aborting')
                elif e.status == 404:
                    self.logger.error('Bucket not found, aborting')
                else:
                    self.logger.error("Accessing S3 bucket resulted in %s %s, aborting" % (e.status, e.reason))
                sys.exit(1)
            except cli_utils.AWSBucketNameInvalid as e:
                self.logger.error(e.message)
                sys.exit(1)
        elif path.scheme == 'hdfs':
            if self.job.spec.source.webhdfs_port is None:
                self.logger.error('source.webhdfs_port must be defined for HDFS jobs')
                sys.exit(1)

            if not self.options.skip_checks:
                try:
                    # We try getting a content summary of the home directory
                    # for HDFS to make sure that we can connect to the WebHDFS
                    # server.
                    curl = pycurl.Curl()

                    url = webhdfs.get_webhdfs_url(
                        self.job.spec.source.hdfs_host,
                        self.job.spec.source.webhdfs_port,
                        self.job.spec.source.hdfs_user, 'GETCONTENTSUMMARY', '')

                    curl.setopt(pycurl.URL, url)

                    def _check_hdfs_response(data):
                        if 'It looks like you are making an HTTP request to a Hadoop IPC port' in data:
                            self.logger.error(
                                'You have provided an IPC port instead of the '
                                'WebHDFS port for the webhdfs-port argument.')

                    curl.setopt(pycurl.WRITEFUNCTION, _check_hdfs_response)
                    curl.perform()
                    status_code = curl.getinfo(pycurl.HTTP_CODE)
                    if status_code != httplib.OK:
                        self.logger.error('HTTP status code %s when testing WebHDFS connection' % status_code)
                        self.logger.error('Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url)
                        sys.exit(1)
                except pycurl.error as e:
                    errno = e.args[0]
                    self.logger.error('libcurl error %s when testing WebHDFS connection' % errno)
                    self.logger.error('Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url)
                    sys.exit(1)
Example #4
0
    def validate_path_conditions(self, path):
        if path.scheme == 's3':
            #is_anonymous = self.job.spec.source.aws_access_key is None or self.job.spec.source.aws_secret_key is None
            #if is_anonymous:
            #    self.logger.debug('Either access key or secret key was not specified, connecting to S3 as anonymous')
            #    self.s3_conn = S3Connection(anon=True)
            #else:
            #    self.logger.debug('Connecting to S3')
            #    self.s3_conn = S3Connection(self.job.spec.source.aws_access_key, self.job.spec.source.aws_secret_key)
            self.s3_conn = boto3.resource('s3')

            try:
                if not cli_utils.RE_VALIDATE_BUCKET_NAME.match(path.bucket):
                    raise cli_utils.AWSBucketNameInvalid(
                        "Bucket name is not valid")

                self.s3_conn.Bucket(path.bucket)
            except S3ResponseError as e:
                if e.status == 403:
                    self.logger.error(
                        'Invalid credentials for this bucket, aborting')
                elif e.status == 404:
                    self.logger.error('Bucket not found, aborting')
                else:
                    self.logger.error(
                        "Accessing S3 bucket resulted in %s %s, aborting" %
                        (e.status, e.reason))
                sys.exit(1)
            except cli_utils.AWSBucketNameInvalid as e:
                self.logger.error(e.message)
                sys.exit(1)
        elif path.scheme == 'hdfs':
            if self.job.spec.source.webhdfs_port is None:
                self.logger.error(
                    'source.webhdfs_port must be defined for HDFS jobs')
                sys.exit(1)

            if not self.options.skip_checks:
                try:
                    # We try getting a content summary of the home directory
                    # for HDFS to make sure that we can connect to the WebHDFS
                    # server.
                    curl = pycurl.Curl()

                    url = webhdfs.get_webhdfs_url(
                        self.job.spec.source.hdfs_host,
                        self.job.spec.source.webhdfs_port,
                        self.job.spec.source.hdfs_user, 'GETCONTENTSUMMARY',
                        '')

                    curl.setopt(pycurl.URL, url)

                    def _check_hdfs_response(data):
                        if 'It looks like you are making an HTTP request to a Hadoop IPC port' in data:
                            self.logger.error(
                                'You have provided an IPC port instead of the '
                                'WebHDFS port for the webhdfs-port argument.')

                    curl.setopt(pycurl.WRITEFUNCTION, _check_hdfs_response)
                    curl.perform()
                    status_code = curl.getinfo(pycurl.HTTP_CODE)
                    if status_code != httplib.OK:
                        self.logger.error(
                            'HTTP status code %s when testing WebHDFS connection'
                            % status_code)
                        self.logger.error(
                            'Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s'
                            % url)
                        sys.exit(1)
                except pycurl.error as e:
                    errno = e.args[0]
                    self.logger.error(
                        'libcurl error %s when testing WebHDFS connection' %
                        errno)
                    self.logger.error(
                        'Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s'
                        % url)
                    sys.exit(1)