def run(self): try: try: # This is at the top so that any exceptions that occur will # emit a KILL QUERY due to fifo.open() # if we are piping through a script, the fifo should block # because the downloader is polling the script's stdin instead # of the fifo blocking = self.job.spec.options.script is not None with self.fifo.open(blocking=blocking) as target_file: # allocate an URL for the target file if self.task.data['scheme'] == 's3': if self.is_anonymous: key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % { 'bucket': self.key.bucket.name, 'path': self.key.name.encode('utf-8') } else: key_url = self.key.generate_url(expires_in=3600) elif self.task.data['scheme'] == 'hdfs': host = self.job.spec.source.hdfs_host port = self.job.spec.source.webhdfs_port hdfs_user = self.job.spec.source.hdfs_user key_name = self.key.name key_url = webhdfs.get_webhdfs_url( host, port, hdfs_user, 'OPEN', key_name) elif self.task.data['scheme'] == 'file': key_url = 'file://%(path)s' % {'path': self.key.name} else: assert False, 'Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ] self._curl = curl = pycurl.Curl() curl.setopt(pycurl.URL, key_url) curl.setopt(pycurl.NOPROGRESS, 0) curl.setopt(pycurl.PROGRESSFUNCTION, self._progress) curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) curl.setopt(pycurl.CONNECTTIMEOUT, 30) if self.job.spec.options.script is not None: self.script_proc = subprocess.Popen( ["/bin/bash", "-c", self.job.spec.options.script], stdout=target_file.fileno(), stdin=subprocess.PIPE) # check that script hasn't errored before downloading # NOTE: we wait here so that we can check if a script exits prematurely # if this is the case, we fail the job without requeueing time.sleep(1) if self.script_proc.poll() is not None: self.logger.error('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) raise WorkerException('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) # If we're piping data into a script and this file is # a gzipped file, we'll decompress the data ourselves # before piping it into the script. if self.task.data['key_name'].endswith('.gz'): # Set the window bits during decompression to # zlib.MAX_WBITS | 32 tells the zlib library to # automatically detect gzip headers. self.decompress_obj = zlib.decompressobj(zlib.MAX_WBITS | 32) curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(self.script_proc.stdin)) else: curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file)) if self.task.data['scheme'] == 'hdfs': curl.setopt(pycurl.FOLLOWLOCATION, True) self.logger.info('Starting download') with self.task.protect(): self.task.start_step('download') try: curl.perform() status_code = curl.getinfo(pycurl.HTTP_CODE) # Catch HTTP client errors, e.g. 404: if status_code >= 400 and status_code < 500: raise WorkerException('HTTP status code %s for file %s' % (status_code, self.key.name)) # If we're piping data through a script, catch timeouts and return codes if self.script_proc is not None: self.script_proc.stdin.close() for i in range(SCRIPT_EXIT_TIMEOUT): if self.script_proc.poll() is not None: break time.sleep(1) else: self.logger.error('Script `%s` failed to exit...killing' % self.job.spec.options.script) self.script_proc.kill() raise WorkerException('Script `%s` failed to exit after %d seconds' % (self.job.spec.options.script, SCRIPT_EXIT_TIMEOUT)) if self.script_proc.returncode != 0: self.logger.error('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) raise WorkerException('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) finally: with self.task.protect(): self.task.stop_step('download') if self.script_proc is not None and self.script_proc.returncode is None: try: self.script_proc.kill() except OSError as e: self.logger.warn("Failed to kill script `%s`: %s" % (self.job.spec.options.script, str(e))) except pycurl.error as e: errno = e.args[0] if errno in (pycurl.E_WRITE_ERROR, pycurl.E_ABORTED_BY_CALLBACK): if self.pycurl_callback_exception is not None: raise self.pycurl_callback_exception elif self._should_exit: self.logger.warn('Download failed...requeueing') # Caught by the outer `except Exception as e` raise RequeueTask() # Caught by the outer `except pycurl.error as e` raise except pycurl.error as e: errno = e.args[0] self._set_error(ConnectionException('libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html' % errno)) except IOError as e: # This is raised sometimes instead of a pycurl error self._set_error(ConnectionException('IOError: %s (%d)' % (e.args[1], e.args[0]))) except Exception as e: self._set_error(e) except KeyboardInterrupt: pass finally: self.logger.info('Finished downloading')
def run(self): try: try: # This is at the top so that any exceptions that occur will # emit a KILL QUERY due to fifo.open() # if we are piping through a script, the fifo should block # because the downloader is polling the script's stdin instead # of the fifo blocking = self.job.spec.options.script is not None with self.fifo.open(blocking=blocking) as target_file: # allocate an URL for the target file if self.task.data['scheme'] == 's3': if self.is_anonymous: key_url = 'http://%(bucket)s.s3.amazonaws.com/%(path)s' % { 'bucket': self.key.bucket.name, 'path': self.key.name.encode('utf-8') } else: key_url = self.key.generate_url(expires_in=3600) elif self.task.data['scheme'] == 'hdfs': host = self.job.spec.source.hdfs_host port = self.job.spec.source.webhdfs_port hdfs_user = self.job.spec.source.hdfs_user key_name = self.key.name key_url = webhdfs.get_webhdfs_url( host, port, hdfs_user, 'OPEN', key_name) elif self.task.data['scheme'] == 'file': key_url = 'file://%(path)s' % {'path': self.key.name} else: assert False, 'Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ] self._curl = curl = pycurl.Curl() curl.setopt(pycurl.URL, key_url) curl.setopt(pycurl.NOPROGRESS, 0) curl.setopt(pycurl.PROGRESSFUNCTION, self._progress) curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) curl.setopt(pycurl.CONNECTTIMEOUT, 30) if self.job.spec.options.script is not None: self.script_proc = subprocess.Popen( ["/bin/bash", "-c", self.job.spec.options.script], stdout=target_file.fileno(), stdin=subprocess.PIPE) # check that script hasn't errored before downloading # NOTE: we wait here so that we can check if a script exits prematurely # if this is the case, we fail the job without requeueing time.sleep(1) if self.script_proc.poll() is not None: self.logger.error( 'Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) raise WorkerException( 'Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) # If we're piping data into a script and this file is # a gzipped file, we'll decompress the data ourselves # before piping it into the script. if self.task.data['key_name'].endswith('.gz'): # Set the window bits during decompression to # zlib.MAX_WBITS | 32 tells the zlib library to # automatically detect gzip headers. self.decompress_obj = zlib.decompressobj( zlib.MAX_WBITS | 32) curl.setopt( pycurl.WRITEFUNCTION, self._write_to_fifo(self.script_proc.stdin)) else: curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file)) if self.task.data['scheme'] == 'hdfs': curl.setopt(pycurl.FOLLOWLOCATION, True) self.logger.info('Starting download') with self.task.protect(): self.task.start_step('download') try: curl.perform() status_code = curl.getinfo(pycurl.HTTP_CODE) # Catch HTTP client errors, e.g. 404: if status_code >= 400 and status_code < 500: raise WorkerException( 'HTTP status code %s for file %s' % (status_code, self.key.name)) # If we're piping data through a script, catch timeouts and return codes if self.script_proc is not None: self.script_proc.stdin.close() for i in range(SCRIPT_EXIT_TIMEOUT): if self.script_proc.poll() is not None: break time.sleep(1) else: self.logger.error( 'Script `%s` failed to exit...killing' % self.job.spec.options.script) self.script_proc.kill() raise WorkerException( 'Script `%s` failed to exit after %d seconds' % (self.job.spec.options.script, SCRIPT_EXIT_TIMEOUT)) if self.script_proc.returncode != 0: self.logger.error( 'Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) raise WorkerException( 'Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode)) finally: with self.task.protect(): self.task.stop_step('download') if self.script_proc is not None and self.script_proc.returncode is None: try: self.script_proc.kill() except OSError as e: self.logger.warn( "Failed to kill script `%s`: %s" % (self.job.spec.options.script, str(e))) except pycurl.error as e: errno = e.args[0] if errno in (pycurl.E_WRITE_ERROR, pycurl.E_ABORTED_BY_CALLBACK): if self.pycurl_callback_exception is not None: raise self.pycurl_callback_exception elif self._should_exit: self.logger.warn('Download failed...requeueing') # Caught by the outer `except Exception as e` raise RequeueTask() # Caught by the outer `except pycurl.error as e` raise except pycurl.error as e: errno = e.args[0] self._set_error( ConnectionException( 'libcurl error #%d. Lookup error here: http://curl.haxx.se/libcurl/c/libcurl-errors.html' % errno)) except IOError as e: # This is raised sometimes instead of a pycurl error self._set_error( ConnectionException('IOError: %s (%d)' % (e.args[1], e.args[0]))) except Exception as e: self._set_error(e) except KeyboardInterrupt: pass finally: self.logger.info('Finished downloading')
def validate_path_conditions(self, path): if path.scheme == 's3': is_anonymous = self.job.spec.source.aws_access_key is None or self.job.spec.source.aws_secret_key is None if is_anonymous: self.logger.debug('Either access key or secret key was not specified, connecting to S3 as anonymous') self.s3_conn = S3Connection(anon=True) else: self.logger.debug('Connecting to S3') self.s3_conn = S3Connection(self.job.spec.source.aws_access_key, self.job.spec.source.aws_secret_key) try: if not cli_utils.RE_VALIDATE_BUCKET_NAME.match(path.bucket): raise cli_utils.AWSBucketNameInvalid("Bucket name is not valid") self.s3_conn.get_bucket(path.bucket) except S3ResponseError as e: if e.status == 403: self.logger.error('Invalid credentials for this bucket, aborting') elif e.status == 404: self.logger.error('Bucket not found, aborting') else: self.logger.error("Accessing S3 bucket resulted in %s %s, aborting" % (e.status, e.reason)) sys.exit(1) except cli_utils.AWSBucketNameInvalid as e: self.logger.error(e.message) sys.exit(1) elif path.scheme == 'hdfs': if self.job.spec.source.webhdfs_port is None: self.logger.error('source.webhdfs_port must be defined for HDFS jobs') sys.exit(1) if not self.options.skip_checks: try: # We try getting a content summary of the home directory # for HDFS to make sure that we can connect to the WebHDFS # server. curl = pycurl.Curl() url = webhdfs.get_webhdfs_url( self.job.spec.source.hdfs_host, self.job.spec.source.webhdfs_port, self.job.spec.source.hdfs_user, 'GETCONTENTSUMMARY', '') curl.setopt(pycurl.URL, url) def _check_hdfs_response(data): if 'It looks like you are making an HTTP request to a Hadoop IPC port' in data: self.logger.error( 'You have provided an IPC port instead of the ' 'WebHDFS port for the webhdfs-port argument.') curl.setopt(pycurl.WRITEFUNCTION, _check_hdfs_response) curl.perform() status_code = curl.getinfo(pycurl.HTTP_CODE) if status_code != httplib.OK: self.logger.error('HTTP status code %s when testing WebHDFS connection' % status_code) self.logger.error('Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url) sys.exit(1) except pycurl.error as e: errno = e.args[0] self.logger.error('libcurl error %s when testing WebHDFS connection' % errno) self.logger.error('Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url) sys.exit(1)
def validate_path_conditions(self, path): if path.scheme == 's3': #is_anonymous = self.job.spec.source.aws_access_key is None or self.job.spec.source.aws_secret_key is None #if is_anonymous: # self.logger.debug('Either access key or secret key was not specified, connecting to S3 as anonymous') # self.s3_conn = S3Connection(anon=True) #else: # self.logger.debug('Connecting to S3') # self.s3_conn = S3Connection(self.job.spec.source.aws_access_key, self.job.spec.source.aws_secret_key) self.s3_conn = boto3.resource('s3') try: if not cli_utils.RE_VALIDATE_BUCKET_NAME.match(path.bucket): raise cli_utils.AWSBucketNameInvalid( "Bucket name is not valid") self.s3_conn.Bucket(path.bucket) except S3ResponseError as e: if e.status == 403: self.logger.error( 'Invalid credentials for this bucket, aborting') elif e.status == 404: self.logger.error('Bucket not found, aborting') else: self.logger.error( "Accessing S3 bucket resulted in %s %s, aborting" % (e.status, e.reason)) sys.exit(1) except cli_utils.AWSBucketNameInvalid as e: self.logger.error(e.message) sys.exit(1) elif path.scheme == 'hdfs': if self.job.spec.source.webhdfs_port is None: self.logger.error( 'source.webhdfs_port must be defined for HDFS jobs') sys.exit(1) if not self.options.skip_checks: try: # We try getting a content summary of the home directory # for HDFS to make sure that we can connect to the WebHDFS # server. curl = pycurl.Curl() url = webhdfs.get_webhdfs_url( self.job.spec.source.hdfs_host, self.job.spec.source.webhdfs_port, self.job.spec.source.hdfs_user, 'GETCONTENTSUMMARY', '') curl.setopt(pycurl.URL, url) def _check_hdfs_response(data): if 'It looks like you are making an HTTP request to a Hadoop IPC port' in data: self.logger.error( 'You have provided an IPC port instead of the ' 'WebHDFS port for the webhdfs-port argument.') curl.setopt(pycurl.WRITEFUNCTION, _check_hdfs_response) curl.perform() status_code = curl.getinfo(pycurl.HTTP_CODE) if status_code != httplib.OK: self.logger.error( 'HTTP status code %s when testing WebHDFS connection' % status_code) self.logger.error( 'Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url) sys.exit(1) except pycurl.error as e: errno = e.args[0] self.logger.error( 'libcurl error %s when testing WebHDFS connection' % errno) self.logger.error( 'Make sure your HDFS server is running and WebHDFS is enabled and ensure that you can access the data at %s' % url) sys.exit(1)