def remote_copy(request_id, src_file_name, registration_id): ''' Remotely copies a source file to a remote machine ''' task_info = { Constants.TASK_ID: remote_copy.request.id, Constants.CELERY_TASK_ID: remote_copy.request.id, Constants.REQUEST_GUID: request_id } try: insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.COPYING}) http_file_upload(src_file_name, registration_id) insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.COPIED}) except RemoteCopyError as e: log.error("Exception happened in remote copy. " + str(e)) insert_extract_stats( task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: 'remote copy has failed: ' + str(e) }) try: # this looks funny to you, but this is just a working around solution for celery bug # since exc option is not really working for retry. raise ExtractionError(str(e)) except ExtractionError as exc: # this could be caused by network hiccup raise remote_copy.retry( args=[request_id, src_file_name, registration_id], exc=exc) except Exception as e: raise ExtractionError(str(e))
def archive_with_encryption(request_id, recipients, archive_file_name, directory): ''' given a directory, archive everything in this directory to a file name specified ''' retryable = False exception_thrown = False try: task_info = { Constants.TASK_ID: archive_with_encryption.request.id, Constants.CELERY_TASK_ID: archive_with_encryption.request.id, Constants.REQUEST_GUID: request_id } insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.ARCHIVING}) gpg_binary_file = get_setting(Config.BINARYFILE) homedir = get_setting(Config.HOMEDIR) keyserver = get_setting(Config.KEYSERVER) encrypted_archive_files(directory, recipients, archive_file_name, homedir=homedir, keyserver=keyserver, gpgbinary=gpg_binary_file) insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.ARCHIVED}) except GPGPublicKeyException as e: # recoverable exception retryable = True exception_thrown = True insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) except Exception as e: # unrecoverable exception exception_thrown = True insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) if exception_thrown: if retryable: try: # this looks funny to you, but this is just a working around solution for celery bug # since exc option is not really working for retry. raise ExtractionError() except ExtractionError as exc: raise archive_with_encryption.retry(args=[ request_id, recipients, archive_file_name, directory ], exc=exc) else: raise ExtractionError()
def copy_to_sftp_lz(request_id, src_file_name, tenant, gatekeeper, sftp_info, timeout=1800): ''' Remotely copies a source file to a remote machine ''' task_info = { Constants.TASK_ID: copy_to_sftp_lz.request.id, Constants.CELERY_TASK_ID: copy_to_sftp_lz.request.id, Constants.REQUEST_GUID: request_id } try: insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.COPYING}) edextract.utils.file_remote_copy.copy(src_file_name, sftp_info[0], tenant, gatekeeper, sftp_info[1], sftp_info[2], timeout=timeout) insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.COPIED}) except RemoteCopyError as e: log.error("Exception happened in remote copy to sftp lz. " + str(e)) insert_extract_stats( task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: 'remote copy to sftp lz has failed: ' + str(e) }) try: # this looks funny to you, but this is just a working around solution for celery bug # since exc option is not really working for retry. raise ExtractionError(str(e)) except ExtractionError as exc: # this could be caused by network hiccup raise copy_to_sftp_lz.retry(args=[ request_id, src_file_name, tenant, gatekeeper, sftp_info ], kwargs={'timeout': timeout}, exc=exc) except Exception as e: raise ExtractionError(str(e))
def prepare_path(request_id, paths): ''' Given a list of paths of directories, creates it if it doesn't exist ''' task_info = { Constants.TASK_ID: prepare_path.request.id, Constants.CELERY_TASK_ID: prepare_path.request.id, Constants.REQUEST_GUID: request_id } try: for path in paths: file_utils.prepare_path(path) except Exception as e: # which thrown from prepare_path # unrecoverable error, do not try to retry celery task. it's just wasting time. log.error(e) insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) raise ExtractionError()
def archive(request_id, archive_file_name, directory): ''' given a directory, archive everything in this directory to a file name specified ''' try: task_info = { Constants.TASK_ID: archive.request.id, Constants.CELERY_TASK_ID: archive.request.id, Constants.REQUEST_GUID: request_id } insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.ARCHIVING}) archive_files(directory, archive_file_name) insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.ARCHIVED}) except Exception as e: # unrecoverable exception insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) raise ExtractionError()
def generate_item_or_raw_extract_file(tenant, request_id, task): """ Generates an item level/raw extract file given task arguments. @param tenant: Tenant name @param request_id: Extract request ID @param task: Calling task """ task_id = task[TaskConstants.TASK_TASK_ID] extract_type = task[TaskConstants.EXTRACTION_DATA_TYPE] log.info( 'execute {task_name} for task {task_id}, extract type {extract_type}'. format(task_name=generate_item_or_raw_extract_file.name, task_id=task_id, extract_type=extract_type)) output_dirs = task[TaskConstants.DIRECTORY_TO_ARCHIVE] if type(output_dirs) is not list: output_dirs = [output_dirs] output_files = task[TaskConstants.TASK_FILE_NAME] if type(output_files) is not list: output_files = [output_files] task_info = { Constants.TASK_ID: task_id, Constants.CELERY_TASK_ID: generate_item_or_raw_extract_file.request.id, Constants.REQUEST_GUID: request_id } retryable = False exception_thrown = False output_file = None try: insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.EXTRACTING}) if tenant is None: insert_extract_stats( task_info, {Constants.STATUS: ExtractStatus.FAILED_NO_TENANT}) else: if extract_type is ExtractionDataType.QUERY_ITEMS_CSV: for output_file in output_files: if not os.path.isdir(os.path.dirname(output_file)): raise FileNotFoundError( os.path.dirname(output_file) + " doesn't exist") elif extract_type is ExtractionDataType.QUERY_RAW_XML: for output_dir in output_dirs: if not os.path.isdir(output_dir): raise FileNotFoundError(output_dir + " doesn't exist") # for item level the output path is a list of one or more files # and for raw extract the output path is a list of one or more directory # to place all the matching xml files if extract_type == ExtractionDataType.QUERY_ITEMS_CSV: output_paths = output_files else: output_paths = output_dirs # Extract data to file extract_func = get_extract_func(extract_type) extract_func(tenant, output_paths, task_info, task) except FileNotFoundError as e: # which thrown from prepare_path # unrecoverable error, do not try to retry celery task. it's just wasting time. if output_file is not None and os.path.isfile(output_file): # file should be deleted if there is an error os.unlink(output_file) log.error(e) insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) exception_thrown = True retryable = False except Exception as e: if output_file is not None and os.path.isfile(output_file): # file should be deleted if there is an error os.unlink(output_file) log.error(e) insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) exception_thrown = True retryable = True if exception_thrown: if retryable: # this looks funny to you, but this is just a working around solution for celery bug # since exc option is not really working for retry. try: raise ExtractionError() except ExtractionError as exc: raise generate_extract_file.retry( args=[tenant, request_id, task], exc=exc) else: raise ExtractionError()
def generate_extract_file(tenant, request_id, task): """ Generates an extract file given task arguments. @param tenant: Tenant name @param request_id: Extract request ID @param task: Calling task @param extract_type: Specific type of data extract for calling task """ task_id = task[TaskConstants.TASK_TASK_ID] extract_type = task[TaskConstants.EXTRACTION_DATA_TYPE] log.info( 'execute {task_name} for task {task_id}, extract type {extract_type}'. format(task_name=generate_extract_file.name, task_id=task_id, extract_type=extract_type)) output_file = task[TaskConstants.TASK_FILE_NAME] task_info = { Constants.TASK_ID: task_id, Constants.CELERY_TASK_ID: generate_extract_file.request.id, Constants.REQUEST_GUID: request_id } retryable = False exception_thrown = False try: insert_extract_stats(task_info, {Constants.STATUS: ExtractStatus.EXTRACTING}) if tenant is None: insert_extract_stats( task_info, {Constants.STATUS: ExtractStatus.FAILED_NO_TENANT}) else: if not os.path.isdir(os.path.dirname(output_file)): raise FileNotFoundError( os.path.dirname(output_file) + " doesn't exist") # Extract data to file. extract_func = get_extract_func(extract_type) extract_func(tenant, output_file, task_info, task) except FileNotFoundError as e: # which thrown from prepare_path # unrecoverable error, do not try to retry celery task. it's just wasting time. if os.path.isfile(output_file): # file should be deleted if there is an error os.unlink(output_file) log.error(e) insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) exception_thrown = True retryable = False except Exception as e: if os.path.isfile(output_file): # file should be deleted if there is an error os.unlink(output_file) log.error(e) insert_extract_stats(task_info, { Constants.STATUS: ExtractStatus.FAILED, Constants.INFO: str(e) }) exception_thrown = True retryable = True if exception_thrown: if retryable: # this looks funny to you, but this is just a working around solution for celery bug # since exc option is not really working for retry. try: raise ExtractionError() except ExtractionError as exc: raise generate_extract_file.retry( args=[tenant, request_id, task], exc=exc) else: raise ExtractionError()