def test_ingest_article_zip_starter_(self, fake_boto_conn, fake_logger): fake_boto_conn.return_value = FakeBotoConnection() self.stater_ingest_article_zip.start( settings=settings_mock, run=run_example, info=S3NotificationInfo.from_dict( test_data.ingest_article_zip_data))
def start(self, ENV="dev", info=None, run=None): # TODO : much of this is common to many starters and could probably be streamlined # Specify run environment settings settings = settingsLib.get_settings(ENV) # Log identity = "starter_%s" % int(random.random() * 1000) log_file = "starter.log" # logFile = None logger = log.logger(log_file, settings.setLevel, identity) filename = info.file_name if filename is None: logger.error("Did not get a filename") return # Simple connect conn = boto.swf.layer1.Layer1(settings.aws_access_key_id, settings.aws_secret_access_key) # Start a workflow execution workflow_id = "PublishPerfectArticle_%s" % filename.replace( '/', '_') + str(int(random.random() * 1000)) workflow_name = "PublishPerfectArticle" workflow_version = "1" child_policy = None execution_start_to_close_timeout = str(60 * 30) workflow_input = S3NotificationInfo.to_dict(info) workflow_input['run'] = run workflow_input = json.dumps(workflow_input, default=lambda ob: ob.__dict__) try: response = conn.start_workflow_execution( settings.domain, workflow_id, workflow_name, workflow_version, settings.default_task_list, child_policy, execution_start_to_close_timeout, workflow_input) logger.info('got response: \n%s' % json.dumps(response, sort_keys=True, indent=4)) except boto.swf.exceptions.SWFWorkflowExecutionAlreadyStartedError: # There is already a running workflow with that ID, cannot start another message = 'SWFWorkflowExecutionAlreadyStartedError: There is already a running workflow with ID %s' % workflow_id logger.info(message)
def do_activity(self, data=None): """ Do the work """ self.expanded_bucket_name = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket info = S3NotificationInfo.from_dict(data) session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') article_version_id = article_id + '.' + version run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "start", "Starting applying version number to files for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) # Do not rename files if a version number is in the file_name m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: # Nothing to do pass elif m is None and version is not None: expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') bucket_folder_name = expanded_folder_name.replace(os.sep, '/') self.rename_article_s3_objects(bucket_folder_name, version) self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "end", "Finished applying version number to article " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception("Exception when applying version number to article") self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error", "Error in applying version number to files for " + article_id + " message:" + e.message) return True
def start(self, settings, info=None, run=None): # Log logger = helper.get_starter_logger( settings.setLevel, helper.get_starter_identity(self.const_name)) if hasattr(info, 'file_name') == False or info.file_name is None: raise NullRequiredDataException( "filename is Null / Did not get a filename.") input = S3NotificationInfo.to_dict(info) input['run'] = run input['version_lookup_function'] = "article_highest_version" input['force'] = True workflow_id, \ workflow_name, \ workflow_version, \ child_policy, \ execution_start_to_close_timeout, \ workflow_input = helper.set_workflow_information(self.const_name, "1", None, input, info.file_name.replace('/', '_')) # Simple connect conn = boto.swf.layer1.Layer1(settings.aws_access_key_id, settings.aws_secret_access_key) try: response = conn.start_workflow_execution( settings.domain, workflow_id, workflow_name, workflow_version, settings.default_task_list, child_policy, execution_start_to_close_timeout, workflow_input) logger.info('got response: \n%s' % json.dumps(response, sort_keys=True, indent=4)) except NullRequiredDataException as e: logger.exception(e.message) raise except boto.swf.exceptions.SWFWorkflowExecutionAlreadyStartedError: # There is already a running workflow with that ID, cannot start another message = 'SWFWorkflowExecutionAlreadyStartedError: There is already a running workflow with ID %s' % workflow_id logger.info(message)
def start(self, ENV="dev", info=None, run=None): # TODO : much of this is common to many starters and could probably be streamlined # Specify run environment settings settings = settingsLib.get_settings(ENV) # Log identity = "starter_%s" % int(random.random() * 1000) log_file = "starter.log" # logFile = None logger = log.logger(log_file, settings.setLevel, identity) filename = info.file_name if filename is None: logger.error("Did not get a filename") return # Simple connect conn = boto.swf.layer1.Layer1(settings.aws_access_key_id, settings.aws_secret_access_key) # Start a workflow execution workflow_id = "PublishPerfectArticle_%s" % filename.replace('/', '_') + str(int(random.random() * 1000)) workflow_name = "PublishPerfectArticle" workflow_version = "1" child_policy = None execution_start_to_close_timeout = str(60 * 30) workflow_input = S3NotificationInfo.to_dict(info) workflow_input['run'] = run workflow_input = json.dumps(workflow_input, default=lambda ob: ob.__dict__) try: response = conn.start_workflow_execution(settings.domain, workflow_id, workflow_name, workflow_version, settings.default_task_list, child_policy, execution_start_to_close_timeout, workflow_input) logger.info('got response: \n%s' % json.dumps(response, sort_keys=True, indent=4)) except boto.swf.exceptions.SWFWorkflowExecutionAlreadyStartedError: # There is already a running workflow with that ID, cannot start another message = 'SWFWorkflowExecutionAlreadyStartedError: There is already a running workflow with ID %s' % workflow_id logger.info(message)
def start(self, settings, run, info): # Log logger = helper.get_starter_logger(settings.setLevel, helper.get_starter_identity(self.const_name)) if hasattr(info, 'file_name') == False or info.file_name is None: raise NullRequiredDataException("filename is Null. Did not get a filename.") input = S3NotificationInfo.to_dict(info) input['run'] = run input['version_lookup_function'] = "article_next_version" workflow_id, \ workflow_name, \ workflow_version, \ child_policy, \ execution_start_to_close_timeout, \ workflow_input = helper.set_workflow_information(self.const_name, "1", None, input, info.file_name.replace('/', '_'), start_to_close_timeout=str(60 * 60 * 5)) # Simple connect conn = boto.swf.layer1.Layer1(settings.aws_access_key_id, settings.aws_secret_access_key) try: response = conn.start_workflow_execution(settings.domain, workflow_id, workflow_name, workflow_version, settings.default_task_list, child_policy, execution_start_to_close_timeout, workflow_input) logger.info('got response: \n%s' % json.dumps(response, sort_keys=True, indent=4)) except NullRequiredDataException as e: logger.exception(e.message) raise except boto.swf.exceptions.SWFWorkflowExecutionAlreadyStartedError: # There is already a running workflow with that ID, cannot start another message = 'SWFWorkflowExecutionAlreadyStartedError: ' \ 'There is already a running workflow with ID %s' % workflow_id logger.info(message)
def do_activity(self, data=None): try: info = S3NotificationInfo.from_dict(data) filename = info.file_name[info.file_name.rfind('/')+1:] session = Session(self.settings) session.store_value(data['run'], 'filename_last_element', filename) article_structure = ArticleInfo(filename) if article_structure.article_id is None: self.logger.error("Name '%s' did not match expected pattern for article id" % filename) raise RuntimeError("article_structure.article_id is None. File pattern problem.") version = self.get_version(self.settings, article_structure, data['version_lookup_function']) session.store_value(data['run'], 'version', version) article_id = article_structure.article_id self.emit_monitor_event(self.settings, article_id, version, data['run'], self.pretty_name, "start", " ".join(("Version Lookup for article", article_id, "version:", version))) self.set_monitor_property(self.settings, article_id, "article-id", article_id, "text") self.set_monitor_property(self.settings, article_id, "publication-status", "publication in progress", "text", version=version) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup Version. Error: " + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
parser.add_option("-e", "--env", default="dev", action="store", type="string", dest="env", help="set the environment to run, either dev or live") parser.add_option("-f", "--filename", default=None, action="store", type="string", dest="filename", help="specify the DOI id the article to process") (options, args) = parser.parse_args() if options.env: ENV = options.env if options.filename: filename = options.filename import settings as settingsLib settings = settingsLib.get_settings(ENV) o = starter_NewS3File() info = S3NotificationInfo("S3Event", "", "xxawsxx-drop-bucket", "elife-kitchen-sink.xml", "3f53f5c808dd58973cd93a368be739b4", "1") o.start(settings=settings, info=info)
def process_data_publishperfectarticle(workflow_name, workflow_data): data = { 'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4()) } return data
def process_data_publishperfectarticle(workflow_name, workflow_data): data = {'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4())} return data
def process_data_ingestarticlezip(workflow_name, workflow_data): data = {'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4())} return data
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) self.check_filenames(upload_filenames) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def test_ingest_article_zip_starter_(self, fake_boto_conn, fake_logger): fake_boto_conn.return_value = FakeBotoConnection() self.stater_ingest_article_zip.start(settings=settings_mock, run=run_example, info=S3NotificationInfo.from_dict(test_data.ingest_article_zip_data))
def work(ENV, flag): # Specify run environment settings settings = settings_lib.get_settings(ENV) # Log identity = "queue_worker_%s" % os.getpid() log_file = "queue_worker.log" # logFile = None logger = log.logger(log_file, settings.setLevel, identity) # Simple connect conn = boto.sqs.connect_to_region(settings.sqs_region, aws_access_key_id=settings.aws_access_key_id, aws_secret_access_key=settings.aws_secret_access_key) queue = conn.get_queue(settings.S3_monitor_queue) queue.set_message_class(S3SQSMessage) rules = load_rules() application = newrelic.agent.application() # Poll for an activity task indefinitely if queue is not None: while flag.green(): logger.info('reading message') queue_message = queue.read(30) # TODO : check for more-than-once delivery # ( Dynamo conditional write? http://tinyurl.com/of3tmop ) if queue_message is None: logger.info('no messages available') else: with newrelic.agent.BackgroundTask(application, name=queue_message.notification_type, group='queue_worker.py'): logger.info('got message id: %s' % queue_message.id) if queue_message.notification_type == 'S3Event': info = S3NotificationInfo.from_S3SQSMessage(queue_message) logger.info("S3NotificationInfo: %s", info.to_dict()) workflow_name = get_starter_name(rules, info) if workflow_name is None: logger.info("Could not handle file %s in bucket %s" % (info.file_name, info.bucket_name)) return False # build message message = { 'workflow_name': workflow_name, 'workflow_data': info.to_dict() } # send workflow initiation message out_queue = conn.get_queue(settings.workflow_starter_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) # cancel incoming message logger.info("cancelling message") queue.delete_message(queue_message) logger.info("message cancelled") else: # TODO : log pass time.sleep(10) logger.info("graceful shutdown") else: logger.error('error obtaining queue')
def work(ENV, flag): # Specify run environment settings settings = settings_lib.get_settings(ENV) # Log identity = "queue_worker_%s" % os.getpid() log_file = "queue_worker.log" # logFile = None logger = log.logger(log_file, settings.setLevel, identity) # Simple connect conn = boto.sqs.connect_to_region( settings.sqs_region, aws_access_key_id=settings.aws_access_key_id, aws_secret_access_key=settings.aws_secret_access_key) queue = conn.get_queue(settings.S3_monitor_queue) queue.set_message_class(S3SQSMessage) rules = load_rules() application = newrelic.agent.application() # Poll for an activity task indefinitely if queue is not None: while flag.green(): logger.info('reading message') queue_message = queue.read(30) # TODO : check for more-than-once delivery # ( Dynamo conditional write? http://tinyurl.com/of3tmop ) if queue_message is None: logger.info('no messages available') else: with newrelic.agent.BackgroundTask( application, name=queue_message.notification_type, group='queue_worker.py'): logger.info('got message id: %s' % queue_message.id) if queue_message.notification_type == 'S3Event': info = S3NotificationInfo.from_S3SQSMessage( queue_message) logger.info("S3NotificationInfo: %s", info.to_dict()) workflow_name = get_starter_name(rules, info) if workflow_name is None: logger.info( "Could not handle file %s in bucket %s" % (info.file_name, info.bucket_name)) return False # build message message = { 'workflow_name': workflow_name, 'workflow_data': info.to_dict() } # send workflow initiation message out_queue = conn.get_queue( settings.workflow_starter_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) # cancel incoming message logger.info("cancelling message") queue.delete_message(queue_message) logger.info("message cancelled") else: # TODO : log pass time.sleep(10) logger.info("graceful shutdown") else: logger.error('error obtaining queue')
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) # set up required connections conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) source_bucket = conn.get_bucket(info.bucket_name) dest_bucket = conn.get_bucket(self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) session = Session(self.settings) article_id_match = re.match(ur'elife-(.*?)-', info.file_name) article_id = article_id_match.group(1) session.store_value(self.get_workflowId(), 'article_id', article_id) if self.logger: self.logger.info("Expanding file %s" % info.file_name) # extract any doi, version and updated date information from the filename version = None # zip name contains version information for previously archived zip files m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: version = m.group(1) if version is None: version = self.get_next_version(article_id) if version == '-1': return False # version could not be determined, exit workflow. Can't emit event as no version. sm = re.search(ur'.*?-.*?-(.*?)-', info.file_name) if sm is not None: status = sm.group(1) if status is None: return False # version could not be determined, exit workflow. Can't emit event as no version. run = str(uuid.uuid4()) # store version for other activities in this workflow execution session.store_value(self.get_workflowId(), 'version', version) # TODO : extract and store updated date if supplied article_version_id = article_id + '.' + version session.store_value(self.get_workflowId(), 'article_version_id', article_version_id) session.store_value(self.get_workflowId(), 'run', run) session.store_value(self.get_workflowId(), 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) self.set_monitor_property(self.settings, article_id, "article_id", article_id, "text") try: # download zip to temp folder tmp = self.get_tmp_dir() key = Key(source_bucket) key.key = info.file_name local_zip_file = self.open_file_from_tmp_dir(info.file_name, mode='wb') key.get_contents_to_file(local_zip_file) local_zip_file.close() bucket_folder_name = article_version_id + '/' + run folder_name = path.join(article_version_id, run) # extract zip contents content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, info.file_name)) as zf: zf.extractall(content_folder) # TODO : rename files (versions!) # TODO : edit xml and rename references upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename k = Key(dest_bucket) k.key = dest_path k.set_contents_from_filename(source_path) session.store_value(self.get_workflowId(), 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return False return True
def process_data_ingestarticlezip(workflow_name, workflow_data): data = { 'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4()) } return data