def test_ingest_article_zip_starter_(self, fake_boto_conn, fake_logger): fake_boto_conn.return_value = FakeBotoConnection() self.stater_ingest_article_zip.start( settings=settings_mock, run=run_example, info=S3NotificationInfo.from_dict( test_data.ingest_article_zip_data))
def do_activity(self, data=None): """ Do the work """ self.expanded_bucket_name = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket info = S3NotificationInfo.from_dict(data) session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') article_version_id = article_id + '.' + version run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "start", "Starting applying version number to files for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) # Do not rename files if a version number is in the file_name m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: # Nothing to do pass elif m is None and version is not None: expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') bucket_folder_name = expanded_folder_name.replace(os.sep, '/') self.rename_article_s3_objects(bucket_folder_name, version) self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "end", "Finished applying version number to article " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception("Exception when applying version number to article") self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error", "Error in applying version number to files for " + article_id + " message:" + e.message) return True
def do_activity(self, data=None): try: info = S3NotificationInfo.from_dict(data) filename = info.file_name[info.file_name.rfind('/')+1:] session = Session(self.settings) session.store_value(data['run'], 'filename_last_element', filename) article_structure = ArticleInfo(filename) if article_structure.article_id is None: self.logger.error("Name '%s' did not match expected pattern for article id" % filename) raise RuntimeError("article_structure.article_id is None. File pattern problem.") version = self.get_version(self.settings, article_structure, data['version_lookup_function']) session.store_value(data['run'], 'version', version) article_id = article_structure.article_id self.emit_monitor_event(self.settings, article_id, version, data['run'], self.pretty_name, "start", " ".join(("Version Lookup for article", article_id, "version:", version))) self.set_monitor_property(self.settings, article_id, "article-id", article_id, "text") self.set_monitor_property(self.settings, article_id, "publication-status", "publication in progress", "text", version=version) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup Version. Error: " + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def process_data_publishperfectarticle(workflow_name, workflow_data): data = {'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4())} return data
def process_data_ingestarticlezip(workflow_name, workflow_data): data = {'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4())} return data
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def test_ingest_article_zip_starter_(self, fake_boto_conn, fake_logger): fake_boto_conn.return_value = FakeBotoConnection() self.stater_ingest_article_zip.start(settings=settings_mock, run=run_example, info=S3NotificationInfo.from_dict(test_data.ingest_article_zip_data))
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) # set up required connections conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) source_bucket = conn.get_bucket(info.bucket_name) dest_bucket = conn.get_bucket(self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) session = Session(self.settings) article_id_match = re.match(ur'elife-(.*?)-', info.file_name) article_id = article_id_match.group(1) session.store_value(self.get_workflowId(), 'article_id', article_id) if self.logger: self.logger.info("Expanding file %s" % info.file_name) # extract any doi, version and updated date information from the filename version = None # zip name contains version information for previously archived zip files m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: version = m.group(1) if version is None: version = self.get_next_version(article_id) if version == '-1': return False # version could not be determined, exit workflow. Can't emit event as no version. sm = re.search(ur'.*?-.*?-(.*?)-', info.file_name) if sm is not None: status = sm.group(1) if status is None: return False # version could not be determined, exit workflow. Can't emit event as no version. run = str(uuid.uuid4()) # store version for other activities in this workflow execution session.store_value(self.get_workflowId(), 'version', version) # TODO : extract and store updated date if supplied article_version_id = article_id + '.' + version session.store_value(self.get_workflowId(), 'article_version_id', article_version_id) session.store_value(self.get_workflowId(), 'run', run) session.store_value(self.get_workflowId(), 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) self.set_monitor_property(self.settings, article_id, "article_id", article_id, "text") try: # download zip to temp folder tmp = self.get_tmp_dir() key = Key(source_bucket) key.key = info.file_name local_zip_file = self.open_file_from_tmp_dir(info.file_name, mode='wb') key.get_contents_to_file(local_zip_file) local_zip_file.close() bucket_folder_name = article_version_id + '/' + run folder_name = path.join(article_version_id, run) # extract zip contents content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, info.file_name)) as zf: zf.extractall(content_folder) # TODO : rename files (versions!) # TODO : edit xml and rename references upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename k = Key(dest_bucket) k.key = dest_path k.set_contents_from_filename(source_path) session.store_value(self.get_workflowId(), 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return False return True
def process_data_ingestarticlezip(workflow_name, workflow_data): data = { 'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4()) } return data
def process_data_publishperfectarticle(workflow_name, workflow_data): data = { 'info': S3NotificationInfo.from_dict(workflow_data), 'run': str(uuid.uuid4()) } return data
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) self.check_filenames(upload_filenames) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True