def do_activity(self, data=None): try: session = Session(self.settings) version = session.get_value(data['run'], 'version') filename = session.get_value(data['run'], 'filename_last_element') article_structure = ArticleInfo(filename) version_date, error = self.get_version(self.settings, article_structure, article_structure.article_id, version) if error is not None: self.logger.error(error) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join(("Error Looking up version article", article_structure.article_id, "message:", error))) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) session.store_value(data['run'], 'update_date', version_date) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup next version") self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join(("Error looking up version for article", article_structure.article_id, "message:", str(e)))) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) try: if 'standalone' in data and data['standalone']: article_id = data['article_id'] poa = data['standalone_is_poa'] (start_msg, end_msg, result) = self.get_events(article_id, poa, version=None, run=None) self.logger.info(end_msg[6]) return result run = data['run'] session = Session(self.settings) article_id = session.get_value(run, 'article_id') version = session.get_value(run, 'version') file_name = session.get_value(run, 'file_name') poa = False if "poa" in file_name: poa = True (start_msg, end_msg, success) = self.get_events(article_id, poa, version, run) self.emit_monitor_event(*start_msg) self.emit_monitor_event(*end_msg) return success except Exception as e: self.logger.exception( "Error starting Copy Glencoe Still Images activity") return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) try: if 'standalone' in data and data['standalone']: article_id = data['article_id'] poa = data['standalone_is_poa'] (start_msg, end_msg, result) = self.get_events(article_id, poa, version=None, run=None) self.logger.info(end_msg[6]) return result run = data['run'] session = Session(self.settings) article_id = session.get_value(run, 'article_id') version = session.get_value(run, 'version') file_name = session.get_value(run, 'file_name') poa = False if "poa" in file_name: poa = True (start_msg, end_msg, success) = self.get_events(article_id, poa, version, run) self.emit_monitor_event(*start_msg) self.emit_monitor_event(*end_msg) return success except Exception as e: self.logger.exception("Error starting Copy Glencoe Still Images activity") return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Set Publication Status", "start", "Starting Ending setting of publish status for " + article_id) try: conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) eif_filename = session.get_value(self.get_workflowId(), 'eif_filename') data = self.get_eif(conn, eif_filename) publication_status = self.get_publication_status(data) data['publish'] = publication_status self.update_bucket(conn, data, eif_filename) self.emit_monitor_event(self.settings, article_id, version, run, "Set Publication Status", "end", "Ending setting of publish status for " + article_id) except Exception as e: self.logger.exception("Exception when setting publication status for " + article_id) self.emit_monitor_event(self.settings, article_id, version, run, "Set Publication Status", "error", "Error submitting EIF For article" + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) article_id = session.get_value(run, 'article_id') version = session.get_value(run, 'version') except Exception as e: self.logger.exception(str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE try: storage_context = StorageContext(self.settings) bucket = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket images_resource = "".join((self.settings.storage_provider, "://", bucket, "/", article_id)) files_in_bucket = storage_context.list_resources(images_resource) original_figures = article_structure.get_figures_for_iiif( files_in_bucket) iiif_path_for_article = self.settings.iiif_resolver.replace( '{article_id}', article_id) results = self.retrieve_endpoints_check(original_figures, iiif_path_for_article) bad_images = list(filter(lambda x: x[0] == False, results)) if len(bad_images) > 0: # print endpoints that did not work self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Some images are not available through the IIIF endpoint: " + str(bad_images)) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "end", "Finished Verification. All endpoints work. Article: " + article_id) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception(str(e)) self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "An error occurred when checking IIIF endpoint. Article " + article_id + '; message: ' + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data): session = Session(self.settings) eif_location = session.get_value(data["run"], "eif_location") self.emit_monitor_event( self.settings, data["article_id"], data["version"], data["run"], self.pretty_name, "start", "Starting to set EIF to publish", ) try: if not isinstance(eif_location, basestring): self.logger.error(self.pretty_name + " error. eif_location must be string") raise Exception("eif_location not available") storage_context = StorageContext(self.settings) eif_origin = "".join( ( self.settings.storage_provider, "://", self.settings.publishing_buckets_prefix + self.settings.eif_bucket, "/", eif_location, ) ) except Exception as e: self.emit_monitor_event( self.settings, data["article_id"], data["version"], data["run"], self.pretty_name, "error", e.message ) return activity.activity.ACTIVITY_PERMANENT_FAILURE success, error = self.set_eif_to_publish(storage_context, eif_origin) if success: self.emit_monitor_event( self.settings, data["article_id"], data["version"], data["run"], self.pretty_name, "end", "Finished to set EIF to publish", ) return activity.activity.ACTIVITY_SUCCESS self.logger.error(error) self.emit_monitor_event( self.settings, data["article_id"], data["version"], data["run"], self.pretty_name, "error", error ) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "start", "Starting submission convert images to jpg for article " + article_id) try: expanded_folder_name = session.get_value(run, 'expanded_folder') expanded_folder_bucket = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) storage_provider = self.settings.storage_provider + "://" orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name storage_context = StorageContext(self.settings) files_in_bucket = storage_context.list_resources(orig_resource) figures = filter(article_structure.article_figure, files_in_bucket) # download is not a IIIF asset but is currently kept for compatibility # download may become obsolete in future formats = {"Original": { "sources": "tif", "format": "jpg", "download": "yes" }} for file_name in figures: figure_resource = orig_resource + "/" + file_name file_path = self.get_tmp_dir() + os.sep + file_name file_pointer = storage_context.get_resource_to_file_pointer(figure_resource, file_path) cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket cdn_resource_path = storage_provider + cdn_bucket_name + "/" + article_id + "/" publish_locations = [cdn_resource_path] image_conversion.generate_images(self.settings, formats, file_pointer, article_structure.ArticleInfo(file_name), publish_locations, self.logger) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Finished converting images for " + article_id + ": " + str(len(figures)) + " images processed ") return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("An error occurred during " + self.pretty_name) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Error converting images to JPG for article" + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) self.expanded_bucket_name = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) self.crossref_bucket_name = (self.settings.publishing_buckets_prefix + self.settings.poa_packaging_bucket) session = Session(self.settings) run = data['run'] version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') expanded_folder_name = session.get_value(run, 'expanded_folder') conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(self.expanded_bucket_name) self.emit_monitor_event(self.settings, article_id, version, run, "Schedule Crossref", "start", "Starting scheduling of crossref deposit for " + article_id) try: (xml_key, xml_filename) = ConvertJATS.get_article_xml_key(bucket, expanded_folder_name) # Rename the XML file to match what is used already new_key_name = self.new_crossref_xml_name( prefix=self.crossref_outbox_folder, journal='elife', article_id=str(article_id).zfill(5)) self.copy_article_xml_to_crossref_outbox( new_key_name=new_key_name, source_bucket_name=self.expanded_bucket_name, old_key_name=xml_key.name) self.emit_monitor_event(self.settings, article_id, version, run, "Schedule Crossref", "end", "Finished scheduling of crossref deposit " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception("Exception when scheduling crossref") self.emit_monitor_event(self.settings, article_id, version, run, "Schedule Crossref", "error", "Error scheduling crossref " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) article_id = session.get_value(run, 'article_id') version = session.get_value(run, 'version') self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "start", "Starting Glencoe video check for " + article_id) try: expanded_folder = session.get_value(run, 'expanded_folder') if expanded_folder is None: raise RuntimeError("No session value for expanded folder") expanded_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket self.logger.info("expanded_bucket: " + expanded_bucket) xml_filename = lax_provider.get_xml_file_name(self.settings, expanded_folder, expanded_bucket, version) if xml_filename is None: raise RuntimeError("No xml_filename found.") xml_origin = "".join((self.settings.storage_provider, "://", expanded_bucket, "/", expanded_folder + '/' + xml_filename)) storage_context = StorageContext(self.settings) xml_content = storage_context.get_resource_as_string(xml_origin) if glencoe_check.has_videos(xml_content): glencoe_check.validate_sources(glencoe_check.metadata(glencoe_check.check_msid(article_id), self.settings)) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Finished Verification. Glencoe is available. Article: " + article_id) return True self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Finished Verification. No Glencoe media tags found in xml. " "Article: " + article_id) return True except AssertionError as err: self.logger.info(err) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Glencoe video is not available for article " + article_id + '; message: ' + str(err)) time.sleep(60) return activity.activity.ACTIVITY_TEMPORARY_FAILURE except Exception as e: self.logger.exception(str(e)) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "An error occurred when checking for Glencoe video. Article " + article_id + '; message: ' + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): try: self.expanded_bucket_name = ( self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "start", "Starting applying version number to files for " + article_id) except Exception as e: self.logger.exception(str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) if version is None: self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Error in applying version number to files for " + article_id + " message: No version available") return activity.activity.ACTIVITY_PERMANENT_FAILURE expanded_folder_name = session.get_value(run, 'expanded_folder') bucket_folder_name = expanded_folder_name.replace(os.sep, '/') self.rename_article_s3_objects(bucket_folder_name, version) self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "end", "Finished applying version number to article " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception(str(e)) self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Error in applying version number to files for " + article_id + " message:" + str(e.message)) return activity.activity.ACTIVITY_PERMANENT_FAILURE return activity.activity.ACTIVITY_SUCCESS
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event( self.settings, article_id, version, run, "Resize Images", "start", "Starting submission resize of images for article " + article_id) try: expanded_folder_name = session.get_value(run, 'expanded_folder') cdn_path = article_id if self.logger: self.logger.info("Converting images for folder %s" % expanded_folder_name) # get information on files in the expanded article bucket for notified zip file bucket_folder_name = expanded_folder_name bucket, file_infos = self.get_file_infos(bucket_folder_name) image_count = 0 for file_info in file_infos: image_count += 1 key = bucket.get_key(file_info.key) # see : http://stackoverflow.com/questions/9954521/s3-boto-list-keys-sometimes-returns-directory-key if not key.name.endswith("/"): # process each key in the folder self.process_key(key, cdn_path) self.emit_monitor_event( self.settings, article_id, version, run, "Resize Images", "end", "Finished converting images for " + article_id + ": " + str(image_count) + " images processed ") self.clean_tmp_dir() except Exception as e: self.logger.exception("Exception when resizing images") self.emit_monitor_event( self.settings, article_id, version, run, "Resize Images", "error", "Error resizing images for article" + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return activity.activity.ACTIVITY_SUCCESS
def do_activity(self, data=None): run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "start", "Depositing Ingest assets for " + article_id) try: expanded_folder_name = session.get_value(run, 'expanded_folder') expanded_folder_bucket = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket storage_context = StorageContext(self.settings) storage_provider = self.settings.storage_provider + "://" orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name files_in_bucket = storage_context.list_resources(orig_resource) pre_ingest_assets = article_structure.pre_ingest_assets( files_in_bucket) for file_name in pre_ingest_assets: orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name + "/" + file_name dest_resource = storage_provider + cdn_bucket_name + "/" + article_id + "/" + file_name storage_context.copy_resource(orig_resource, dest_resource) if self.logger: self.logger.info("Uploaded file %s to %s" % (file_name, cdn_bucket_name)) self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "end", "Deposited Ingest assets for article " + article_id) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when Depositing Ingest assets") self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Error depositing Ingest assets for article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): try: self.expanded_bucket_name = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "start", "Starting applying version number to files for " + article_id) except Exception as e: self.logger.exception(str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) if version is None: self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Error in applying version number to files for " + article_id + " message: No version available") return activity.activity.ACTIVITY_PERMANENT_FAILURE expanded_folder_name = session.get_value(run, 'expanded_folder') bucket_folder_name = expanded_folder_name.replace(os.sep, '/') self.rename_article_s3_objects(bucket_folder_name, version) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Finished applying version number to article " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception(str(e)) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Error in applying version number to files for " + article_id + " message:" + str(e.message)) return activity.activity.ACTIVITY_PERMANENT_FAILURE return activity.activity.ACTIVITY_SUCCESS
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event(self.settings, article_id, version, run, "Resize Images", "start", "Starting submission resize of images for article " + article_id) try: expanded_folder_name = session.get_value(run, 'expanded_folder') cdn_path = article_id if self.logger: self.logger.info("Converting images for folder %s" % expanded_folder_name) # get information on files in the expanded article bucket for notified zip file bucket_folder_name = expanded_folder_name bucket, file_infos = self.get_file_infos(bucket_folder_name) image_count = 0 for file_info in file_infos: image_count += 1 key = bucket.get_key(file_info.key) # see : http://stackoverflow.com/questions/9954521/s3-boto-list-keys-sometimes-returns-directory-key if not key.name.endswith("/"): # process each key in the folder self.process_key(key, cdn_path) self.emit_monitor_event(self.settings, article_id, version, run, "Resize Images", "end", "Finished converting images for " + article_id + str(image_count) + " images processed ") self.clean_tmp_dir() except Exception as e: self.logger.exception("Exception when resizing images") self.emit_monitor_event(self.settings, article_id, version, run, "Resize Images", "error", "Error resizing images for article" + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Deposit assets", "start", "Depositing assets for " + article_id) try: conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket expanded_bucket = conn.get_bucket(expanded_folder_bucket) cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket no_download_extensions = [x.strip() for x in self.settings.no_download_extensions.split(',')] keys = self.get_keys(expanded_bucket, expanded_folder_name) for key in keys: (file_key, file_name) = key file_key.copy(cdn_bucket_name, article_id + "/" + file_name) if self.logger: self.logger.info("Uploaded key %s to %s" % (file_name, cdn_bucket_name)) file_name_no_extension, extension = file_name.rsplit('.', 1) if extension not in no_download_extensions: download_metadata = file_key.metadata download_metadata['Content-Disposition'] = str( "Content-Disposition: attachment; filename=" + file_name + ";") file_key.copy(cdn_bucket_name, article_id + "/" + file_name_no_extension + "-download." + extension, metadata=download_metadata) self.emit_monitor_event(self.settings, article_id, version, run, "Deposit assets", "end", "Deposited assets for article " + article_id) except Exception as e: self.logger.exception("Exception when Depositing assets") self.emit_monitor_event(self.settings, article_id, version, run, "Deposit assets", "error", "Error depositing assets for article " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ self.expanded_bucket_name = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket info = S3NotificationInfo.from_dict(data) session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') article_version_id = article_id + '.' + version run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "start", "Starting applying version number to files for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) # Do not rename files if a version number is in the file_name m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: # Nothing to do pass elif m is None and version is not None: expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') bucket_folder_name = expanded_folder_name.replace(os.sep, '/') self.rename_article_s3_objects(bucket_folder_name, version) self.emit_monitor_event(self.settings, article_id, version, run, "Apply Version Number", "end", "Finished applying version number to article " + article_id + " for version " + version + " run " + str(run)) except Exception as e: self.logger.exception("Exception when applying version number to article") self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error", "Error in applying version number to files for " + article_id + " message:" + e.message) return True
def get_workflow_data(self, data): if "publication_data" in data: publication_data = json.loads(base64.decodestring(data['publication_data'])) workflow_data = publication_data['workflow_data'] return workflow_data # added this block because when doing silent corrections we will not have the opportunity to get the data from # the previous workflow (PreparePostEIF sets the data and when in silent corrections it is part of the same # workflow) currently we cannot mutate the data and pass it through activities, only workflows # it is an improvement to be made. Needs research on AWS SWF. # it will also work when not in Silent corrections, it will just override the setting with the same data session = Session(self.settings) eif_location = session.get_value(data['run'], 'eif_location') if eif_location is not None: data['eif_location'] = eif_location return data
def do_activity(self, data=None): try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) article_id = session.get_value(run, 'article_id') version = session.get_value(run, 'version') except Exception as e: self.logger.exception(str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE try: storage_context = StorageContext(self.settings) bucket = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket images_resource = "".join((self.settings.storage_provider, "://", bucket, "/", article_id)) files_in_bucket = storage_context.list_resources(images_resource) original_figures = article_structure.get_figures_for_iiif(files_in_bucket) iiif_path_for_article = self.settings.iiif_resolver.replace('{article_id}', article_id) results = self.retrieve_endpoints_check(original_figures, iiif_path_for_article) bad_images = list(filter(lambda x: x[0] == False, results)) if len(bad_images) > 0: # print endpoints that did not work self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Some images are not available through the IIIF endpoint: " + str(bad_images)) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Finished Verification. All endpoints work. Article: " + article_id) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception(str(e)) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "An error occurred when checking IIIF endpoint. Article " + article_id + '; message: ' + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data["run"] session = Session(self.settings) data['version'] = session.get_value(run, 'version') data['article_id'] = session.get_value(run, 'article_id') data['status'] = session.get_value(run, 'status') data['expanded_folder'] = session.get_value(run, 'expanded_folder') data['update_date'] = session.get_value(run, 'update_date') queue_connection_settings = { "sqs_region": self.settings.sqs_region, "aws_access_key_id": self.settings.aws_access_key_id, "aws_secret_access_key": self.settings.aws_secret_access_key } (message, queue, start_event, end_event, end_event_details, exception) = self.get_message_queue( data, self.settings.consider_Lax_elife_2_0) self.emit_monitor_event(*start_event) if end_event == "error": self.logger.exception( "Exception when Preparing Ingest for Lax. Details: %s", exception) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.write_message(queue_connection_settings, queue, message) self.emit_monitor_event(*end_event_details) return activity.activity.ACTIVITY_SUCCESS
def get_workflow_data(self, data): if "publication_data" in data: publication_data = json.loads( base64.decodestring(data['publication_data'])) workflow_data = publication_data['workflow_data'] return workflow_data # added this block because when doing silent corrections we will not have the opportunity to get the data from # the previous workflow (PreparePostEIF sets the data and when in silent corrections it is part of the same # workflow) currently we cannot mutate the data and pass it through activities, only workflows # it is an improvement to be made. Needs research on AWS SWF. # it will also work when not in Silent corrections, it will just override the setting with the same data session = Session(self.settings) eif_location = session.get_value(data['run'], 'eif_location') if eif_location is not None: data['eif_location'] = eif_location return data
def do_activity(self, data): session = Session(self.settings) eif_location = session.get_value(data['run'], 'eif_location') self.emit_monitor_event(self.settings, data['article_id'], data['version'], data['run'], self.pretty_name, "start", "Starting to set EIF to publish") try: if not isinstance(eif_location, basestring): self.logger.error(self.pretty_name + " error. eif_location must be string") raise Exception("eif_location not available") storage_context = StorageContext(self.settings) eif_origin = "".join((self.settings.storage_provider, "://", self.settings.publishing_buckets_prefix + self.settings.eif_bucket, "/", eif_location)) except Exception as e: self.emit_monitor_event(self.settings, data['article_id'], data['version'], data['run'], self.pretty_name, "error", e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE success, error = self.set_eif_to_publish(storage_context, eif_origin) if success: self.emit_monitor_event(self.settings, data['article_id'], data['version'], data['run'], self.pretty_name, "end", "Finished to set EIF to publish") return activity.activity.ACTIVITY_SUCCESS self.logger.error(error) self.emit_monitor_event(self.settings, data['article_id'], data['version'], data['run'], self.pretty_name, "error", error) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data["run"] session = Session(self.settings) data['version'] = session.get_value(run, 'version') data['article_id'] = session.get_value(run, 'article_id') data['status'] = session.get_value(run, 'status') data['expanded_folder'] = session.get_value(run, 'expanded_folder') data['update_date'] = session.get_value(run, 'update_date') queue_connection_settings = {"sqs_region": self.settings.sqs_region, "aws_access_key_id":self.settings.aws_access_key_id, "aws_secret_access_key": self.settings.aws_secret_access_key} (message, queue, start_event, end_event, end_event_details, exception) = self.get_message_queue(data, self.settings.consider_Lax_elife_2_0) self.emit_monitor_event(*start_event) if end_event == "error": self.logger.exception("Exception when Preparing Ingest for Lax. Details: %s", exception) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.write_message(queue_connection_settings, queue, message) self.emit_monitor_event(*end_event_details) return activity.activity.ACTIVITY_SUCCESS
def do_activity(self, data=None): try: info = S3NotificationInfo.from_dict(data) filename = info.file_name[info.file_name.rfind('/')+1:] session = Session(self.settings) session.store_value(data['run'], 'filename_last_element', filename) article_structure = ArticleInfo(filename) if article_structure.article_id is None: self.logger.error("Name '%s' did not match expected pattern for article id" % filename) raise RuntimeError("article_structure.article_id is None. File pattern problem.") version = self.get_version(self.settings, article_structure, data['version_lookup_function']) session.store_value(data['run'], 'version', version) article_id = article_structure.article_id self.emit_monitor_event(self.settings, article_id, version, data['run'], self.pretty_name, "start", " ".join(("Version Lookup for article", article_id, "version:", version))) self.set_monitor_property(self.settings, article_id, "article-id", article_id, "text") self.set_monitor_property(self.settings, article_id, "publication-status", "publication in progress", "text", version=version) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup Version. Error: " + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Deposit assets", "start", "Depositing assets for " + article_id) try: conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket expanded_bucket = conn.get_bucket(expanded_folder_bucket) cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket keys = self.get_keys(expanded_bucket, expanded_folder_name) for key in keys: (file_key, file_name) = key file_key.copy(cdn_bucket_name, article_id + "/" + file_name) if self.logger: self.logger.info("Uploaded key %s to %s" % (file_name, cdn_bucket_name)) except Exception as e: self.logger.exception("Exception when Depositing assets") self.emit_monitor_event(self.settings, article_id, version, run, "Deposit assets", "error", "Error depositing assets for article " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): try: session = Session(self.settings) version = session.get_value(data['run'], 'version') filename = session.get_value(data['run'], 'filename_last_element') article_structure = ArticleInfo(filename) version_date, error = self.get_version( self.settings, article_structure, article_structure.article_id, version) if error is not None: self.logger.error(error) self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join( ("Error Looking up version article", article_structure.article_id, "message:", error))) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join( ("Finished Version Lookup for article", article_structure.article_id, "version:", version))) session.store_value(data['run'], 'update_date', version_date) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception( "Exception when trying to Lookup next version") self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join( ("Error looking up version for article", article_structure.article_id, "message:", str(e)))) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "start", "Starting submission of article EIF " + article_id) try: eif_filename = session.get_value(self.get_workflowId(), 'eif_filename') eif_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket if self.logger: self.logger.info("Posting file %s" % eif_filename) conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(eif_bucket) key = Key(bucket) key.key = eif_filename json_output = key.get_contents_as_string() destination = self.settings.drupal_EIF_endpoint headers = {'content-type': 'application/json'} r = requests.post(destination, data=json_output, headers=headers) self.logger.info("POST response was %s" % str(r.status_code)) self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "start", "Finish submission of article " + article_id + " for version " + str(version) + " run " + str(run) + " the response status " "was " + str(r.status_code)) if r.status_code == 200: # TODO : article path will at some point be available in the respose article_path = session.get_value(self.get_workflowId(), 'article_path') self.set_monitor_property(self.settings, article_id, 'path', article_path, 'text') published = r.json().get('publish') if published == "1": self.set_monitor_property(self.settings, article_id, 'publication_status', 'published', "text") # initiate post-publication workflow # assemble data expanded_folder = session.get_value(self.get_workflowId(), 'expanded_folder') status = session.get_value(self.get_workflowId(), 'status') update_date = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') # TODO: need to replace line above with this once its in the session # update_date = session.get_value(self.get_workflowId(), 'update_date') sqs_conn = boto.sqs.connect_to_region(self.settings.sqs_region, aws_access_key_id=self.settings.aws_access_key_id, aws_secret_access_key=self.settings.aws_secret_access_key) follow_on_data = { 'article_id': article_id, 'version': version, 'expanded_folder': expanded_folder, 'update_date': update_date, 'run': run, 'status': status, 'eif_location': eif_filename, } message = { 'workflow_name': 'PostPerfectPublication', 'workflow_data': follow_on_data } out_queue = sqs_conn.get_queue(self.settings.workflow_starter_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) else: self.set_monitor_property(self.settings, article_id, 'publication_status', 'ready', "text") else: self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "error", "Website ingest returned an error code: " + str(r.status_code)) return False self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "end", "Finished submitting EIF for article " + article_id + " status was " + str(r.status_code)) self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "start", "Finish submission of article " + article_id + " for version " + str(version) + " run " + str(run) + " the response status ") except Exception as e: self.logger.exception("Exception when submitting article EIF") self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "error", "Error submitting EIF For article" + article_id + " message:" + str(e.message)) return False return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "start", "Depositing assets for " + article_id) try: expanded_folder_name = session.get_value(run, 'expanded_folder') expanded_folder_bucket = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) storage_context = StorageContext(self.settings) storage_provider = self.settings.storage_provider + "://" orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name files_in_bucket = storage_context.list_resources(orig_resource) # filter figures that have already been copied (see DepositIngestAssets activity) pre_ingest_assets = article_structure.pre_ingest_assets(files_in_bucket) other_assets = filter(lambda asset: asset not in pre_ingest_assets, files_in_bucket) # assets bucket cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket no_download_extensions = self.get_no_download_extensions(self.settings.no_download_extensions) for file_name in other_assets: orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name + "/" dest_resource = storage_provider + cdn_bucket_name + "/" + article_id + "/" storage_context.copy_resource(orig_resource + file_name, dest_resource + file_name) if self.logger: self.logger.info("Uploaded file %s to %s" % (file_name, cdn_bucket_name)) file_name_no_extension, extension = file_name.rsplit('.', 1) if extension not in no_download_extensions: content_type = self.content_type_from_file_name(file_name) dict_metadata = {'Content-Disposition': str("Content-Disposition: attachment; filename=" + file_name + ";"), 'Content-Type': content_type} file_download = file_name_no_extension + "-download." + extension # file is copied with additional metadata storage_context.copy_resource(orig_resource + file_name, dest_resource + file_download, additional_dict_metadata=dict_metadata) self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "end", "Deposited assets for article " + article_id) except Exception as e: self.logger.exception("Exception when Depositing assets") self.emit_monitor_event(self.settings, article_id, version, run, self.pretty_name, "error", "Error depositing assets for article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return activity.activity.ACTIVITY_SUCCESS
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "start", "Starting submission of article EIF " + article_id) try: eif_filename = session.get_value(self.get_workflowId(), 'eif_filename') eif_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket if self.logger: self.logger.info("Posting file %s" % eif_filename) conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(eif_bucket) key = Key(bucket) key.key = eif_filename json_output = key.get_contents_as_string() destination = self.settings.drupal_EIF_endpoint headers = {'content-type': 'application/json'} auth = None if self.settings.drupal_update_user and self.settings.drupal_update_user != '': auth = requests.auth.HTTPBasicAuth(self.settings.drupal_update_user, self.settings.drupal_update_pass) r = requests.post(destination, data=json_output, headers=headers, auth=auth) self.logger.info("POST response was %s" % str(r.status_code)) self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "start", "Finish submission of article " + article_id + " for version " + str(version) + " run " + str(run) + " the response status " "was " + str(r.status_code)) # TODO: this is temp if r.status_code == 200: #if True: # TODO : article path will at some point be available in the respose article_path = session.get_value(self.get_workflowId(), 'article_path') self.set_monitor_property(self.settings, article_id, 'path', article_path, 'text', version=version) published = r.json().get('publish') # TODO: this is temp #published = False # assemble data to start post-publication workflow expanded_folder = session.get_value(self.get_workflowId(), 'expanded_folder') status = session.get_value(self.get_workflowId(), 'status') try: update_date = session.get_value(self.get_workflowId(), 'update_date') except: # Default update_date = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') follow_on_data = { 'article_id': article_id, 'version': version, 'expanded_folder': expanded_folder, 'update_date': update_date, 'run': run, 'status': status, 'eif_location': eif_filename, } message = { 'workflow_name': 'PostPerfectPublication', 'workflow_data': follow_on_data } if published is True: self.set_monitor_property(self.settings, article_id, 'publication-status', 'published', "text", version=version) # initiate post-publication workflow now sqs_conn = boto.sqs.connect_to_region(self.settings.sqs_region, aws_access_key_id=self.settings.aws_access_key_id, aws_secret_access_key=self.settings.aws_secret_access_key) out_queue = sqs_conn.get_queue(self.settings.workflow_starter_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) else: encoded_message = base64.encodestring(json.dumps(message)) # store message in dashboard for later self.set_monitor_property(self.settings, article_id, "_publication-data", encoded_message, "text", version=version) self.set_monitor_property(self.settings, article_id, "publication-status", "ready to publish", "text", version=version) else: self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "error", "Website ingest returned an error code: " + str(r.status_code)) self.logger.error("Body:" + r.text) return False self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "end", "Finished submitting EIF for article " + article_id + " status was " + str(r.status_code)) except Exception as e: self.logger.exception("Exception when submitting article EIF") self.emit_monitor_event(self.settings, article_id, version, run, "Post EIF", "error", "Error submitting EIF For article" + article_id + " message:" + str(e.message)) return False return True
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') article_version_id = article_id + '.' + version run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket print expanded_folder_name conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml, article_version=version) # Add update date if it is in the session update_date = None try: update_date = session.get_value(self.get_workflowId(), 'update_date') except: update_date = None if update_date: json_output = self.add_update_date_to_json(json_output, update_date, xml_filename) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + '/' + run output_name = xml_filename.replace('.xml', '.json') output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + '/' + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) self.set_dashboard_properties(json_output, article_id, version) session.store_value(self.get_workflowId(), "eif_filename", output_key) eif_object = json.loads(json_output) session.store_value(self.get_workflowId(), 'article_path', eif_object.get('path')) self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "end", "XML converted to EIF for article " + article_id + " to " + output_key) except Exception as e: self.logger.exception("Exception when converting article XML to EIF") self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) self.check_filenames(upload_filenames) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') article_version_id = article_id + '.' + version self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(run, 'expanded_folder') expanded_folder_bucket = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml, article_version=version) # Add update date if it is in the session update_date = session.get_value(run, 'update_date') if update_date: json_output = self.add_update_date_to_json( json_output, update_date, xml_filename) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + '/' + run output_name = xml_filename.replace('.xml', '.json') output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + '/' + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) self.set_dashboard_properties(json_output, article_id, version) session.store_value(run, "eif_location", output_key) eif_object = json.loads(json_output) session.store_value(run, 'article_path', eif_object.get('path')) self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "end", "XML converted to EIF for article " + article_id + " to " + output_key) except Exception as e: self.logger.exception( "Exception when converting article XML to EIF") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "start", "Starting preparation of article for EIF " + article_id) try: eif_location = session.get_value(run, 'eif_location') eif_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket article_path = session.get_value(run, 'article_path') self.set_monitor_property(self.settings, article_id, 'path', article_path, 'text', version=version) expanded_folder = session.get_value(run, 'expanded_folder') status = session.get_value(run, 'status') update_date = session.get_value(run, 'update_date') carry_over_data = { 'eif_location': eif_location, 'eif_bucket': eif_bucket, 'passthrough': { 'article_id': article_id, 'version': version, 'run': run, 'article_path': article_path, 'expanded_folder': expanded_folder, 'status': status, 'update_date': update_date, } } message = carry_over_data sqs_conn = boto.sqs.connect_to_region( self.settings.sqs_region, aws_access_key_id=self.settings.aws_access_key_id, aws_secret_access_key=self.settings.aws_secret_access_key) out_queue = sqs_conn.get_queue(self.settings.website_ingest_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) ######### except Exception as e: self.logger.exception("Exception when Preparing for PostEIF") self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Error submitting EIF For article" + article_id + " message:" + str(e.message)) return False self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "end", "Finished preparation of article for EIF " + article_id) return True
def do_activity(self, data=None): """ Do the work """ run = data["run"] session = Session(self.settings) version = session.get_value(run, "version") article_id = session.get_value(run, "article_id") self.emit_monitor_event( self.settings, article_id, version, run, "Deposit assets", "start", "Depositing assets for " + article_id ) try: conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) expanded_folder_name = session.get_value(run, "expanded_folder") expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket expanded_bucket = conn.get_bucket(expanded_folder_bucket) cdn_bucket_name = self.settings.publishing_buckets_prefix + self.settings.ppp_cdn_bucket no_download_extensions = self.get_no_download_extensions(self.settings.no_download_extensions) storage_context = StorageContext(self.settings) storage_provider = self.settings.storage_provider + "://" published_bucket_path = ( self.settings.publishing_buckets_prefix + self.settings.published_bucket + "/articles" ) keys = self.get_keys(expanded_bucket, expanded_folder_name) for key in keys: (file_key, file_name) = key # file_key.copy(cdn_bucket_name, article_id + "/" + file_name) orig_resource = storage_provider + expanded_folder_bucket + "/" + expanded_folder_name + "/" + file_name dest_resource = storage_provider + cdn_bucket_name + "/" + article_id + "/" + file_name additional_dest_resource = storage_provider + published_bucket_path + "/" + article_id + "/" + file_name storage_context.copy_resource(orig_resource, dest_resource) storage_context.copy_resource(orig_resource, additional_dest_resource) if self.logger: self.logger.info("Uploaded key %s to %s" % (file_name, cdn_bucket_name)) file_name_no_extension, extension = file_name.rsplit(".", 1) if extension not in no_download_extensions: content_type = self.content_type_from_file_name(file_name) dict_metadata = { "Content-Disposition": str("Content-Disposition: attachment; filename=" + file_name + ";"), "Content-Type": content_type, } file_download = file_name_no_extension + "-download." + extension orig_resource_download = dest_resource dest_resource_download = storage_provider + cdn_bucket_name + "/" + article_id + "/" + file_download additional_dest_resource_download = ( storage_provider + published_bucket_path + "/" + article_id + "/" + file_download ) # file is copied with additional metadata storage_context.copy_resource( orig_resource_download, dest_resource_download, additional_dict_metadata=dict_metadata ) # additional metadata is already set in origin resource so it will be copied accross by default storage_context.copy_resource(dest_resource_download, additional_dest_resource_download) self.emit_monitor_event( self.settings, article_id, version, run, "Deposit assets", "end", "Deposited assets for article " + article_id, ) except Exception as e: self.logger.exception("Exception when Depositing assets") self.emit_monitor_event( self.settings, article_id, version, run, "Deposit assets", "error", "Error depositing assets for article " + article_id + " message:" + e.message, ) return False return True
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) # set up required connections conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) source_bucket = conn.get_bucket(info.bucket_name) dest_bucket = conn.get_bucket(self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) session = Session(self.settings) article_id_match = re.match(ur'elife-(.*?)-', info.file_name) article_id = article_id_match.group(1) session.store_value(self.get_workflowId(), 'article_id', article_id) if self.logger: self.logger.info("Expanding file %s" % info.file_name) # extract any doi, version and updated date information from the filename version = None # zip name contains version information for previously archived zip files m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: version = m.group(1) if version is None: version = self.get_next_version(article_id) if version == '-1': return False # version could not be determined, exit workflow. Can't emit event as no version. sm = re.search(ur'.*?-.*?-(.*?)-', info.file_name) if sm is not None: status = sm.group(1) if status is None: return False # version could not be determined, exit workflow. Can't emit event as no version. run = str(uuid.uuid4()) # store version for other activities in this workflow execution session.store_value(self.get_workflowId(), 'version', version) # TODO : extract and store updated date if supplied article_version_id = article_id + '.' + version session.store_value(self.get_workflowId(), 'article_version_id', article_version_id) session.store_value(self.get_workflowId(), 'run', run) session.store_value(self.get_workflowId(), 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) self.set_monitor_property(self.settings, article_id, "article_id", article_id, "text") try: # download zip to temp folder tmp = self.get_tmp_dir() key = Key(source_bucket) key.key = info.file_name local_zip_file = self.open_file_from_tmp_dir(info.file_name, mode='wb') key.get_contents_to_file(local_zip_file) local_zip_file.close() bucket_folder_name = article_version_id + '/' + run folder_name = path.join(article_version_id, run) # extract zip contents content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, info.file_name)) as zf: zf.extractall(content_folder) # TODO : rename files (versions!) # TODO : edit xml and rename references upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename k = Key(dest_bucket) k.key = dest_path k.set_contents_from_filename(source_path) session.store_value(self.get_workflowId(), 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), "version") article_id = session.get_value(self.get_workflowId(), "article_id") article_version_id = article_id + "." + version run = session.get_value(self.get_workflowId(), "run") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id, ) try: if self.logger: self.logger.info("data: %s" % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(self.get_workflowId(), "expanded_folder") expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket print expanded_folder_name conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + "/" + run output_name = xml_filename.replace(".xml", ".json") output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + "/" + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) session.store_value(self.get_workflowId(), "eif_filename", output_key) eif_object = json.loads(json_output) session.store_value(self.get_workflowId(), "article_path", eif_object.get("path")) self.emit_monitor_event( self.settings, article_id, version, run, "Post EIF", "success", "XML converted to EIF for article " + article_id + " to " + output_key, ) except Exception as e: self.logger.exception("Exception when converting article XML to EIF") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message, ) return False return True
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info("data: %s" % json.dumps(data, sort_keys=True, indent=4)) run = data["run"] session = Session(self.settings) version = session.get_value(run, "version") article_id = session.get_value(run, "article_id") self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "start", "Starting preparation of article for EIF " + article_id, ) try: eif_location = session.get_value(run, "eif_location") eif_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket article_path = session.get_value(run, "article_path") self.set_monitor_property(self.settings, article_id, "path", article_path, "text", version=version) expanded_folder = session.get_value(run, "expanded_folder") status = session.get_value(run, "status") update_date = session.get_value(run, "update_date") carry_over_data = { "eif_location": eif_location, "eif_bucket": eif_bucket, "passthrough": { "article_id": article_id, "version": version, "run": run, "article_path": article_path, "expanded_folder": expanded_folder, "status": status, "update_date": update_date, }, } message = carry_over_data sqs_conn = boto.sqs.connect_to_region( self.settings.sqs_region, aws_access_key_id=self.settings.aws_access_key_id, aws_secret_access_key=self.settings.aws_secret_access_key, ) out_queue = sqs_conn.get_queue(self.settings.website_ingest_queue) m = Message() m.set_body(json.dumps(message)) out_queue.write(m) ######### except Exception as e: self.logger.exception("Exception when Preparing for PostEIF") self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "error", "Error submitting EIF For article" + article_id + " message:" + str(e.message), ) return False self.emit_monitor_event( self.settings, article_id, version, run, self.pretty_name, "end", "Finished preparation of article for EIF " + article_id, ) return True