def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "SendQueuedEmail" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 5 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 5 self.description = "Send email in the email queue." # Data provider self.db = dblib.SimpleDB(settings) # Default limit of emails per activity self.limit = 100 # Default rate limit self.rate_limit_per_sec = 10 # S3 bucket where email body content is stored self.email_body_bucket = settings.bot_bucket
def start(self, ENV = "dev"): # Specify run environment settings settings = settingsLib.get_settings(ENV) ping_marker_id = "cron_NewS3Suppl" # Log logFile = "starter.log" logger = log.logger(logFile, settings.setLevel, ping_marker_id) # Data provider db = dblib.SimpleDB(settings) db.connect() # SWF meta data provider swfmeta = swfmetalib.SWFMeta(settings) swfmeta.connect() # Default, if cron never run before last_startTimestamp = 0 # Get the last time this cron was run last_startTimestamp = swfmeta.get_last_completed_workflow_execution_startTimestamp(workflow_id = ping_marker_id) # Start a ping workflow as a marker self.start_ping_marker(ping_marker_id, ENV) # Check for S3 Suppl files that were updated since the last run date_format = "%Y-%m-%dT%H:%M:%S.000Z" # Quick hack - subtract 30 minutes to not ignore the top of the hour # the time between S3Monitor running and this cron starter last_startTimestamp_minus_30 = last_startTimestamp - (60*30) if(last_startTimestamp_minus_30 < 0): last_startTimestamp_minus_30 = 0 time_tuple = time.gmtime(last_startTimestamp_minus_30) last_startDate = time.strftime(date_format, time_tuple) logger.info('last run %s' % (last_startDate)) S3_item_list = db.elife_get_article_S3_file_items(file_data_type = "suppl", latest = True, last_updated_since = last_startDate) logger.info('Suppl files updated since %s: %s' % (last_startDate, str(len(S3_item_list)))) if(len(S3_item_list) <= 0): # No new SVG pass else: # Found new SVG files # Start a PublishSVG starter try: starter_name = "starter_PublishSuppl" self.import_starter_module(starter_name, logger) s = self.get_starter_module(starter_name, logger) s.start(ENV = ENV, last_updated_since = last_startDate) except: logger.info('Error: %s starting %s' % (ping_marker_id, starter_name)) logger.exception('')
def __init__(self, settings=None, tmp_dir=None): self.settings = settings self.tmp_dir = tmp_dir # Default tmp_dir if not specified self.tmp_dir_default = "article_provider" # SimpleDB connection for looking up S3 keys self.db = None if self.settings is not None: # Data provider self.db = dblib.SimpleDB(settings) # S3 connection self.s3_conn = None # Default S3 bucket name self.bucket_name = None if self.settings is not None: self.bucket_name = self.settings.bucket # Some defaults self.related_insight_article = None self.was_ever_poa = None self.is_poa = None # Store the list of DOI id that was ever PoA self.was_poa_doi_ids = None self.doi_ids = None self.article_bucket_published_dates = None # For checking published articles need a URL prefix for where to check self.lookup_url_prefix = "http://elifesciences.org/lookup/doi/10.7554/eLife."
def get_docs_from_SimpleDB(self, ENV = "dev", last_updated_since = None, doi_id = None): """ Get the array of docs from the SimpleDB provider """ docs = [] # Specify run environment settings settings = settingsLib.get_settings(ENV) db = dblib.SimpleDB(settings) db.connect() if(last_updated_since is not None): xml_item_list = db.elife_get_article_S3_file_items(file_data_type = "xml", latest = True, last_updated_since = last_updated_since) elif(doi_id is not None): xml_item_list = db.elife_get_article_S3_file_items(file_data_type = "xml", latest = True, doi_id = doi_id) else: # Get all xml_item_list = db.elife_get_article_S3_file_items(file_data_type = "xml", latest = True) for x in xml_item_list: tmp = {} elife_id = str(x['name']).split("/")[0] document = 'https://s3.amazonaws.com/' + x['item_name'] tmp['elife_id'] = elife_id tmp['document'] = document docs.append(tmp) return docs
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "DepositCrossref" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 30 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 15 self.description = ( "Download article XML from crossref outbox, " + "generate crossref XML, and deposit with crossref.") # Directory where POA library is stored self.poa_lib_dir_name = "elife-poa-xml-generation" # Where we specify the library to be imported self.elife_poa_lib = None # Import the libraries we will need self.import_imports() # Create output directories self.create_activity_directories() self.date_stamp = self.set_datestamp() # Data provider where email body is saved self.db = dblib.SimpleDB(settings) # Instantiate a new article object to provide some helper functions self.article = articlelib.article(self.settings, self.get_tmp_dir()) # Bucket for outgoing files self.publish_bucket = settings.poa_packaging_bucket self.outbox_folder = "crossref/outbox/" self.published_folder = "crossref/published/" # Track the success of some steps self.activity_status = None self.generate_status = None self.approve_status = None self.outbox_status = None self.publish_status = None # HTTP requests status self.http_request_status_code = [] self.http_request_status_text = [] self.outbox_s3_key_names = None # Track XML files selected for pubmed XML self.article_published_file_names = [] self.article_not_published_file_names = []
def start(self, settings): ping_marker_id = "cron_NewS3POA" # Log logFile = "starter.log" logger = log.logger(logFile, settings.setLevel, ping_marker_id) # Data provider db = dblib.SimpleDB(settings) db.connect() # SWF meta data provider swfmeta = swfmetalib.SWFMeta(settings) swfmeta.connect() last_startTimestamp = swfmeta.get_last_completed_workflow_execution_startTimestamp( workflow_id=ping_marker_id) # Start a ping workflow as a marker self.start_ping_marker(ping_marker_id, settings) # Check for S3 XML files that were updated since the last run date_format = "%Y-%m-%dT%H:%M:%S.000Z" # Quick hack - subtract 15 minutes, # the time between S3Monitor running and this cron starter last_startTimestamp_minus_15 = last_startTimestamp - (60 * 15) time_tuple = time.gmtime(last_startTimestamp_minus_15) last_startDate = time.strftime(date_format, time_tuple) logger.info('last run %s' % (last_startDate)) xml_item_list = db.elife_get_POA_delivery_S3_file_items( last_updated_since=last_startDate) logger.info('POA files updated since %s: %s' % (last_startDate, str(len(xml_item_list)))) if len(xml_item_list) <= 0: # No new XML pass else: # Found new XML files # Start a PackagePOA starter try: starter_name = "starter_PackagePOA" self.import_starter_module(starter_name, logger) s = self.get_starter_module(starter_name, logger) s.start(settings=settings, last_updated_since=last_startDate) except: logger.info('Error: %s starting %s' % (ping_marker_id, starter_name)) logger.exception('')
def import_simpledb_provider_module(step): imported = None try: import provider.simpleDB as dblib world.db = dblib.SimpleDB(world.settings) imported = True except: imported = False assert imported is True, \ "SimpleDB module was imported"
def start(self, ENV="dev"): # Specify run environment settings settings = settingsLib.get_settings(ENV) ping_marker_id = "cron_FiveMinute" # Log logFile = "starter.log" logger = log.logger(logFile, settings.setLevel, ping_marker_id) # Data provider db = dblib.SimpleDB(settings) db.connect() # SWF meta data provider swfmeta = swfmetalib.SWFMeta(settings) swfmeta.connect() last_startTimestamp = swfmeta.get_last_completed_workflow_execution_startTimestamp( workflow_id=ping_marker_id) # Start a ping workflow as a marker self.start_ping_marker(ping_marker_id, ENV) # Check for S3 XML files that were updated since the last run date_format = "%Y-%m-%dT%H:%M:%S.000Z" # Date conversion time_tuple = time.gmtime(last_startTimestamp) last_startDate = time.strftime(date_format, time_tuple) logger.info('last run %s %s' % (ping_marker_id, last_startDate)) # A conditional start for SendQueuedEmail # Only start a workflow if there are emails in the queue ready to send item_list = db.elife_get_email_queue_items( query_type="count", date_scheduled_before=last_startDate) try: if (int(item_list[0]["Count"]) > 0): # More than one email in the queue, start a workflow try: starter_name = "starter_SendQueuedEmail" self.import_starter_module(starter_name, logger) s = self.get_starter_module(starter_name, logger) s.start(ENV=ENV) except: logger.info('Error: %s starting %s' % (ping_marker_id, starter_name)) logger.exception('') except: # Some error logger.info('Exception encountered starting %s: %s' % (ping_marker_id, last_startDate))
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "PackagePOA" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 30 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 15 self.description = "Process POA zip file input, repackage, and save to S3." # Directory where POA library is stored self.poa_lib_dir_name = "elife-poa-xml-generation" # Where we specify the library to be imported self.elife_poa_lib = None # Import the libraries we will need self.import_imports() # Create output directories self.create_activity_directories() # Create an EJP provider to access S3 bucket holding CSV files self.ejp = ejplib.EJP(settings, self.get_tmp_dir()) # Data provider where email body is saved self.db = dblib.SimpleDB(settings) # Bucket for outgoing files self.publish_bucket = settings.poa_packaging_bucket self.outbox_folder = "outbox/" # Some values to set later self.document = None self.poa_zip_filename = None self.doi = None # Capture errors from generating XML self.error_count = None self.error_messages = None # Track the success of some steps self.activity_status = None self.approve_status = None self.process_status = None self.generate_xml_status = None self.pdf_decap_status = None
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "PublishFinalPOA" self.version = "1" self.default_task_heartbeat_timeout = 60 * 30 self.default_task_schedule_to_close_timeout = 60 * 30 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 15 self.description = ( "Download POA files from a bucket, zip each article separately, " + "and upload to final bucket.") # Local directory settings self.TMP_DIR = self.get_tmp_dir() + os.sep + "tmp_dir" self.INPUT_DIR = self.get_tmp_dir() + os.sep + "input_dir" self.OUTPUT_DIR = self.get_tmp_dir() + os.sep + "output_dir" self.JUNK_DIR = self.get_tmp_dir() + os.sep + "junk_dir" self.DONE_DIR = self.get_tmp_dir() + os.sep + "done_dir" # Bucket for outgoing files self.input_bucket = settings.poa_packaging_bucket self.outbox_folder = "outbox/" self.published_folder_prefix = "published/" self.published_folder_name = None self.publish_bucket = settings.publishing_buckets_prefix + settings.production_bucket # Track the success of some steps self.activity_status = None self.approve_status = None self.publish_status = None # More file status tracking for reporting in email self.outbox_s3_key_names = [] self.malformed_ds_file_names = [] self.empty_ds_file_names = [] self.unmatched_ds_file_names = [] # Data provider where email body is saved self.db = dblib.SimpleDB(settings)
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "LensCDNInvalidation" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 5 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 5 self.description = "Create an invalidation request for the eLife Lens documents in the Cloudfront CDN." # Data provider self.db = dblib.SimpleDB(settings)
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "S3Monitor" self.version = "1.1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 15 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 20 self.description = "S3Monitor activity: poll S3 bucket and save object metadata into SimpleDB." # Data provider self.db = dblib.SimpleDB(settings)
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "PMCDeposit" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 30 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 15 self.description = ("Download single zip file an article, repackage it, " + "send to PMC and notify them.") # Local directory settings self.TMP_DIR = self.get_tmp_dir() + os.sep + "tmp_dir" self.INPUT_DIR = self.get_tmp_dir() + os.sep + "input_dir" self.JUNK_DIR = self.get_tmp_dir() + os.sep + "junk_dir" self.ZIP_DIR = self.get_tmp_dir() + os.sep + "zip_dir" self.EPS_DIR = self.get_tmp_dir() + os.sep + "eps_dir" self.TIF_DIR = self.get_tmp_dir() + os.sep + "tif_dir" self.OUTPUT_DIR = self.get_tmp_dir() + os.sep + "output_dir" # Data provider where email body is saved self.db = dblib.SimpleDB(settings) # Bucket settings self.input_bucket = None self.input_bucket_default = (settings.publishing_buckets_prefix + settings.archive_bucket) self.publish_bucket = settings.poa_packaging_bucket self.published_folder = "pmc/published" self.published_zip_folder = "pmc/zip" # journal self.journal = 'elife' # Outgoing FTP settings are set later self.FTP_URI = None self.FTP_USERNAME = None self.FTP_PASSWORD = None self.FTP_CWD = None self.FTP_SUBDIR = []
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "LensXMLFilesList" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 5 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 5 self.description = "Create the eLife Lens xml list file for cache warming, and then save those to the S3 CDN bucket." # Data provider self.db = dblib.SimpleDB(settings) # Create the filesystem provider self.fs = fslib.Filesystem(self.get_tmp_dir())
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "AdminEmailHistory" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 5 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 5 self.description = "Email administrators a workflow history status message." # Data provider self.db = dblib.SimpleDB(settings) # Default time period, in seconds self.time_period = 60 * 60 * 4
def get_docs_from_SimpleDB(self, settings, last_updated_since=None): """ Get the array of docs from the SimpleDB provider """ docs = [] db = dblib.SimpleDB(settings) db.connect() if last_updated_since is not None: xml_item_list = db.elife_get_POA_delivery_S3_file_items( last_updated_since=last_updated_since) else: # Get all - not implemented for now to avoid mistakes running too many workflows pass for x in xml_item_list: tmp = {} name = x['name'] tmp['document'] = name docs.append(tmp) return docs
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "PublicationEmail" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 5 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 5 self.description = "Queue emails to notify of a new article publication." # Data provider self.db = dblib.SimpleDB(settings) # Templates provider self.templates = templatelib.Templates(settings, self.get_tmp_dir()) # EJP data provider self.ejp = ejplib.EJP(settings, self.get_tmp_dir()) # Bucket for outgoing files self.publish_bucket = settings.poa_packaging_bucket self.outbox_folder = "publication_email/outbox/" self.published_folder = "publication_email/published/" # Track the success of some steps self.activity_status = None # Track XML files selected for publication self.article_xml_filenames = [] self.xml_file_to_doi_map = {} self.articles = [] self.related_articles = [] self.articles_approved = [] self.articles_approved_prepared = [] self.insight_articles_to_remove_from_outbox = [] self.articles_do_not_remove_from_outbox = [] # Default is do not send duplicate emails self.allow_duplicates = False # Article types for which not to send emails self.article_types_do_not_send = [] self.article_types_do_not_send.append('editorial') self.article_types_do_not_send.append('correction') # Email types, for sending previews of each template self.email_types = [] self.email_types.append('author_publication_email_POA') self.email_types.append('author_publication_email_VOR_after_POA') self.email_types.append('author_publication_email_VOR_no_POA') self.email_types.append('author_publication_email_Insight_to_VOR') self.email_types.append('author_publication_email_Feature') self.date_stamp = self.set_datestamp() self.admin_email_content = ''
def do_activity(self, data=None): """ Activity, do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) # Data passed to this activity elife_id = data["data"]["elife_id"] workflow = data["data"]["workflow"] # Create output directories self.create_activity_directories() # Data provider self.db = dblib.SimpleDB(self.settings) # Connect to DB self.db_conn = self.db.connect() # Download the S3 objects self.download_files_from_s3(elife_id, workflow) # Set FTP settings self.set_ftp_settings(elife_id, workflow) # FTP to endpoint try: if workflow == 'HEFCE': file_type = "/*.zip" zipfiles = glob.glob(self.get_tmp_dir() + os.sep + self.FTP_TO_SOMEWHERE_DIR + file_type) #self.ftp_to_endpoint(zipfiles, self.FTP_SUBDIR, passive=True) # SFTP now sub_dir = "{:05d}".format(int(elife_id)) self.sftp_to_endpoint(zipfiles, sub_dir) if workflow == 'Cengage': file_type = "/*.zip" zipfiles = glob.glob(self.get_tmp_dir() + os.sep + self.FTP_TO_SOMEWHERE_DIR + file_type) self.ftp_to_endpoint(zipfiles, passive=True) if workflow == 'Scopus': file_type = "/*.zip" zipfiles = glob.glob(self.get_tmp_dir() + os.sep + self.FTP_TO_SOMEWHERE_DIR + file_type) self.ftp_to_endpoint(zipfiles, passive=True) if workflow == 'WoS': file_type = "/*.zip" zipfiles = glob.glob(self.get_tmp_dir() + os.sep + self.FTP_TO_SOMEWHERE_DIR + file_type) self.ftp_to_endpoint(zipfiles, passive=True) if workflow == 'GoOA': file_type = "/*.zip" zipfiles = glob.glob(self.get_tmp_dir() + os.sep + self.FTP_TO_SOMEWHERE_DIR + file_type) self.ftp_to_endpoint(zipfiles, passive=True) except: # Something went wrong, fail if self.logger: self.logger.exception( 'exception in FTPArticle, data: %s' % json.dumps(data, sort_keys=True, indent=4)) result = False self.clean_tmp_dir() return result # Return the activity result, True or False result = True self.clean_tmp_dir() return result
def __init__(self, settings, logger, conn=None, token=None, activity_task=None): activity.activity.__init__(self, settings, logger, conn, token, activity_task) self.name = "PubRouterDeposit" self.version = "1" self.default_task_heartbeat_timeout = 30 self.default_task_schedule_to_close_timeout = 60 * 30 self.default_task_schedule_to_start_timeout = 30 self.default_task_start_to_close_timeout = 60 * 15 self.description = ("Download article XML from pub_router outbox, \ approve each for publication, and deposit files via FTP to pub router." ) # Create output directories self.date_stamp = self.set_datestamp() # Data provider where email body is saved self.db = dblib.SimpleDB(settings) # Instantiate a new article object to provide some helper functions self.article = articlelib.article(self.settings, self.get_tmp_dir()) # Bucket for outgoing files self.publish_bucket = settings.poa_packaging_bucket self.outbox_folder = None self.published_folder = None # Bucket settings for source files of FTPArticle workflows self.pmc_zip_bucket = settings.poa_packaging_bucket self.pmc_zip_folder = "pmc/zip/" # Bucket settings for source files of PMCDeposit workflows self.archive_bucket = self.settings.publishing_buckets_prefix + self.settings.archive_bucket # Track the success of some steps self.activity_status = None self.ftp_status = None self.outbox_status = None self.publish_status = None self.outbox_s3_key_names = None # Type of FTPArticle workflow to start, will be specified in data self.workflow = None # Track XML files selected self.article_xml_filenames = [] self.xml_file_to_doi_map = {} self.articles = [] #self.article_published_file_names = [] #self.article_not_published_file_names = [] self.admin_email_content = '' # journal self.journal = 'elife'