def send_report(self, jobs): """ Send weekly report of new translation jobs to lead translator Pass: Sequence of tuples of values """ report = self.create_report(jobs) self.logger.debug("report\n%s", report) if self.recip: recips = [self.recip] else: group = "Spanish Translation Leads" if self.test: group = "Test Translation Queue Recips" recips = Job.get_group_email_addresses(group) if recips: subject = "[%s] %s" % (cdr.Tier().name, self.title) opts = dict(subject=subject, body=report, subtype="html") message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() self.logger.info("sent %s", subject) self.logger.info("recips: %s", ", ".join(recips)) else: self.logger.error("no email recipients for %s", group)
def sendErrorMessage(msg): # We want to send an email so that the query doesn't silently fail # ---------------------------------------------------------------- args = cdr.Tier().name, "*** Error: Program CheckHotfixRemove failed!" subject = "[%s] %s" % args recips = cdr.getEmailList("Developers Notification") mailHeader = """\ From: %s To: %s Subject: %s """ % (STR_FROM, ", ".join(recips), subject) mailHeader += "Content-type: text/html; charset=utf-8\n" mailBody = "<b>Error running HotfixRemove.py</b><br>" mailBody += "Most likely %s<br>" % msg mailBody += "See log file for details." # Add a Separator line + body # --------------------------- message = mailHeader + "\n" + mailBody server = smtplib.SMTP(SMTP_RELAY) server.sendmail(STR_FROM, recips, message.encode('utf-8')) server.quit()
def send_report(self, control): report = self.create_report(control) control.logger.debug("report\n%s", report) if self.recip: recips = [self.recip] elif control.test: group = "Test Translation Queue Recips" recips = Job.get_group_email_addresses(group) else: recips = [self.email] if recips: subject = "[%s] %s" % (cdr.Tier().name, control.title) opts = dict(subject=subject, body=report, subtype="html") message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() control.logger.info("sent %s", subject) control.logger.info("recips: %s", ", ".join(recips)) else: control.logger.error("no email recipients for %s", group)
def send_report(self, report): """ Email the report to the right recipient list. report Serialized HTML document for the report. """ if self.recip: recips = [self.recip] else: if self.test: group = "Test Publishing Notification" else: group = "Licensee Report Notification" recips = Job.get_group_email_addresses(group) title = "PDQ Distribution Partner List" subject = "[%s] %s" % (cdr.Tier().name, title) opts = dict(subject=subject, body=report, subtype="html") message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() self.logger.info("sent %s", subject) self.logger.info("recips: %s", ", ".join(recips))
class MailerJob: """ Base class for mailer job processing. Cannot be used directly. Public methods include: run() Top-level method to invoke job processing. log(message) Appends progress or error information to logfile. fillQueue() Overridden by derived classes to define processing appropriate to each mailer type. See documentation for this method below. addToQueue(job) Called by the derived classes' implementations of fillQueue() to add a PrintJob object to the print queue for the mailer job. addMailerTrackingDoc(doc, recipient, mailerType) Invoked by the derived classes' imlementations of fillQueue() to insert a Mailer document into the CDR. formatAddress(addr) Formats address information into a block of printable text lines and returns the block. getCipsContactAddress(id) Returns an Address object for the ContactDetail information in a Person document identified as the CIPS contact address. getOrganizationAddress (id): Returns an Address object for the ContactDetail information in an Organization document. Handles cases of directly referenced CIPSContactPerson, and generic Administrators. makeIndex() Builds an index list of recipients from the dictionary of Recipient objects and sorts it by country and postalCode. getId() Returns the ID for the publishing job. getSubset() Returns the string identifying the specific type of publishing job. getCursor() Returns object for executing database queries. getSession() Returns key for current CDR session. getDocIds() Returns the tuple of document IDs found in the pub_proc_doc table. getRecipients() Returns the dictionary containing the Recipient objects associated with this job. Populated by the derived classes during the process of filling the print queue. For jobs which use a single address for all packages sent to a given person, the Person document ID is used as the dictionary key. For jobs in which different addresses can be used for the same person, the keys used for the dictionary are the fragment links which identify a person and a specific address, so the same person can appear more than once if multiple addresses are used. getDocuments() Returns the dictionary containing the Document objects for the documents which will be mailed out for this job. Populated by the derived classes during the process of filling the print queue. getParm(name) Returns a possibly empty tuple of values stored in the pub_proc_parm table for this job. Filled by the base class. getDeadline() Returns a string in the form YYYY-MM-DD for the deadline by which the mailer must be responded to. Can be overridden by the derived classes as appropriate. getJobTime() Returns a string in the form YYYY-MM-DDTHH:MM:SS representing the date/time the job processing began. commit() Commits the current open database transaction. """ #------------------------------------------------------------------ # Class-level values. #------------------------------------------------------------------ __TIER = cdr.Tier() __CDR_EMAIL = "PDQ Operator <*****@*****.**" __SMTP_RELAY = "MAILFWD.NIH.GOV" __LOGFILE = _LOGFILE __DEF_PRINTER = "\\\\CIPSFS1\\HP8100" __INCLUDE_PATH = f"{cdr.WORK_DRIVE}:/cdr/Mailers/include" __ERR_PATTERN = re.compile("<Err>(.*)</Err>") #------------------------------------------------------------------ # Constructor for base class. #------------------------------------------------------------------ def __init__(self, jobId, batchPrinting=1): """ Parameters: jobId - Integer for publication job number. """ self.__id = jobId self.__nMailers = 0 self.__docIds = [] self.__recipients = {} self.__index = [] self.__documents = {} self.__parms = {} self.__printer = MailerJob.__DEF_PRINTER self.__batchPrinting = batchPrinting self.__letterLink = "" self.__cursor = None self.__email = None #------------------------------------------------------------------ # Public access methods. #------------------------------------------------------------------ def getId(self): return self.__id def getCursor(self): return self.__cursor def getSubset(self): return self.__subset def getSession(self): return self.__session def getDeadline(self): return self.__deadline def getDocIds(self): return self.__docIds def getRecipients(self): return self.__recipients def getIndex(self): return self.__index def getDocuments(self): return self.__documents def getJobTime(self): return self.__now def getCount(self): return self.__nMailers def getMailerIncludePath(self): return self.__INCLUDE_PATH def bumpCount(self): self.__nMailers += 1 def printDirect(self): self.__batchPrinting = 0 def addToQueue(self, job): self.__queue.append(job) def commit(self): self.__conn.commit() def getParm(self, name): v = self.__parms.get(name) return v and tuple(v) or () #------------------------------------------------------------------ # Driver for mailer job processing. #------------------------------------------------------------------ def run(self): """ Invokes the processing for a CDR mailer job. Catches and logs all exceptions. Returns 0 for success and 1 for failure. """ try: self.log("******** starting mailer job ********") self.__loadSettings() self.log("~~Finished __loadSettings") self.__mailerCleanup() self.log("~~Finished __mailerCleanup") self.__createQueue() self.log("~~Finished __createQueue") self.fillQueue() self.__printQueue(self.__batchPrinting) self.createRtfMailers() self.__cleanup("Success", "Processed %d mailers" % self.__nMailers) self.log("******** finished mailer job ********") return 0 except: (eType, eValue) = sys.exc_info()[:2] errMessage = eValue or eType self.log("ERROR: %s" % errMessage, tback=1) self.__packageFailureFiles() self.__cleanup("Failure", errMessage) return 1 #------------------------------------------------------------------ # Append message to logfile. #------------------------------------------------------------------ def log(self, message, tback=None): """ Appends progress or error information to a log file. Each entry is stamped with the current date and time and the number of the current publication job. No return value. No exceptions raised. """ try: msg = "Job %d: %s" % (self.__id, message) if tback: _LOGGER.exception(msg) else: _LOGGER.info(msg) except: pass #------------------------------------------------------------------ # Placeholder for method to populate the print queue for the job. #------------------------------------------------------------------ def fillQueue(self): """ The primary responsibility of the classes derived from MailerJob is to provide a definition of this method. This method must populate the object's print queue by invoking addToQueue() with instances of the PrintJob class, defined below. Each PrintJob object must represent a file which is ready to be written directly to the printer (for example, PostScript, or plain text, but not RTF files or Microsoft Word documents). Furthermore, for each copy of each document added to the print queue, the implementation of fillQueue() must invoke the addMailerTrackingDoc() method to add a new document to the repository for tracking the responses to the mailer. The mailerType argument passed to that method must be a string which matches one of the valid values for MailerType enumerated in the schema for Mailer documents. The files created for the queue should be written to the current working directory (as should any intermediate working files), and the filenames provided to the constructor for the PrintJob objects should not include any path information. """ raise Exception("fillQueue() must be defined by derived class") #------------------------------------------------------------------ # Placeholder for method to create rtf mailers (if any). #------------------------------------------------------------------ def createRtfMailers(self): """ Do nothing if the derived class does not override this method. """ pass #------------------------------------------------------------------ # Generate a document for tracking a mailer. #------------------------------------------------------------------ def addMailerTrackingDoc(self, doc, recipient, mailerType, remailerFor=None, protOrgId=None, email=None): """ Parameters: doc - Object of type Document, defined below recipient - Object of type Recipient, defined below mailerType - String containing a values matching the list of valid values for MailerType enumerated in the schema for Mailer docs. remailerFor - optional integer for the document ID of an earlier mailer that was sent out and never responded to, and for which this is a followup remailer. protOrgId - string or integer form of CDR ID for a protocol's lead organization (the one to which this mailer is being sent); used to distinguish between Status and Participant mailers for the same protocol in the same job. email - address used for electronic mailer. Return value: Integer ID for the newly inserted Mailer document. """ if remailerFor: remailerFor = "\n <RemailerFor cdr:ref='%s'/>" % \ cdr.normalize(remailerFor) else: remailerFor = "" if protOrgId: protOrg = "\n <ProtocolOrg cdr:ref='%s'/>" % \ cdr.normalize(protOrgId) else: protOrg = "" if email: mode = "Web-based" address = """\ <MailerAddress> <Email>%s</Email> </MailerAddress>""" % email else: mode = "Mail" address = recipient.getAddress().getXml() recipId = "CDR%010d" % recipient.getId() docId = "CDR%010d" % doc.getId() xml = """\ <CdrDoc Type="Mailer"> <CdrDocCtl> <DocTitle>Mailer for document %s sent to %s</DocTitle> </CdrDocCtl> <CdrDocXml><![CDATA[ <Mailer xmlns:cdr="cips.nci.nih.gov/cdr"> <Type Mode='%s'>%s</Type>%s <JobId>%d</JobId> <Recipient cdr:ref="%s">%s</Recipient>%s %s <Document cdr:ref="%s">%s</Document> <Sent>%s</Sent> <Deadline>%s</Deadline> </Mailer>]]> </CdrDocXml> </CdrDoc> """ % (docId, recipId, mode, mailerType, remailerFor, self.__id, recipId, recipient.getName(), protOrg, address, docId, doc.getTitle(), self.__now, self.getDeadline()) rsp = cdr.addDoc(self.__session, doc=xml.encode('utf-8'), checkIn="Y", ver="Y", val='Y') match = self.__ERR_PATTERN.search(rsp) if match: err = match.group(1) raise Exception("failure adding tracking document for %s: %s" % (docId, err)) self.__nMailers += 1 digits = re.sub("[^\d]", "", rsp) return int(digits) #------------------------------------------------------------------ # Convert Unicode string to Latin-1 character set. #------------------------------------------------------------------ def encodeLatin1(self, unicodeString): return unicodeString.encode('latin-1') #------------------------------------------------------------------ # Retrieve the CIPS contact Address for a mailer recipient. #------------------------------------------------------------------ def getCipsContactAddress(self, id, withPersonTitle=TITLE_OMITTED): """ Constructs and returns a new Address object for the document. Parameters: id - Integer ID for CDR Person document. docType - 'Person' (default) or 'Organization' withPersonTitle - For Address constructor. Return value: Returns an Address object for the CIPS contact. """ # Make string version of id docId = cdr.normalize(id) # Find fragment ID for CIPS contact location in Person doc rows = cdr.getQueryTermValueForId( '/Person/PersonLocations/CIPSContact', id, self.__conn) if not rows: raise Exception("no CIPSContact for %s" % docId) fragId = rows[0] # Filter to create AddressElement XML filters = ["name:Person Address Fragment With Name"] result = cdr.filterDoc(self.__session, filters, docId, parm=(('fragId', fragId), )) # Expecting tuple of xml fragment, messages. Single string is error. if type(result) == type(""): raise Exception("failure extracting contact address " "for %s: %s" % (docId, result)) return Address(result[0], withPersonTitle) #------------------------------------------------------------------ # Retrieve the contact address for a board member. #------------------------------------------------------------------ def getBoardMemberAddress(self, personId, memberId): # Find fragment ID for CIPS contact location in BoardMemberInfo doc path = '/PDQBoardMemberInfo/BoardMemberContact/PersonContactID' rows = cdr.getQueryTermValueForId(path, memberId, self.__conn) # Filter to create AddressElement XML if rows: docId = cdr.normalize(personId) parms = (('fragId', rows[0]), ) filters = ["name:Person Address Fragment With Name"] else: docId = cdr.normalize(memberId) parms = () filters = ["name:Board Member Address Fragment With Name"] result = cdr.filterDoc(self.__session, filters, docId, parm=parms) if isinstance(result, (str, bytes)): raise Exception("failure extracting contact address " "for %s: %s" % (docId, result)) return Address(result[0]) #------------------------------------------------------------------ # Retrieve the CIPS contact Address object for an Organization #------------------------------------------------------------------ def getOrganizationAddress(self, id): """ Parameters: id - Integer ID of the organization document. Return value: Address object for organization. """ # Default 'name' of the recipient nameStr = 'Administrator' # See if we have a CIPS contact person whose real name we can use rows = cdr.getQueryTermValueForId( '/Organization/OrganizationDetails/CIPSContactPerson/@cdr:ref', id, self.__conn) if rows: # Construct and Address object for person, to get real name # Fatal error if we can't find one personAddr = self.getCipsContactAddress(rows[0]) nameStr = personAddr.getAddressee() # Find the fragment id in the Organization doc for # the address we need to send to # Filter the organization to construct an address rows = cdr.getQueryTermValueForId( '/Organization/OrganizationLocations/CIPSContact', id, self.__conn) if not rows: raise Exception("No CIPSContact element found for " "Organization %d" % id) filters = ["name:Organization Address Fragment"] parms = (("fragId", rows[0]), ) result = cdr.filterDoc(self.__session, filters, id, parm=parms) # Expecting tuple of xml fragment, messages. Single string is error. if type(result) == type(""): raise Exception("failure extracting contact address " "for %d: %s" % (id, result)) # Construct an address from returned XML orgAddr = Address(result[0]) # Add or replace the name string with the one we constructed above orgAddr.setAddressee(nameStr) return orgAddr #------------------------------------------------------------------ # Generate an index of the mailers in order of country + postal code. #------------------------------------------------------------------ def makeIndex(self): self.__index = [] recipients = self.getRecipients() for recipKey in recipients: recip = recipients[recipKey] address = recip.getAddress() country = address.getCountry() postalCode = address.getPostalCode() for doc in recip.getDocs(): self.__index.append((country, postalCode, recip, doc)) self.__index.sort() #------------------------------------------------------------------ # Create a formatted address block from an Address object. #------------------------------------------------------------------ def formatAddress(self, addr): return addr.format().getBlock() #------------------------------------------------------------------ # Create the directory for RTF mailers. Callback used by the # derived class where appropriate. #------------------------------------------------------------------ def initRtfMailers(self): # Does the output directory already exist try: os.chdir(self.__rtfMailerDir) except: # Doesn't exist, try to create it try: os.makedirs(self.__rtfMailerDir) except: self.log("Unable to create rtf mailer directory", tback=1) raise Exception("failure creating rtf mailer directory %s" % self.__rtfMailerDir) try: os.chdir(self.__rtfMailerDir) except: self.log("Unable to change to rtf mailer directory", tback=1) raise Exception("failure setting working directory to %s" % self.__rtfMailerDir) # Specify the hostname based on the environment we're in # ------------------------------------------------------ args = cdr.APPC, "GetBoardMemberLetters.py", self.__id url = "https://{}/cgi-bin/cdr/{}?job={:d}".format(*args) self.__letterLink = """ You can retrieve the letters at: %s """ % url #------------------------------------------------------------------ # Clear out orphaned mailer tracking documents (from failed jobs). #------------------------------------------------------------------ def __mailerCleanup(self): if os.getenv('SKIP_MAILER_CLEANUP'): return # for faster testing try: results = cdr.mailerCleanup(self.__session) if results[0]: self.log("%d tracking document(s) marked as deleted" % len(results[0])) for err in results[1]: self.log("__mailerCleanup: %s" % err) except: self.log("mailerCleanup failure", 1) #------------------------------------------------------------------ # Prepare initial settings for job. #------------------------------------------------------------------ def __loadSettings(self): self.__getDates() self.__getDbConnection() self.__getCdrSession() self.__loadDbInfo() #------------------------------------------------------------------ # Calculate needed dates (now and two months from now). #------------------------------------------------------------------ def __getDates(self): now = time.localtime(time.time()) deadline = (now[0], now[1], now[2] + 60, 0, 0, 0, 0, 0, -1) deadline = time.localtime(time.mktime(deadline)) self.__now = time.strftime("%Y-%m-%dT%H:%M:%S", now) self.__deadline = time.strftime("%Y-%m-%d", deadline) #------------------------------------------------------------------ # Log into the CDR server. #------------------------------------------------------------------ def __getCdrSession(self): rsp = str(cdr.login("cdrmailers", cdr.getpw("cdrmailers"))) match = self.__ERR_PATTERN.search(rsp) if match: raise Exception("CDR login failure: %s" % match.group(1)) self.__session = rsp #------------------------------------------------------------------ # Log into the CDR database. #------------------------------------------------------------------ def __getDbConnection(self): try: self.__conn = db.connect(user="******") self.__cursor = self.__conn.cursor() except Exception as e: raise Exception(f"database connection failure: {e}%s") #------------------------------------------------------------------ # Load the settings for this job from the database. #------------------------------------------------------------------ def __loadDbInfo(self): self.__getPubProcRow() self.__getPubProcDocRows() self.__getPubProcParmRows() #------------------------------------------------------------------ # Load the row which matches this job from the pub_proc table. #------------------------------------------------------------------ def __getPubProcRow(self): try: self.__cursor.execute( """\ SELECT output_dir, email, pub_subset FROM pub_proc WHERE id = ?""", (self.__id, )) row = self.__cursor.fetchone() if not row: raise Exception("unable to find job %d" % self.__id) (self.__outputDir, self.__email, self.__subset) = row self.__rtfMailerDir = self.__outputDir + "-r" except Exception as e: raise Exception("database error retrieving pub_proc row: {e}") #------------------------------------------------------------------ # Load the list of document IDs and other descriptive information # for each document to be mailed by this job. #------------------------------------------------------------------ def __getPubProcDocRows(self): try: # Find id, version, title, document type name # for each document previously selected for mailing self.__cursor.execute( """\ SELECT pub.doc_id, pub.doc_version, doc.title, type.name FROM pub_proc_doc pub JOIN document doc ON pub.doc_id = doc.id JOIN doc_type type ON doc.doc_type = type.id WHERE pub_proc = ?""", (self.__id, )) docDescriptorList = self.__cursor.fetchall() # Can't continue if there aren't any if not docDescriptorList: raise Exception("no documents found for job %d" % self.__id) # Build a list of pure docIds (used by some software) # and of fuller information for row in docDescriptorList: # Append the id to plain list of ids self.__docIds.append(row[0]) # Create a document object and add it to list of objects self.__documents[row[0]] = \ Document (row[0], row[2], row[3], row[1]) if not docDescriptorList: raise Exception("no documents found for job %d" % self.__id) # Convert the id list to a faster tuple # [Not sure why Bob did this] self.__docIds = tuple(self.__docIds) except Exception as e: raise Exception("database error retrieving pub_proc_doc rows: {e}") #------------------------------------------------------------------ # Load the parameters stored in the pub_proc_parm table for this job. #------------------------------------------------------------------ def __getPubProcParmRows(self): try: self.__cursor.execute( """\ SELECT parm_name, parm_value FROM pub_proc_parm WHERE pub_proc = ? ORDER BY id""", (self.__id)) rows = self.__cursor.fetchall() if rows: for row in rows: if row[0] not in self.__parms: self.__parms[row[0]] = [] self.__parms[row[0]].append(row[1]) if row[0] == "Printer": self.__printer = row[1] except Exception as e: raise Exception("database error retrieving job parms: {e}") #------------------------------------------------------------------ # Create and populate the print queue. # Also creates and changes to the output directory. #------------------------------------------------------------------ def __createQueue(self): self.__queue = [] # Does the output directory already exist try: os.chdir(self.__outputDir) except: # Doesn't exist, try to create it try: os.makedirs(self.__outputDir) except: self.log("Unable to create working directory", tback=1) raise Exception("failure creating working directory %s" % self.__outputDir) try: os.chdir(self.__outputDir) except: self.log("Unable to change to working directory", tback=1) raise Exception("failure setting working directory to %s" % self.__outputDir) #------------------------------------------------------------------ # Print the jobs in the queue. #------------------------------------------------------------------ def __printQueue(self, batchPrint=1): # If no mailers at this point, we're just doing electronic mailers. if not self.__nMailers: for file in os.listdir("."): os.unlink("./%s" % file) os.chdir("..") os.rmdir(self.__outputDir) # Nothing to print. return if batchPrint: outputFile = open("PrintJob.cmd", "w") outputFile.write("@echo off\n") outputFile.write("if %1. == . goto usage\n") outputFile.write("if %1. == howmany. goto showcount\n") outputFile.write("if %2. == . goto L1\n") for i in range(len(self.__queue)): outputFile.write("if %%2. == %d. goto L%d\n" % (i + 1, i + 1)) outputFile.write("goto usage\n") else: outputFile = self.__printer i = 1 for job in self.__queue: job.Print(outputFile, self.log, batchPrint, i) i += 1 if batchPrint: outputFile.write("goto done\n") outputFile.write(":usage\n") outputFile.write("echo usage: PrintJob path-to-printer " "[first [last]]\n") outputFile.write("echo or: PrintJob howmany\n") outputFile.write("echo (to show how many files the script " "has without printing anything)\n") outputFile.write("echo e.g.: PrintJob \\\\CIPSFS1\\HP8100\n") outputFile.write("echo or: PrintJob \\\\CIPSFS1\\HP8100 " "201 400\n") outputFile.write("echo (to print the second 200 files)\n") outputFile.write(":showcount\n") outputFile.write("echo this script contains %d files\n" % len(self.__queue)) outputFile.write(":done\n") outputFile.close() self.__packageFiles() #------------------------------------------------------------------ # Create archive packages for the job's files. # Assumption: the current working directory is the job's output # output directory. We switch to the parent of that directory. # This side effect should have no undesirable consequences, # because this is the last thing we do for the job. # Note: all files with the extensions '.xml', '.tex', '.log', # '.aux', and '.dvi' are packaged in a separate compressed tar # archive for intermediate files. Everything else goes into a # second tar archive, used to print the actual mailer documents. # Make sure nothing needed by this second archive gets a filename # extension used for the intermediate file archive. #------------------------------------------------------------------ def __packageFiles(self): self.log("~~In packageFiles") workExt = ('xml', 'tex', 'log', 'aux', 'dvi', 'toc') dir = "Job%d" % self.getId() workName = "SupportFilesForJob%d.tar.bz2" % self.getId() printName = "PrintFilesForJob%d.tar.bz2" % self.getId() os.chdir("..") if not os.path.isdir(dir): raise Exception("INTERNAL ERROR: cannot find " "directory %s" % dir) try: workFile = tarfile.open(workName, 'w:bz2') for ext in workExt: for file in glob.glob('%s/*.%s' % (dir, ext)): workFile.add(file) workFile.close() for ext in workExt: for file in glob.glob('%s/*.%s' % (dir, ext)): os.unlink(file) except: raise Exception("failure packing working files for job") try: printFile = tarfile.open(printName, 'w:bz2') for file in os.listdir(dir): printFile.add("%s/%s" % (dir, file)) printFile.close() for file in os.listdir(dir): os.unlink("%s/%s" % (dir, file)) except: raise Exception("failure creating print job package") os.rmdir(dir) #------------------------------------------------------------------ # Create single archive package for a failed job's files. #------------------------------------------------------------------ def __packageFailureFiles(self): self.log("~~In packageFailureFiles") dir = "Job%d" % self.getId() name = "FailedJob%d.tar.bz2" % self.getId() try: os.chdir(self.__outputDir) os.chdir("..") except: return if not os.path.isdir(dir): self.log("Cannot find directory %s" % dir) return try: file = tarfile.open(name, 'w:bz2') for fName in glob.glob('%s/*' % dir): file.add(fName) file.close() for file in glob.glob('%s/*' % dir): os.unlink(file) except Exception as e: self.log("failure packing files for failed job: %s" % str(e)) return os.rmdir(dir) #------------------------------------------------------------------ # Clean up. #------------------------------------------------------------------ def __cleanup(self, status, message): self.log("~~In cleanup") if self.__rtfMailerDir: os.chdir(self.__rtfMailerDir) command = f"{cdr.BASEDIR}/bin/fix-permissions.cmd" command = command.replace("/", "\\") process = cdr.run_command(command, merge_output=True) if process.returncode: args = self.__rtfMailerDir, process.stdout _LOGGER.error("fixing %s permissions: %s", *args) else: self.log(f"fixed permissions for {self.__rtfMailerDir}") try: self.__updateStatus(status, message) self.__sendMail() if self.__session: cdr.logout(self.__session) except: self.log("__cleanup failed, status was '%s'" % status, tback=1) #------------------------------------------------------------------ # Update the pub_proc table's status. #------------------------------------------------------------------ def __updateStatus(self, status, message=None): self.log("~~In update status, status=%s" % status) message = message and str(message) try: if message: self.log(" (message: %s)" % message) self.__cursor.execute( """\ UPDATE pub_proc SET status = ?, messages = ?, completed = GETDATE() WHERE id = ?""", (status, message, self.__id)) self.__conn.commit() except: self.log("__updateStatus failed, status was '%s'" % status, tback=1) #------------------------------------------------------------------ # Inform the user that the job has completed. #------------------------------------------------------------------ def __sendMail(self): try: if self.__email: self.log("Sending mail to %s" % self.__email) sender = MailerJob.__CDR_EMAIL subject = "[%s] CDR Mailer Job Status" % self.__TIER.name # Specify the hostname based on the environment we're in # ------------------------------------------------------ args = cdr.APPC, "PubStatus.py", self.__id url = "https://{}/cgi-bin/cdr/{}?id={:d}".format(*args) message = """\ Job %d has completed. You can view a status report for this job at: %s %s Please do not reply to this message. """ % (self.__id, url, self.__letterLink) opts = dict(subject=subject, body=message) cdr.EmailMessage(sender, [self.__email], **opts).send() except: self.log("failure sending email to %s: %s" % (self.__email, cdr.exceptionInfo()))
</body> </html> """ # In Testmode we don't want to send the notification to the world # --------------------------------------------------------------- # Email constants # --------------- SMTP_RELAY = "MAILFWD.NIH.GOV" strFrom = "PDQ Operator <*****@*****.**>" if testMode: strTo = cdr.getEmailList('Test Publishing Notification') else: strTo = cdr.getEmailList('Licensee Report Notification') args = cdr.Tier().name, 'PDQ Distribution Partner List' subject = "[%s] %s" % args mailHeader = """\ From: %s To: %s Subject: %s """ % (strFrom, ", ".join(strTo), subject) mailHeader += "Content-type: text/html; charset=iso-8859-1\n" # Add a Separator line + body # --------------------------- message = mailHeader + "\n" + mailBody #print message
class Control: """ This is the class that does the real work. It is separated out so that we can provide a way to run this task from the command line. Class constants: TITLES Map of report key to distinguishing part of report title. DEFAULT_START Fall back on this for beginning of date range for report. DEFAULT_END Fall back on this for end of date range. REPORTS Full set of reports to be run by default (in order). SENDER First argument to cdr.EmailMessage constructor. CHARSET Used in HTML page. TSTYLE CSS formatting rules for table elements. TO_STRING_OPTS Options used for serializing HTML report object. CG DNS name for this tier's Cancer.gov host. B HTML builder module imported at Control class scope. HTML HTML module imported at Control class scope. Instance properties: reports Reports to be run in sequence specified. mode Required report mode ("test" or "live"). skip_email If true, don't send report to recipients; just save it. start Beginning of date range for selecting documents for report. end End of date range for selecting documents for report. test Convenience Boolean reflecting whether mode is 'test'. logger Object for recording log information about the report. cursor Object for submitting queries to the database. """ import lxml.html.builder as B import lxml.html as HTML TITLES = { "trials": "Trials", "english": "New/Changed English Summaries", "spanish": "New/Changed Spanish Summaries", } REPORTS = ["english", "spanish", "trials"] SENDER = "PDQ Operator <*****@*****.**>" CHARSET = "utf-8" TSTYLE = ("width: 80%", "border: 1px solid #999", "border-collapse: collapse", "margin-top: 30px") TSTYLE = "; ".join(TSTYLE) TO_STRING_OPTS = { "pretty_print": True, "encoding": CHARSET, "doctype": "<!DOCTYPE html>" } TIER = cdr.Tier() CG = TIER.hosts["CG"] def __init__(self, options, logger): """ Validate the settings: reports "english", "spanish", and/or "trials"; defaults to all three mode must be "test" or "live" (required); test mode restricts recipient list for report skip-email optional Boolean, defaults to False; if True, don't email the report to anyone log-level "info", "debug", or "error"; defaults to "info" start overrides the default start of the date range (a week ago) end overrides the default end of the date range (today) recip optional email address for testing so we don't spam others timeout how many seconds we'll wait for a connection or a query """ self.TODAY = datetime.date.today() self.DEFAULT_END = self.TODAY - datetime.timedelta(1) self.DEFAULT_START = self.TODAY - datetime.timedelta(7) self.logger = logger self.logger.info("====================================") self.reports = options.get("reports") or self.REPORTS self.mode = options["mode"] self.skip_email = options.get("skip-email", False) self.start = options.get("start") or str(self.DEFAULT_START) self.end = options.get("end") or str(self.DEFAULT_END) self.test = self.mode == "test" self.recip = options.get("recip") timeout = int(options.get("timeout", 300)) self.cursor = db.connect(user="******", timeout=timeout).cursor() if self.skip_email: self.logger.info("skipping email of reports") def run(self): "Run each of the reports we've been asked to create." for key in self.reports: try: self.do_report(key) except Exception as e: self.logger.exception("do_report(%s): %s", key, e) self.logger.info("%s job completed", self.mode) def do_report(self, key): """ Create, save, and (optionally) send out a single report. key Identifies which report we should process. See Control.REPORTS for expected values. """ title_args = (self.TITLES[key], self.start, self.end) self.title = "GovDelivery %s Report (%s to %s)" % title_args self.key = key report = self.create_report() self.logger.debug("report\n%s", report) if not self.skip_email: self.send_report(report) self.save_report(report) self.logger.info(self.title) def create_report(self): """ Create an HTML document for one of this job's reports. The report on new trials deals with all of the new trials as a single result set, so we can hand off the generation of the report to the single TrialSet instance. The reports on summaries are broken down to show lots of subsets of the documents in separate tables, so we handle the logic here, instantiating as many SummarySet objects as we need (by calling the summary_table() method below). """ if self.key == "trials": return TrialSet(self).report() style = "font-size: .9em; font-style: italic; font-family: Arial" body = self.B.BODY( self.B.H3(self.title, style="color: navy; font-family: Arial;"), self.B.P("Report date: %s" % datetime.date.today(), style=style)) for audience in ("Health professionals", "Patients"): body.append(self.summary_table("Summary", True, audience)) body.append(self.summary_table("Summary", False, audience)) if self.key == "english": body.append(self.summary_table("DrugInformationSummary", True)) body.append(self.summary_table("DrugInformationSummary", False)) return self.serialize(self.B.HTML(self.html_head(), body)) def summary_table(self, doc_type, new, audience=None): """ Create a SummarySet instance to generate the table for a slice of the documents in the report. doc_type Either "Summary" or "DrugInformationSummary." new If true, find documents first published in the date range. Otherwise, find documents whose DateLastModified value falls withing this range. audience Either "Health professionals" or "Patients" (only used for summaries). """ args = {"doc_type": doc_type, "new": new, "audience": audience} if doc_type == "Summary": args["language"] = self.key.capitalize() return SummarySet(self, **args).table() def save_report(self, report): """ Write the generated report to the cdr/reports directory. report Serialized HTML document for the report. """ now = datetime.datetime.now().isoformat() stamp = now.split(".")[0].replace(":", "").replace("-", "") test = self.test and ".test" or "" name = "gd-%s-%s%s.html" % (self.key, stamp, test) path = "%s/reports/%s" % (cdr.BASEDIR, name) fp = open(path, "wb") fp.write(report) fp.close() self.logger.info("created %s", path) def html_head(self): "Common code to create the top part of the generated report." return self.B.HEAD( self.B.META(charset=self.CHARSET), self.B.TITLE(self.title), ) def send_report(self, report): """ Email the report to the right recipient list. report Serialized HTML document for the report. """ if self.recip: recips = [self.recip] else: if self.test: group = "Test Publishing Notification" else: group = { "spanish": "GovDelivery ES Docs Notification", "english": "GovDelivery EN Docs Notification", "trials": "GovDelivery Trials Notification" }.get(self.key) recips = Job.get_group_email_addresses(group) if recips: subject = "[%s] %s" % (self.TIER.name, self.title) opts = dict(subject=subject, body=report, subtype="html") message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() self.logger.info("sent %s", subject) self.logger.info("recips: %s", ", ".join(recips)) else: self.logger.error("no email recipients for %s", group) @classmethod def th(cls, label, **styles): """ Helper method to generate a table column header. label Display string for the column header styles Optional style tweaks. See merge_styles() method. """ default_styles = { "font-family": "Arial", "border": "1px solid #999", "margin": "auto", "padding": "2px", } style = cls.merge_styles(default_styles, **styles) return cls.B.TH(label, style=style) @classmethod def td(cls, data, url=None, **styles): """ Helper method to generate a table data cell. data Data string to be displayed in the cell styles Optional style tweaks. See merge_styles() method. """ default_styles = { "font-family": "Arial", "border": "1px solid #999", "vertical-align": "top", "padding": "2px", "margin": "auto" } style = cls.merge_styles(default_styles, **styles) if url: return cls.B.TD(cls.B.A(data, href=url), style=style) return cls.B.TD(data, style=style) @classmethod def li(cls, text, url=None): """ Helper method for creating a list item element. text Display string for the list item. url Optional URL, causing the text to be wrapped in a link element. """ if url: return cls.B.LI(cls.B.A(text, href=url, style="font-family: Arial")) return cls.B.LI(text, style="font-family: Arial") @classmethod def serialize(cls, html): """ Create a properly encoded string for the report. html Tree object created using lxml HTML builder. """ return cls.HTML.tostring(html, **cls.TO_STRING_OPTS) @staticmethod def merge_styles(defaults, **styles): """ Allow the default styles for an element to be overridden. defaults Dictionary of style settings for a given element. styles Dictionary of additional or replacement style settings. If passed as separate arguments the setting names with hyphens will have to have been given with underscores instead of hyphens. We restore the names which CSS expects. """ d = dict(defaults, **styles) s = ["%s:%s" % (k.replace("_", "-"), v) for k, v in d.items()] return ";".join(s)
# Open Log file and enter start message # ------------------------------------- LOGFILE = 'PubEmail.log' LOGGER = cdr.Logging.get_logger("PubEmail") LOGGER.info('PubEmail Notification - Started') LOGGER.info('Arguments: %s', sys.argv) # Retrieve the Email addresses from the specified group # ----------------------------------------------------- emailDL = sorted(cdr.getEmailList('Operator Publishing Notification')) emailDev = sorted(cdr.getEmailList("Developers Notification")) # Set the variables and send the message # -------------------------------------- sender = "*****@*****.**" subject = "[%s] %s" % (cdr.Tier().name, sys.argv[1]) message = """\ Automated Publishing Email Notification: %s""" % sys.argv[2] try: # Somebody needs to get the message if the group is empty if not len(emailDL): emailDL = emailDev subject = '*** DL Missing *** %s' % subject opts = dict(subject=subject, body=message) cdr.EmailMessage(sender, emailDL, **opts).send() except: LOGGER.exception('*** Failure sending email message')
</html> """ else: raise NoNewDocumentsError('NoNewDocumentsError') # In Testmode we don't want to send the notification to the world # --------------------------------------------------------------- # Email constants # --------------- if testMode: strTo = cdr.getEmailList('Test Publishing Notification') else: strTo = cdr.getEmailList('Hotfix Remove Notification') #strTo.append(u'*****@*****.**') args = cdr.Tier().name, "Document Candidates to be removed from Cancer.gov" subject = "[%s] %s" % args mailHeader = """\ From: %s To: %s Subject: %s """ % (STR_FROM, ', '.join(strTo), subject) mailHeader += "Content-type: text/html; charset=utf-8\n" # Add a Separator line + body # --------------------------- message = mailHeader + "\n" + mailBody #print message
class Control: """ Wrap the processing logic in a single namespace. Properties: cms_only - set of document IDs for summaries we don't send to partners job_id - ID of the most recent licensee job job_ids - IDs of all of the licensee jobs, sorted chronologically job_path - location of the filtered CDR documents logger - object for writing to the log file newsums - nested dictionary of checksums for docs we're transferring oldsums - nested dictionary of checksums for docs already transferred prev_job_id - ID of the job we transferred the last time prev_job_path - location of documents transferred by the last run opts - command-line argument values controlling runtime options types - sequence of exported document types week - YYYYWW string representing the ISO week """ CHECKSUMS = "CHECKSUMS" LICENSEE_DOCS = "d:/cdr/Output/LicenseeDocs" PUB_SHADOW = "d:/cdr/sftp_shadow" PUB_SHADOW_FULL = "{}/full".format(PUB_SHADOW) LANGUAGES = dict(English="en", Spanish="es") MEDIA_CATALOG = "media_catalog.txt" LOGGER = cdr.Logging.get_logger("sftp-export") SSH = ("d:/cygwin/bin/ssh.exe -i d:/etc/cdroperator_rsa " "-o LogLevel=error " "-o StrictHostKeyChecking=no") TIER = cdr.Tier() HOST = TIER.hosts["SFTP"] USER = "******" PATH = "/sftp/sftphome/cdrstaging/pdq-{}".format(TIER.name.lower()) def __init__(self): """Log what we're about to do.""" self.logger.info(47 * "*") self.logger.info("sftp-export-data.py - Started") self.logger.info(47 * "*") self.logger.info("Processing %s", self.job_path) self.logger.info("week %s", self.week) self.logger.info("path is %s", os.environ.get("PATH")) def run(self): """ Execute the top-level processing for the script, performing 4 tasks. 1. Create catalog files for what we're publishing. 2. Create compressed archives for the files. 3. Stage the files to be synced. 4. Populate the public sFTP data share. """ start = datetime.now() os.chdir(self.job_path) # Optionally skip the first three steps if so requested. if not self.opts.push_only: # 1. Creating tar files and auxilliary files. if not self.opts.skip_catalogs: self.create_catalogs() # 2. Create tar files. self.create_archives() # 3. Copy files and move tar files to shadow location. self.copy_files() # 4. Sync the staging area to the sFTP server. if not self.opts.create_only: self.push_files() self.fix_permissions() elapsed = (datetime.now() - start).total_seconds() self.logger.info("") # Blank line to format log output self.logger.info("completed in %f seconds", elapsed) self.logger.info(47 * "*") self.logger.info("sftp-export-data.py - Finished") self.logger.info(47 * "*") def create_catalogs(self): """ Compare what we published last week with what we're publishing now. Creates a summary file listing counts for added, removed, and changed files of each document type. Also, for each document type for which any such differences have occurred since the previous week, creates a manifest, listing each document, with the action behind the difference. """ for path in glob("{}/*.{}".format(self.job_path, self.week)): self.logger.debug("removing %r", path) os.remove(path) changes = [] self.logger.info("Processing doctype directories:") for doctype in self.types: self.logger.info("...processing %s", doctype) oldsums = self.oldsums.get(doctype, {}) newsums = self.newsums.get(doctype, {}) olddocs = set(oldsums) newdocs = set(newsums) added = newdocs - olddocs dropped = olddocs - newdocs kept = olddocs & newdocs docs = [] for name in added: docs.append((self.extract_id(name), name, "added")) for name in dropped: docs.append((self.extract_id(name), name, "dropped")) changed = 0 for name in kept: if oldsums[name] != newsums[name]: changed += 1 docs.append((self.extract_id(name), name, "modified")) if docs: path = "{}/{}.{}".format(self.job_path, doctype, self.week) with open(path, "w") as fp: for doc_id, name, action in sorted(docs): fp.write("{}:{}\n".format(name, action)) self.logger.debug("%d line(s) in %s", len(docs), path) prefix = "{}.{}".format(doctype, self.week) changes.append("{}:added:{:d}\n".format(prefix, len(added))) changes.append("{}:removed:{:d}\n".format(prefix, len(dropped))) changes.append("{}:modified:{:d}\n".format(prefix, changed)) if doctype == "Summary": self.catalog_summaries(newdocs) with open("{}/{}.changes".format(self.job_path, self.week), "w") as fp: for change in sorted(changes): fp.write(change) self.logger.info("catalogs created") def catalog_summaries(self, filenames): """ Create Summary.en and Summary.es catalog files. Each file lists the file names (one per line) of each of the summary documents in the language of the catalog file. """ summaries = dict(en=[], es=[]) for filename in filenames: language = self.get_summary_language(filename) summaries[language].append((self.extract_id(filename), filename)) for language in summaries: args = self.job_path, language with open("{}/Summary.{}".format(*args), "w") as fp: for doc_id, filename in sorted(summaries[language]): fp.write("{}\n".format(filename)) self.logger.info("cataloged summaries by language") def get_summary_language(self, filename): """ Parse the summary document to determine its language. Pass: filename - string naming the file to examine """ path = "{}/Summary/{}".format(self.job_path, filename) root = etree.parse(path).getroot() language = cdr.get_text(root.find("SummaryMetaData/SummaryLanguage")) return self.LANGUAGES[language] def create_archives(self): """ Create compressed archives for the published files. A tar file is created for each of the document types. In addition, a complete tar file is created containing all of the document types, as well as the catalogs created above. """ self.logger.info("") # Blank line to format log output self.logger.info("Creating full.tar.gz") os.chdir(self.job_path) with tarfile.open("full.tar.gz", "w:gz") as tar: for name in self.types: self.logger.info("...adding %s", name) tar.add(name) catalog_name = "{}.{}".format(name, self.week) if os.path.exists(catalog_name): tar.add(catalog_name) if name == "Summary": tar.add("Summary.en") tar.add("Summary.es") if os.path.exists(self.MEDIA_CATALOG): tar.add(self.MEDIA_CATALOG) tar.add("{}.changes".format(self.week)) self.logger.info("") self.logger.info("Creating doctype tar files:") for name in self.types: tarname = "{}.tar.gz".format(name) with tarfile.open(tarname, "w:gz") as tar: self.logger.info("...creating %s.tar.gz", name) tar.add(name) def copy_files(self): """ Populate the local sFTP shadow directory. Copy the individual document files and catalogs to the shadow directory, and move the compressed archive files to that location. """ self.logger.info("") # Blank line to format log output self.logger.info("Copying files to shadow location") os.chdir(self.job_path) destination = "{}/full.tar.gz".format(self.PUB_SHADOW) try: os.remove(destination) except: print(("Can't remove {}".format(destination))) shutil.move("full.tar.gz", destination) full = self.PUB_SHADOW_FULL shutil.rmtree(full, ignore_errors=True) os.mkdir(full) shutil.copy("{}.changes".format(self.week), full) if os.path.exists(self.MEDIA_CATALOG): shutil.copy(self.MEDIA_CATALOG, full) for name in self.types: destination = "{}/{}".format(full, name) self.logger.info("...copying %s", name) shutil.copytree(name, destination) args = "{}.tar.gz".format(name), "{}.tar.gz".format(destination) shutil.move(*args) catalog_name = "{}.{}".format(name, self.week) if os.path.exists(catalog_name): shutil.copy(catalog_name, full) if name == "Summary": shutil.copy("Summary.en", full) shutil.copy("Summary.es", full) def push_files(self): """ Update the sFTP server with the content of the shadow directory. Use `rsync` to get the the individual document files and catalogs from the shadow directory to the sFTP server. """ args = self.SSH, self.USER, self.HOST, self.PATH command = 'rsync --delete -rae "{}" full* {}@{}:{}'.format(*args) self.logger.info("") # Blank line to format log output self.logger.info("ssh host: %s", self.HOST) self.logger.debug("ssh user: %s", self.USER) self.logger.info("rsync command: %s", command) os.chdir(self.PUB_SHADOW) result = cdr.run_command(command) self.logger.info("") # Blank line to format log output self.logger.info("*** run_command output") self.logger.info(result.stdout) if result.stderr: self.logger.info("*** Error:") self.logger.info(result.stderr) self.logger.info("finished syncing files with errors!!!") else: self.logger.info("finished syncing files on FTP server") os.chdir(self.job_path) def load_checksums(self, persist=True, prune_cms_only=False): """ Get the checksums for the CDR documents in the job tree. Assumes we have already made the current working directory the top-level directory for the job. If the checksums have already been calculated (as will typically be the case for the previous week's files), just load them from the file where they have been persisted. Pass: persist - if True (the default), save the calculated checksums to save us from having to calculate the sums for this job's files in a subsequent run prune_cms_only - if True, drop documents we don't send to the partners Return: nested dictionary of checksums, top level indexed by document type name, inner dictionaries indexed by file name, with values of hex strings for SHA256 digest hashes """ checksums = {} if os.path.exists(self.CHECKSUMS): with open(self.CHECKSUMS) as fp: for line in fp: checksum, path = line.strip().split(None, 1) directory, filename = path.split("/") if directory not in checksums: checksums[directory] = {} checksums[directory][filename] = checksum else: for directory in self.types: sums = checksums[directory] = {} for path in glob("{}/CDR*".format(directory)): filename = os.path.split(path)[-1] if prune_cms_only: id = self.extract_id(filename) if id in self.cms_only: os.remove(path) continue sums[filename] = self.checksum(path) opts = len(sums), directory self.logger.debug("calculated %d checksums for %s files", *opts) if persist: with open(self.CHECKSUMS, "w") as fp: for directory in sorted(checksums): sums = checksums[directory] for filename in sorted(sums, key=self.extract_id): checksum = sums[filename] path = "{}/{}".format(directory, filename) fp.write("{} {}\n".format(checksum, path)) return checksums def fix_permissions(self): """Make it possible for the data partners to retrieve the files.""" args = self.SSH, self.USER, self.HOST, self.PATH command = '{} {}@{} "chmod -R 755 {}/full*"'.format(*args) self.logger.info("chmod command: %s", command) result = cdr.run_command(command) if result.stderr: self.logger.info("*** Error:") self.logger.info(result.stderr) self.logger.info("finished fixing permissions with errors!!!") else: self.logger.info("finished fixing permissions on FTP server") @property def cms_only(self): """List of summary documents we don't give to the data partners.""" if not hasattr(self, "_cms_only"): query = db.Query("query_term_pub", "doc_id") query.where("path = '/Summary/@SVPC'") query.where("value = 'Yes'") rows = query.execute().fetchall() self._cms_only = {row.doc_id for row in rows} return self._cms_only @property def job_id(self): """ Get the overridden or calculated ID of the last licensee job. Use the job ID passed as a specific option if available; else use the default of the last job ID found in the LicenseeDocs directory. """ if not hasattr(self, "_job_id"): if self.opts.job: self._job_id = self.opts.job else: self._job_id = self.job_ids[-1] return self._job_id @property def job_ids(self): """ Collect all of the job IDS in the LicenseeDocs directory. If we don't find at least one job, there's nothing to transfer, so we'll bail. """ if not hasattr(self, "_job_ids"): os.chdir(self.LICENSEE_DOCS) job_ids = set() for name in glob("Job*"): match = re.match(r"^Job(\d+)$", name) if match: job_ids.add(int(match.group(1))) if not job_ids: self.logger.info("*** Error: No PDQ partner data found") exit(1) self._job_ids = sorted(job_ids) os.chdir(self.job_path) return self._job_ids @property def job_path(self): """Get the location of the documents to be transferred.""" if not hasattr(self, "_job_path"): args = self.LICENSEE_DOCS, self.job_id self._job_path = "{}/Job{:d}".format(*args) return self._job_path @property def logger(self): """Adjust the logging level as requested.""" if not hasattr(self, "_logger"): self._logger = Control.LOGGER self._logger.setLevel(self.opts.level.upper()) return self._logger @property def newsums(self): """Get checksums for the documents we are about to transfer.""" if not hasattr(self, "_newsums"): os.chdir(self.job_path) self._newsums = self.load_checksums(prune_cms_only=True) self.logger.info("loaded new checksums from %s", self.job_path) return self._newsums @property def oldsums(self): """ Get checksums for the documents we transferred last time. Get them from the previous job directory if available. Otherwise get them from the shadow SFTP directory. """ if not hasattr(self, "_oldsums"): directory = self.prev_job_path or self.PUB_SHADOW_FULL os.chdir(directory) self._oldsums = self.load_checksums() os.chdir(self.job_path) self.logger.info("loaded old checksums from %s", directory) return self._oldsums @property def opts(self): """Collect the command-line arguments.""" if not hasattr(self, "_opts"): parser = ArgumentParser() parser.add_argument("--job", type=int, help="enter job-id to process, default: last") parser.add_argument("--level", default="info", help="specify log level " "(debug, warn, [info], error)") parser.add_argument("--push-only", action="store_true", help="copy the latest existing data set") parser.add_argument("--create-only", action="store_true", help="create a new data set but do not copy") parser.add_argument("--skip-catalogs", action="store_true", help="skip creating auxilliary files") parser.add_argument("--week", help="use at your own risk") self._opts = parser.parse_args() return self._opts @property def prev_job_id(self): """ Get the ID of the job whose documents we transferred the last time. Return None if there are no jobs found older than the one we are transferring. """ if not hasattr(self, "_prev_job_id"): if len(self.job_ids) > 1: self._prev_job_id = self.job_ids[-2] else: self._prev_job_id = None return self._prev_job_id @property def prev_job_path(self): """Get the location of documents we transferred the last time.""" if not hasattr(self, "_prev_job_path"): if self.prev_job_id is None: self._prev_job_path = None else: args = self.LICENSEE_DOCS, self.prev_job_id self._prev_job_path = "{}/Job{:d}".format(*args) return self._prev_job_path @property def types(self): """Get the names of the document types we're transferring.""" if not hasattr(self, "_types"): os.chdir(self.job_path) types = [name for name in os.listdir(".") if os.path.isdir(name)] self._types = types return self._types @property def week(self): """Get the YYYYWW string for the job's ISO week.""" if not hasattr(self, "_week"): self._week = self.opts.week if not self._week: query = db.Query("pub_proc", "started") query.where(query.Condition("id", self.job_id)) started = query.execute().fetchone().started year, week, dow = started.isocalendar() self._week = "{:04d}{:02d}".format(year, week) return self._week @staticmethod def checksum(path): """ Create a checksum for the bytes of a file. Use of checksums instead of loading the old and new files into memory and comparing them speeds up the catalog generation from 5 1/2 minutes to 1 1/2 minutes on the CDR DEV server. This also avoids problems with aborted runs as reported in OCECDR-4348. We're using SHA256 instead of SHA-1 because Linus is planning to switch `git` from SHA-1 hashes to SHA256 hashes, and if SHA-1 isn't good enough for avoiding collisions in his view, then who are we to second-guess his judgment? Pass: path - string for relative path of file to checksum Return: Hex representation for SHA256 hash of file contents """ hasher = sha256() with open(path, "rb") as fp: for block in iter(partial(fp.read, 4096), b""): hasher.update(block) return hasher.hexdigest() @staticmethod def extract_id(name): """ Get the CDR document ID from the file name. Note: We do this so we can sort the names correctly. Pass: name - string for the file's name Return: integer extracted from the name """ root, ext = os.path.splitext(name) return int(root[3:])
# --------------------------------------------------------------------- # Created: 2007-04-03 Volker Englisch # ********************************************************************* import sys, re, cdr, os, shutil, time, getopt from cdrapi import db # Setting the host variable to submit the link for the error report # ----------------------------------------------------------------- host = cdr.APPC url = 'https://%s' % host # Setting directory and file names # -------------------------------- PUBPATH = os.path.join('d:\\cdr', 'publishing') # PUBPATH = os.path.join('d:\\home', 'venglisch', 'cdr', 'publishing') TIER = cdr.Tier().name MAX_RETRIES = 10 RETRY_MULTIPLIER = 5.0 wait = 60 # number of seconds to wait between status checks # The performance of the publishing job has greatly improved allowing # us to cancel a running job much sooner if it fails to finish. # Optionally overriden below once we know the publishing subset. # -------------------------------------------------------------------- if cdr.isProdHost(): waitTotal = 10800 # 3.0 hours elif cdr.isDevHost(): waitTotal = 10800 # 3.0 hours else: waitTotal = 14400 # 4.0 hours