def trial_def(self): """Schema for the listing trial records.""" if not hasattr(self, "_trial_def"): trial_def = getControlValue(self.CONTROL_GROUP, self.TRIAL_DEF) self._trial_def = loads(trial_def) return self._trial_def
def info_def(self): """Schema for the listing info records.""" if not hasattr(self, "_info_def"): info_def = getControlValue(self.CONTROL_GROUP, self.INFO_DEF) self._info_def = loads(info_def) return self._info_def
def indexdef(self): """Schema for our index. INDEXDEF can be the name of a control value or a JSON serialization of the index mappings. If the former, the serialization will be fetched from the ctl table of the CDR database, which will have been populated for this row in the table from "dictionary--{name}.json" in the Database/Loader directory of the `cdr-server` git repository. """ if not hasattr(self, "_indexdef"): if not hasattr(self, "INDEXDEF"): raise Exception("no schema provided") try: indexdef = getControlValue("dictionary", self.INDEXDEF) if indexdef: self._indexdef = loads(indexdef) return self._indexdef with open(self.INDEXDEF) as fp: self._indexdef = load(fp) except FileNotFoundError: try: self._indexdef = loads(self.INDEXDEF) except: name = self.INDEXDEF self.logger.exception("Loading schema from string") raise Exception("can't load index schema") except Exception: self.logger.exception("Loading schema from %s", self.INDEXDEF) raise Exception(f"can't load schema from {self.INDEXDEF}") return self._indexdef
def tokens(self): """Strings which we don't alter when we normalize display names.""" if not hasattr(self, "_tokens"): self._tokens = set() tokens = getControlValue(self.CONTROL_GROUP, self.TOKENS) for line in tokens.splitlines(): self._tokens.add(line.strip()) return self._tokens
def load_notification_message(self, test=False): """ Assemble the body for the message to be sent to the data partners. The top portion of the message is pulled from the ctl table, and the rest contains statistical information about what changed since last week's publishing job. """ name = "%s-notification" % ("test" if test else "data-partner") message = cdr.getControlValue("Publishing", name) return "%s\n%s\n" % (message, self.format_stats())
def labels(self): """Map tuples to dictionaries.""" if not hasattr(self, "_labels"): self._labels = [] labels = getControlValue(self.CONTROL_GROUP, self.LABELS) for line in labels.splitlines(): values = line.strip().split("|") url, id, label = [value.strip() for value in values] self._labels.append(dict( pretty_url_name=url, id_string=id.strip(), label=label.strip(), )) return self._labels
def overrides(self): """Hand-crafted labels and pretty URLs.""" if not hasattr(self, "_overrides"): overrides = getControlValue(self.CONTROL_GROUP, self.OVERRIDES) self._overrides = {} urls = {} for line in overrides.splitlines(): override = Override(line) if override.url in urls: message = f"URL {override.url} in multiple overrides" raise Exception(message) for code in override.codes: if not code: raise Exception(f"empty code in {line}") if code in self._overrides: message = f"code {code} in multiple overrides" raise Exception(message) self._overrides[code] = override return self._overrides
def warn(self): """ Send an email message saying that the account will be deactivated soon. Also, send a separate message immediately to the operators, containing the warning message we just sent to the data partner. N.B.: If we're resuming after a partially-failed run, it is possible that multiple warning messages will be sent to the same partner, because (unlike normal notification) there is no mechanism for recording when the last warning was sent. """ self.logger.info("warning %s of pending expiration", self.display) subject = "Warning notice: NCI PDQ Test Account for %s" % self.org subject = "%s, %s" % (subject, self.job.date_and_week()) template = cdr.getControlValue("Publishing", "test-partner-warning") message = template.replace("@@EXPIRING@@", str(self.expiring)[:10]) self.send(subject, message) self.notify_ops(subject, message)
def disable(self): """ Disable the account, and send a notice of expiration. Also, add a line to the summary report to be sent to the operators, and send a separate message immediately to the operators, containing the termination message we just sent to the data partner. N.B.: If processing fails between the step to send the expiration notice and the actual expiration itself, the partner will get a second notice when we correct the cause of the failure and resume processing. Very unlikely to happen. """ self.logger.info("disabling %s", self.display) self.report("Disabled test account for %s" % self.display) subject = "Expiration notice: NCI PDQ Test Account for %s" % self.org subject = "%s, %s" % (subject, self.job.date_and_week()) message = cdr.getControlValue("Publishing", "test-partner-disabled") self.send(subject, message) self.expire() self.notify_ops(subject, message)
class Report(Job): """ Task for generating a spreadsheet showing which PDQ partners have connected to the SFTP server to retrieve data. Attributes: logger - object for recording what we do resend - if True send a previously saved report noemail - if True don't email the report recips - sequence of recipient email addresses month - period for which activity is reported log_path - location of the log which we parse report_path - location to which report file is written """ LOGNAME = "pdq-access-report" SENDER = "NCI PDQ Operator <*****@*****.**>" SUBJECT = "SFTP Log - PDQ Distribution Partner Access Report (%s)" MAX_TRIES = 5 DELAY = 5 WIDTHS = 15, 50, 40, 10, 10, 10 LABELS = "Login", "Partner", "Path", "Session", "Date", "Time" NON_PARTNERS = cdr.getControlValue("PDQ", "non-partners", "") NON_PARTNERS = set(NON_PARTNERS.split(",")) SUPPORTED_PARAMETERS = {"month", "noemail", "recips", "resend"} def run(self): """ Generate and/or send the report. """ self.logger.info("Report started") for name in self.opts: self.logger.info("Option %s=%r", name, self.opts[name]) if not self.resend: self.make_report(self.requests) if not self.noemail: self.send_report() @property def log_path(self): """Location of the log to be parsed.""" if not hasattr(self, "_log_path"): self._log_path = self.month.log_path() return self._log_path @property def month(self): """Period for which activity is to be reported.""" if not hasattr(self, "_month"): self._month = self.Month(self.opts.get("month")) return self._month @property def noemail(self): """If True we skip sending the report.""" if not hasattr(self, "_noemail"): self._noemail = True if self.opts.get("noemail") else False return self._noemail @property def orgs(self): """ Fetch the information about the organizations with which we partner. """ if hasattr(self, "_orgs"): return self._orgs url = "https://cdr.cancer.gov/cgi-bin/cdr/get-pdq-partners.py?p=CDR" self.logger.info("fetching partners from %r", url) class Org: def __init__(self, node): self.oid = int(node.get("oid")) self.name = cdr.get_text(node.find("org_name")) self.status = cdr.get_text(node.find("org_status")) self.uid = cdr.get_text(node.find("ftp_userid")) self.terminated = cdr.get_text(node.find("terminated")) root = etree.fromstring(requests.get(url).content) self._orgs = {} for node in root.findall("org_id"): org = Org(node) if org.uid is not None: self._orgs[org.uid] = org return self._orgs @property def recips(self): """ Figure out who we should send the report to. """ if not hasattr(self, "_recips"): recips = self.opts.get("recips") if recips: self._recips = [r.strip() for r in recips.split(",")] else: self._recips = ["*****@*****.**"] return self._recips @property def report_path(self): """Location of the log to be parsed.""" if not hasattr(self, "_report_path"): self._report_path = self.month.report_path() return self._report_path @property def requests(self): """Partner requests extracted from the log file. Make sure we have the latest log files (using rsync), and then walk through each line in the log file for this report. We're interested in two types of lines (records): session opening lines, from which we build our dictionary of login IDs mapped by session IDs; and file opening lines, from which we parse our request objects. We skip over requests made using login accounts which are known not to represent PDQ data partners (CBIIT accounts, developer accounts, testing accounts, etc.). The session IDs appear in fields which look like this example: sshd[9223]: ... which is why we use the expression [5:-2] to extract them. """ if hasattr(self, "_requests"): return self._requests class Request: def __init__(self, line, sids, orgs): """ Extract the fields from the sftp activity log. Note that the second field holds the digit(s) for the date the request was received. In order to ensure that the value has a uniform width (for possible sorting purposes), we stick a zero in front of the value and use the substring starting two characters from the end (hence the -2 in the tokens[1][-2:] expression). Passed: line - record from the sftp log, fields separated by spaces sids - dictionary of sftp login IDs indexed by session ID orgs - dictionary of partner org names indexed by login ID """ tokens = line.split() if tokens[0].isdigit(): tokens = tokens[1:] self.date = "%s-%s" % (tokens[0], ("0" + tokens[1])[-2:]) self.time = tokens[2] self.path = tokens[6][1:-1].replace("/pdq/full/", "") self.sid = int(tokens[4][5:-2]) self.user = sids.get(self.sid, "") if self.user and self.user in orgs: self.org = orgs[self.user].name or "" else: self.org = "" self._requests = {} sids = {} count = 0 self.logger.info("parsing %r", self.log_path) self.__sync_logs() with gzip.open(self.log_path) as fp: for line in fp.readlines(): line = str(line, "utf-8") if "]: open " in line: request = Request(line, sids, self.orgs) if request.user in self.NON_PARTNERS: continue if request.user not in self._requests: self._requests[request.user] = [] self._requests[request.user].append(request) count += 1 elif "session opened for local user" in line: tokens = line.split() if tokens[0].isdigit(): tokens = tokens[1:] sid = int(tokens[4][5:-2]) user = tokens[10] sids[sid] = user args = count, len(self._requests) self.logger.info("fetched %d requests from %d partners", *args) return self._requests @property def resend(self): """If True we send a previously saved report.""" if not hasattr(self, "_resend"): self._resend = True if self.opts.get("resend") else False return self._resend @property def tier(self): """Run time settings.""" if not hasattr(self, "_tier"): self._tier = Tier() return self._tier def __sync_logs(self): """ Top up our local copies of the pdq logs from the sftp server. We're ignoring some expected errors, having to do with cygwin's difficulty in dealing with bizarre Windows file permissions configuration settings. If we really fail to bring down a needed log file successfully, we'll find out when we try to read it. """ etc = self.tier.etc rsa = f"{etc}/cdroperator_rsa" ssh = f"ssh -i {rsa} -o LogLevel=error -o StrictHostKeyChecking=no" usr = "******" dns = "cancerinfo.nci.nih.gov" src = "%s@%s:/sftp/sftphome/cdrstaging/logs/*" % (usr, dns) cmd = "rsync -e \"%s\" %s ." % (ssh, src) fix = r"%s:\cdr\bin\fix-permissions.cmd ." % cdr.WORK_DRIVE cwd = os.getcwd() os.chdir(self.Month.LOGDIR) self.logger.info(cmd) cdr.run_command(cmd) if cdr.WORK_DRIVE: self.logger.info(fix) cdr.run_command(fix) os.chdir(cwd) def make_report(self, requests): """ Generate and save a report of files fetched by the PDQ partners. """ book = openpyxl.Workbook() sheet = book.active sheet.title = "Requests" bold = openpyxl.styles.Font(size=12, bold=True) center = openpyxl.styles.Alignment(horizontal="center") sheet.freeze_panes = "A6" sheet["A1"] = str(datetime.date.today()) sheet["A1"].font = bold sheet["A3"] = "Downloads for %s" % self.month sheet["A3"].font = bold for i, width in enumerate(self.WIDTHS): col = chr(ord("A") + i) cell = "%s5" % col sheet.column_dimensions[col].width = width sheet[cell] = self.LABELS[i] sheet[cell].font = bold sheet[cell].alignment = center row = 6 for user in sorted(requests): for r in requests[user]: sheet.cell(row=row, column=1, value=r.user) sheet.cell(row=row, column=2, value=r.org) sheet.cell(row=row, column=3, value=r.path) sheet.cell(row=row, column=4, value=r.sid) sheet.cell(row=row, column=5, value=r.date).alignment = center sheet.cell(row=row, column=6, value=r.time).alignment = center row += 1 book.save(self.report_path) self.logger.info("wrote %r", self.report_path) def send_report(self): """ Send the report as an attachment to an email message. """ label = str(self.month) book = cdr.EmailAttachment(filepath=self.report_path) subject = self.SUBJECT % label body = ( "Attached is the monthly PDQ Distribution Partner report listing " "all documents downloaded from the SFTP server for %s.\n" % label, "The report is based on the log file provided at", " %s\n" % self.log_path, "Please save the attached report to the network directory", " L:\\OCPL\\_CROSS\\CDR\\Reports\\FTP Stats", "so the Clinical Trials team can access the information as needed.", "", "For questions or comments please reply to this email message.") body = "\n".join(body) recips = self.recips opts = dict(subject=subject, body=body, attachments=[book]) message = cdr.EmailMessage(self.SENDER, recips, **opts) message.send() self.logger.info("sent report to %s", ", ".join(recips)) class Month: """ Period covered by the report. Attributes: year - integer for the year of the report's data month - integer for the month of the report's data """ LOGDIR = cdr.BASEDIR + "/sftp_log" REPORTS = cdr.BASEDIR + "/reports" FILEBASE = "PDQPartnerDownloads" def __init__(self, yyyymm=None): """ Extract the month and year from the YYYYMM string passed, if provided; otherwise get the month and year for the previous month. """ if yyyymm: match = re.match(r"(\d\d\d\d)(\d\d)", yyyymm) if not match: self.logger.error("malformed month %r", month) raise Exception("invalid parameter") self.year = int(match.group(1)) self.month = int(match.group(2)) self.start = datetime.date(self.year, self.month, 1) else: today = datetime.date.today() self.start = today - relativedelta(months=1, day=1) self.month = self.start.month self.year = self.start.year def log_path(self): """ Contruct the path for the location of the log to be parsed. """ report_date = self.start + relativedelta(months=1) stamp = report_date.strftime("%Y%m%d") return "%s/pdq.log-%s.gz" % (self.LOGDIR, stamp) def report_path(self): """ Construct the path for the location of the report to be generated. """ ym = self.start.strftime("%Y-%m") return "%s/%s_%s.xlsx" % (self.REPORTS, self.FILEBASE, ym) def __str__(self): """ Display the month in spelled-out English. This is invoked by the make_report() method to construct the report title. Python effectively calls str(arg) when interpolating arguments for '%s' placeholders in the string templates, and the built-in str() function uses the object's __str__() method if it has one. """ return self.start.strftime("%B %Y")
def letters_json(self): """Letter type information usable by client-side scripting.""" if not hasattr(self, "_letters_json"): self._letters_json = getControlValue("Mailers", self.LETTERS) return self._letters_json
def run(self): """Create the sitemap document and store it on Akamai's servers. We maintain a dictionary of glossary names so we only have to fetch and parse a GlossaryTerm document once, instead of once for the English name and a second time for the Spanish name. The selection process is driven by a CSV file, stored in the ctl table, with CDR ID, key, and language on each line. Key is one of term, genetic, or drug. Language is English or Spanish. See the Jira ticket (URL at the top of this file) for more details. """ # Load the entries file. tier = self.tier self.logger.info("loading sitemap info from %s", tier) entries = getControlValue("dictionary", "sitemap-entries", tier=tier) # Prepare the database query used to fetch a CDR XML document. cursor = db.connect(user="******", tier=tier).cursor() query = db.Query("pub_proc_cg c", "t.name", "c.xml") query.join("document d", "d.id = c.id") query.join("doc_type t", "t.id = d.doc_type") query.where(query.Condition("c.id", 0)) query = str(query) # Walk through each entry to determine whether we have a URL. glossary_names = dict() urls = dict() for line in entries.splitlines(): line = line.strip() id, key, lang = line.split(",") id = int(id) cursor.execute(query, (id,)) row = cursor.fetchone() if not row: args = lang, key, id self.logger.warning("%s %s entry CDR%s not published", *args) else: if id in glossary_names: node = glossary_names[id].get(lang) else: root = etree.fromstring(row.xml.encode("utf-8")) if key == "drug": if row.name != "Term": raise Exception(f"CDR{id} has doctype {row.name}") node = root.find("CGovInfo/NCITName") if node is None or not node.text: node = root.find("PreferredName") else: glossary_names[id] = dict() node = root.find("TermName") glossary_names[id]["English"] = node node = root.find("SpanishTermName") glossary_names[id]["Spanish"] = node node = glossary_names[id].get(lang) url = None if node is not None and node.text: url = DictionaryAPILoader.Doc.Node.clean_pretty_url(node) if not url: args = lang, key, id message = "%s %s entry CDR%s has no URL; using CDR ID" self.logger.warning(message, *args) url = id pattern = self.PATTERNS[key][lang] url = f"https://www.cancer.gov/{pattern}/def/{url}" if url not in urls: urls[url] = [] urls[url].append((lang, key, id)) # Build the sitemap document, logging and skipping duplicate URLs. urlset = etree.Element(f"{self.NS}urlset", nsmap=self.NSMAP) for url in sorted(urls): if len(urls[url]) > 1: self.logger.warning("duplicate URL %r", url) for args in urls[url]: self.logger.warning("... used by %s %s entry CDR%s", *args) else: node = etree.SubElement(urlset, f"{self.NS}url") etree.SubElement(node, f"{self.NS}loc").text = url etree.SubElement(node, f"{self.NS}priority").text = "0.5" etree.SubElement(node, f"{self.NS}changefreq").text = "weekly" xml = etree.tostring(urlset, pretty_print=True, encoding="utf-8") if self.opts.get("dump"): print(xml.decode("utf-8")) else: try: stamp = datetime.now().strftime("%Y%m%d%H%M%S") tempname = f"d:/tmp/sitemap-{stamp}.xml" with open(tempname, "wb") as fp: fp.write(xml) except Exception as e: self.logger.exception("saving sitemap") raise try: with self.client.open_sftp() as sftp: sftp.put(tempname, self.DESTINATION) self.logger.info("sent %s to %s", tempname, self.hostname) if not self.opts.get("keep"): unlink(tempname) else: self.logger.info("preserving %s", tempname) except Exception as e: self.logger.exception("sending %s", tempname) raise
# Instantiate the Log class # --------------------------------------------------------------------- LOGGER = cdr.Logging.get_logger("PubJob", console=True) LOGGER.info("SubmitPubJob - Started") LOGGER.info('Arguments: %s', sys.argv) parseArgs(sys.argv) # Based on the command line parameter passed we are submitting a # interim publishing job or a full export # --------------------------------------------------------------- if fullMode: pubSubset = 'Export' else: pubSubset = 'Interim-Export' override = cdr.getControlValue("Publishing", f"{pubSubset}-wait-seconds") try: override = int(override) waitTotal = override except Exception: pass try: # Before we start we need to check if a publishing job is already # underway. It could be in the process of publishing or pushing. # We do not allow two jobs of the same job type to run simultanously. # Also, if a publishing job ran but the push job failed the # initiated push job would fail with a message 'Push job pending'. # --------------------------------------------------------------- LOGGER.info("Checking job queue ...")