Ejemplo n.º 1
0
    def trial_def(self):
        """Schema for the listing trial records."""

        if not hasattr(self, "_trial_def"):
            trial_def = getControlValue(self.CONTROL_GROUP, self.TRIAL_DEF)
            self._trial_def = loads(trial_def)
        return self._trial_def
Ejemplo n.º 2
0
    def info_def(self):
        """Schema for the listing info records."""

        if not hasattr(self, "_info_def"):
            info_def = getControlValue(self.CONTROL_GROUP, self.INFO_DEF)
            self._info_def = loads(info_def)
        return self._info_def
Ejemplo n.º 3
0
    def indexdef(self):
        """Schema for our index.

        INDEXDEF can be the name of a control value or a JSON serialization
        of the index mappings. If the former, the serialization will be
        fetched from the ctl table of the CDR database, which will have
        been populated for this row in the table from "dictionary--{name}.json"
        in the Database/Loader directory of the `cdr-server` git repository.
        """

        if not hasattr(self, "_indexdef"):
            if not hasattr(self, "INDEXDEF"):
                raise Exception("no schema provided")
            try:
                indexdef = getControlValue("dictionary", self.INDEXDEF)
                if indexdef:
                    self._indexdef = loads(indexdef)
                    return self._indexdef
                with open(self.INDEXDEF) as fp:
                    self._indexdef = load(fp)
            except FileNotFoundError:
                try:
                    self._indexdef = loads(self.INDEXDEF)
                except:
                    name = self.INDEXDEF
                    self.logger.exception("Loading schema from string")
                    raise Exception("can't load index schema")
            except Exception:
                self.logger.exception("Loading schema from %s", self.INDEXDEF)
                raise Exception(f"can't load schema from {self.INDEXDEF}")
        return self._indexdef
Ejemplo n.º 4
0
    def tokens(self):
        """Strings which we don't alter when we normalize display names."""

        if not hasattr(self, "_tokens"):
            self._tokens = set()
            tokens = getControlValue(self.CONTROL_GROUP, self.TOKENS)
            for line in tokens.splitlines():
                self._tokens.add(line.strip())
        return self._tokens
Ejemplo n.º 5
0
        def load_notification_message(self, test=False):
            """
            Assemble the body for the message to be sent to the data partners.

            The top portion of the message is pulled from the ctl table,
            and the rest contains statistical information about what
            changed since last week's publishing job.
            """

            name = "%s-notification" % ("test" if test else "data-partner")
            message = cdr.getControlValue("Publishing", name)
            return "%s\n%s\n" % (message, self.format_stats())
Ejemplo n.º 6
0
    def labels(self):
        """Map tuples to dictionaries."""

        if not hasattr(self, "_labels"):
            self._labels = []
            labels = getControlValue(self.CONTROL_GROUP, self.LABELS)
            for line in labels.splitlines():
                values = line.strip().split("|")
                url, id, label = [value.strip() for value in values]
                self._labels.append(dict(
                    pretty_url_name=url,
                    id_string=id.strip(),
                    label=label.strip(),
                ))
        return self._labels
Ejemplo n.º 7
0
    def overrides(self):
        """Hand-crafted labels and pretty URLs."""

        if not hasattr(self, "_overrides"):
            overrides = getControlValue(self.CONTROL_GROUP, self.OVERRIDES)
            self._overrides = {}
            urls = {}
            for line in overrides.splitlines():
                override = Override(line)
                if override.url in urls:
                    message = f"URL {override.url} in multiple overrides"
                    raise Exception(message)
                for code in override.codes:
                    if not code:
                        raise Exception(f"empty code in {line}")
                    if code in self._overrides:
                        message = f"code {code} in multiple overrides"
                        raise Exception(message)
                    self._overrides[code] = override
        return self._overrides
Ejemplo n.º 8
0
    def warn(self):
        """
        Send an email message saying that the account will be deactivated soon.

        Also, send a separate message immediately to the operators, containing
        the warning message we just sent to the data partner.

        N.B.: If we're resuming after a partially-failed run, it is possible
        that multiple warning messages will be sent to the same partner,
        because (unlike normal notification) there is no mechanism for
        recording when the last warning was sent.
        """

        self.logger.info("warning %s of pending expiration", self.display)
        subject = "Warning notice: NCI PDQ Test Account for %s" % self.org
        subject = "%s, %s" % (subject, self.job.date_and_week())
        template = cdr.getControlValue("Publishing", "test-partner-warning")
        message = template.replace("@@EXPIRING@@", str(self.expiring)[:10])
        self.send(subject, message)
        self.notify_ops(subject, message)
Ejemplo n.º 9
0
    def disable(self):
        """
        Disable the account, and send a notice of expiration.

        Also, add a line to the summary report to be sent to the operators,
        and send a separate message immediately to the operators, containing
        the termination message we just sent to the data partner.

        N.B.: If processing fails between the step to send the expiration
        notice and the actual expiration itself, the partner will get a
        second notice when we correct the cause of the failure and resume
        processing. Very unlikely to happen.
        """

        self.logger.info("disabling %s", self.display)
        self.report("Disabled test account for %s" % self.display)
        subject = "Expiration notice: NCI PDQ Test Account for %s" % self.org
        subject = "%s, %s" % (subject, self.job.date_and_week())
        message = cdr.getControlValue("Publishing", "test-partner-disabled")
        self.send(subject, message)
        self.expire()
        self.notify_ops(subject, message)
Ejemplo n.º 10
0
class Report(Job):
    """
    Task for generating a spreadsheet showing which PDQ partners
    have connected to the SFTP server to retrieve data.

    Attributes:
        logger - object for recording what we do
        resend - if True send a previously saved report
        noemail - if True don't email the report
        recips - sequence of recipient email addresses
        month - period for which activity is reported
        log_path - location of the log which we parse
        report_path - location to which report file is written
    """

    LOGNAME = "pdq-access-report"
    SENDER = "NCI PDQ Operator <*****@*****.**>"
    SUBJECT = "SFTP Log - PDQ Distribution Partner Access Report (%s)"
    MAX_TRIES = 5
    DELAY = 5
    WIDTHS = 15, 50, 40, 10, 10, 10
    LABELS = "Login", "Partner", "Path", "Session", "Date", "Time"
    NON_PARTNERS = cdr.getControlValue("PDQ", "non-partners", "")
    NON_PARTNERS = set(NON_PARTNERS.split(","))
    SUPPORTED_PARAMETERS = {"month", "noemail", "recips", "resend"}

    def run(self):
        """
        Generate and/or send the report.
        """

        self.logger.info("Report started")
        for name in self.opts:
            self.logger.info("Option %s=%r", name, self.opts[name])
        if not self.resend:
            self.make_report(self.requests)
        if not self.noemail:
            self.send_report()

    @property
    def log_path(self):
        """Location of the log to be parsed."""

        if not hasattr(self, "_log_path"):
            self._log_path = self.month.log_path()
        return self._log_path

    @property
    def month(self):
        """Period for which activity is to be reported."""

        if not hasattr(self, "_month"):
            self._month = self.Month(self.opts.get("month"))
        return self._month

    @property
    def noemail(self):
        """If True we skip sending the report."""

        if not hasattr(self, "_noemail"):
            self._noemail = True if self.opts.get("noemail") else False
        return self._noemail

    @property
    def orgs(self):
        """
        Fetch the information about the organizations with which we partner.
        """

        if hasattr(self, "_orgs"):
            return self._orgs
        url = "https://cdr.cancer.gov/cgi-bin/cdr/get-pdq-partners.py?p=CDR"
        self.logger.info("fetching partners from %r", url)

        class Org:
            def __init__(self, node):
                self.oid = int(node.get("oid"))
                self.name = cdr.get_text(node.find("org_name"))
                self.status = cdr.get_text(node.find("org_status"))
                self.uid = cdr.get_text(node.find("ftp_userid"))
                self.terminated = cdr.get_text(node.find("terminated"))

        root = etree.fromstring(requests.get(url).content)
        self._orgs = {}
        for node in root.findall("org_id"):
            org = Org(node)
            if org.uid is not None:
                self._orgs[org.uid] = org
        return self._orgs

    @property
    def recips(self):
        """
        Figure out who we should send the report to.
        """

        if not hasattr(self, "_recips"):
            recips = self.opts.get("recips")
            if recips:
                self._recips = [r.strip() for r in recips.split(",")]
            else:
                self._recips = ["*****@*****.**"]
        return self._recips

    @property
    def report_path(self):
        """Location of the log to be parsed."""

        if not hasattr(self, "_report_path"):
            self._report_path = self.month.report_path()
        return self._report_path

    @property
    def requests(self):
        """Partner requests extracted from the log file.

        Make sure we have the latest log files (using rsync),
        and then walk through each line in the log file for this
        report. We're interested in two types of lines (records):
        session opening lines, from which we build our dictionary
        of login IDs mapped by session IDs; and file opening lines,
        from which we parse our request objects. We skip over requests
        made using login accounts which are known not to represent
        PDQ data partners (CBIIT accounts, developer accounts, testing
        accounts, etc.).

        The session IDs appear in fields which look like this example:

            sshd[9223]:

        ... which is why we use the expression [5:-2] to extract them.
        """

        if hasattr(self, "_requests"):
            return self._requests

        class Request:
            def __init__(self, line, sids, orgs):
                """
                Extract the fields from the sftp activity log.

                Note that the second field holds the digit(s) for
                the date the request was received. In order to
                ensure that the value has a uniform width (for
                possible sorting purposes), we stick a zero in
                front of the value and use the substring starting
                two characters from the end (hence the -2 in the
                tokens[1][-2:] expression).

                Passed:
                  line - record from the sftp log, fields separated by spaces
                  sids - dictionary of sftp login IDs indexed by session ID
                  orgs - dictionary of partner org names indexed by login ID
                """

                tokens = line.split()
                if tokens[0].isdigit():
                    tokens = tokens[1:]
                self.date = "%s-%s" % (tokens[0], ("0" + tokens[1])[-2:])
                self.time = tokens[2]
                self.path = tokens[6][1:-1].replace("/pdq/full/", "")
                self.sid = int(tokens[4][5:-2])
                self.user = sids.get(self.sid, "")
                if self.user and self.user in orgs:
                    self.org = orgs[self.user].name or ""
                else:
                    self.org = ""

        self._requests = {}
        sids = {}
        count = 0
        self.logger.info("parsing %r", self.log_path)
        self.__sync_logs()
        with gzip.open(self.log_path) as fp:
            for line in fp.readlines():
                line = str(line, "utf-8")
                if "]: open " in line:
                    request = Request(line, sids, self.orgs)
                    if request.user in self.NON_PARTNERS:
                        continue
                    if request.user not in self._requests:
                        self._requests[request.user] = []
                    self._requests[request.user].append(request)
                    count += 1
                elif "session opened for local user" in line:
                    tokens = line.split()
                    if tokens[0].isdigit():
                        tokens = tokens[1:]
                    sid = int(tokens[4][5:-2])
                    user = tokens[10]
                    sids[sid] = user
        args = count, len(self._requests)
        self.logger.info("fetched %d requests from %d partners", *args)
        return self._requests

    @property
    def resend(self):
        """If True we send a previously saved report."""

        if not hasattr(self, "_resend"):
            self._resend = True if self.opts.get("resend") else False
        return self._resend

    @property
    def tier(self):
        """Run time settings."""

        if not hasattr(self, "_tier"):
            self._tier = Tier()
        return self._tier

    def __sync_logs(self):
        """
        Top up our local copies of the pdq logs from the sftp server.
        We're ignoring some expected errors, having to do with cygwin's
        difficulty in dealing with bizarre Windows file permissions
        configuration settings. If we really fail to bring down a needed
        log file successfully, we'll find out when we try to read it.
        """

        etc = self.tier.etc
        rsa = f"{etc}/cdroperator_rsa"
        ssh = f"ssh -i {rsa} -o LogLevel=error -o StrictHostKeyChecking=no"
        usr = "******"
        dns = "cancerinfo.nci.nih.gov"
        src = "%s@%s:/sftp/sftphome/cdrstaging/logs/*" % (usr, dns)
        cmd = "rsync -e \"%s\" %s ." % (ssh, src)
        fix = r"%s:\cdr\bin\fix-permissions.cmd ." % cdr.WORK_DRIVE
        cwd = os.getcwd()
        os.chdir(self.Month.LOGDIR)
        self.logger.info(cmd)
        cdr.run_command(cmd)
        if cdr.WORK_DRIVE:
            self.logger.info(fix)
            cdr.run_command(fix)
        os.chdir(cwd)

    def make_report(self, requests):
        """
        Generate and save a report of files fetched by the PDQ partners.
        """

        book = openpyxl.Workbook()
        sheet = book.active
        sheet.title = "Requests"
        bold = openpyxl.styles.Font(size=12, bold=True)
        center = openpyxl.styles.Alignment(horizontal="center")
        sheet.freeze_panes = "A6"
        sheet["A1"] = str(datetime.date.today())
        sheet["A1"].font = bold
        sheet["A3"] = "Downloads for %s" % self.month
        sheet["A3"].font = bold
        for i, width in enumerate(self.WIDTHS):
            col = chr(ord("A") + i)
            cell = "%s5" % col
            sheet.column_dimensions[col].width = width
            sheet[cell] = self.LABELS[i]
            sheet[cell].font = bold
            sheet[cell].alignment = center
        row = 6
        for user in sorted(requests):
            for r in requests[user]:
                sheet.cell(row=row, column=1, value=r.user)
                sheet.cell(row=row, column=2, value=r.org)
                sheet.cell(row=row, column=3, value=r.path)
                sheet.cell(row=row, column=4, value=r.sid)
                sheet.cell(row=row, column=5, value=r.date).alignment = center
                sheet.cell(row=row, column=6, value=r.time).alignment = center
                row += 1
        book.save(self.report_path)
        self.logger.info("wrote %r", self.report_path)

    def send_report(self):
        """
        Send the report as an attachment to an email message.
        """

        label = str(self.month)
        book = cdr.EmailAttachment(filepath=self.report_path)
        subject = self.SUBJECT % label
        body = (
            "Attached is the monthly PDQ Distribution Partner report listing "
            "all documents downloaded from the SFTP server for %s.\n" % label,
            "The report is based on the log file provided at",
            "         %s\n" % self.log_path,
            "Please save the attached report to the network directory",
            "         L:\\OCPL\\_CROSS\\CDR\\Reports\\FTP Stats",
            "so the Clinical Trials team can access the information as needed.",
            "",
            "For questions or comments please reply to this email message.")
        body = "\n".join(body)
        recips = self.recips
        opts = dict(subject=subject, body=body, attachments=[book])
        message = cdr.EmailMessage(self.SENDER, recips, **opts)
        message.send()
        self.logger.info("sent report to %s", ", ".join(recips))

    class Month:
        """
        Period covered by the report.

        Attributes:
            year - integer for the year of the report's data
            month - integer for the month of the report's data
        """

        LOGDIR = cdr.BASEDIR + "/sftp_log"
        REPORTS = cdr.BASEDIR + "/reports"
        FILEBASE = "PDQPartnerDownloads"

        def __init__(self, yyyymm=None):
            """
            Extract the month and year from the YYYYMM string passed,
            if provided; otherwise get the month and year for the
            previous month.
            """

            if yyyymm:
                match = re.match(r"(\d\d\d\d)(\d\d)", yyyymm)
                if not match:
                    self.logger.error("malformed month %r", month)
                    raise Exception("invalid parameter")
                self.year = int(match.group(1))
                self.month = int(match.group(2))
                self.start = datetime.date(self.year, self.month, 1)
            else:
                today = datetime.date.today()
                self.start = today - relativedelta(months=1, day=1)
                self.month = self.start.month
                self.year = self.start.year

        def log_path(self):
            """
            Contruct the path for the location of the log to be parsed.
            """

            report_date = self.start + relativedelta(months=1)
            stamp = report_date.strftime("%Y%m%d")
            return "%s/pdq.log-%s.gz" % (self.LOGDIR, stamp)

        def report_path(self):
            """
            Construct the path for the location of the report to be generated.
            """

            ym = self.start.strftime("%Y-%m")
            return "%s/%s_%s.xlsx" % (self.REPORTS, self.FILEBASE, ym)

        def __str__(self):
            """
            Display the month in spelled-out English.

            This is invoked by the make_report() method to construct
            the report title. Python effectively calls str(arg) when
            interpolating arguments for '%s' placeholders in the string
            templates, and the built-in str() function uses the object's
            __str__() method if it has one.
            """

            return self.start.strftime("%B %Y")
Ejemplo n.º 11
0
    def letters_json(self):
        """Letter type information usable by client-side scripting."""

        if not hasattr(self, "_letters_json"):
            self._letters_json = getControlValue("Mailers", self.LETTERS)
        return self._letters_json
Ejemplo n.º 12
0
    def run(self):
        """Create the sitemap document and store it on Akamai's servers.

        We maintain a dictionary of glossary names so we only have to
        fetch and parse a GlossaryTerm document once, instead of once
        for the English name and a second time for the Spanish name.

        The selection process is driven by a CSV file, stored in the
        ctl table, with CDR ID, key, and language on each line. Key is
        one of term, genetic, or drug. Language is English or Spanish.
        See the Jira ticket (URL at the top of this file) for more details.
        """

        # Load the entries file.
        tier = self.tier
        self.logger.info("loading sitemap info from %s", tier)
        entries = getControlValue("dictionary", "sitemap-entries", tier=tier)

        # Prepare the database query used to fetch a CDR XML document.
        cursor = db.connect(user="******", tier=tier).cursor()
        query = db.Query("pub_proc_cg c", "t.name", "c.xml")
        query.join("document d", "d.id = c.id")
        query.join("doc_type t", "t.id = d.doc_type")
        query.where(query.Condition("c.id", 0))
        query = str(query)

        # Walk through each entry to determine whether we have a URL.
        glossary_names = dict()
        urls = dict()
        for line in entries.splitlines():
            line = line.strip()
            id, key, lang = line.split(",")
            id = int(id)
            cursor.execute(query, (id,))
            row = cursor.fetchone()
            if not row:
                args = lang, key, id
                self.logger.warning("%s %s entry CDR%s not published", *args)
            else:
                if id in glossary_names:
                    node = glossary_names[id].get(lang)
                else:
                    root = etree.fromstring(row.xml.encode("utf-8"))
                    if key == "drug":
                        if row.name != "Term":
                            raise Exception(f"CDR{id} has doctype {row.name}")
                        node = root.find("CGovInfo/NCITName")
                        if node is None or not node.text:
                            node = root.find("PreferredName")
                    else:
                        glossary_names[id] = dict()
                        node = root.find("TermName")
                        glossary_names[id]["English"] = node
                        node = root.find("SpanishTermName")
                        glossary_names[id]["Spanish"] = node
                        node = glossary_names[id].get(lang)
                url = None
                if node is not None and node.text:
                    url = DictionaryAPILoader.Doc.Node.clean_pretty_url(node)
                if not url:
                    args = lang, key, id
                    message = "%s %s entry CDR%s has no URL; using CDR ID"
                    self.logger.warning(message, *args)
                    url = id
                pattern = self.PATTERNS[key][lang]
                url = f"https://www.cancer.gov/{pattern}/def/{url}"
                if url not in urls:
                    urls[url] = []
                urls[url].append((lang, key, id))

        # Build the sitemap document, logging and skipping duplicate URLs.
        urlset = etree.Element(f"{self.NS}urlset", nsmap=self.NSMAP)
        for url in sorted(urls):
            if len(urls[url]) > 1:
                self.logger.warning("duplicate URL %r", url)
                for args in urls[url]:
                    self.logger.warning("... used by %s %s entry CDR%s", *args)
            else:
                node = etree.SubElement(urlset, f"{self.NS}url")
                etree.SubElement(node, f"{self.NS}loc").text = url
                etree.SubElement(node, f"{self.NS}priority").text = "0.5"
                etree.SubElement(node, f"{self.NS}changefreq").text = "weekly"
        xml = etree.tostring(urlset, pretty_print=True, encoding="utf-8")
        if self.opts.get("dump"):
            print(xml.decode("utf-8"))
        else:
            try:
                stamp = datetime.now().strftime("%Y%m%d%H%M%S")
                tempname = f"d:/tmp/sitemap-{stamp}.xml"
                with open(tempname, "wb") as fp:
                    fp.write(xml)
            except Exception as e:
                self.logger.exception("saving sitemap")
                raise
            try:
                with self.client.open_sftp() as sftp:
                    sftp.put(tempname, self.DESTINATION)
                self.logger.info("sent %s to %s", tempname, self.hostname)
                if not self.opts.get("keep"):
                    unlink(tempname)
                else:
                    self.logger.info("preserving %s", tempname)
            except Exception as e:
                self.logger.exception("sending %s", tempname)
                raise
Ejemplo n.º 13
0
# Instantiate the Log class
# ---------------------------------------------------------------------
LOGGER = cdr.Logging.get_logger("PubJob", console=True)
LOGGER.info("SubmitPubJob - Started")
LOGGER.info('Arguments: %s', sys.argv)

parseArgs(sys.argv)

# Based on the command line parameter passed we are submitting a
# interim publishing job or a full export
# ---------------------------------------------------------------
if fullMode:
    pubSubset = 'Export'
else:
    pubSubset = 'Interim-Export'
override = cdr.getControlValue("Publishing", f"{pubSubset}-wait-seconds")
try:
    override = int(override)
    waitTotal = override
except Exception:
    pass

try:
    # Before we start we need to check if a publishing job is already
    # underway.  It could be in the process of publishing or pushing.
    # We do not allow two jobs of the same job type to run simultanously.
    # Also, if a publishing job ran but the push job failed the
    # initiated push job would fail with a message 'Push job pending'.
    # ---------------------------------------------------------------
    LOGGER.info("Checking job queue ...")