Ejemplo n.º 1
0
    def produce_report(self):
        """Produce a report of the batch jobs."""
        s3_prefix = 'reading_results/%s/logs/%s/' % (self.basename,
                                                     self._job_queue)
        logger.info("Producing batch report for %s, from prefix %s." %
                    (self.basename, s3_prefix))
        s3 = boto3.client('s3')
        file_tree = get_s3_file_tree(s3, bucket_name, s3_prefix)
        logger.info("Found %d relevant files." % len(file_tree))
        stat_files = {
            'git_info.txt': (self._handle_git_info, self._report_git_info),
            'timing.txt': (self._handle_timing, self._report_timing),
            'raw_tuples.pkl': (None, None),
            'hist_data.pkl': (self._handle_hist_data, self._report_hist_data),
            'sum_data.pkl': (self._handle_sum_data, self._report_sum_data)
        }
        stat_aggs = {}
        for stat_file, (handle_stats, report_stats) in stat_files.items():
            logger.info("Aggregating %s..." % stat_file)
            # Prep the data storage.
            my_agg = {}

            # Get a list of the relevant files (one per job).
            file_paths = file_tree.get_paths(stat_file)
            logger.info("Found %d files for %s." %
                        (len(file_paths), stat_file))

            # Aggregate the data from all the jobs for each file type.
            for sub_path, file_entry in file_paths:
                s3_key = file_entry['key']
                ref = sub_path[0]
                file = s3.get_object(Bucket=bucket_name, Key=s3_key)
                file_bytes = file['Body'].read()
                if handle_stats is not None:
                    handle_stats(ref, my_agg, file_bytes)

            if report_stats is not None and len(my_agg):
                report_stats(my_agg)

            stat_aggs[stat_file] = my_agg

        for end_type, jobs in self.run_record.items():
            self.reporter.add_text('Jobs %s: %d' % (end_type, len(jobs)),
                                   section='Totals')

        s3_prefix = 'reading_results/%s/' % self.basename
        fname = self.reporter.make_report()
        with open(fname, 'rb') as f:
            s3.put_object(Bucket=bucket_name,
                          Key=s3_prefix + fname,
                          Body=f.read())
        s3.put_object(Bucket=bucket_name,
                      Key=s3_prefix + 'stat_aggregates_%s.pkl' % self.time_tag,
                      Body=pickle.dumps(stat_aggs))
        return file_tree, stat_aggs
Ejemplo n.º 2
0
def sort_s3_files_by_last_mod(bucket,
                              prefix,
                              time_delta=None,
                              extension=None,
                              unsigned=True,
                              reverse=False,
                              w_dt=False):
    """Return a list of s3 object keys sorted by their LastModified date on S3

    Parameters
    ----------
    bucket : str
        s3 bucket to look for keys in
    prefix : str
        The prefix to use for the s3 keys
    time_delta : Optional[datetime.timedelta]
        If used, should specify how far back the to look for files on s3.
        Default: None
    extension : Optional[str]
        If used, limit keys to those with the matching file extension.
        Default: None.
    unsigned : bool
        If True, use unsigned s3 client. Default: True.
    reverse : bool
        Reverse the sort order of the returned s3 files. Default: False.
    w_dt : bool
        If True, return list with datetime object along with key as tuple
        (key, datetime.datetime). Default: False.

    Returns
    -------
    list
        A list of s3 keys. If w_dt is True, each item is a tuple of
        (key, datetime.datetime) of the LastModified date.
    """
    if time_delta is None:
        time_delta = timedelta()  # zero timedelta
    s3 = get_s3_client(unsigned)
    n_hours_ago = datetime.utcnow() - time_delta
    file_tree = get_s3_file_tree(s3,
                                 bucket,
                                 prefix,
                                 date_cutoff=n_hours_ago,
                                 with_dt=True)
    key_list = sorted(list(file_tree.get_leaves()),
                      key=lambda t: t[1],
                      reverse=reverse)
    if extension:
        return [
            t if w_dt else t[0] for t in key_list if t[0].endswith(extension)
        ]
    else:
        return key_list if w_dt else [t[0] for t in key_list]
Ejemplo n.º 3
0
def get_latest_pa_stmt_dump():
    s3_cli = get_s3_client(False)
    # Get file key
    dump_name = 'full_pa_stmts.pkl'
    file_tree = get_s3_file_tree(s3=s3_cli,
                                 bucket=DUMPS_BUCKET,
                                 prefix=DUMPS_PREFIX,
                                 with_dt=True)
    # Get all keys for dump_name
    keys = [key for key in file_tree.gets('key') if key[0].endswith(dump_name)]
    keys.sort(key=itemgetter(1))  # Sorts ascending by datetime

    return load_pickle_from_s3(s3_cli, keys[-1][0], DUMPS_BUCKET)
Ejemplo n.º 4
0
def get_latest_sif_s3(get_mesh_ids: bool = False) \
        -> Union[Tuple[Any, str], Tuple[Tuple[Any, str], Tuple[Any, str]]]:
    necc_files = [mngr.name for mngr in dumpers]
    if get_mesh_ids:
        necc_files.append(StatementHashMeshId.name)
    s3 = get_s3_client(unsigned=False)
    tree = get_s3_file_tree(s3,
                            bucket=DUMPS_BUCKET,
                            prefix=DUMPS_PREFIX,
                            with_dt=True)
    # Find all pickles and jsons
    keys = [
        key for key in tree.gets('key') if key[0].endswith(('.pkl', '.json'))
    ]
    # Sort newest first
    keys.sort(key=lambda t: t[1], reverse=True)
    # Get keys of those pickles
    keys_in_latest_dir = \
        [k[0] for k in keys if any(nfl in k[0] for nfl in necc_files)]
    # Map key to resource
    necc_keys = {}
    for n in necc_files:
        for k in keys_in_latest_dir:
            # check name then alt name
            if n in k:
                # Save and continue to next file in necc_files
                necc_keys[n] = k
                break
    logger.info(f'Latest files: {", ".join([f for f in necc_keys.values()])}')
    df = load_pickle_from_s3(s3, key=necc_keys[Sif.name], bucket=DUMPS_BUCKET)
    sif_date = _get_date_from_s3_key(necc_keys[Sif.name])
    if get_mesh_ids:
        mid = load_pickle_from_s3(s3,
                                  key=necc_keys[StatementHashMeshId.name],
                                  bucket=DUMPS_BUCKET)
        meshids_date = _get_date_from_s3_key(
            necc_keys[StatementHashMeshId.name])
        return (df, sif_date), (mid, meshids_date)

    return df, sif_date
Ejemplo n.º 5
0
def get_logs_from_s3(folder=None, cached=True, past_days=None):
    """Download logs from S3 and save into a local folder

    Parameters
    ----------
    folder : str
        The directory where to put the processed reports
    cached : str
        Look for already existing folders and skip those that exist
    past_days : int|datetime.datetime
        Either an integer or an instance of a datetime.datetime object
        specifying the number of days into the past to download logs for.
        If nothing is specified (default), all logs are downloaded.
        Default: None.

    Returns
    -------
    dir_set : set
        A set containing the dir paths of all the requested logs
    """
    s3 = get_s3_client(unsigned=False)
    if past_days:
        days_ago = past_days if isinstance(past_days, datetime) else\
            ((datetime.utcnow() - timedelta(days=past_days)).replace(
                tzinfo=timezone.utc) if isinstance(past_days, int) else None)
    else:
        days_ago = None
    tree = get_s3_file_tree(s3, 'cwc-hms', 'bob_ec2_logs', days_ago)
    keys = tree.gets('key')
    # Here we only get the tar.gz files which contain the logs for the
    # facilitator + the json file (if present) of the user data
    logger.info('Total number of objects: %d ' % len(keys))
    logger.info('Total number of images found: %d' %
                len([k for k in keys if 'image' in k]))
    keys = [key for key in keys if key.startswith('bob_ec2_logs/')
            and key.endswith(('.tar.gz', '.json', '.log'))]
    logger.info('Number of archives: %d' % len(keys))

    fname_patt = re.compile(
        '([\w:-]+?)_(\w+?)_(\w+?_\w+?)_(.*).(tar\.gz|json|\.log)'
    )
    dir_set = set()
    for key in tqdm.tqdm(keys):
        fname = os.path.basename(key)
        m = fname_patt.match(fname)
        if m is None:
            logger.warning("File name %s failed to match %s. Skipping..."
                           % (fname, fname_patt))
            continue
        image_id, cont_hash, cont_name, resource_name, suffix = m.groups()
        head_dir_path = '%s_%s_%s' % (image_id.replace(':', '-'), cont_name,
                                      cont_hash)
        dir_set.add(head_dir_path)
        if folder:
            head_dir_path = os.path.join(folder, head_dir_path)
        if not os.path.exists(head_dir_path):
            os.makedirs(head_dir_path, exist_ok=True)
        if resource_name == 'bioagent_images':
            outpath = head_dir_path
        else:
            outpath = os.path.join(head_dir_path, 'log.txt')
            if cached and os.path.exists(outpath) and\
                    not key.endswith(('.json', '.log')):
                continue
        tgz_file_name = key.split('/')[-1]
        tgz_file = os.path.join(head_dir_path, tgz_file_name)
        res = s3.get_object(Bucket='cwc-hms', Key=key)
        # byte_stream = BytesIO(res['Body'].read())
        byte_stream = res['Body'].read()
        with open(tgz_file, 'wb') as tf:
            tf.write(byte_stream)
        # Re-open file
        if tgz_file.endswith(('.json', '.log')):
            continue
        with open(tgz_file, 'rb') as file_byte_stream:
            with tarfile.open(None, 'r', fileobj=file_byte_stream) as tarf:
                if resource_name == 'bioagent_images':
                    tarf.extractall(outpath)
                else:
                    outpaths = tarf.getnames()
                    facls = [n for n in outpaths if
                             n.endswith('facilitator.log')]
                    if not facls:
                        logger.info('No facilitator.log found for %s' % key)
                        continue
                    facl = facls[0]
                    efo = tarf.extractfile(facl)
                    log_txt = efo.read().decode('utf-8')
                    with open(outpath, 'w') as fh:
                        fh.write(log_txt)
    return dir_set