Beispiel #1
0
def manifest_cmd(verbose, sfl_file, evt_dir):
    """
    Compares files in SFL-FILE with files in EVT-DIR.

    If EVT-DIR begins with 's3://s3-bucket-name' then files will located in S3.
    To configure credentials for S3 access use the 'aws' command-line tool from
    the 'awscli' Python package.

    It's normal for about one file per day to be missing from the SFL file or
    EVT day of year folder, especially around midnight.

    To read from STDIN use '-' for SFL_FILE. Prints a file list diff to STDOUT.
    """
    found_evt_ids = []
    if evt_dir.startswith("s3://"):
        try:
            _, _, bucket, evt_dir = evt_dir.split("/", 3)
        except ValueError:
            raise click.ClickException("could not parse bucket and folder from S3 EVT-DIR")
        cloud = clouds.AWS([("s3-bucket", bucket)])
        try:
            files = cloud.get_files(evt_dir)
        except botocore.exceptions.NoCredentialsError:
            print('Please configure aws first:', file=sys.stderr)
            print('  $ pip install awscli', file=sys.stderr)
            print('  then', file=sys.stderr)
            print('  $ aws configure', file=sys.stderr)
            raise click.Abort()
        found_evt_files = seaflowfile.sorted_files(seaflowfile.keep_evt_files(files))
    else:
        found_evt_files = seaflowfile.find_evt_files(evt_dir)

    df = sfl.read_file(sfl_file)
    sfl_evt_ids = [seaflowfile.SeaFlowFile(f).file_id for f in df['file']]
    found_evt_ids = [seaflowfile.SeaFlowFile(f).path_file_id for f in found_evt_files]
    sfl_set = set(sfl_evt_ids)
    found_set = set(found_evt_ids)

    print('%d EVT files in SFL file %s' % (len(sfl_set), sfl_file.name))
    print('%d EVT files in directory %s' % (len(found_set), evt_dir))
    print('%d EVT files in common' % len(sfl_set.intersection(found_set)))
    if verbose and \
       (len(sfl_set.intersection(found_set)) != len(sfl_set) or
        len(sfl_set.intersection(found_set)) != len(found_set)):
        print('')
        print('EVT files in SFL but not found:')
        print(os.linesep.join(sorted(sfl_set.difference(found_set))))
        print('')
        print('EVT files found but not in SFL:')
        print(os.linesep.join(sorted(found_set.difference(sfl_set))))
        print('')
Beispiel #2
0
def validate_evt_cmd(no_header, no_summary, verbose, files):
    """
    Examines EVT/OPP files.

    If any of the file arguments are directories all EVT/OPP files within those
    directories will be recursively found and examined. Prints to STDOUT.
    """
    if not files:
        return

    # dirs to file paths
    files = expand_file_list(files)

    header_printed = False
    ok, bad = 0, 0

    for filepath in files:
        # Default values
        filetype = '-'
        file_id = '-'
        events = 0

        try:
            sff = seaflowfile.SeaFlowFile(filepath)
            file_id = sff.file_id
            if sff.is_opp:
                filetype = 'opp'
                data = fileio.read_opp_labview(filepath)
                events = len(data.index)
            else:
                filetype = 'evt'
                data = fileio.read_evt_labview(filepath)
                events = len(data.index)
        except errors.FileError as e:
            status = str(e)
            bad += 1
        else:
            status = "OK"
            ok += 1

        if not verbose:
            if status != 'OK':
                if not header_printed and not no_header:
                    print('\t'.join(['path', 'file_id', 'type', 'status', 'events']))
                    header_printed = True
                print('\t'.join([filepath, file_id, filetype, status, str(events)]))
        else:
            if not header_printed and not no_header:
                print('\t'.join(['path', 'file_id', 'type', 'status', 'events']))
                header_printed = True
            print('\t'.join([filepath, file_id, filetype, status, str(events)]))
    if not no_summary:
        print("%d/%d files passed validation" % (ok, bad + ok))
Beispiel #3
0
def count_evt_cmd(no_header, evt_files):
    """
    Reports event counts in EVT files.

    For speed, only a portion at the beginning of the file is read to get the
    event count. If any of EVT-FILES are directories all EVT/OPP files within
    those directories will be recursively found and examined. Files which can't
    be read with a valid EVT/OPP file name and file header will be reported
    with a count of 0.

    Unlike the "evt validate" command, this command does not attempt validation
    of the EVT/OPP file beyond reading the first 4 byte row count header.
    Because of this, there may be files where "evt validate" reports 0 rows
    while this tool reports > 0 rows.

    Outputs tab-delimited text to STDOUT.
    """
    if not evt_files:
        return

    files = expand_file_list(evt_files)  # dirs to file paths
    evt_files = seaflowfile.keep_evt_files(files)
    opp_files = seaflowfile.keep_evt_files(files, opp=True)
    evtopp_files = {*(evt_files + opp_files)}

    header_printed = False

    for filepath in files:
        # Default values
        filetype = '-'
        file_id = '-'
        events = 0

        if filepath in evtopp_files:
            sff = seaflowfile.SeaFlowFile(filepath)
            file_id = sff.file_id
            try:
                if sff.is_opp:
                    filetype = 'opp'
                else:
                    filetype = 'evt'
                events = fileio.read_labview_row_count(filepath)
            except errors.FileError:
                pass  # accept defaults, do nothing

        if not header_printed and not no_header:
            print('\t'.join(['path', 'file_id', 'type', 'events']))
            header_printed = True
        print('\t'.join([filepath, file_id, filetype, str(events)]))
def dayofyear_cmd(verbose, files):
    """
    Gets calculated day of year dir from filename timestamp.

    File paths must be new-style datestamped paths. Any part of the file
    path except for the filename will be ignored. The filename may include a
    '.gz' extension. Outputs to STDOUT.
    """
    output = []
    for file in files:
        try:
            sfile = seaflowfile.SeaFlowFile(file)
        except errors.FileError as e:
            click.echo("%s %s" % (file, e), err=True)
            continue
        if verbose:
            output.append([file, sfile.filename, sfile.dayofyear])
        else:
            output.append([sfile.dayofyear])
    if output:
        click.echo("\n".join(["\t".join(row) for row in output]))
Beispiel #5
0
def sfl_fix_event_rate_cmd(sfl_file, events_file):
    """
    Calculates true event rates.

    EVENTS-FILE should be a TSV file with EVT path/file ID in first
    column and event count in last column, or a popcycle SQLite3 database file
    with a '.db' extension. A version of SFL_FILE with updated event rates will
    be printed to STDOUT. In cases where the file duration value is < 0 or NA
    the event rate will be NA.
    """
    df = sfl.read_file(sfl_file)
    df = sfl.fix(df)

    # Event counts should be a dict of { file: event_count }
    if events_file.endswith(".db"):
        event_counts = db.get_event_counts(events_file)
    else:
        lines = [x.rstrip().split('\t') for x in events_file.readlines()]
        event_counts = {seaflowfile.SeaFlowFile(x[0]).file_id: int(x[-1]) for x in lines}

    df = sfl.fix_event_rate(df, event_counts)
    sfl.save_to_file(df, sys.stdout)
Beispiel #6
0
def local_filter_evt_cmd(evt_dir, s3_flag, dbpath, limit, opp_dir,
                         process_count, resolution):
    """Filter EVT data locally."""
    # Validate args
    if not evt_dir and not s3_flag:
        raise click.UsageError('One of --evt_dir or --s3 must be provided')

    # Find cruise in db
    try:
        cruise = db.get_cruise(dbpath)
    except errors.SeaFlowpyError as e:
        raise click.ClickException(str(e))

    # Find filter parameters in db. Won't use them yet but better to check
    # upfront
    try:
        _filter_params = db.get_latest_filter(dbpath)
    except errors.SeaFlowpyError as e:
        raise click.ClickException(str(e))

    # Capture run parameters and information
    v = {
        'evt_dir': evt_dir,
        's3': s3_flag,
        'limit': limit,
        'db': dbpath,
        'opp_dir': opp_dir,
        'process_count': process_count,
        'resolution': resolution,
        'version': pkg_resources.get_distribution("seaflowpy").version,
        'cruise': cruise
    }
    to_delete = [k for k in v if v[k] is None]
    for k in to_delete:
        v.pop(k, None)  # Remove undefined parameters

    # Print run parameters
    print('Run parameters and information:')
    print(json.dumps(v, indent=2))
    print('')

    # Get list of files in sfl table.
    try:
        sfl_df = db.get_sfl_table(dbpath)
    except errors.SeaFlowpyError as e:
        raise click.ClickException(str(e))
    sfl_files = sfl_df["file"].tolist()

    # Find EVT files
    print('Getting lists of files to filter')
    if evt_dir:
        evt_files = seaflowfile.sorted_files(
            seaflowfile.find_evt_files(evt_dir))
    elif s3_flag:
        # Make sure configuration for s3 is ready to go
        config = conf.get_aws_config(s3_only=True)
        cloud = clouds.AWS(config.items("aws"))
        # Make sure try to access S3 up front to setup AWS credentials before
        # launching child processes.
        try:
            evt_files = cloud.get_files(cruise)
            evt_files = seaflowfile.keep_evt_files(
                evt_files)  # Only keep EVT files
        except botocore.exceptions.NoCredentialsError as e:
            print('Please configure aws first:', file=sys.stderr)
            print('  $ conda install aws', file=sys.stderr)
            print('  or', file=sys.stderr)
            print('  $ pip install aws', file=sys.stderr)
            print('  then', file=sys.stderr)
            print('  $ aws configure', file=sys.stderr)
            raise click.Abort()

    # Check for duplicates, exit with message if any exist
    uniques = {seaflowfile.SeaFlowFile(f).file_id for f in evt_files}
    if len(uniques) < len(evt_files):
        raise click.ClickException('Duplicate EVT file(s) detected')

    # Find intersection of SFL files and EVT files
    files = seaflowfile.filtered_file_list(evt_files, sfl_files)
    print('sfl={} evt={} intersection={}'.format(len(sfl_files),
                                                 len(evt_files), len(files)))

    # Restrict length of file list with --limit
    if (limit is not None) and (limit > 0):
        files = files[:limit]

    # Filter
    try:
        filterevt.filter_evt_files(files,
                                   dbpath,
                                   opp_dir,
                                   s3=s3_flag,
                                   worker_count=process_count,
                                   every=resolution)
    except errors.SeaFlowpyError as e:
        raise click.ClickException(str(e))
Beispiel #7
0
def remote_filter_evt_cmd(dryrun, executable, instance_count, no_cleanup,
                          output_dir, process_count, ramdisk_size,
                          instance_type, dbs):
    """Filter EVT data on remote servers.

    SQLite3 db files must contain filter parameters and cruise name
    """
    print("Started at {}{}".format(datetime.datetime.utcnow().isoformat(),
                                   os.linesep))

    # Print defined parameters and information
    v = {
        'dbs': dbs,
        'executable': executable,
        'output_dir': output_dir,
        'dryrun': dryrun,
        'instance_count': instance_count,
        'no_cleanup': no_cleanup,
        'process_count': process_count,
        'instance_type': instance_type,
        'ramdisk_size': ramdisk_size,
        'version': pkg_resources.get_distribution("seaflowpy").version
    }
    to_delete = [k for k in v if v[k] is None]
    for k in to_delete:
        v.pop(k, None)  # Remove undefined parameters
    print('Run parameters and information:')
    print(json.dumps(v, indent=2))
    print('')

    # Make sure configuration for aws and ssh is ready to go
    config = conf.get_aws_config()
    conf.get_ssh_config(config)
    cloud = clouds.AWS(config.items('aws'))

    # If local executable is not given download latest from github
    remove_executable = False
    if not executable:
        remove_executable = True  # mark this file for deletion at exit
        executable = download_latest_linux()

    # Configure fabric
    env.connection_attempts = 6
    # Tell fabric the SSH user name and key file location
    env.user = config.get('ssh', 'ssh-user')
    env.key_filename = os.path.expanduser(
        config.get('ssh', 'ssh-private-key-file'))

    try:
        if len(dbs) > 0:
            print('Getting lists of files for each cruise')
            cruise_files = {}
            for dbfile in dbs:
                # Make sure file exists
                if not os.path.exists(dbfile):
                    raise click.ClickException(
                        'DB file {} does not exist'.format(dbfile))
                # Make sure db has filter parameters filled in
                try:
                    filter_table = db.get_latest_filter(dbfile)
                except errors.SeaFlowpyError as e:
                    raise click.ClickException(
                        'No filter parameters found in database file {}'.
                        format(dbfile))
                if len(filter_table) != 3:
                    raise click.ClickException(
                        'Unusual filter parameters found in database file {}'.
                        format(dbfile))
                # Get cruise name DB
                try:
                    c = db.get_cruise(dbfile)
                except errors.SeaFlowpyError as e:
                    raise click.ClickException(
                        'Could not retrieve cruise name from DB. {}'.format(e))
                try:
                    evt_files = seaflowfile.sorted_files(
                        seaflowfile.keep_evt_files(cloud.get_files(c)))
                except botocore.exceptions.NoCredentialsError as e:
                    print('Please configure aws first:', file=sys.stderr)
                    print('  $ pip install awscli', file=sys.stderr)
                    print('  then', file=sys.stderr)
                    print('  $ aws configure', file=sys.stderr)
                    raise click.Abort()

                # Check for duplicates, exit with message if any exist
                uniques = {
                    seaflowfile.SeaFlowFile(f).file_id
                    for f in evt_files
                }
                if len(uniques) < len(evt_files):
                    raise click.ClickException(
                        'Duplicate EVT file(s) detected')

                # Filter cruise files by SFL entries
                try:
                    sfl_df = db.get_sfl_table(dbfile)
                except errors.SeaFlowpyError as e:
                    print(
                        'Error retrieving SFL file list from DB: {}'.format(e))
                    return 1
                sfl_files = sfl_df["file"].tolist()
                # Find intersection of SFL files and EVT files
                cruise_files[c] = seaflowfile.filtered_file_list(
                    evt_files, sfl_files)
                print('{:<20} sfl={} evt={} intersection={}'.format(
                    c, len(sfl_files), len(evt_files), len(cruise_files[c])))
            print('')

            if dryrun:
                # Create dummy host list
                print('Creating {} dummy hosts'.format(instance_count))
                env.hosts = [
                    'dummy{}'.format(i) for i in range(instance_count)
                ]
            else:
                print('Starting {} instances'.format(instance_count))
                result = cloud.start(count=instance_count,
                                     instance_type=instance_type)
                for iid, iip in zip(result['InstanceIds'],
                                    result['publicips']):
                    print('  InstanceId = {}, IP = {}'.format(iid, iip))
                env.hosts.extend(result['publicips'])
            print('')

            # Fairly divide cruises into hosts based on number of files
            print('Assigning cruises to {} hosts'.format(len(env.hosts)))
            host_assignments = assign_keys_to_hosts(env.hosts, cruise_files)
            for h in host_assignments:
                htotal = sum([c[1] for c in host_assignments[h]])
                print('{:<20} {}'.format(h, htotal))
                for c in host_assignments[h]:
                    print('  {:<18} {}'.format(c[0], c[1]))
            print('')

            if dryrun:
                print('Dry run complete')
                print('')
                return 0

            print('Waiting for hosts to come up with SSH')
            execute(wait_for_up)

            print('Creating initial ramdisk')
            with hide('output'):
                execute(create_ramdisk, ramdisk_size)

            print('Transfer AWS credentials')
            with hide('output'):
                execute(rsync_put, ['~/.aws/'], '.aws')

            print('Transfer seaflowpy configuration')
            with hide('output'):
                execute(rsync_put, ['~/.seaflowpy/'], '.seaflowpy')

            print('Transfer initial databases')
            execute(mkdir, REMOTE_DB_DIR)  # create db dir on each host
            with hide('output'):
                execute(rsync_put, dbs, REMOTE_DB_DIR)

            print('Install system dependencies')
            execute(install_system_dependencies)

            print('Upload seaflowpy executable')
            execute(upload_seaflowpy, executable)

            # Host list in env.hosts should be populated now and all machines up
            print('Filter data')
            execute(filter_cruise, host_assignments, output_dir, process_count)
    except Exception as e:
        print(f'Error: {e}')
    finally:
        disconnect_all()  # always disconnect SSH connections
        if not no_cleanup:
            cloud.cleanup()  # clean up in case of any unhandled exceptions
        # Clean up seaflowpy executable we downloaded
        if remove_executable:
            try:
                os.remove(executable)
            except OSError as e:
                print(
                    'Error: could not delete temporary seaflowpy executable: {} - {}'
                    .format(executable, e.strerror),
                    file=sys.stderr)

        print('Finished at {}'.format(datetime.datetime.utcnow().isoformat()))

    return 0