def main(cli_args=None): """Main function to implement command-line interface""" if cli_args is None: cli_args = sys.argv[1:] args = parse_args(cli_args) # Print defined parameters v = dict(vars(args)) to_delete = [k for k in v if v[k] is None] for k in to_delete: v.pop(k, None) # Remove undefined parameters v["version"] = pkg_resources.get_distribution("seaflowpy").version print "Defined parameters:" print json.dumps(v, indent=2) print "" # Find EVT files if args.evt_dir: files = evt.find_evt_files(args.evt_dir) elif args.s3: # Make sure configuration for s3 is ready to go config = conf.get_aws_config(s3_only=True) cloud = clouds.AWS(config.items("aws")) # Make sure try to access S3 up front to setup AWS credentials before # launching child processes. try: files = cloud.get_files(args.cruise) files = evt.parse_file_list(files) # Only keep EVT files except botocore.exceptions.NoCredentialsError as e: print "Please configure aws first:" print " $ conda install aws" print " or" print " $ pip install aws" print " then" print " $ aws configure" sys.exit(1) # Restrict length of file list with --limit if (not args.limit is None) and (args.limit > 0): files = files[:args.limit] filter_keys = ["notch1", "notch2", "width", "offset", "origin"] filter_options = dict((k, getattr(args, k)) for k in filter_keys) # Filter if args.twopass: filterer = filterevt.two_pass_filter else: filterer = filterevt.filter_evt_files filterer(files, args.cruise, filter_options, args.db, args.opp_dir, s3=args.s3, process_count=args.process_count, every=args.resolution) # Index if args.db: db.ensure_indexes(args.db)
def filter_evt_files(files, cruise, filter_options, dbpath, opp_dir, s3=False, process_count=1, every=10.0): """Filter a list of EVT files. Arguments arguments: files - paths to EVT files to filter cruise - cruise name filter_options - Dictionary of filter params (notch1, notch2, width, offset, origin) dbpath = SQLite3 db path opp_dir = Directory for output binary OPP files Keyword arguments: s3 - Get EVT data from S3 process_count - number of worker processes to use every - Percent progress output resolution """ o = { "file": None, # fill in later "cruise": cruise, "process_count": process_count, "filter_options": filter_options, "every": every, "s3": s3, "cloud_config_items": None, "dbpath": dbpath, "opp_dir": opp_dir, "filter_id": None # fill in later } if dbpath: dbdir = os.path.dirname(dbpath) if dbdir and not os.path.isdir(dbdir): util.mkdir_p(dbdir) db.ensure_tables(dbpath) o["filter_id"] = db.save_filter_params(dbpath, filter_options) if s3: config = conf.get_aws_config(s3_only=True) o["cloud_config_items"] = config.items("aws") if process_count > 1: # Create a pool of N worker processes pool = Pool(process_count) def mapper(worker, task_list): return pool.imap_unordered(worker, task_list) else: def mapper(worker, task_list): return imap(worker, task_list) evt_count = 0 evt_signal_count = 0 opp_count = 0 files_ok = 0 # Construct worker inputs inputs = [] for f in files: inputs.append(copy.copy(o)) inputs[-1]["file"] = f print "" print "Filtering %i EVT files. Progress every %i%% (approximately)" % \ (len(files), every) t0 = time.time() last = 0 # Last progress milestone in increments of every evt_count_block = 0 # EVT particles in this block (between milestones) evt_signal_count_block = 0 # EVT noise filtered particles in this block opp_count_block = 0 # OPP particles in this block # Filter particles in parallel with process pool for i, res in enumerate(mapper(do_work, inputs)): evt_count_block += res["evt_count"] evt_signal_count_block += res["evt_signal_count"] opp_count_block += res["opp_count"] files_ok += 1 if res["ok"] else 0 # Print progress periodically perc = float(i + 1) / len(files) * 100 # Percent completed # Round down to closest every% milestone = int(perc / every) * every if milestone > last: now = time.time() evt_count += evt_count_block evt_signal_count += evt_signal_count_block opp_count += opp_count_block ratio_signal_block = zerodiv(opp_count_block, evt_signal_count_block) ratio_block = zerodiv(opp_count_block, evt_count_block) msg = "File: %i/%i (%.02f%%)" % (i + 1, len(files), perc) msg += " Particles this block: %i / %i (%i) %.04f (%.04f) elapsed: %.2fs" % \ (opp_count_block, evt_signal_count_block, evt_count_block, ratio_signal_block, ratio_block, now - t0) print msg sys.stdout.flush() last = milestone evt_count_block = 0 evt_signal_count_block = 0 opp_count_block = 0 # If any particle count data is left, add it to totals if evt_count_block > 0: evt_count += evt_count_block evt_signal_count += evt_signal_count_block opp_count += opp_count_block opp_evt_signal_ratio = zerodiv(opp_count, evt_signal_count) opp_evt_ratio = zerodiv(opp_count, evt_count) t1 = time.time() delta = t1 - t0 evtrate = zerodiv(evt_count, delta) evtsignalrate = zerodiv(evt_signal_count, delta) opprate = zerodiv(opp_count, delta) print "" print "Input EVT files = %i" % len(files) print "Parsed EVT files = %i" % files_ok print "EVT particles = %s (%.2f p/s)" % (evt_count, evtrate) print "EVT noise filtered particles = %s (%.2f p/s)" % (evt_signal_count, evtsignalrate) print "OPP particles = %s (%.2f p/s)" % (opp_count, opprate) print "OPP/EVT ratio = %.04f (%.04f)" % (opp_evt_signal_ratio, opp_evt_ratio) print "Filtering completed in %.2f seconds" % (delta, )
def main(cli_args=None): """Main function to implement command-line interface""" if cli_args is None: cli_args = sys.argv[1:] args = parse_args(cli_args) print "Started at {}".format(datetime.datetime.utcnow().isoformat()) # Print defined parameters v = dict(vars(args)) to_delete = [k for k in v if v[k] is None] for k in to_delete: v.pop(k, None) # Remove undefined parameters v["version"] = pkg_resources.get_distribution("seaflowpy").version print "Defined parameters:" print json.dumps(v, indent=2) print "" # Make sure configuration for aws and ssh is ready to go config = conf.get_aws_config() conf.get_ssh_config(config) cloud = clouds.AWS(config.items("aws")) # Configure fabric env.connection_attempts = 6 # Tell fabric the SSH user name and key file location env.user = config.get("ssh", "user") env.key_filename = os.path.expanduser(config.get("ssh", "ssh-key-file")) try: print "Getting lists of files for each cruise" cruise_files = {} # Handle case where cruises are listed in a file if len(args.cruises) == 1 and os.path.isfile(args.cruises[0]): with open(args.cruises[0]) as fh: args.cruises = fh.read().split() try: for c in args.cruises: cruise_files[c] = cloud.get_files(c) print "{:<20} {}".format(c, len(cruise_files[c])) print "" except botocore.exceptions.NoCredentialsError as e: print "Please configure aws first:" print " $ conda install aws" print " or" print " $ pip install aws" print " then" print " $ aws configure" sys.exit(1) if args.dryrun: # Create dummy host list print "Creating {} dummy hosts".format(args.instance_count) env.hosts = ["dummy{}".format(i) for i in range(args.instance_count)] else: print "Starting {} instances".format(args.instance_count) result = cloud.start( count=args.instance_count, instance_type=args.instance_type ) for iid, iip in zip(result["InstanceIds"], result["publicips"]): print " InstanceId = {}, IP = {}".format(iid, iip) env.hosts.extend(result["publicips"]) print "" # Fairly divide cruises into hosts based on number of files print "Assigning cruises to {} hosts".format(len(env.hosts)) host_assignments = assign_keys_to_hosts(env.hosts, cruise_files) for h in host_assignments: htotal = sum([c[1] for c in host_assignments[h]]) print "{:<20} {}".format(h, htotal) for c in host_assignments[h]: print " {:<18} {}".format(c[0], c[1]) print "" if args.dryrun: print "Dry run complete" print "" return print "Waiting for hosts to come up with SSH" execute(wait_for_up) print "Transfer AWS credentials" with hide("output"): execute(rsync_put, "~/.aws/", ".aws") print "Transfer seaflowpy configuration" with hide("output"): execute(rsync_put, "~/.seaflowpy/", ".seaflowpy") print "Install seaflowpy" execute(pull_seaflowpy) # Host list in env.hosts should be populated now and all machines up print "Filter data" execute(filter_cruise, host_assignments, args.output_dir, args.process_count) finally: disconnect_all() # always disconnect SSH connections if not args.nocleanup: cloud.cleanup() # clean up in case of any unhandled exceptions print "Finished at {}".format(datetime.datetime.utcnow().isoformat())
def filter_evt_files(files, cruise, filter_options, dbpath, opp_dir, s3=False, process_count=1, every=10.0): """Filter a list of EVT files. Arguments arguments: files - paths to EVT files to filter cruise - cruise name filter_options - Dictionary of filter params (notch1, notch2, width, offset, origin) dbpath = SQLite3 db path opp_dir = Directory for output binary OPP files Keyword arguments: s3 - Get EVT data from S3 process_count - number of worker processes to use every - Percent progress output resolution """ o = { "file": None, # fill in later "cruise": cruise, "process_count": process_count, "filter_options": filter_options, "every": every, "s3": s3, "cloud_config_items": None, "dbpath": dbpath, "opp_dir": opp_dir, "filter_id": None # fill in later } if dbpath: dbdir = os.path.dirname(dbpath) if dbdir and not os.path.isdir(dbdir): util.mkdir_p(dbdir) db.ensure_tables(dbpath) o["filter_id"] = db.save_filter_params(dbpath, filter_options) if s3: config = conf.get_aws_config(s3_only=True) o["cloud_config_items"] = config.items("aws") if process_count > 1: # Create a pool of N worker processes pool = Pool(process_count) def mapper(worker, task_list): return pool.imap_unordered(worker, task_list) else: def mapper(worker, task_list): return imap(worker, task_list) evt_count = 0 evt_signal_count = 0 opp_count = 0 files_ok = 0 # Construct worker inputs inputs = [] for f in files: inputs.append(copy.copy(o)) inputs[-1]["file"] = f print "" print "Filtering %i EVT files. Progress every %i%% (approximately)" % \ (len(files), every) t0 = time.time() last = 0 # Last progress milestone in increments of every evt_count_block = 0 # EVT particles in this block (between milestones) evt_signal_count_block = 0 # EVT noise filtered particles in this block opp_count_block = 0 # OPP particles in this block # Filter particles in parallel with process pool for i, res in enumerate(mapper(do_work, inputs)): evt_count_block += res["evt_count"] evt_signal_count_block += res["evt_signal_count"] opp_count_block += res["opp_count"] files_ok += 1 if res["ok"] else 0 # Print progress periodically perc = float(i + 1) / len(files) * 100 # Percent completed # Round down to closest every% milestone = int(perc / every) * every if milestone > last: now = time.time() evt_count += evt_count_block evt_signal_count += evt_signal_count_block opp_count += opp_count_block ratio_signal_block = zerodiv(opp_count_block, evt_signal_count_block) ratio_block = zerodiv(opp_count_block, evt_count_block) msg = "File: %i/%i (%.02f%%)" % (i + 1, len(files), perc) msg += " Particles this block: %i / %i (%i) %.04f (%.04f) elapsed: %.2fs" % \ (opp_count_block, evt_signal_count_block, evt_count_block, ratio_signal_block, ratio_block, now - t0) print msg sys.stdout.flush() last = milestone evt_count_block = 0 evt_signal_count_block = 0 opp_count_block = 0 # If any particle count data is left, add it to totals if evt_count_block > 0: evt_count += evt_count_block evt_signal_count += evt_signal_count_block opp_count += opp_count_block opp_evt_signal_ratio = zerodiv(opp_count, evt_signal_count) opp_evt_ratio = zerodiv(opp_count, evt_count) t1 = time.time() delta = t1 - t0 evtrate = zerodiv(evt_count, delta) evtsignalrate = zerodiv(evt_signal_count, delta) opprate = zerodiv(opp_count, delta) print "" print "Input EVT files = %i" % len(files) print "Parsed EVT files = %i" % files_ok print "EVT particles = %s (%.2f p/s)" % (evt_count, evtrate) print "EVT noise filtered particles = %s (%.2f p/s)" % (evt_signal_count, evtsignalrate) print "OPP particles = %s (%.2f p/s)" % (opp_count, opprate) print "OPP/EVT ratio = %.04f (%.04f)" % (opp_evt_signal_ratio, opp_evt_ratio) print "Filtering completed in %.2f seconds" % (delta,)