def run(bucket, key_pattern = None, map_hdf = None, map_dataframe = None, init = None, combine = None, post_process = None, accept_none_as_result = True, label = None, _type = 'f2'): """ Runs query functions over collections of HDF files on S3. You *must* provide: - either the bucket and key_pattern (ie: "capk-fxcm" and "*2012*") or, a concatenation of the two separated by a slash ("capk-fxcm/*2012*") - either map_hdf (a function which takes HDF objects and returns a result) or map_dataframe (which takes pandas.DataFrames and returns a result) If you are running a query which returns a result you must provide: - init : the initial value of the computation - combine : a function which takes the value thus-far accumulated and the result of a single mapper and combines them into a new accumulated result Additional arguments: - accept_none_as_result : Don't warn me if my mappers are returning None - label : label shown on picloud's job manager - _type : which picloud instance type to use (one of 'f2', 'm1', 'c1', 'c2') """ if key_pattern is None: bucket, key_pattern = cloud_helpers.parse_bucket_and_pattern(bucket) if len(key_pattern) == 0: key_pattern = '*' key_names = cloud_helpers.get_matching_key_names(bucket, key_pattern) if combine: assert init is not None if init and hasattr(init, '__call__'): acc = init() else: # if we can't call init assume it's either None or some value acc = init if map_hdf is not None: assert map_dataframe is None do_work = wrap_hdf_mapper(map_hdf) else: assert map_dataframe is not None assert map_hdf is None do_work = wrap_dataframe_mapper(map_dataframe) if label is None: label = "Mapping %s over %s/%s" % \ ( (map_hdf if map_hdf else map_dataframe), bucket, key_pattern) acc = launch_jobs(bucket, key_names, do_work, combine, acc, label, _type, accept_none_as_result) if post_process: return post_process(acc) else: return acc
def collect_keys_and_launch(training_pattern, testing_pattern, start_hour = 3, end_hour = 7, num_features = 1, future_offset = 600, profile = False): # if files are local then buckets are None # otherwise we expect HDFs to be on the same S3 bucket if training_pattern.startswith('s3'): training_bucket, training_pattern = \ cloud_helpers.parse_bucket_and_pattern(training_pattern) if len(training_pattern) == 0: training_pattern = '*' training_names = cloud_helpers.get_matching_key_names(training_bucket, training_pattern) else: assert False #print "Local training files: ", training_pattern #training_bucket = None if testing_pattern.startswith('s3'): testing_bucket, testing_pattern = \ cloud_helpers.parse_bucket_and_pattern(testing_pattern) if len(testing_pattern) == 0: testing_pattern = '*' testing_names = cloud_helpers.get_matching_key_names(testing_bucket, testing_pattern) else: assert False assert len(training_names) > 0 assert len(testing_names) > 0 assert training_bucket == testing_bucket, \ "Expected training bucket %s to be same as testing bucket %s" % (training_bucket, testing_bucket) return launch_jobs(training_bucket, training_names, testing_names, raw_features = None, start_hour = start_hour, end_hour = end_hour, num_features = num_features, future_offset = future_offset, profile = profile)
def process_s3_files( input_bucket_name, key_glob="*", output_bucket_name_1ms=None, output_bucket_name_100ms=None, overwrite=False, use_cloud=True, ): if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob) s3_cxn = cloud_helpers.get_s3_cxn() # create output buckets if they don't already exist # it's better to do this before launching remote computations s3_cxn.create_bucket(output_bucket_name_1ms) s3_cxn.create_bucket(output_bucket_name_100ms) if use_cloud: print "Launching %d jobs" % len(matching_keys) def do_work(key_name): return process_s3_file( input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite ) label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob) jids = cloud.map(do_work, matching_keys, _type="f2", _label=label) progress = progressbar.ProgressBar(len(jids)).start() n_finished = 0 for _ in cloud.iresult(jids): n_finished += 1 progress.update(n_finished) progress.finish() else: print "Running locally..." print "%d keys match the pattern '%s'" % (len(matching_keys), key_glob) for key in matching_keys: process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms) print "Done!"
def process_s3_files(input_bucket_name, key_glob = '*', output_bucket_name_1ms = None, output_bucket_name_100ms = None, overwrite = False, use_cloud = True): if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob) s3_cxn = cloud_helpers.get_s3_cxn() # create output buckets if they don't already exist # it's better to do this before launching remote computations s3_cxn.create_bucket(output_bucket_name_1ms) s3_cxn.create_bucket(output_bucket_name_100ms) if use_cloud: print "Launching %d jobs" % len(matching_keys) def do_work(key_name): return process_s3_file( input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite) label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob) jids = cloud.map(do_work, matching_keys, _type = 'f2', _label=label ) progress = progressbar.ProgressBar(len(jids)).start() n_finished = 0 for _ in cloud.iresult(jids): n_finished += 1 progress.update(n_finished) progress.finish() else: print "Running locally..." print "%d keys match the pattern \'%s\'" % (len(matching_keys), key_glob) for key in matching_keys: process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms) print "Done!"