Example #1
0
def run(bucket, key_pattern = None, 
  map_hdf = None, map_dataframe = None,
  init = None, combine = None,  post_process = None, 
  accept_none_as_result = True, label = None, _type = 'f2'):
  
  """
  Runs query functions over collections of HDF files on S3. 
  You *must* provide: 
    - either the bucket and key_pattern (ie: "capk-fxcm" and "*2012*")
      or, a concatenation of the two separated by a slash ("capk-fxcm/*2012*")
    - either map_hdf (a function which takes HDF objects and returns a result)
      or map_dataframe (which takes pandas.DataFrames and returns a result)
  If you are running a query which returns a result you must provide:
    - init : the initial value of the computation
    - combine : a function which takes the value thus-far accumulated and 
      the result of a single mapper and combines them into a new accumulated
      result
  
  Additional arguments:
     - accept_none_as_result : Don't warn me if my mappers are returning None
     - label : label shown on picloud's job manager
     - _type : which picloud instance type to use (one of 'f2', 'm1', 'c1', 'c2')
  """
     
  if key_pattern is None:
    bucket, key_pattern = cloud_helpers.parse_bucket_and_pattern(bucket)
  if len(key_pattern) == 0:
    key_pattern = '*'
  key_names = cloud_helpers.get_matching_key_names(bucket, key_pattern)

  if combine: 
    assert init is not None

  if init and hasattr(init, '__call__'):
    acc = init()
  else:
    # if we can't call init assume it's either None or some value
    acc = init
  
  if map_hdf is not None:
    assert map_dataframe is None
    do_work = wrap_hdf_mapper(map_hdf)
  else:
    assert map_dataframe is not None
    assert map_hdf is None
    do_work = wrap_dataframe_mapper(map_dataframe)
  
  if label is None:
    label = "Mapping %s over %s/%s" % \
      ( (map_hdf if map_hdf else map_dataframe), bucket, key_pattern)
  
  acc = launch_jobs(bucket, key_names, do_work, combine, acc,  
    label, _type, accept_none_as_result)    
 
  if post_process: return post_process(acc)
  else: return acc
Example #2
0
def collect_keys_and_launch(training_pattern, testing_pattern, 
    start_hour = 3, end_hour = 7, num_features = 1, future_offset = 600,
    profile = False):
  # if files are local then buckets are None
  # otherwise we expect HDFs to be on the same S3 bucket 
  
  if training_pattern.startswith('s3'):
    training_bucket, training_pattern = \
      cloud_helpers.parse_bucket_and_pattern(training_pattern)
    if len(training_pattern) == 0:
      training_pattern = '*'
    training_names = cloud_helpers.get_matching_key_names(training_bucket, training_pattern)
  else:
    assert False
    #print "Local training files: ", training_pattern
    #training_bucket = None 
  
  if testing_pattern.startswith('s3'):
    testing_bucket, testing_pattern = \
      cloud_helpers.parse_bucket_and_pattern(testing_pattern)
    if len(testing_pattern) == 0:
      testing_pattern = '*'
    testing_names = cloud_helpers.get_matching_key_names(testing_bucket, testing_pattern)
  else:
    assert False   
  assert len(training_names) > 0
  assert len(testing_names) > 0
  assert training_bucket == testing_bucket, \
    "Expected training bucket %s to be same as testing bucket %s" % (training_bucket, testing_bucket)
 
  return launch_jobs(training_bucket, training_names, testing_names, 
    raw_features = None, 
    start_hour = start_hour, 
    end_hour = end_hour, 
    num_features = num_features, 
    future_offset = future_offset, 
    profile = profile)
Example #3
0
def process_s3_files(
    input_bucket_name,
    key_glob="*",
    output_bucket_name_1ms=None,
    output_bucket_name_100ms=None,
    overwrite=False,
    use_cloud=True,
):

    if output_bucket_name_1ms is None:
        output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"

    if output_bucket_name_100ms is None:
        output_bucket_name_100ms = input_bucket_name + "-hdf"

    matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob)

    s3_cxn = cloud_helpers.get_s3_cxn()
    # create output buckets if they don't already exist
    # it's better to do this before launching remote computations
    s3_cxn.create_bucket(output_bucket_name_1ms)
    s3_cxn.create_bucket(output_bucket_name_100ms)

    if use_cloud:
        print "Launching %d jobs" % len(matching_keys)

        def do_work(key_name):
            return process_s3_file(
                input_bucket_name, key_name, output_bucket_name_1ms, output_bucket_name_100ms, overwrite
            )

        label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob)
        jids = cloud.map(do_work, matching_keys, _type="f2", _label=label)

        progress = progressbar.ProgressBar(len(jids)).start()
        n_finished = 0
        for _ in cloud.iresult(jids):
            n_finished += 1
            progress.update(n_finished)
        progress.finish()
    else:
        print "Running locally..."
        print "%d keys match the pattern '%s'" % (len(matching_keys), key_glob)
        for key in matching_keys:
            process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms)
    print "Done!"
Example #4
0
def process_s3_files(input_bucket_name, key_glob = '*', 
      output_bucket_name_1ms = None, 
      output_bucket_name_100ms = None, 
      overwrite = False, 
      use_cloud = True):
      
  if output_bucket_name_1ms is None:
    output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"
  
  if output_bucket_name_100ms is None:
    output_bucket_name_100ms = input_bucket_name + "-hdf" 
  
  matching_keys = cloud_helpers.get_matching_key_names(input_bucket_name, key_glob)
  
  s3_cxn = cloud_helpers.get_s3_cxn()    
  # create output buckets if they don't already exist
  # it's better to do this before launching remote computations 
  s3_cxn.create_bucket(output_bucket_name_1ms)
  s3_cxn.create_bucket(output_bucket_name_100ms)
  
  if use_cloud:
    print "Launching %d jobs" % len(matching_keys)
    def do_work(key_name):
      return process_s3_file(
        input_bucket_name, 
        key_name, 
        output_bucket_name_1ms, 
        output_bucket_name_100ms, 
        overwrite)
    label = "Generate HDF files for %s/%s" % (input_bucket_name, key_glob)
    jids = cloud.map(do_work, matching_keys, _type = 'f2', _label=label )
    
    progress = progressbar.ProgressBar(len(jids)).start()
    n_finished = 0
    for _ in cloud.iresult(jids):
      n_finished += 1
      progress.update(n_finished)
    progress.finish()
  else:
    print "Running locally..."
    print "%d keys match the pattern \'%s\'" % (len(matching_keys), key_glob)
    for key in matching_keys:
      process_s3_file(input_bucket_name, key, output_bucket_name_1ms, output_bucket_name_100ms)
  print "Done!"