Esempio n. 1
0
def run(bucket, key_pattern = None, 
  map_hdf = None, map_dataframe = None,
  init = None, combine = None,  post_process = None, 
  accept_none_as_result = True, label = None, _type = 'f2'):
  
  """
  Runs query functions over collections of HDF files on S3. 
  You *must* provide: 
    - either the bucket and key_pattern (ie: "capk-fxcm" and "*2012*")
      or, a concatenation of the two separated by a slash ("capk-fxcm/*2012*")
    - either map_hdf (a function which takes HDF objects and returns a result)
      or map_dataframe (which takes pandas.DataFrames and returns a result)
  If you are running a query which returns a result you must provide:
    - init : the initial value of the computation
    - combine : a function which takes the value thus-far accumulated and 
      the result of a single mapper and combines them into a new accumulated
      result
  
  Additional arguments:
     - accept_none_as_result : Don't warn me if my mappers are returning None
     - label : label shown on picloud's job manager
     - _type : which picloud instance type to use (one of 'f2', 'm1', 'c1', 'c2')
  """
     
  if key_pattern is None:
    bucket, key_pattern = cloud_helpers.parse_bucket_and_pattern(bucket)
  if len(key_pattern) == 0:
    key_pattern = '*'
  key_names = cloud_helpers.get_matching_key_names(bucket, key_pattern)

  if combine: 
    assert init is not None

  if init and hasattr(init, '__call__'):
    acc = init()
  else:
    # if we can't call init assume it's either None or some value
    acc = init
  
  if map_hdf is not None:
    assert map_dataframe is None
    do_work = wrap_hdf_mapper(map_hdf)
  else:
    assert map_dataframe is not None
    assert map_hdf is None
    do_work = wrap_dataframe_mapper(map_dataframe)
  
  if label is None:
    label = "Mapping %s over %s/%s" % \
      ( (map_hdf if map_hdf else map_dataframe), bucket, key_pattern)
  
  acc = launch_jobs(bucket, key_names, do_work, combine, acc,  
    label, _type, accept_none_as_result)    
 
  if post_process: return post_process(acc)
  else: return acc
Esempio n. 2
0
def collect_keys_and_launch(training_pattern, testing_pattern, 
    start_hour = 3, end_hour = 7, num_features = 1, future_offset = 600,
    profile = False):
  # if files are local then buckets are None
  # otherwise we expect HDFs to be on the same S3 bucket 
  
  if training_pattern.startswith('s3'):
    training_bucket, training_pattern = \
      cloud_helpers.parse_bucket_and_pattern(training_pattern)
    if len(training_pattern) == 0:
      training_pattern = '*'
    training_names = cloud_helpers.get_matching_key_names(training_bucket, training_pattern)
  else:
    assert False
    #print "Local training files: ", training_pattern
    #training_bucket = None 
  
  if testing_pattern.startswith('s3'):
    testing_bucket, testing_pattern = \
      cloud_helpers.parse_bucket_and_pattern(testing_pattern)
    if len(testing_pattern) == 0:
      testing_pattern = '*'
    testing_names = cloud_helpers.get_matching_key_names(testing_bucket, testing_pattern)
  else:
    assert False   
  assert len(training_names) > 0
  assert len(testing_names) > 0
  assert training_bucket == testing_bucket, \
    "Expected training bucket %s to be same as testing bucket %s" % (training_bucket, testing_bucket)
 
  return launch_jobs(training_bucket, training_names, testing_names, 
    raw_features = None, 
    start_hour = start_hour, 
    end_hour = end_hour, 
    num_features = num_features, 
    future_offset = future_offset, 
    profile = profile)