Esempio n. 1
0
def process_local_dir(input_path, output_dir=None, max_books=None, heap_profile=False, overwrite=False):

    if not os.path.exists(input_path):
        print "Specified path does not exist: ", input_path
        exit()

    if os.path.isdir(input_path):
        files = os.listdir(input_path)
        basedir = input_path
    else:
        basedir = os.path.split(input_path)[0]
        files = [os.path.basename(input_path)]

    count = 0
    for filename in files:
        if filename.endswith(".csv") or filename.endswith(".csv.gz"):
            count += 1
            input_filename = os.path.join(basedir, filename)
            print "----"
            print "Processing  #", count, " : ", input_filename

            dest_filename_1ms, dest_filename_100ms = output_filenames(input_filename, output_dir)

            feature_names = extractor.feature_names()
            if (
                not overwrite
                and hdf.complete_hdf_exists(dest_filename_1ms, feature_names)
                and hdf.complete_hdf_exists(dest_filename_100ms, feature_names)
            ):
                print "Skipping %s found data files %s" % (input_filename, [dest_filename_1ms, dest_filename_100ms])
            else:
                process_local_file(input_filename, dest_filename_1ms, dest_filename_100ms, max_books, heap_profile)
        else:
            print "Unknown suffix for", filename
Esempio n. 2
0
def process_s3_file(
    input_bucket_name, input_key_name, output_bucket_name_1ms=None, output_bucket_name_100ms=None, overwrite=False
):

    input_filename = cloud_helpers.download_file_from_s3(input_bucket_name, input_key_name)
    dest_1ms, dest_100ms = output_filenames(input_filename, cloud_helpers.scratch_dir())

    out_key_name_1ms = os.path.split(dest_1ms)[1]
    out_key_name_100ms = os.path.split(dest_100ms)[1]

    if output_bucket_name_1ms is None:
        output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"

    if output_bucket_name_100ms is None:
        output_bucket_name_100ms = input_bucket_name + "-hdf"

    feature_names = extractor.feature_names()

    if (
        not overwrite
        and cloud_helpers.hdf_already_on_s3(output_bucket_name_1ms, out_key_name_1ms, feature_names)
        and cloud_helpers.hdf_already_on_s3(output_bucket_name_100ms, out_key_name_100ms, feature_names)
    ):

        print "HDFs on S3 have same features, so skipping this input..."
        return
    else:
        print "HDFs either not on S3 or have different features..."

    # In some weird situation we might have a local copy of the HDF already
    # finished but it just might not have been uploaded yet
    if (
        not overwrite
        and hdf.complete_hdf_exists(dest_1ms, feature_names)
        and hdf.complete_hdf_exists(dest_100ms, feature_names)
    ):
        print "Found finished HDFs on local storage..."
        header = hdf.header_from_hdf_filename(dest_1ms)
    else:
        print "Running feature generator..."
        header = process_local_file(input_filename, dest_1ms, dest_100ms)

    print "Header:", header
    print "Uploading 1ms feature file", dest_1ms, "to", output_bucket_name_1ms, "/", out_key_name_1ms
    cloud_helpers.upload_file_to_s3(dest_1ms, output_bucket_name_1ms, out_key_name_1ms, header)
    print "Uploading 100ms feature file", dest_100ms, "to", output_bucket_name_100ms, "/", out_key_name_100ms
    cloud_helpers.upload_file_to_s3(dest_100ms, output_bucket_name_100ms, out_key_name_100ms, header)
Esempio n. 3
0
def process_s3_file(input_bucket_name, input_key_name, 
    output_bucket_name_1ms = None, 
    output_bucket_name_100ms = None, 
    overwrite = False):
  
  input_filename = cloud_helpers.download_file_from_s3(input_bucket_name, input_key_name)   
  dest_1ms, dest_100ms = output_filenames(input_filename, cloud_helpers.scratch_dir())
  
  out_key_name_1ms = os.path.split(dest_1ms)[1]  
  out_key_name_100ms = os.path.split(dest_100ms)[1]
  
  if output_bucket_name_1ms is None:
     output_bucket_name_1ms = input_bucket_name + "-hdf-1ms"
  
  if output_bucket_name_100ms is None:
     output_bucket_name_100ms = input_bucket_name + "-hdf"
     
  feature_names = extractor.feature_names()
  
  if not overwrite and \
     cloud_helpers.hdf_already_on_s3(output_bucket_name_1ms, out_key_name_1ms, feature_names) and \
     cloud_helpers.hdf_already_on_s3(output_bucket_name_100ms, out_key_name_100ms, feature_names):
  
    print "HDFs on S3 have same features, so skipping this input..."
    return
  else:
    print "HDFs either not on S3 or have different features..."
  
  # In some weird situation we might have a local copy of the HDF already 
  # finished but it just might not have been uploaded yet       
  if not overwrite and hdf.complete_hdf_exists(dest_1ms, feature_names) and \
     hdf.complete_hdf_exists(dest_100ms, feature_names):
    print "Found finished HDFs on local storage..."
    header = hdf.header_from_hdf_filename(dest_1ms) 
  else:
    print "Running feature generator..."
    header = process_local_file(input_filename, dest_1ms, dest_100ms)

  print "Header:", header
  print "Uploading 1ms feature file", dest_1ms, "to", output_bucket_name_1ms, "/", out_key_name_1ms
  cloud_helpers.upload_file_to_s3(\
    dest_1ms, output_bucket_name_1ms, out_key_name_1ms, header)
  print "Uploading 100ms feature file", dest_100ms, "to", output_bucket_name_100ms, "/", out_key_name_100ms
  cloud_helpers.upload_file_to_s3(\
    dest_100ms, output_bucket_name_100ms, out_key_name_100ms, header)
Esempio n. 4
0
def process_local_dir(input_path, output_dir = None, max_books = None,
  heap_profile=False, overwrite = False):
  
  if not os.path.exists(input_path):
    print "Specified path does not exist: ", input_path
    exit()
        
  if os.path.isdir(input_path):
    files = os.listdir(input_path)
    basedir = input_path
  else:
    basedir = os.path.split(input_path)[0]
    files = [os.path.basename(input_path)]
  
  count = 0
  for filename in files:
    if filename.endswith('.csv') or filename.endswith('.csv.gz'):
      count += 1
      input_filename = os.path.join(basedir, filename)
      print "----"
      print "Processing  #", count, " : ", input_filename
      
      dest_filename_1ms, dest_filename_100ms = \
        output_filenames (input_filename, output_dir)
      
      feature_names = extractor.feature_names()
      if not overwrite and \
         hdf.complete_hdf_exists(dest_filename_1ms, feature_names) and \
         hdf.complete_hdf_exists(dest_filename_100ms, feature_names):
          print "Skipping %s found data files %s" \
            % (input_filename, [dest_filename_1ms, dest_filename_100ms])
      else:
          process_local_file(input_filename,
             dest_filename_1ms, dest_filename_100ms, 
             max_books, heap_profile)
    else:
      print "Unknown suffix for", filename