def process_local_file(input_filename, dest_1ms, dest_100ms, max_books=None, heap_profile=False): print "Start time:", datetime.datetime.now() frames_1ms, frames_100ms, header = extractor.run(input_filename, max_books=max_books, heap_profile=heap_profile) if dest_1ms: hdf.dict_to_hdf(frames_1ms, dest_1ms, header, extractor.feature_names()) if dest_100ms: hdf.dict_to_hdf(frames_100ms, dest_100ms, header, extractor.feature_names()) return header
def process_local_file(input_filename, dest_1ms, dest_100ms, max_books = None, heap_profile=False): print "Start time:", datetime.datetime.now() frames_1ms, frames_100ms, header = \ extractor.run(input_filename, max_books = max_books, heap_profile = heap_profile) if dest_1ms: hdf.dict_to_hdf(frames_1ms, dest_1ms, header, extractor.feature_names() ) if dest_100ms: hdf.dict_to_hdf(frames_100ms, dest_100ms, header, extractor.feature_names()) return header
def process_local_dir(input_path, output_dir=None, max_books=None, heap_profile=False, overwrite=False): if not os.path.exists(input_path): print "Specified path does not exist: ", input_path exit() if os.path.isdir(input_path): files = os.listdir(input_path) basedir = input_path else: basedir = os.path.split(input_path)[0] files = [os.path.basename(input_path)] count = 0 for filename in files: if filename.endswith(".csv") or filename.endswith(".csv.gz"): count += 1 input_filename = os.path.join(basedir, filename) print "----" print "Processing #", count, " : ", input_filename dest_filename_1ms, dest_filename_100ms = output_filenames(input_filename, output_dir) feature_names = extractor.feature_names() if ( not overwrite and hdf.complete_hdf_exists(dest_filename_1ms, feature_names) and hdf.complete_hdf_exists(dest_filename_100ms, feature_names) ): print "Skipping %s found data files %s" % (input_filename, [dest_filename_1ms, dest_filename_100ms]) else: process_local_file(input_filename, dest_filename_1ms, dest_filename_100ms, max_books, heap_profile) else: print "Unknown suffix for", filename
def process_s3_file( input_bucket_name, input_key_name, output_bucket_name_1ms=None, output_bucket_name_100ms=None, overwrite=False ): input_filename = cloud_helpers.download_file_from_s3(input_bucket_name, input_key_name) dest_1ms, dest_100ms = output_filenames(input_filename, cloud_helpers.scratch_dir()) out_key_name_1ms = os.path.split(dest_1ms)[1] out_key_name_100ms = os.path.split(dest_100ms)[1] if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" feature_names = extractor.feature_names() if ( not overwrite and cloud_helpers.hdf_already_on_s3(output_bucket_name_1ms, out_key_name_1ms, feature_names) and cloud_helpers.hdf_already_on_s3(output_bucket_name_100ms, out_key_name_100ms, feature_names) ): print "HDFs on S3 have same features, so skipping this input..." return else: print "HDFs either not on S3 or have different features..." # In some weird situation we might have a local copy of the HDF already # finished but it just might not have been uploaded yet if ( not overwrite and hdf.complete_hdf_exists(dest_1ms, feature_names) and hdf.complete_hdf_exists(dest_100ms, feature_names) ): print "Found finished HDFs on local storage..." header = hdf.header_from_hdf_filename(dest_1ms) else: print "Running feature generator..." header = process_local_file(input_filename, dest_1ms, dest_100ms) print "Header:", header print "Uploading 1ms feature file", dest_1ms, "to", output_bucket_name_1ms, "/", out_key_name_1ms cloud_helpers.upload_file_to_s3(dest_1ms, output_bucket_name_1ms, out_key_name_1ms, header) print "Uploading 100ms feature file", dest_100ms, "to", output_bucket_name_100ms, "/", out_key_name_100ms cloud_helpers.upload_file_to_s3(dest_100ms, output_bucket_name_100ms, out_key_name_100ms, header)
def process_s3_file(input_bucket_name, input_key_name, output_bucket_name_1ms = None, output_bucket_name_100ms = None, overwrite = False): input_filename = cloud_helpers.download_file_from_s3(input_bucket_name, input_key_name) dest_1ms, dest_100ms = output_filenames(input_filename, cloud_helpers.scratch_dir()) out_key_name_1ms = os.path.split(dest_1ms)[1] out_key_name_100ms = os.path.split(dest_100ms)[1] if output_bucket_name_1ms is None: output_bucket_name_1ms = input_bucket_name + "-hdf-1ms" if output_bucket_name_100ms is None: output_bucket_name_100ms = input_bucket_name + "-hdf" feature_names = extractor.feature_names() if not overwrite and \ cloud_helpers.hdf_already_on_s3(output_bucket_name_1ms, out_key_name_1ms, feature_names) and \ cloud_helpers.hdf_already_on_s3(output_bucket_name_100ms, out_key_name_100ms, feature_names): print "HDFs on S3 have same features, so skipping this input..." return else: print "HDFs either not on S3 or have different features..." # In some weird situation we might have a local copy of the HDF already # finished but it just might not have been uploaded yet if not overwrite and hdf.complete_hdf_exists(dest_1ms, feature_names) and \ hdf.complete_hdf_exists(dest_100ms, feature_names): print "Found finished HDFs on local storage..." header = hdf.header_from_hdf_filename(dest_1ms) else: print "Running feature generator..." header = process_local_file(input_filename, dest_1ms, dest_100ms) print "Header:", header print "Uploading 1ms feature file", dest_1ms, "to", output_bucket_name_1ms, "/", out_key_name_1ms cloud_helpers.upload_file_to_s3(\ dest_1ms, output_bucket_name_1ms, out_key_name_1ms, header) print "Uploading 100ms feature file", dest_100ms, "to", output_bucket_name_100ms, "/", out_key_name_100ms cloud_helpers.upload_file_to_s3(\ dest_100ms, output_bucket_name_100ms, out_key_name_100ms, header)
def process_local_dir(input_path, output_dir = None, max_books = None, heap_profile=False, overwrite = False): if not os.path.exists(input_path): print "Specified path does not exist: ", input_path exit() if os.path.isdir(input_path): files = os.listdir(input_path) basedir = input_path else: basedir = os.path.split(input_path)[0] files = [os.path.basename(input_path)] count = 0 for filename in files: if filename.endswith('.csv') or filename.endswith('.csv.gz'): count += 1 input_filename = os.path.join(basedir, filename) print "----" print "Processing #", count, " : ", input_filename dest_filename_1ms, dest_filename_100ms = \ output_filenames (input_filename, output_dir) feature_names = extractor.feature_names() if not overwrite and \ hdf.complete_hdf_exists(dest_filename_1ms, feature_names) and \ hdf.complete_hdf_exists(dest_filename_100ms, feature_names): print "Skipping %s found data files %s" \ % (input_filename, [dest_filename_1ms, dest_filename_100ms]) else: process_local_file(input_filename, dest_filename_1ms, dest_filename_100ms, max_books, heap_profile) else: print "Unknown suffix for", filename