def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar":"baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")
def test_write_filename(self): test_file = os.path.join(self.get_test_dir(), "test.log") self.storage.write_filename("foo", '{"bar":"baz"}', test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc") test_file = os.path.join(self.get_test_dir(), "test2.log") # Now test writing an object self.storage.write_filename("foo", {"bar": "baz"}, test_file) test_file_md5, test_file_size = fileutil.md5file(test_file) self.assertEqual(test_file_md5, "0ea91df239ea79ed2ebab34b46d455fc")
def export_batch(self, data_dir, conn, bucket, files): print self.label, "Uploading", ",".join(files) if self.dry_run: return 0 # Time the s3funnel call: start = datetime.now() result = subprocess.call(self.s3f_cmd + files, cwd=data_dir) sec = timer.delta_sec(start) total_size = 0 if result == 0: # Success! Verify each file's checksum, then truncate it. for f in files: # Verify checksum and track cumulative size so we can figure out MB/s full_filename = os.path.join(data_dir, f) md5, size = fileutil.md5file(full_filename) total_size += size # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: # TODO: add it to a "failed" queue. print "ERROR: %s failed checksum verification: Local=%s, Remote=%s" % (f, md5, remote_md5) self.bad_records += 1 result = -1 # TODO: else add it to a "succeeded" queue and remove it locally. else: print "Failed to upload one or more files in the current batch. Error code was", result total_mb = float(total_size) / 1024.0 / 1024.0 print "Transferred %.2fMB in %.2fs (%.2fMB/s)" % (total_mb, sec, total_mb / sec) return result
def test_write(self): dims = ["r1", "a1", "c1", "v1", "b1", "20130102"] test_dir = self.get_test_dir() test_file = self.schema.get_filename(test_dir, dims) self.assertEquals(test_file, test_dir + "/r1/a1/c1/v1/b1.20130102.v1.log") self.storage.write("foo", '{"bar":"baz"}', dims) md5, size = fileutil.md5file(test_file) self.assertEqual(md5, "0ea91df239ea79ed2ebab34b46d455fc")
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key): result = 0 if len(incoming_files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) files = [] for f in incoming_files: full_filename = os.path.join(fetch_cwd, f) if os.path.isfile(full_filename): md5, size = fileutil.md5file(full_filename) # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: files.append(f) else: print "Already downloaded", f else: files.append(f) fetch_cmd = [S3FUNNEL_PATH] fetch_cmd.append(bucket.name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") # Fetch in batches of 8 files at a time while len(files) > 0: current_files = files[0:8] files = files[8:] start = datetime.now() result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % ( downloaded_mb, duration_sec, downloaded_mb / duration_sec) if result != 0: break return result
def fetch_s3_files(incoming_files, fetch_cwd, bucket, aws_key, aws_secret_key): result = 0 if len(incoming_files) > 0: if not os.path.isdir(fetch_cwd): os.makedirs(fetch_cwd) files = [] for f in incoming_files: full_filename = os.path.join(fetch_cwd, f) if os.path.isfile(full_filename): md5, size = fileutil.md5file(full_filename) # f is the key name - it does not include the full path to the # data dir. key = bucket.get_key(f) # Strip quotes from md5 remote_md5 = key.etag[1:-1] if md5 != remote_md5: files.append(f) else: print "Already downloaded", f else: files.append(f) fetch_cmd = [S3FUNNEL_PATH] fetch_cmd.append(bucket.name) fetch_cmd.append("get") fetch_cmd.append("-a") fetch_cmd.append(aws_key) fetch_cmd.append("-s") fetch_cmd.append(aws_secret_key) fetch_cmd.append("-t") fetch_cmd.append("8") # Fetch in batches of 8 files at a time while len(files) > 0: current_files = files[0:8] files = files[8:] start = datetime.now() result = subprocess.call(fetch_cmd + current_files, cwd=fetch_cwd) duration_sec = timer.delta_sec(start) # TODO: verify MD5s downloaded_bytes = sum([ os.path.getsize(os.path.join(fetch_cwd, f)) for f in current_files ]) downloaded_mb = downloaded_bytes / 1024.0 / 1024.0 print "Downloaded %.2fMB in %.2fs (%.2fMB/s)" % (downloaded_mb, duration_sec, downloaded_mb / duration_sec) if result != 0: break return result
def main(): args = get_args() logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format( args.source_bucket, args.dest_bucket)) logger.debug("Connecting to S3...") conn = S3Connection(args.aws_key, args.aws_secret_key) source_bucket = conn.get_bucket(args.source_bucket) dest_bucket = conn.get_bucket(args.dest_bucket) compress_cmd = [StorageLayout.COMPRESS_PATH ] + StorageLayout.COMPRESSION_ARGS prefix = args.prefix last_key = '' done = False total_count = 0 total_bytes = 0 start_time = datetime.now() dupe_map = {} while not done: try: for k in source_bucket.list(prefix=prefix, marker=last_key): if k.name.endswith('/'): logger.debug("Skipping directory '{}'".format(k.name)) continue if skip_by_date(k.name, args.min_date, logger): logger.debug("Skipping file older than {}: {}".format( args.min_date, k.name)) continue total_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.info( "Looked at {} total records in {} seconds. Last key was {}" .format(total_count, timer.delta_sec(start_time), last_key)) logger.debug("Fetching {} from source bucket".format(k.name)) full_source_filename = os.path.join(args.work_dir, "__source", k.name) full_dest_filename = os.path.join(args.work_dir, "__dest", k.name) # Ensure that the necessary local dirs exist: for f in [full_source_filename, full_dest_filename]: dirname = os.path.dirname(f) if dirname != '' and not os.path.exists(dirname): os.makedirs(dirname) logger.debug("Getting '{}' to '{}'".format( k.name, full_source_filename)) k.get_contents_to_filename(full_source_filename) logger.info("Removing pingIDs...") tmp_out_file = full_dest_filename + ".tmp" out_handle = open(tmp_out_file, "w") logger.debug("Uncompressing...") if full_source_filename.endswith( StorageLayout.COMPRESSED_SUFFIX): decompress_cmd = [StorageLayout.COMPRESS_PATH ] + StorageLayout.DECOMPRESSION_ARGS raw_handle = open(full_source_filename, "rb") # Popen the decompressing version of StorageLayout.COMPRESS_PATH p_decompress = Popen(decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr) handle = p_decompress.stdout else: handle = open(full_source_filename, "r") raw_handle = None logger.debug("Generating new pingIDs...") for line in handle: # Lines are of the form <key><tab><json payload><newline>. # Split on tab character to get the pieces. key, payload = line.split(u"\t", 1) # Replace key with a fresh UUID: if key in dupe_map: logger.info( "Already saw key {}, skipping any more occurrences" .format(key)) else: new_key = str(uuid4()) dupe_map[key] = new_key out_handle.write(u"%s\t%s" % (new_key, payload)) handle.close() out_handle.close() if raw_handle: raw_handle.close() sql_update = None empty_result = False if os.stat(tmp_out_file).st_size > 0: logger.debug("Compressing new file...") f_comp = open(full_dest_filename, "wb") f_raw = open(tmp_out_file, "r", 1) p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) p_compress.communicate() f_raw.close() f_comp.close() local_md5, size = fu.md5file(full_dest_filename) sql_update = "UPDATE published_files SET " \ "file_md5 = '{0}', " \ "file_size = {1}, " \ "bucket_name = '{2}' " \ "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name) else: # Don't upload empty files. empty_result = True sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format( k.name) logger.debug( "File was empty, skipping: {}".format(tmp_out_file)) logger.info( "Removing temp output file: {}".format(tmp_out_file)) os.remove(tmp_out_file) if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"): dest_key = dest_bucket.new_key(k.name) dest_key.set_contents_from_filename(full_dest_filename) # Compare the md5 to be sure it succeeded. dest_md5 = dest_key.etag[1:-1] local_md5, size = fu.md5file(full_dest_filename) if dest_md5 != local_md5: raise Exception( "Failed to upload {}".format(full_dest_filename)) if should_run( args.dry_run, logger, "Removing input file: {}".format( full_source_filename)): os.remove(full_source_filename) if not empty_result and should_run( args.dry_run, logger, "Removing output file: {}".format(full_dest_filename)): os.remove(full_dest_filename) if empty_result or args.source_bucket != args.dest_bucket: if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info( "Not deleting source: either non-empty or same bucket: {}" .format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): #TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key))
"allowed_values": ["c1", "c2", "c3"] }, { "field_name": "appVersion", "allowed_values": "*" }, { "field_name": "appBuildID", "allowed_values": "*" }, { "field_name": "submission_date", "allowed_values": { "min": "20130101", "max": "20131231" } } ] } try: schema = TelemetrySchema(schema_spec) storage = StorageLayout(schema, test_dir, 10000) test_file_1 = os.path.join(test_dir, "test.log") storage.write_filename("foo", '{"bar": "baz"}', test_file_1) test_file_1_md5, test_file_1_size = fileutil.md5file(test_file_1) assert test_file_1_md5 == "206dd2d33a04802c31d2c74f10cc472b" assert storage.clean_newlines("ab\n\ncd\r\n") == "ab cd " finally: shutil.rmtree(test_dir)
def main(): args = get_args() logging.basicConfig() logger = logging.getLogger(__name__) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.WARNING) if not os.path.exists(args.work_dir): os.makedirs(args.work_dir) logger.info("Sanitizing FirefoxOS data from {} and moving it to {}".format(args.source_bucket, args.dest_bucket)) logger.debug("Connecting to S3...") conn = S3Connection(args.aws_key, args.aws_secret_key) source_bucket = conn.get_bucket(args.source_bucket) dest_bucket = conn.get_bucket(args.dest_bucket) compress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.COMPRESSION_ARGS prefix = args.prefix last_key = "" done = False total_count = 0 total_bytes = 0 start_time = datetime.now() dupe_map = {} while not done: try: for k in source_bucket.list(prefix=prefix, marker=last_key): if k.name.endswith("/"): logger.debug("Skipping directory '{}'".format(k.name)) continue if skip_by_date(k.name, args.min_date, logger): logger.debug("Skipping file older than {}: {}".format(args.min_date, k.name)) continue total_count += 1 total_bytes += k.size last_key = k.name if total_count % 100 == 0: logger.info( "Looked at {} total records in {} seconds. Last key was {}".format( total_count, timer.delta_sec(start_time), last_key ) ) logger.debug("Fetching {} from source bucket".format(k.name)) full_source_filename = os.path.join(args.work_dir, "__source", k.name) full_dest_filename = os.path.join(args.work_dir, "__dest", k.name) # Ensure that the necessary local dirs exist: for f in [full_source_filename, full_dest_filename]: dirname = os.path.dirname(f) if dirname != "" and not os.path.exists(dirname): os.makedirs(dirname) logger.debug("Getting '{}' to '{}'".format(k.name, full_source_filename)) k.get_contents_to_filename(full_source_filename) logger.info("Removing pingIDs...") tmp_out_file = full_dest_filename + ".tmp" out_handle = open(tmp_out_file, "w") logger.debug("Uncompressing...") if full_source_filename.endswith(StorageLayout.COMPRESSED_SUFFIX): decompress_cmd = [StorageLayout.COMPRESS_PATH] + StorageLayout.DECOMPRESSION_ARGS raw_handle = open(full_source_filename, "rb") # Popen the decompressing version of StorageLayout.COMPRESS_PATH p_decompress = Popen( decompress_cmd, bufsize=65536, stdin=raw_handle, stdout=PIPE, stderr=sys.stderr ) handle = p_decompress.stdout else: handle = open(full_source_filename, "r") raw_handle = None logger.debug("Generating new pingIDs...") for line in handle: # Lines are of the form <key><tab><json payload><newline>. # Split on tab character to get the pieces. key, payload = line.split(u"\t", 1) # Replace key with a fresh UUID: if key in dupe_map: logger.info("Already saw key {}, skipping any more occurrences".format(key)) else: new_key = str(uuid4()) dupe_map[key] = new_key out_handle.write(u"%s\t%s" % (new_key, payload)) handle.close() out_handle.close() if raw_handle: raw_handle.close() sql_update = None empty_result = False if os.stat(tmp_out_file).st_size > 0: logger.debug("Compressing new file...") f_comp = open(full_dest_filename, "wb") f_raw = open(tmp_out_file, "r", 1) p_compress = Popen(compress_cmd, bufsize=65536, stdin=f_raw, stdout=f_comp, stderr=sys.stderr) p_compress.communicate() f_raw.close() f_comp.close() local_md5, size = fu.md5file(full_dest_filename) sql_update = ( "UPDATE published_files SET " "file_md5 = '{0}', " "file_size = {1}, " "bucket_name = '{2}' " "WHERE file_name = '{3}';".format(local_md5, size, dest_bucket.name, k.name) ) else: # Don't upload empty files. empty_result = True sql_update = "DELETE FROM published_files WHERE file_name = '{0}';".format(k.name) logger.debug("File was empty, skipping: {}".format(tmp_out_file)) logger.info("Removing temp output file: {}".format(tmp_out_file)) os.remove(tmp_out_file) if not empty_result and should_run(args.dry_run, logger, "Uploading to dest bucket"): dest_key = dest_bucket.new_key(k.name) dest_key.set_contents_from_filename(full_dest_filename) # Compare the md5 to be sure it succeeded. dest_md5 = dest_key.etag[1:-1] local_md5, size = fu.md5file(full_dest_filename) if dest_md5 != local_md5: raise Exception("Failed to upload {}".format(full_dest_filename)) if should_run(args.dry_run, logger, "Removing input file: {}".format(full_source_filename)): os.remove(full_source_filename) if not empty_result and should_run( args.dry_run, logger, "Removing output file: {}".format(full_dest_filename) ): os.remove(full_dest_filename) if empty_result or args.source_bucket != args.dest_bucket: if should_run(args.dry_run, logger, "Deleting from source bucket"): k.delete() else: logger.info("Not deleting source: either non-empty or same bucket: {}".format(k.name)) if sql_update is None: logger.error("Missing sql_update :(") else: logger.info(sql_update) if should_run(args.dry_run, logger, "Notifying coordinator"): # TODO logger.debug("Should be actually notifying coordinator") done = True except socket.error, e: logger.error("Error listing keys: {}".format(e)) logger.error(traceback.format_exc()) logger.info("Continuing from last seen key: {}".format(last_key))