def RemoveRemoteFiles(): gs_api = gsutil.GsutilApi() filenames = gs_api.List( gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)) for filename in filenames: gs_api.DeleteFile(filename) self.assertFalse( gs_api.List(gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)))
def _MakeConfigPackage(self, package_path): """Produces the config package. Args: package_path: Path for the output file. Returns: Number of configs generated. Raises: BatchSimClientError: The final simulation time is unset or is too large, or running with keep_h5_logs and use_local_worker but local_h5_logs_dir is unset. """ self._packager.OpenPackage(package_path) num_configs = 0 for i, config in enumerate(self._GenerateConfigs()): if 'sim_time' not in config['sim']: raise BatchSimClientError('Sim time is unset.') elif config['sim']['sim_time'] > FLAGS.max_sim_time: raise BatchSimClientError( 'Sim time (%s) exceeds the value of --max_sim_time (%s).' % (config['sim']['sim_time'], FLAGS.max_sim_time)) if FLAGS.use_local_worker: config['output_file_path'] = os.path.abspath( os.path.join(self._local_output_dir, '%d.json' % i)) else: # TODO: It's weird having this one case in which we use the # gs://makani/ prefix for a Cloud Storage path. It's probably better to # include it universally - knowing whether a path is meant to be remote # or local is useful debugging information. config['output_file_path'] = ('gs://makani/%s/%d.json' % (self._gcs_output_dir, i)) config['h5_keep_sparse_log_only'] = (FLAGS.keep_sparse_h5_logs and not FLAGS.keep_h5_logs) config['h5_log_file_path'] = '' if FLAGS.keep_h5_logs or FLAGS.keep_sparse_h5_logs: if FLAGS.use_local_worker: if FLAGS.local_h5_logs_dir: config['h5_log_file_path'] = os.path.join( FLAGS.local_h5_logs_dir, '%d.h5' % i) else: raise BatchSimClientError( 'local_h5_logs_dir is unset.') else: config['h5_log_file_path'] = gcloud_util.GcsPath( 'gs://makani/', self._gcs_h5_log_dir, '%d.h5' % i) self._packager.AddString( json.dumps(config, indent=2, separators=(',', ': ')), 'gce_config/%d.json' % i) num_configs += 1 self._packager.ClosePackage() return num_configs
def _GetRemoteFilename(local_dir, filename, dest_dir, rename_template=None): """Get name of the remote file to upload to. Args: local_dir: The local directory where the file resides. filename: The name of the file. dest_dir: The remote directory to upload to. rename_template: The template used to rename the file at the destination (default: None (preserve the original filename)). Returns: full_cloud_path: Full path to the cloud file to upload to. """ renamed_filename = _RenameFile(filename, rename_template, local_dir) return gcloud_util.GcsPath(dest_dir, renamed_filename)
def AssertRemoteFiles(): gs_api = gsutil.GsutilApi() filenames = gs_api.List( gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)) prefix = os.path.join(CLOUD_BASE_DIR, CLOUD_LOG_PATH) self.assertEqual( set(filenames), { os.path.join(prefix, 'w7/logs/logs1-folder/subfolder/dummy.json'), os.path.join( prefix, 'w7/logs/logs1-folder/another_subfolder/dummy.json'), os.path.join( prefix, 'w7/logs/logs1-folder/one_more_subfolder/dummy.json'), os.path.join(prefix, 'w7/logs/w7-2013.h5'), os.path.join(prefix, 'M600A/logs/m600.h5'), os.path.join(prefix, 'M600A/logs/dummy.json'), os.path.join(prefix, 'w7/logs/logs1-folder/dummy.json'), os.path.join(prefix, 'w7/logs/logs1-folder/dummy.txt'), })
def Upload(self, preserve_local, clean_uploaded): """Attempt to upload files according to the configuration. Args: preserve_local: True if local files should not be removed after uploading. clean_uploaded: True if local files that are previously uploaded should be removed. Raises: SynchronizerError: If an issue was found with the configuration. Returns: A list of tuples, each tuple has the form of (local_filename, uploaded_filename). """ uploaded_files = [] for system in self._config['systems']: if FLAGS.systems and system not in FLAGS.systems: continue for collection in self._config['collections']: if FLAGS.collections and collection['name'] not in FLAGS.collections: continue path_string = os.path.join( self._config['local_basedir'], system, collection['local_path']) local_path_template = string.Template(path_string) try: local_path = local_path_template.substitute(os.environ) except KeyError as e: logging.error('Local path %s expects a missing environment ' 'variable: %s', path_string, e.message) continue if not os.path.isdir(local_path): logging.debug('Skipped nonexistent local directory "%s".', local_path) continue else: logging.info('Uploading local directory "%s" for collection "%s".', local_path, collection['name']) for source in collection['sources']: # Upload logs from one local directory to the cloud. dest_name = source.get('dest_name', None) dest_path = gcloud_util.GcsPath(self._config['remote_basedir'], system, collection['remote_path']) src_dir_pattern = source.get('src_dir_pattern', None) src_pattern = source.get('src_pattern', None) existing_dest_paths, gs_api = auto_upload.PrepareToUpload( local_path, dest_path) if src_dir_pattern: regex_file = re.compile(src_pattern) if src_pattern else None for directory, dirpath in auto_upload.IterDirectories( local_path, src_dir_pattern): auto_upload.TryUploadDirectory( directory, dirpath, dest_path, regex_file, dest_name, gs_api, preserve_local, True, clean_uploaded, uploaded_files) elif src_pattern: for filename, dirpath in auto_upload.IterFiles( local_path, src_pattern): result = auto_upload.TryUploadFile( filename, dirpath, dest_path, existing_dest_paths, dest_name, gs_api, preserve_local, True, clean_uploaded) if result: uploaded_files.append(result) else: raise SynchronizerError('A source requires at least a src_pattern ' 'or src_dir_pattern.') return uploaded_files
def TryUploadDirectory(directory, parent_relpath, dest_dir, source_file_regex, rename_template, gs_api, preserve_local, check_timing, clean_uploaded, uploaded_files): """Attempt to upload a directory. Args: directory: The name of the directory. parent_relpath: The local directory where the directory resides. dest_dir: The remote directory to upload to. source_file_regex: The precompiled regular expression to test whether a file should be uploaded. If None, all files are uploaded. The regex is used to match the subpath within `directory`. rename_template: The template used to rename the file at the destination. If None, the original file name is preserved. gs_api: The gsutil.GsutilApi object. preserve_local: If True, the source files will remain after uploading. check_timing: If True, the upload will begin only if preconditions are met. clean_uploaded: True if a local log should be removed if the scan finds it is already uploaded. uploaded_files: A list of tuples, each has the form of (local_filename, uploaded_filename). Raises: BadTimeToUploadError: Raised if it is not the right time to upload. """ base_relpath = os.path.join(parent_relpath, directory) renamed_directory = _RenameFile(directory, rename_template, parent_relpath) full_cloud_path = gcloud_util.GcsPath(dest_dir, renamed_directory) # Test if there exists any file with such prefix. # TODO: Could be made more efficient if there is an "Exist" call. is_new_path = not bool(gs_api.List(full_cloud_path)) # Upload all files (except symbolic links) within the directory. # Do not rename any files within the directory. rename_template = None for sub_directory, sub_directories, files in os.walk(base_relpath): rel_path = os.path.relpath(sub_directory, base_relpath) if rel_path == '.': sub_cloud_directory = full_cloud_path else: sub_cloud_directory = gcloud_util.GcsPath(full_cloud_path, rel_path) if is_new_path: existing_dest_paths = set() else: try: existing_dest_paths = set(gs_api.List(sub_cloud_directory)) except httplib2.ServerNotFoundError: # Internet becomes unavailable. return # Files in upper level directories are uploaded first; we assume files in # subdirectories take lower priority / are less interesting. # In one directory, files tagged with larger timestamps are uploaded first. files.sort(reverse=True) for filename in files: file_path = os.path.join(sub_directory, filename) rel_path = os.path.relpath(file_path, base_relpath) if source_file_regex and not source_file_regex.match(rel_path): continue try: result = TryUploadFile(filename, sub_directory, sub_cloud_directory, existing_dest_paths, rename_template, gs_api, preserve_local, check_timing, clean_uploaded) except BadTimeToUploadError: return else: if result: uploaded_files.append(result) # Traverse directories with larger timestamps first. sub_directories.sort(reverse=True)
def AssertNoRemoteFiles(): gs_api = gsutil.GsutilApi() filenames = gs_api.List( gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)) self.assertEqual(filenames, [])
def testUploadGoodTiming(self): def AssertRemoteFiles(): gs_api = gsutil.GsutilApi() filenames = gs_api.List( gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)) prefix = os.path.join(CLOUD_BASE_DIR, CLOUD_LOG_PATH) self.assertEqual( set(filenames), { os.path.join(prefix, 'w7/logs/logs1-folder/subfolder/dummy.json'), os.path.join( prefix, 'w7/logs/logs1-folder/another_subfolder/dummy.json'), os.path.join( prefix, 'w7/logs/logs1-folder/one_more_subfolder/dummy.json'), os.path.join(prefix, 'w7/logs/w7-2013.h5'), os.path.join(prefix, 'M600A/logs/m600.h5'), os.path.join(prefix, 'M600A/logs/dummy.json'), os.path.join(prefix, 'w7/logs/logs1-folder/dummy.json'), os.path.join(prefix, 'w7/logs/logs1-folder/dummy.txt'), }) def RemoveRemoteFiles(): gs_api = gsutil.GsutilApi() filenames = gs_api.List( gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH)) for filename in filenames: gs_api.DeleteFile(filename) self.assertFalse( gs_api.List(gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH))) test_config = self._LoadTestConfig() synchronizer = self._CreateLogSynchronizerFromJson(test_config) upload_expected = [ # 1. Uploading *.h5 files from the first source. (os.path.join(self._test_dir, 'logs/M600A/m600.h5'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'M600A/logs/m600.h5')), (os.path.join(self._test_dir, 'logs/w7/w7-2013.h5'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/w7-2013.h5')), # 2. Uploading the `folder` directory from the second source. # The files are uploaded before subdirectories. # Files are uploaded in reverse alphabetical order. (os.path.join(self._test_dir, 'logs/w7/logs1/folder/dummy.txt'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/logs1-folder/dummy.txt')), (os.path.join(self._test_dir, 'logs/w7/logs1/folder/dummy.json'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/logs1-folder/dummy.json')), # Subdirectories are uploaded in reverse alphabetical order. (os.path.join(self._test_dir, 'logs/w7/logs1/folder/subfolder/dummy.json'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/logs1-folder/subfolder/dummy.json')), (os.path.join( self._test_dir, 'logs/w7/logs1/folder/one_more_subfolder/' 'dummy.json'), gcloud_util.GcsPath( CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/logs1-folder/one_more_subfolder/dummy.json')), (os.path.join( self._test_dir, 'logs/w7/logs1/folder/another_subfolder/' 'dummy.json'), gcloud_util.GcsPath( CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'w7/logs/logs1-folder/another_subfolder/dummy.json')), # 3. Uploading dummy.json from the third source. (os.path.join(self._test_dir, 'logs/M600A/dummy.json'), gcloud_util.GcsPath(CLOUD_BASE_DIR, CLOUD_LOG_PATH, 'M600A/logs/dummy.json')), ] all_files_to_upload = set(pair[0] for pair in upload_expected) with PatchCloudFakes(self._test_dir) as cloud_fake: with PatchEnvFakes(has_internet=True, is_idle=True): uploaded = synchronizer.Upload(preserve_local=True, clean_uploaded=False) uploaded.sort(key=lambda x: x[0]) upload_expected.sort(key=lambda x: x[0]) self.assertEqual(uploaded, upload_expected) AssertRemoteFiles() # Upload again should result in no updates uploaded = synchronizer.Upload(preserve_local=True, clean_uploaded=False) self.assertEqual(uploaded, []) AssertRemoteFiles() # Make a snapshot of files in the faked file system. local_files_copy = copy.copy(cloud_fake.GetLocalFiles()) # If we remove remote files and retry, they should be uploaded again. RemoveRemoteFiles() uploaded = synchronizer.Upload(preserve_local=False, clean_uploaded=False) uploaded.sort(key=lambda x: x[0]) upload_expected.sort(key=lambda x: x[0]) self.assertEqual(uploaded, upload_expected) # However, local files should be deleted if preserve_local is False. self.assertFalse(all_files_to_upload & set(cloud_fake.GetLocalFiles())) # Now restore the faked file system as if no local files are deleted. cloud_fake.SetLocalFiles(local_files_copy) # Upload again. No files should be uploaded because they are already # in the cloud, but local files should be deleted. uploaded = synchronizer.Upload(preserve_local=True, clean_uploaded=True) self.assertEqual(uploaded, []) self.assertFalse(all_files_to_upload & set(cloud_fake.GetLocalFiles())) # Make a snapshot of files in the faked cloud system. cloud_files_copy = copy.copy(cloud_fake.GetCloudFiles()) with PatchCloudFakes(self._test_dir, right_checksum=False) as cloud_fake: with PatchEnvFakes(has_internet=True, is_idle=True): # Restore the cloud file system as if files are uploaded already. cloud_fake.SetCloudFiles(cloud_files_copy) AssertRemoteFiles() # No upload should be successful, due to the wrong checksum. uploaded = synchronizer.Upload(preserve_local=False, clean_uploaded=True) self.assertEqual(uploaded, []) AssertRemoteFiles() # All files should be preserved locally due to mismatched checksum. self.assertEqual( all_files_to_upload & set(cloud_fake.GetLocalFiles()), all_files_to_upload)
import string import tempfile import unittest from makani.lib.log_synchronizer import auto_upload from makani.lib.log_synchronizer import synchronizer as logsync from makani.lib.python import gsutil from makani.lib.python import gsutil_fakes from makani.lib.python.batch_sim import gcloud_fakes from makani.lib.python.batch_sim import gcloud_util import mock # The cloud folder that serves all log collections. CLOUD_BUCKET = 'gs://makani' CLOUD_ROOT_PATH = 'sandbox/logs' CLOUD_BASE_DIR = gcloud_util.GcsPath(CLOUD_BUCKET, CLOUD_ROOT_PATH) # The cloud folder to upload the collection of logs during test. CLOUD_LOG_PATH = 'cloud/subfolder/' class PatchCloudFakes(object): """Patches to fake cloud storage needed for a log synchronizer unit test. Exclusively for use as a context manager. """ def __init__(self, test_dir, right_checksum=True): """Creates the Patcher. Args: test_dir: The local directory containing the testing files. right_checksum: If False, the faked GsutilApi returns the