def download_model_from_gcs(local_model_directory, fuzzer_name): """Pull model from GCS bucket and put them in specified model directory.""" # ML model is stored in corpus bucket. gcs_corpus_bucket = environment.get_value('CORPUS_BUCKET') if not gcs_corpus_bucket: logs.log('Corpus bucket is not set. Skip generation.') return False # Get cloud storage path. # e.g. gs://clusterfuzz-corpus/rnn/libpng_read_fuzzer gcs_model_directory = 'gs://%s/%s/%s' % ( gcs_corpus_bucket, constants.RNN_MODEL_NAME, fuzzer_name) logs.log('GCS model directory for fuzzer %s is %s.' % (fuzzer_name, gcs_model_directory)) # RNN model consists of three files. meta_filename = constants.RNN_MODEL_NAME + constants.MODEL_META_SUFFIX data_filename = constants.RNN_MODEL_NAME + constants.MODEL_DATA_SUFFIX index_filename = constants.RNN_MODEL_NAME + constants.MODEL_INDEX_SUFFIX # Cloud file paths. gcs_meta_path = '%s/%s' % (gcs_model_directory, meta_filename) gcs_data_path = '%s/%s' % (gcs_model_directory, data_filename) gcs_index_path = '%s/%s' % (gcs_model_directory, index_filename) # Check if model exists. if not (storage.exists(gcs_meta_path) and storage.exists(gcs_data_path) and storage.exists(gcs_index_path)): logs.log( 'ML RNN model for fuzzer %s does not exist. Skip generation.' % fuzzer_name) return False # Local file paths. local_meta_path = os.path.join(local_model_directory, meta_filename) local_data_path = os.path.join(local_model_directory, data_filename) local_index_path = os.path.join(local_model_directory, index_filename) # Download model files. result = (storage.copy_file_from(gcs_meta_path, local_meta_path) and storage.copy_file_from(gcs_data_path, local_data_path) and storage.copy_file_from(gcs_index_path, local_index_path)) if not result: logs.log( 'Failed to download RNN model for fuzzer %s. Skip generation.' % fuzzer_name) return False return True
def _cross_pollinate_other_fuzzer_corpuses(self): """Add other fuzzer corpuses to shared corpus path for cross-pollination.""" corpus_backup_date = utils.utcnow().date() - datetime.timedelta( days=data_types.CORPUS_BACKUP_PUBLIC_LOOKBACK_DAYS) for cross_pollinate_fuzzer in self.cross_pollinate_fuzzers: project_qualified_name = ( cross_pollinate_fuzzer.fuzz_target.project_qualified_name()) backup_bucket_name = cross_pollinate_fuzzer.backup_bucket_name corpus_engine_name = cross_pollinate_fuzzer.corpus_engine_name corpus_backup_url = corpus_manager.gcs_url_for_backup_file( backup_bucket_name, corpus_engine_name, project_qualified_name, corpus_backup_date, ) corpus_backup_local_filename = "%s-%s" % ( project_qualified_name, os.path.basename(corpus_backup_url), ) corpus_backup_local_path = os.path.join( self.shared_corpus_path, corpus_backup_local_filename) if not storage.exists(corpus_backup_url, ignore_errors=True): # This can happen in cases when a new fuzz target is checked in or if # missed to capture a backup for a particular day (for OSS-Fuzz, this # will result in a 403 instead of 404 since that GCS path belongs to # other project). So, just log a warning for debugging purposes only. logs.log_warn("Corpus backup does not exist, ignoring: %s." % corpus_backup_url) continue if not storage.copy_file_from(corpus_backup_url, corpus_backup_local_path): continue corpus_backup_output_directory = os.path.join( self.shared_corpus_path, project_qualified_name) shell.create_directory(corpus_backup_output_directory) result = archive.unpack(corpus_backup_local_path, corpus_backup_output_directory) shell.remove_file(corpus_backup_local_path) if result: logs.log( "Corpus backup url %s successfully unpacked into shared corpus." % corpus_backup_url) else: logs.log_error("Failed to unpack corpus backup from url %s." % corpus_backup_url)
def download_recommended_dictionary_from_gcs(self, local_dict_path): """Download recommended dictionary from GCS to the given location. Args: local_dict_path: Path to a dictionary file on the disk. Returns: A boolean indicating whether downloading succeeded or not. """ # When the fuzz target is initially created or when it has no new # coverage or dictionary recommendations, then we won't have a # recommended dictionary in GCS. if not storage.exists(self.gcs_path): return False if storage.copy_file_from(self.gcs_path, local_dict_path): return True logs.log('Downloading %s failed.' % self.gcs_path) return False
def update_recommended_dictionary(self, new_dictionary): """Update recommended dictionary stored in GCS with new dictionary elements. Args: new_dictionary: A set of dictionary elements to be added into dictionary. Returns: A number of new elements actually added to the dictionary stored in GCS. """ if environment.is_lib(): return 0 # If the dictionary does not already exist, then directly update it. if not storage.exists(self.gcs_path): storage.write_data('\n'.join(new_dictionary).encode('utf-8'), self.gcs_path) return len(new_dictionary) # Read current version of the dictionary. old_dictionary_data = storage.read_data(self.gcs_path).decode('utf-8') # Use "Compare-and-swap"-like approach to avoid race conditions and also to # avoid having a separate job merging multiple recommended dictionaries. succeeded = False while not succeeded: # If old_dictionary_data is None, there is no dictionary in GCS yet, i.e. # it's empty. Otherwise, we parse it and use it. old_dictionary = set() if old_dictionary_data: old_dictionary = set(old_dictionary_data.splitlines()) # Merge two dictionaries. new_dictionary |= old_dictionary if new_dictionary == old_dictionary: # "New dictionary" elements have been already added to GCS, bail out. return 0 succeeded, old_dictionary_data = self._compare_and_swap_gcs_dictionary( old_dictionary_data, '\n'.join(new_dictionary)) return len(new_dictionary) - len(old_dictionary)