def _wait_for_operation(response, project, zone): """Wait for the given operation to complete.""" if 'status' in response and response['status'] == 'DONE': return if 'kind' not in response or response['kind'] != 'compute#operation': logs.log_error('Compute api response not an operation.') return api = _get_api() operation = response['name'] start_time = datetime.datetime.utcnow() while not dates.time_has_expired(start_time, seconds=OPERATION_TIMEOUT): operation_func = api.zoneOperations().get( operation=operation, project=project, zone=zone) response = _execute_api_call_with_retries(operation_func) if 'status' not in response: logs.log_error('Invalid compute engine operation %s.' % str(operation)) return if response['status'] == 'DONE': return time.sleep(POLL_INTERVAL) logs.log_error('Compute engine operation %s timed out.' % str(operation))
def check_public_testcase(self, blob_info, testcase): """Check public testcase.""" if blob_info.key() != testcase.minimized_keys: return False if not testcase.bug_information: return False issue_tracker = issue_tracker_utils.get_issue_tracker_for_testcase( testcase) issue = issue_tracker.get_issue(testcase.bug_information) if not issue: return False # If the issue is explicitly marked as view restricted to committers only # (OSS-Fuzz only), then don't allow public download. if 'restrict-view-commit' in issue.labels: return False # For OSS-Fuzz, delay the disclosure of the reproducer by 30 days. # If the deadline had previously exceeded, the reproducer was made public # already so exclude that case. if (utils.is_oss_fuzz() and 'deadline-exceeded' not in issue.labels and issue.closed_time and not dates.time_has_expired( issue.closed_time, days=_OSS_FUZZ_REPRODUCER_DELAY)): return False return True
def wait_until_good_state(): """Check battery and make sure it is charged beyond minimum level and temperature thresholds.""" # Battery levels are not applicable on GCE. if environment.is_android_cuttlefish() or settings.is_automotive(): return # Make sure device is online. adb.wait_for_device() # Skip battery check if done recently. last_battery_check_time = persistent_cache.get_value( LAST_BATTERY_CHECK_TIME_KEY, constructor=datetime.datetime.utcfromtimestamp) if last_battery_check_time and not dates.time_has_expired( last_battery_check_time, seconds=BATTERY_CHECK_INTERVAL): return # Initialize variables. battery_level_threshold = environment.get_value('LOW_BATTERY_LEVEL_THRESHOLD', LOW_BATTERY_LEVEL_THRESHOLD) battery_temperature_threshold = environment.get_value( 'MAX_BATTERY_TEMPERATURE_THRESHOLD', MAX_BATTERY_TEMPERATURE_THRESHOLD) device_restarted = False while True: battery_information = get_battery_level_and_temperature() if battery_information is None: logs.log_error('Failed to get battery information, skipping check.') return battery_level = battery_information['level'] battery_temperature = battery_information['temperature'] logs.log('Battery information: level (%d%%), temperature (%.1f celsius).' % (battery_level, battery_temperature)) if (battery_level >= battery_level_threshold and battery_temperature <= battery_temperature_threshold): persistent_cache.set_value(LAST_BATTERY_CHECK_TIME_KEY, time.time()) return logs.log('Battery in bad battery state, putting device in sleep mode.') if not device_restarted: adb.reboot() device_restarted = True # Change thresholds to expected levels (only if they were below minimum # thresholds). if battery_level < battery_level_threshold: battery_level_threshold = EXPECTED_BATTERY_LEVEL if battery_temperature > battery_temperature_threshold: battery_temperature_threshold = EXPECTED_BATTERY_TEMPERATURE # Stopping shell should help with shutting off a lot of services that would # otherwise use up the battery. However, we need to turn it back on to get # battery status information. adb.stop_shell() time.sleep(BATTERY_CHARGE_INTERVAL) adb.start_shell()
def update_tests_if_needed(): """Updates layout tests every day.""" data_directory = environment.get_value('FUZZ_DATA') error_occured = False expected_task_duration = 60 * 60 # 1 hour. retry_limit = environment.get_value('FAIL_RETRIES') temp_archive = os.path.join(data_directory, 'temp.zip') tests_url = environment.get_value('WEB_TESTS_URL') # Check if we have a valid tests url. if not tests_url: return # Layout test updates are usually disabled to speedup local testing. if environment.get_value('LOCAL_DEVELOPMENT'): return # |UPDATE_WEB_TESTS| env variable can be used to control our update behavior. if not environment.get_value('UPDATE_WEB_TESTS'): return last_modified_time = persistent_cache.get_value( TESTS_LAST_UPDATE_KEY, constructor=datetime.datetime.utcfromtimestamp) if (last_modified_time is not None and not dates.time_has_expired(last_modified_time, days=TESTS_UPDATE_INTERVAL_DAYS)): return logs.log('Updating layout tests.') tasks.track_task_start(tasks.Task('update_tests', '', ''), expected_task_duration) # Download and unpack the tests archive. for _ in range(retry_limit): try: shell.remove_directory(data_directory, recreate=True) storage.copy_file_from(tests_url, temp_archive) archive.unpack(temp_archive, data_directory, trusted=True) shell.remove_file(temp_archive) error_occured = False break except: logs.log_error( 'Could not retrieve and unpack layout tests archive. Retrying.' ) error_occured = True if not error_occured: persistent_cache.set_value(TESTS_LAST_UPDATE_KEY, time.time(), persist_across_reboots=True) tasks.track_task_end()
def beat(previous_state, log_filename): """Run a cycle of heartbeat checks to ensure bot is running.""" # Handle case when run_bot.py script is stuck. If yes, kill its process. task_end_time = tasks.get_task_end_time() if psutil and task_end_time and dates.time_has_expired( task_end_time, seconds=tasks.TASK_COMPLETION_BUFFER): # Get absolute path to |run_bot| script. We use this to identify unique # instances of bot running on a particular host. startup_scripts_directory = environment.get_startup_scripts_directory() bot_file_path = os.path.join(startup_scripts_directory, 'run_bot') for process in psutil.process_iter(): try: command_line = ' '.join(process.cmdline()) except (psutil.AccessDenied, psutil.NoSuchProcess, OSError): continue # Find the process running the main bot script. if bot_file_path not in command_line: continue process_id = process.pid logs.log('Killing stale bot (pid %d) which seems to have stuck.' % process_id) try: process_handler.terminate_root_and_child_processes(process_id) except Exception: logs.log_error('Failed to terminate stale bot processes.') # Minor cleanup to avoid disk space issues on bot restart. process_handler.terminate_stale_application_instances() shell.clear_temp_directory() shell.clear_testcase_directories() # Concerned stale processes should be killed. Now, delete the stale task. tasks.track_task_end() # Figure out when the log file was last modified. try: current_state = str(os.path.getmtime(log_filename)) except Exception: current_state = None # Only update the heartbeat if the log file was modified. if current_state and current_state != previous_state: # Try updating the heartbeat. If an error occurs, just # wait and return None. if not data_handler.update_heartbeat(): return None # Heartbeat is successfully updated. return current_state
def add_test_accounts_if_needed(): """Add test account to work with GmsCore, etc.""" last_test_account_check_time = persistent_cache.get_value( constants.LAST_TEST_ACCOUNT_CHECK_KEY, constructor=datetime.datetime.utcfromtimestamp) needs_test_account_update = (last_test_account_check_time is None or dates.time_has_expired( last_test_account_check_time, seconds=ADD_TEST_ACCOUNT_CHECK_INTERVAL)) if not needs_test_account_update: return config = db_config.get() if not config: return test_account_email = config.test_account_email test_account_password = config.test_account_password if not test_account_email or not test_account_password: return adb.run_as_root() wifi.configure(force_enable=True) if not app.is_installed(ADD_TEST_ACCOUNT_PKG_NAME): logs.log('Installing helper apk for adding test account.') android_directory = environment.get_platform_resources_directory() add_test_account_apk_path = os.path.join(android_directory, ADD_TEST_ACCOUNT_APK_NAME) app.install(add_test_account_apk_path) logs.log('Trying to add test account.') output = adb.run_shell_command( 'am instrument -e account %s -e password %s -w %s' % (test_account_email, test_account_password, ADD_TEST_ACCOUNT_CALL_PATH), timeout=ADD_TEST_ACCOUNT_TIMEOUT) if not output or test_account_email not in output: logs.log('Failed to add test account, probably due to wifi issues.') return logs.log('Test account added successfully.') persistent_cache.set_value(constants.LAST_TEST_ACCOUNT_CHECK_KEY, time.time())
def _is_data_bundle_up_to_date(data_bundle, data_bundle_directory): """Return true if the data bundle is up to date, false otherwise.""" sync_file_path = _get_data_bundle_sync_file_path(data_bundle_directory) if environment.is_trusted_host() and data_bundle.sync_to_worker: from clusterfuzz._internal.bot.untrusted_runner import file_host worker_sync_file_path = file_host.rebase_to_worker_root(sync_file_path) shell.remove_file(sync_file_path) file_host.copy_file_from_worker(worker_sync_file_path, sync_file_path) if not os.path.exists(sync_file_path): return False last_sync_time = datetime.datetime.utcfromtimestamp( utils.read_data_from_file(sync_file_path)) # Check if we recently synced. if not dates.time_has_expired( last_sync_time, seconds=_DATA_BUNDLE_SYNC_INTERVAL_IN_SECONDS): return True # For search index data bundle, we don't sync them from bucket. Instead, we # rely on the fuzzer to generate testcases periodically. if _is_search_index_data_bundle(data_bundle.name): return False # Check when the bucket url had last updates. If no new updates, no need to # update directory. bucket_url = data_handler.get_data_bundle_bucket_url(data_bundle.name) last_updated_time = storage.last_updated(bucket_url) if last_updated_time and last_sync_time > last_updated_time: logs.log('Data bundle %s has no new content from last sync.' % data_bundle.name) return True return False
def get(self): """Handle a get request.""" try: grouper.group_testcases() except: logs.log_error('Error occurred while grouping test cases.') return # Free up memory after group task run. utils.python_gc() # Get a list of jobs excluded from bug filing. excluded_jobs = _get_excluded_jobs() # Get a list of all jobs. This is used to filter testcases whose jobs have # been removed. all_jobs = data_handler.get_all_job_type_names() for testcase_id in data_handler.get_open_testcase_id_iterator(): try: testcase = data_handler.get_testcase_by_id(testcase_id) except errors.InvalidTestcaseError: # Already deleted. continue # Skip if testcase's job is removed. if testcase.job_type not in all_jobs: continue # Skip if testcase's job is in exclusions list. if testcase.job_type in excluded_jobs: continue # Skip if we are running progression task at this time. if testcase.get_metadata('progression_pending'): continue # If the testcase has a bug filed already, no triage is needed. if _is_bug_filed(testcase): continue # Check if the crash is important, i.e. it is either a reproducible crash # or an unreproducible crash happening frequently. if not _is_crash_important(testcase): continue # Require that all tasks like minimizaton, regression testing, etc have # finished. if not data_handler.critical_tasks_completed(testcase): continue # For testcases that are not part of a group, wait an additional time to # make sure it is grouped. # The grouper runs prior to this step in the same cron, but there is a # window of time where new testcases can come in after the grouper starts. # This delay needs to be longer than the maximum time the grouper can take # to account for that. # FIXME: In future, grouping might be dependent on regression range, so we # would have to add an additional wait time. if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): continue # If this project does not have an associated issue tracker, we cannot # file this crash anywhere. issue_tracker = issue_tracker_utils.get_issue_tracker_for_testcase( testcase) if not issue_tracker: issue_filer.notify_issue_update(testcase, 'new') continue # If there are similar issues to this test case already filed or recently # closed, skip filing a duplicate bug. if _check_and_update_similar_bug(testcase, issue_tracker): continue # Clean up old triage messages that would be not applicable now. testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False) # File the bug first and then create filed bug metadata. try: issue_filer.file_issue(testcase, issue_tracker) except Exception as e: logs.log_error('Failed to file issue for testcase %d.' % testcase_id) _add_triage_message(testcase, f'Failed to file issue due to exception: {str(e)}') continue _create_filed_bug_metadata(testcase) issue_filer.notify_issue_update(testcase, 'new') logs.log('Filed new issue %s for testcase %d.' % (testcase.bug_information, testcase_id))
def _check_and_update_similar_bug(testcase, issue_tracker): """Get list of similar open issues and ones that were recently closed.""" # Get similar testcases from the same group. similar_testcases_from_group = [] if testcase.group_id: group_query = data_types.Testcase.query( data_types.Testcase.group_id == testcase.group_id) similar_testcases_from_group = ndb_utils.get_all_from_query( group_query, batch_size=data_types.TESTCASE_ENTITY_QUERY_LIMIT // 2) # Get testcases with the same crash params. These might not be in the a group # if they were just fixed. same_crash_params_query = data_types.Testcase.query( data_types.Testcase.crash_type == testcase.crash_type, data_types.Testcase.crash_state == testcase.crash_state, data_types.Testcase.security_flag == testcase.security_flag, data_types.Testcase.project_name == testcase.project_name, data_types.Testcase.status == 'Processed') similar_testcases_from_query = ndb_utils.get_all_from_query( same_crash_params_query, batch_size=data_types.TESTCASE_ENTITY_QUERY_LIMIT // 2) for similar_testcase in itertools.chain(similar_testcases_from_group, similar_testcases_from_query): # Exclude ourself from comparison. if similar_testcase.key.id() == testcase.key.id(): continue # Exclude similar testcases without bug information. if not similar_testcase.bug_information: continue # Get the issue object given its ID. issue = issue_tracker.get_issue(similar_testcase.bug_information) if not issue: continue # If the reproducible issue is not verified yet, bug is still valid and # might be caused by non-availability of latest builds. In that case, # don't file a new bug yet. if similar_testcase.open and not similar_testcase.one_time_crasher_flag: return True # If the issue is still open, no need to file a duplicate bug. if issue.is_open: return True # If the issue indicates that this crash needs to be ignored, no need to # file another one. policy = issue_tracker_policy.get(issue_tracker.project) ignore_label = policy.label('ignore') if ignore_label in issue.labels: _add_triage_message( testcase, ('Skipping filing a bug since similar testcase ({testcase_id}) in ' 'issue ({issue_id}) is blacklisted with {ignore_label} label.' ).format( testcase_id=similar_testcase.key.id(), issue_id=issue.id, ignore_label=ignore_label)) return True # If the issue is recently closed, wait certain time period to make sure # our fixed verification has completed. if (issue.closed_time and not dates.time_has_expired( issue.closed_time, hours=data_types.MIN_ELAPSED_TIME_SINCE_FIXED)): _add_triage_message( testcase, ('Delaying filing a bug since similar testcase ' '({testcase_id}) in issue ({issue_id}) was just fixed.').format( testcase_id=similar_testcase.key.id(), issue_id=issue.id)) return True return False
def flash_to_latest_build_if_needed(): """Wipes user data, resetting the device to original factory state.""" if environment.get_value('LOCAL_DEVELOPMENT'): # Don't reimage local development devices. return run_timeout = environment.get_value('RUN_TIMEOUT') if run_timeout: # If we have a run timeout, then we are already scheduled to bail out and # will be probably get re-imaged. E.g. using frameworks like Tradefed. return # Check if a flash is needed based on last recorded flash time. last_flash_time = persistent_cache.get_value( constants.LAST_FLASH_TIME_KEY, constructor=datetime.datetime.utcfromtimestamp) needs_flash = last_flash_time is None or dates.time_has_expired( last_flash_time, seconds=FLASH_INTERVAL) if not needs_flash: return is_google_device = settings.is_google_device() if is_google_device is None: logs.log_error('Unable to query device. Reimaging failed.') adb.bad_state_reached() elif not is_google_device: # We can't reimage these, skip. logs.log('Non-Google device found, skipping reimage.') return # Check if both |BUILD_BRANCH| and |BUILD_TARGET| environment variables # are set. If not, we don't have enough data for reimaging and hence # we bail out. branch = environment.get_value('BUILD_BRANCH') target = environment.get_value('BUILD_TARGET') if not target: # We default to userdebug configuration. build_params = settings.get_build_parameters() if build_params: target = build_params.get('target') + '-userdebug' # Cache target in environment. This is also useful for cases when # device is bricked and we don't have this information available. environment.set_value('BUILD_TARGET', target) if not branch or not target: logs.log_warn( 'BUILD_BRANCH and BUILD_TARGET are not set, skipping reimage.') return image_directory = environment.get_value('IMAGES_DIR') build_info = fetch_artifact.get_latest_artifact_info(branch, target) if not build_info: logs.log_error('Unable to fetch information on latest build artifact for ' 'branch %s and target %s.' % (branch, target)) return if environment.is_android_cuttlefish(): download_latest_build(build_info, FLASH_CUTTLEFISH_REGEXES, image_directory) adb.recreate_cuttlefish_device() adb.connect_to_cuttlefish_device() else: download_latest_build(build_info, FLASH_IMAGE_REGEXES, image_directory) # We do one device flash at a time on one host, otherwise we run into # failures and device being stuck in a bad state. flash_lock_key_name = 'flash:%s' % socket.gethostname() if not locks.acquire_lock(flash_lock_key_name, by_zone=True): logs.log_error('Failed to acquire lock for reimaging, exiting.') return logs.log('Reimaging started.') logs.log('Rebooting into bootloader mode.') for _ in range(FLASH_RETRIES): adb.run_as_root() adb.run_command(['reboot-bootloader']) time.sleep(FLASH_REBOOT_BOOTLOADER_WAIT) adb.run_fastboot_command(['oem', 'off-mode-charge', '0']) adb.run_fastboot_command(['-w', 'reboot-bootloader']) for partition, partition_image_filename in FLASH_IMAGE_FILES: partition_image_file_path = os.path.join(image_directory, partition_image_filename) adb.run_fastboot_command( ['flash', partition, partition_image_file_path]) if partition in ['bootloader', 'radio']: adb.run_fastboot_command(['reboot-bootloader']) # Disable ramdump to avoid capturing ramdumps during kernel crashes. # This causes device lockup of several minutes during boot and we intend # to analyze them ourselves. adb.run_fastboot_command(['oem', 'ramdump', 'disable']) adb.run_fastboot_command('reboot') time.sleep(FLASH_REBOOT_WAIT) if adb.get_device_state() == 'device': break logs.log_error('Reimaging failed, retrying.') locks.release_lock(flash_lock_key_name, by_zone=True) if adb.get_device_state() != 'device': logs.log_error('Unable to find device. Reimaging failed.') adb.bad_state_reached() logs.log('Reimaging finished.') # Reset all of our persistent keys after wipe. persistent_cache.delete_value(constants.BUILD_PROP_MD5_KEY) persistent_cache.delete_value(constants.LAST_TEST_ACCOUNT_CHECK_KEY) persistent_cache.set_value(constants.LAST_FLASH_BUILD_KEY, build_info) persistent_cache.set_value(constants.LAST_FLASH_TIME_KEY, time.time())