def record_cross_pollination_stats(pruner_stats, pollinator_stats, project_qualified_name, sources, tag, initial_corpus_size, minimized_corpus_size_units, method): """Log stats about cross pollination in BigQuery.""" # TODO(mpherman): Find a way to collect these stats for OSS Fuzz. if environment.is_untrusted_worker(): return # BigQuery not available in local development.This is necessary because the # untrusted runner is in a separate process and can't be easily mocked. if environment.get_value('LOCAL_DEVELOPMENT') or environment.get_value( 'PY_UNITTESTS'): return if not pruner_stats or not pollinator_stats: return bigquery_row = { 'project_qualified_name': project_qualified_name, 'method': method, 'sources': sources, 'tags': tag if tag else '', 'initial_corpus_size': initial_corpus_size, 'corpus_size': minimized_corpus_size_units, 'initial_edge_coverage': pruner_stats['edge_coverage'], 'edge_coverage': pollinator_stats['edge_coverage'], 'initial_feature_coverage': pruner_stats['feature_coverage'], 'feature_coverage': pollinator_stats['feature_coverage'] } client = big_query.Client(dataset_id='main', table_id='cross_pollination_statistics') client.insert([big_query.Insert(row=bigquery_row, insert_id=None)])
def _store_probabilities_in_bigquery(data): """Update a bigquery table containing the daily updated probability distribution over strategies.""" bigquery_data = [] # TODO(mukundv): Update once we choose a temperature parameter for final # implementation. for row in data: bigquery_row = { 'strategy_name': row['strategy'], 'probability_high_temperature': row['bandit_weight_high_temperature'], 'probability_low_temperature': row['bandit_weight_low_temperature'], 'probability_medium_temperature': row['bandit_weight_medium_temperature'], 'run_count': row['run_count'] } bigquery_data.append(big_query.Insert(row=bigquery_row, insert_id=None)) client = big_query.Client(dataset_id='main', table_id='fuzz_strategy_experiments') client.insert(bigquery_data)
def get_last_crash_time(testcase): """Return timestamp for last crash with same crash params as testcase.""" client = big_query.Client() where_clause = ('crash_type = {crash_type} AND ' 'crash_state = {crash_state} AND ' 'security_flag = {security_flag} AND ' 'project = {project}').format( crash_type=json.dumps(testcase.crash_type), crash_state=json.dumps(testcase.crash_state), security_flag=json.dumps(testcase.security_flag), project=json.dumps(testcase.project_name), ) sql = """ SELECT hour FROM main.crash_stats WHERE {where_clause} ORDER by hour DESC LIMIT 1 """.format(where_clause=where_clause) result = client.query(query=sql) if result and result.rows: return get_datetime(result.rows[0]['hour']) return None
def _query_multi_armed_bandit_probabilities(): """Get query results. Queries above BANDIT_PROBABILITY_QUERY and yields results from bigquery. This query is sorted by strategies implemented.""" client = big_query.Client() return client.query(query=BANDIT_PROBABILITY_QUERY).rows
def _get_performance_features(fuzzer_name, job_type, datetime_start, datetime_end): """Get raw performance features stored in BigQuery.""" query_fields = [ fuzzer_stats.QueryField(fuzzer_stats.TestcaseQuery.ALIAS, column, None) for column in constants.QUERY_COLUMNS ] # TODO(mmoroz): the query should be possible for datetime as well object. query = fuzzer_stats.TestcaseQuery( fuzzer_name=fuzzer_name, job_types=[job_type], query_fields=query_fields, group_by=fuzzer_stats.QueryGroupBy.GROUP_BY_NONE, date_start=datetime_start.date(), date_end=datetime_end.date()) client = big_query.Client() try: result = client.query(query=query.build()) except Exception as e: logging.error('Exception during BigQuery request: %s\n', str(e)) raise helpers.EarlyExitException('Internal error.', 500) if not result.rows: raise helpers.EarlyExitException('No stats.', 404) return result
def _record_cross_pollination_stats(stats): """Log stats about cross pollination in BigQuery.""" # If no stats were gathered due to a timeout or lack of corpus, return. if not stats: return # BigQuery not available in local development.This is necessary because the # untrusted runner is in a separate process and can't be easily mocked. if environment.get_value('LOCAL_DEVELOPMENT') or environment.get_value( 'PY_UNITTESTS'): return bigquery_row = { 'project_qualified_name': stats.project_qualified_name, 'method': stats.method, 'sources': stats.sources, 'tags': stats.tag if stats.tag else '', 'initial_corpus_size': stats.initial_corpus_size, 'corpus_size': stats.minimized_corpus_size_units, 'initial_edge_coverage': stats.initial_edge_coverage, 'edge_coverage': stats.edge_coverage, 'initial_feature_coverage': stats.initial_feature_coverate, 'feature_coverage': stats.feature_coverage } client = big_query.Client(dataset_id='main', table_id='cross_pollination_statistics') client.insert([big_query.Insert(row=bigquery_row, insert_id=None)])
def test_multiple_page_with_limit(self): """Test multiple page with limit.""" self.query.execute.return_value = self._make_resp('tok', True, 2) self.get_query_results.execute.side_effect = [ self._make_resp('tok2', True, 2, 3), self._make_resp('tok3', True, 2, 3), self._make_resp('tok4', True, 2, 3), self._make_resp(None, True, 2, 1) ] client = big_query.Client() result = client.query('sql', timeout=10, max_results=100, offset=50, limit=10) self.assertEqual([{'t': 1}] * 10, result.rows) self.assertEqual(2, result.total_count) self.query.execute.assert_called_once_with() self.jobs.query.assert_called_once_with(projectId='project', body={ 'query': 'sql', 'timeoutMs': 60000, 'useLegacySql': False, 'maxResults': 0 }) self.jobs.getQueryResults.assert_has_calls([ mock.call(projectId='project', jobId='job', startIndex=50, pageToken=None, timeoutMs=60000, maxResults=10), mock.call().execute(), mock.call(projectId='project', jobId='job', startIndex=0, pageToken='tok2', timeoutMs=60000, maxResults=7), mock.call().execute(), mock.call(projectId='project', jobId='job', startIndex=0, pageToken='tok3', timeoutMs=60000, maxResults=4), mock.call().execute(), mock.call(projectId='project', jobId='job', startIndex=0, pageToken='tok4', timeoutMs=60000, maxResults=1), mock.call().execute() ])
def get(self): """Process all fuzz targets and update FuzzTargetJob weights.""" client = big_query.Client() update_target_weights_for_engine(client, 'libFuzzer', LIBFUZZER_SPECIFICATIONS) update_target_weights_for_engine(client, 'afl', AFL_SPECIFICATIONS) store_current_weights_in_bigquery()
def get(params, query, offset, limit): """Get the data from BigQuery.""" sql = SQL.format(table_id='%ss' % params['type'], where_clause=query.get_where_clause(), prefix=params['type'], offset=offset, limit=limit) client = big_query.Client() result = client.query(query=sql, offset=offset, limit=limit) return result.rows, result.total_count
def test_not_complete(self): """Test jobComplete=false for the first few request.""" self.query.execute.return_value = self._make_resp(None, False, 4) self.get_query_results.execute.side_effect = [ self._make_resp(None, False, 4), self._make_resp('tok', True, 8), self._make_resp(None, True, 8) ] client = big_query.Client() result = client.query('sql', timeout=10, max_results=100, offset=50) self.assertEqual([{'t': 1}, {'t': 1}], result.rows) self.assertEqual(8, result.total_count) self.query.execute.assert_called_once_with() self.jobs.query.assert_called_once_with( projectId='project', body={ 'query': 'sql', 'timeoutMs': 60000, 'useLegacySql': False, 'maxResults': 0 }) self.jobs.getQueryResults.assert_has_calls([ mock.call( projectId='project', jobId='job', timeoutMs=60000, maxResults=100, pageToken=None, startIndex=50), mock.call().execute(), mock.call( projectId='project', jobId='job', timeoutMs=60000, maxResults=100, pageToken=None, startIndex=50), mock.call().execute(), mock.call( projectId='project', jobId='job', pageToken='tok', timeoutMs=60000, maxResults=100, startIndex=0), mock.call().execute(), ])
def store_current_weights_in_bigquery(): """Update a bigquery table containing the daily stats.""" rows = [] target_jobs = ndb_utils.get_all_from_model(data_types.FuzzTargetJob) for target_job in target_jobs: row = { 'fuzzer': target_job.fuzz_target_name, 'job': target_job.job, 'weight': target_job.weight } rows.append(big_query.Insert(row=row, insert_id=None)) client = big_query.Client(dataset_id='main', table_id='fuzzer_weights') client.insert(rows)
def get_start_hour(): """Get the start hour from the first crash.""" client = big_query.Client() sql = """ SELECT min(CAST(FLOOR(UNIX_SECONDS(created_at) / 3600) AS INT64)) as min_hour FROM main.crashes """ result = client.query(query=sql) if result and result.rows: return result.rows[0]['min_hour'] return 0
def _do_bigquery_query(query): """Return results from BigQuery.""" logs.log(query) client = big_query.Client() try: results = client.raw_query(query, max_results=10000) except HttpError as e: raise helpers.EarlyExitException(str(e), 500) if 'rows' not in results: raise helpers.EarlyExitException('No stats.', 404) return results
def test_init(self): """Test __init__.""" client = big_query.Client(dataset_id='data', table_id='table') self.assertEqual('built', client.client) self.assertEqual('project', client.project_id) self.assertEqual('data', client.dataset_id) self.assertEqual('table', client.table_id) # `self.mock.build.assert_called_once_with( # 'bigquery', 'v2')` doesn't # work because `discovery.build` is decorated with @positional. # Therefore, we need the below. args, _ = self.mock.build.call_args self.assertEqual(('bigquery', 'v2'), args)
def _store_probabilities_in_bigquery(data): """Update a bigquery table containing the daily updated probability distribution over strategies.""" bigquery_data = [] for row in data: bigquery_row = { 'strategy_name': row['strategy'], 'probability': row['bandit_weight'], 'run_count': row['run_count'] } bigquery_data.append(big_query.Insert(row=bigquery_row, insert_id=None)) client = big_query.Client( dataset_id='main', table_id='fuzz_strategy_probability') client.insert(bigquery_data)
def _query_multi_armed_bandit_probabilities(): """Get query results. Queries above BANDIT_PROBABILITY_QUERY and yields results from bigquery. This query is sorted by strategies implemented.""" client = big_query.Client() formatted_query = BANDIT_PROBABILITY_QUERY.format( high_temperature_query=BANDIT_PROBABILITY_SUBQUERY.format( temperature_type='high', temperature_value=HIGH_TEMPERATURE_PARAMETER), low_temperature_query=BANDIT_PROBABILITY_SUBQUERY.format( temperature_type='low', temperature_value=LOW_TEMPERATURE_PARAMETER), medium_temperature_query=BANDIT_PROBABILITY_SUBQUERY.format( temperature_type='medium', temperature_value=MEDIUM_TEMPERATURE_PARAMETER)) return client.query(query=formatted_query).rows
def test_insert(self): """Test calling insertAll API.""" underlying = mock.MagicMock() tabledata = mock.MagicMock() insert_all = mock.MagicMock() underlying.tabledata.return_value = tabledata tabledata.insertAll.return_value = insert_all insert_all.execute.return_value = {'test': 1} self.mock.get_api_client.return_value = underlying client = big_query.Client(dataset_id='data', table_id='table') self.assertDictEqual({ 'test': 1 }, client.insert([ big_query.Insert({ 'a': 1 }, 'prefix:0'), big_query.Insert({ 'b': 2 }, 'prefix:1') ])) tabledata.insertAll.assert_called_once_with( projectId='project', datasetId='data', tableId='table', body={ 'kind': 'bigquery#tableDataInsertAllRequest', 'rows': [{ 'json': { 'a': 1 }, 'insertId': 'prefix:0' }, { 'json': { 'b': 2 }, 'insertId': 'prefix:1' }] })
def test_timeout(self): """Test timeout.""" self.query.execute.return_value = self._make_resp(None, False, 3) self.count = 0 def get_query_results(**unused_kwargs): self.count += 1 if self.count >= 3: self.mock.time.return_value = 100 return self._make_resp('tok%d' % self.count, False, 3) self.get_query_results.execute.side_effect = get_query_results client = big_query.Client() with self.assertRaises(Exception) as cm: client.query('sql', timeout=10, max_results=100, offset=50) self.assertEqual("Timeout: the query doesn't finish within 10 seconds.", cm.exception.message) self.query.execute.assert_called_once_with() self.jobs.query.assert_called_once_with( projectId='project', body={ 'query': 'sql', 'timeoutMs': 60000, 'useLegacySql': False, 'maxResults': 0 }) self.jobs.getQueryResults.assert_has_calls([ mock.call( projectId='project', jobId='job', timeoutMs=60000, maxResults=100, pageToken=None, startIndex=50), mock.call().execute(), ] * 3)
def test_one_page(self): """Test one page.""" self.query.execute.return_value = self._make_resp(None, True, 1) self.get_query_results.execute.return_value = self._make_resp( None, True, 1) client = big_query.Client() result = client.query('sql', timeout=10, max_results=100) self.assertEqual([{'t': 1}], result.rows) self.assertEqual(1, result.total_count) self.query.execute.assert_called_once_with() self.get_query_results.execute.assert_called_once_with() self.jobs.query.assert_called_once_with(projectId='project', body={ 'query': 'sql', 'timeoutMs': 60000, 'useLegacySql': False, 'maxResults': 0 })
def _query_multi_armed_bandit_probabilities(engine): """Get query results. Queries above BANDIT_PROBABILITY_QUERY and yields results from bigquery. This query is sorted by strategies implemented.""" strategy_names_list = [ strategy_entry.name for strategy_entry in engine.query_strategy_list ] strategies_subquery = '\n'.join( STRATEGY_SUBQUERY_FORMAT.format(strategy_name=strategy_name) for strategy_name in strategy_names_list) client = big_query.Client() strategies = ','.join('strategy_' + strategy_name for strategy_name in strategy_names_list) formatted_query = BANDIT_PROBABILITY_QUERY_FORMAT.format( performance_metric=engine.performance_metric, temperature_value=TEMPERATURE_PARAMETER, strategies=strategies, strategies_subquery=strategies_subquery, engine=engine.name) return client.query(query=formatted_query).rows
def test_query(self): """Test calling query API.""" underlying = mock.MagicMock() jobs = mock.MagicMock() query = mock.MagicMock() underlying.jobs.return_value = jobs jobs.query.return_value = query query.execute.return_value = {'test': 1} self.mock.get_api_client.return_value = underlying client = big_query.Client() self.assertDictEqual({'test': 1}, client.raw_query('sql', max_results=100)) jobs.query.assert_called_once_with(projectId='project', body={ 'query': 'sql', 'timeoutMs': 60000, 'maxResults': 100, 'useLegacySql': False })
def build(end_hour): """Build crash stats for the end hour.""" logging.info('Started building crash stats for %s.', crash_stats.get_datetime(end_hour)) job_id = JOB_ID_TEMPLATE.format(unique_number=int(time.time())) client = big_query.Client() make_request(client, job_id, end_hour) start_time = time.time() while (time.time() - start_time) < TIMEOUT: time.sleep(10) result = client.get_job(job_id) logging.info('Checking %s', json.dumps(result)) if result['status']['state'] == 'DONE': if result['status'].get('errors'): raise Exception(json.dumps(result)) return raise Exception('Building crash stats exceeded %d seconds.' % TIMEOUT)
def _store_probabilities_in_bigquery(engine, data): """Update a bigquery table containing the daily updated probability distribution over strategies.""" bigquery_data = [] # TODO(mukundv): Update once we choose a temperature parameter for final # implementation. for row in data: bigquery_row = { 'strategy_name': row['strategy'], 'probability': row['bandit_weight'], 'engine': engine.name } bigquery_data.append(big_query.Insert(row=bigquery_row, insert_id=None)) if bigquery_data: client = big_query.Client( dataset_id='main', table_id='fuzz_strategy_probability') client.insert(bigquery_data) else: logs.log('No fuzz strategy distribution data was found to upload to ' 'BigQuery.')
def get(end, days, block, group_by, where_clause, group_having_clause, sort_by, offset, limit): """Query from BigQuery given the params.""" if where_clause: where_clause = '(%s) AND ' % where_clause start = end - (days * 24) + 1 where_clause += '(hour BETWEEN %d AND %d) AND ' % (start, end) where_clause += ('(_PARTITIONTIME BETWEEN TIMESTAMP_TRUNC("%s", DAY) ' 'AND TIMESTAMP_TRUNC("%s", DAY))' % (get_datetime(start).strftime('%Y-%m-%d'), get_datetime(end).strftime('%Y-%m-%d'))) time_span = 1 if block == 'hour' else 24 remainder = get_remainder_for_index(end, time_span) if group_having_clause: group_having_clause = 'HAVING ' + group_having_clause sql = SQL.format(time_span=time_span, remainder=remainder, group_by=group_by, where_clause=where_clause, group_having_clause=group_having_clause, sort_by=sort_by) client = big_query.Client() result = client.query(query=sql, offset=offset, limit=limit) items = [] for row in result.rows: avg_crash_time_in_ms = row['sum_crash_time_in_ms'] // row['total_count'] for group in row['groups']: for index in group['indices']: index['hour'] = convert_index_to_hour(index['index'], time_span, remainder) items.append({ 'projectName': row['project'], 'crashType': row['crash_type'], 'crashState': row['crash_state'], 'isSecurity': row['security_flag'], 'isReproducible': row['is_reproducible'], 'isNew': row['is_new'], 'totalCount': row['total_count'], 'crashTime': { 'min': row['min_crash_time_in_ms'], 'max': row['max_crash_time_in_ms'], 'avg': avg_crash_time_in_ms, 'std': math.sqrt((row['sum_square_crash_time_in_ms'] // row['total_count']) - (avg_crash_time_in_ms * avg_crash_time_in_ms)) }, 'groups': row['groups'], 'days': days, 'block': block, 'end': end + 1 # Convert to UI's end. }) return result.total_count, items