def GetTryBuilders(ci_builders): """Gets the set of try builders to query. A try builder is of interest if it mirrors a builder in |ci_builders|. Args: ci_builders: An iterable of strings, each element being the name of a Chromium CI builder that results will be/were queried from. Returns: A set of strings, each element being the name of a Chromium try builder to query results from. """ logging.info('Getting try builders') mirrored_builders = set() no_output_builders = set() pool = multiprocessing_utils.GetProcessPool() results = pool.map(_GetMirroredBuildersForCiBuilder, ci_builders) for (builders, found_mirror) in results: if found_mirror: mirrored_builders |= builders else: no_output_builders |= builders if no_output_builders: raise RuntimeError( 'Did not get Buildbucket output for the following builders. They may ' 'need to be added to the FAKE_TRY_BUILDERS or NON_CHROMIUM_BUILDERS ' 'mappings.\n%s' % '\n'.join(no_output_builders)) logging.debug('Got %d try builders: %s', len(mirrored_builders), mirrored_builders) return mirrored_builders
def _FillExpectationMapForBuilders(self, expectation_map, builders, builder_type): """Fills |expectation_map| with results from |builders|. Args: expectation_map: A data_types.TestExpectationMap. Will be modified in-place. builders: A list of strings containing the names of builders to query. builder_type: A string containing the type of builder to query, either "ci" or "try". Returns: A dict containing any results that were retrieved that did not have a matching expectation in |expectation_map| in the following format: { |builder_type|:|builder_name| (str): [ result1 (data_types.Result), result2 (data_types.Result), ... ], } """ assert isinstance(expectation_map, data_types.TestExpectationMap) # Spin up a separate process for each query/add step. This is wasteful in # the sense that we'll have a bunch of idle processes once faster steps # start finishing, but ensures that we start slow queries early and avoids # the overhead of passing large amounts of data between processes. See # crbug.com/1182459 for more information on performance considerations. process_pool = multiprocessing_utils.GetProcessPool( nodes=len(builders)) args = [(b, builder_type, expectation_map) for b in builders] results = process_pool.map(self._QueryAddCombined, args) tmp_expectation_map = data_types.TestExpectationMap() all_unmatched_results = {} for (unmatched_results, prefixed_builder_name, merge_map) in results: expectations.MergeExpectationMaps(tmp_expectation_map, merge_map, expectation_map) if unmatched_results: all_unmatched_results[ prefixed_builder_name] = unmatched_results expectation_map.clear() expectation_map.update(tmp_expectation_map) return all_unmatched_results
def _FillExpectationMapForBuilders(expectation_map, builders, builder_type, suite, project, num_samples, large_query_mode): """Fills |expectation_map| with results from |builders|. Args: expectation_map: A dict in the format returned by expectations.CreateTestExpectationMap(). Will be modified in-place. builders: A list of strings containing the names of builders to query. builder_type: A string containing the type of builder to query, either "ci" or "try". suite: A string containing the name of the suite that is being queried for. project: A string containing the billing project to use for BigQuery. num_samples: An integer containing the number of builds to pull results from. large_query_mode: A boolean indicating whether large query mode should be used. In this mode, an initial, smaller query is made and its results are used to perform additional filtering on a second, larger query in BigQuery. This works around hitting a hard memory limit when running the ORDER BY clause. Returns: A dict containing any results that were retrieved that did not have a matching expectation in |expectation_map| in the following format: { |builder_type|:|builder_name| (str): [ result1 (data_types.Result), result2 (data_types.Result), ... ], } """ all_unmatched_results = {} # We use two separate pools since each is better for a different task. Adding # retrieved results to the expectation map is computationally intensive, so # properly parallelizing it results in large speedup. Python's default # interpreter does not support true multithreading, and the multiprocessing # module throws a fit when using custom data types due to pickling, so use # pathos' ProcessPool for this, which is like multiprocessing but handles all # the pickling automatically. # # However, ProcessPool appears to add a non-trivial amount of overhead when # passing data back and forth, so use a thread pool for triggering BigQuery # queries. Each query is already started in its own subprocess, so the lack # of multithreading is not an issue. multiprocessing.pool.ThreadPool() is not # officially documented, but comes up frequently when looking for information # on Python thread pools and is used in other places in the Chromium code # base. # # Using two pools also allows us to start processing data while queries are # still running since the latter spends most of its time waiting for the # query to complete. # # Since the ThreadPool is going to be idle most of the time, we can use many # more threads than we have logical cores. thread_count = 4 * multiprocessing.cpu_count() query_pool = multiprocessing.pool.ThreadPool(thread_count) result_pool = multiprocessing_utils.GetProcessPool() running_queries = set() running_adds = set() running_adds_lock = threading.Lock() def pass_query_result_to_add(result): bn, r = result arg = (expectation_map, builder_type, bn, r) running_adds_lock.acquire() running_adds.add(result_pool.apipe(_AddResultToMapMultiprocessing, arg)) running_adds_lock.release() for b in builders: arg = (b, builder_type, suite, project, num_samples, large_query_mode) running_queries.add( query_pool.apply_async(QueryBuilder, arg, callback=pass_query_result_to_add)) # We check the AsyncResult objects here because the provided callback only # gets called on success, and exceptions are not raised until the result is # retrieved. This can be removed whenever this is switched to Python 3, as # apply_async has an error_callback parameter there. while True: completed_queries = set() for rq in running_queries: if rq.ready(): completed_queries.add(rq) rq.get() running_queries -= completed_queries if not len(running_queries): break time.sleep(ASYNC_RESULT_SLEEP_DURATION) # At this point, no more AsyncResults should be getting added to # |running_adds|, so we don't need to bother with the lock. add_results = [] while True: completed_adds = set() for ra in running_adds: if ra.ready(): completed_adds.add(ra) add_results.append(ra.get()) running_adds -= completed_adds if not len(running_adds): break time.sleep(ASYNC_RESULT_SLEEP_DURATION) tmp_expectation_map = {} for (unmatched_results, prefixed_builder_name, merge_map) in add_results: _MergeExpectationMaps(tmp_expectation_map, merge_map, expectation_map) if unmatched_results: all_unmatched_results[prefixed_builder_name] = unmatched_results expectation_map.clear() expectation_map.update(tmp_expectation_map) return all_unmatched_results