def test_claim_objects(jm, sample_data): """``claim_objects`` claims & returns unclaimed rows up to a limit.""" blobs = [] blob_lookup = set() for job in sample_data.job_data[:3]: blobs.append(job) blob_lookup.add(json.dumps(job)) jm.store_job_data(blobs) rows1 = jm.claim_objects(2) # a separate worker with a separate connection from treeherder.model.derived.jobs import JobsModel jm2 = JobsModel(jm.project) rows2 = jm2.claim_objects(2) loading_rows = jm.get_dhub(jm.CT_OBJECTSTORE).execute( proc="objectstore_test.counts.loading")[0]["loading_count"] jm.disconnect() jm2.disconnect() assert len(rows1) == 2 # second worker asked for two rows but only got one that was left assert len(rows2) == 1 # all three blobs were fetched by one of the workers for r in rows1 + rows2: assert r['json_blob'] in blob_lookup # the blobs are all marked as "loading" in the database assert loading_rows == 3
def unstructured_bugs(self): """ Get bugs that match this line in the Bug Suggestions artifact for this job. """ components = self._serialized_components() if not components: return [] # Importing this at the top level causes circular import misery from treeherder.model.derived import JobsModel, ArtifactsModel with JobsModel(self.repository.name) as jm, \ ArtifactsModel(self.repository.name) as am: job_id = jm.get_job_ids_by_guid([self.job_guid ])[self.job_guid]["id"] bug_suggestions = am.filter_bug_suggestions( am.bug_suggestions(job_id)) rv = [] ids_seen = set() for item in bug_suggestions: if all(component in item["search"] for component in components): for suggestion in itertools.chain(item["bugs"]["open_recent"], item["bugs"]["all_others"]): if suggestion["id"] not in ids_seen: ids_seen.add(suggestion["id"]) rv.append(suggestion) return rv
def publish_job_action(project, action, job_id, requester): """ Generic task to issue pulse notifications when jobs actions occur (retrigger/cancel) :param project str: The name of the project this action was requested for. :param action str: The type of action performed (retrigger/cancel/etc..) :param job_id str: The job id the action was requested for. :param requester str: The email address associated with the request. """ publisher = pulse_connection.get_publisher() if not publisher: return from treeherder.model.derived.jobs import JobsModel with JobsModel(project) as jm: job = jm.get_job(job_id)[0] refdata = jm.get_job_reference_data(job['signature']) publisher.job_action( version=1, build_system_type=refdata['build_system_type'], project=project, action=action, job_guid=job['job_guid'], # Job id is included for convenience as you need it in some cases # instead of job_guid... job_id=job['id'], requester=requester)
def save(self, *args, **kwargs): super(BugJobMap, self).save(*args, **kwargs) # FIXME: using the JobsModel here is pretty horrible -- remove # when we move jobs table to central db from treeherder.model.derived.jobs import JobsModel from treeherder.etl.tasks import submit_elasticsearch_doc with JobsModel(self.job.repository.name) as jm: if settings.ORANGEFACTOR_HAWK_KEY: ds_job = jm.get_job(self.job.project_specific_id)[0] if ds_job["state"] == "completed": # Submit bug associations to Elasticsearch using an async # task. submit_elasticsearch_doc.apply_async( args=[ self.job.repository.name, self.job.project_specific_id, self.bug_id, int(time.mktime(self.created.timetuple())), self.who ], routing_key='classification_mirroring') # if we have a user, then update the autoclassification relations if self.user: jm.update_autoclassification_bug(self.job.project_specific_id, self.bug_id)
def jm(request, test_repository, jobs_ds): """ Give a test access to a JobsModel instance. """ model = JobsModel(jobs_ds.project) # patch in additional test-only procs on the datasources add_test_procs_file( model.get_dhub(), model.get_datasource().key, "jobs_test.json", ) def fin(): model.disconnect() request.addfinalizer(fin) return model
def process_job_list(self, all_jobs_list): if not isinstance(all_jobs_list, list): all_jobs_list = [all_jobs_list] validated_jobs = self._get_validated_jobs_by_project(all_jobs_list) for project, job_list in validated_jobs.items(): newrelic.agent.add_custom_parameter("project", project) with JobsModel(project) as jobs_model: storeable_job_list = [] for pulse_job in job_list: if pulse_job["state"] != "unscheduled": try: self.clean_revision(pulse_job, jobs_model) storeable_job_list.append( self.transform(pulse_job)) except AttributeError: logger.warn("Skipping job due to bad attribute", exc_info=1) try: jobs_model.store_job_data(storeable_job_list) except DatasetNotFoundError: logger.warn( "Job with unsupported project: {}".format(project))
def publish_job_action(project, action, job_id, requester): """ Generic task to issue pulse notifications when jobs actions occur (retrigger/cancel) :param project str: The name of the project this action was requested for. :param action str: The type of action performed (retrigger/cancel/etc..) :param job_id str: The job id the action was requested for. :param requester str: The email address associated with the request. """ newrelic.agent.add_custom_parameter("project", project) newrelic.agent.add_custom_parameter("action", action) newrelic.agent.add_custom_parameter("job_id", job_id) newrelic.agent.add_custom_parameter("requester", requester) publisher = pulse_connection.get_publisher() if not publisher: return from treeherder.model.derived.jobs import JobsModel with JobsModel(project) as jm: job = jm.get_job(job_id)[0] publisher.job_action( version=1, build_system_type=ReferenceDataSignatures.objects.values_list( 'build_system_type', flat=True).get(signature=job['signature']), project=project, action=action, job_guid=job['job_guid'], # Job id is included for convenience as you need it in some cases # instead of job_guid... job_id=job['id'], requester=requester)
def populate_performance_series(project, series_type, series_data): from treeherder.model.derived.jobs import JobsModel with JobsModel(project) as jm: for t_range in settings.TREEHERDER_PERF_SERIES_TIME_RANGES: for signature in series_data: jm.store_performance_series(t_range['seconds'], series_type, signature, series_data[signature])
def _lock_unlock(): with JobsModel(test_project) as jm2: jm2.get_dhub().execute( proc='generic.locks.get_lock', placeholders=[FakePerfData.get_fake_lock_string()]) time.sleep(1) jm2.get_dhub().execute( proc='generic.locks.release_lock', placeholders=[FakePerfData.get_fake_lock_string()])
def _calculate_etag(view_instance, view_method, request, args, kwargs): project, interval = (kwargs.get('project'), request.QUERY_PARAMS.get('interval')) if project and interval: return cache.get(JobsModel.get_performance_series_cache_key( project, interval, hash=True)) return None
def calculate_durations(sample_window_seconds=21600, debug=False): from treeherder.model.derived.jobs import JobsModel projects = Repository.objects.filter(active_status='active').values_list('name', flat=True) for project in projects: with JobsModel(project) as jm: jm.calculate_durations(sample_window_seconds, debug)
def _calculate_etag(view_instance, view_method, request, args, kwargs): project, interval = (kwargs.get('project'), request.QUERY_PARAMS.get('interval')) if project and interval: return cache.get( JobsModel.get_performance_series_cache_key(project, interval, hash=True)) return None
def _update_failure_classification(self): # update the job classification from treeherder.model.derived.jobs import JobsModel with JobsModel(self.job.repository.name) as jm: jm.update_last_job_classification(self.job.project_specific_id) # if a manually filed job, update the autoclassification information if self.user: if self.failure_classification.name in [ "intermittent", "intermittent needs filing"]: failure_line = jm.get_manual_classification_line( self.job.project_specific_id) if failure_line: failure_line.update_autoclassification()
def _add_series(server_params, project, time_intervals, signature_hash, signature_props, mysql_debug, verbose): with JobsModel(project) as jm: jm.DEBUG = mysql_debug if verbose: print(signature_hash) jm.set_series_signature(signature_hash, signature_props) for time_interval in time_intervals: pc = PerfherderClient(protocol=server_params.scheme, host=server_params.netloc) series = pc.get_performance_series(project, signature_hash, time_interval=time_interval) jm.store_performance_series(time_interval, 'talos_data', str(signature_hash), series)
def store_jobs(job_collections, chunk_size): errors = [] for repository, jobs in job_collections.iteritems(): with JobsModel(repository) as jm: for collection in jobs.get_chunks(chunk_size=chunk_size): try: collection.validate() jm.store_job_data(collection.get_collection_data()) except Exception: newrelic.agent.record_exception() errors.append({ "project": repository, "collection": "job", "message": traceback.format_exc() }) if errors: raise common.CollectionNotStoredException(errors)
def publish_resultset(project, ids): # If we don't have a publisher (because of missing configs), then we can't # publish any pulse messages. This is okay, local installs etc. doesn't # need to publish on pulse, and requiring a pulse user is adding more # overhead to an already large development setup process. publisher = pulse_connection.get_publisher() if not publisher: return from treeherder.model.derived.jobs import JobsModel with JobsModel(project) as jm: # Publish messages with new result-sets for entry in jm.get_result_set_list_by_ids(ids): repository = jm.refdata_model.get_repository_info( entry['repository_id']) if repository is None: return entry['repository_url'] = repository['url'] # Don't expose these properties, they are internal, at least that's # what I think without documentation I have no clue... what any of # this is del entry['revisions'] # Not really internal, but too big del entry['repository_id'] # Set required properties entry['version'] = 1 entry['project'] = project # Property revision_hash should already be there, I suspect it is the # result-set identifier... # publish the data to pulse publisher.new_result_set(**entry) # Basically, I have no idea what context this runs and was inherently # unable to make kombu with or without pyamqp, etc. confirm-publish, # so we're stuck with this super ugly hack where we just close the # connection so that if the process context is destroyed then at least # messages will still get published... Well, assuming nothing goes # wrong, because we're not using confirm channels for publishing... publisher.connection.release()
def jm(request): """ Give a test access to a JobsModel instance. """ from django.conf import settings from treeherder.model.derived.jobs import JobsModel model = JobsModel.create(settings.DATABASES["default"]["TEST_NAME"]) # patch in additional test-only procs on the datasources add_test_procs_file( model.get_dhub(), model.get_datasource().key, "jobs_test.json", ) def fin(): model.disconnect() request.addfinalizer(fin) return model
def process_objects(limit=None, project=None): """ Process a number of objects from the objectstore and load them to the jobs store """ from treeherder.model.derived.jobs import JobsModel # default limit to 100 limit = limit or 100 if project: projects_to_process = [project] else: projects_to_process = Datasource.objects.values_list( 'project', flat=True).distinct() for project in projects_to_process: with JobsModel(project) as jm: jm.process_objects(limit)
def process_job_list(self, all_jobs_list): if not isinstance(all_jobs_list, list): all_jobs_list = [all_jobs_list] validated_jobs = self._get_validated_jobs_by_project(all_jobs_list) for project, job_list in validated_jobs.items(): with JobsModel(project) as jobs_model: storeable_job_list = [] for pulse_job in job_list: if pulse_job["state"] != "unscheduled": try: storeable_job_list.append( self.transform(pulse_job)) except AttributeError: logger.warn("Skipping job due to bad attribute", exc_info=1) jobs_model.store_job_data(storeable_job_list)
def process(self, message_body, exchange): try: transformer = self.get_transformer_class(exchange)(message_body) repo = Repository.objects.get(url=transformer.repo_url, branch=transformer.branch, active_status="active") transformed_data = transformer.transform(repo.name) with JobsModel(repo.name) as jobs_model: logger.info("Storing resultset for {} {} {}".format( repo.name, transformer.repo_url, transformer.branch)) jobs_model.store_result_set_data([transformed_data]) except ObjectDoesNotExist: newrelic.agent.record_custom_event("skip_unknown_repository", message_body["details"]) logger.warn("Skipping unsupported repo: {} {}".format( transformer.repo_url, transformer.branch)) except Exception as ex: newrelic.agent.record_exception(exc=ex) logger.exception("Error transforming resultset", exc_info=ex)
def _rewrite_data(self, project, mysql_debug): signature_mapping = {} extra_subtest_signatures = {} with JobsModel(project) as jm: jm.DEBUG = mysql_debug summary = jm.get_performance_series_summary( max(PerformanceTimeInterval.all_valid_time_intervals())) # first pass: rewrite non-summary tests for (signature_hash, signature_properties) in summary.iteritems(): if self._signature_needs_rewriting(signature_properties, signature_hash) and \ 'subtest_signatures' not in signature_properties: new_hash = self._rewrite_series(jm, signature_hash, signature_properties, None, None) signature_mapping[signature_hash] = new_hash elif (not signature_properties.get('subtest_signatures') and signature_properties.get('test') not in Command.COUNTER_TESTS): # in case this script got interrupted, keep track of # subtest signatures which have already been converted suitekey = self._get_suitekey(signature_properties) if extra_subtest_signatures.get(suitekey): extra_subtest_signatures[suitekey].append( signature_hash) else: extra_subtest_signatures[suitekey] = [signature_hash] # second pass: rewrite summary tests for (signature_hash, signature_properties) in summary.iteritems(): if (self._signature_needs_rewriting(signature_properties, signature_hash) and signature_properties.get('subtest_signatures')): self._rewrite_series(jm, signature_hash, signature_properties, signature_mapping, extra_subtest_signatures)
def process_job_list(self, all_jobs_list): if not isinstance(all_jobs_list, list): all_jobs_list = [all_jobs_list] validated_jobs = self._get_validated_jobs_by_project(all_jobs_list) for project, job_list in validated_jobs.items(): with JobsModel(project) as jobs_model: # todo: Continue using short revisions until Bug 1199364 rs_lookup = jobs_model.get_revision_resultset_lookup( [x["origin"]["revision"][:12] for x in job_list]) storeable_job_list = [] for pulse_job in job_list: if pulse_job["state"] != "unscheduled": try: storeable_job_list.append( self.transform(pulse_job, rs_lookup)) except AttributeError: logger.warn("Skipping job due to bad attribute", exc_info=1) jobs_model.store_job_data(storeable_job_list)
def run(self, source_url, repository, changeset=None): # get the last object seen from cache. this will # reduce the number of pushes processed every time last_push_id = cache.get("{0}:last_push_id".format(repository)) if not changeset and last_push_id: startid_url = "{}&startID={}".format(source_url, last_push_id) logger.info( "Extracted last push for '%s', '%s', from cache, " "attempting to get changes only from that point at: %s" % (repository, last_push_id, startid_url)) # Use the cached ``last_push_id`` value (saved from the last time # this API was called) for this repo. Use that value as the # ``startID`` to get all new pushes from that point forward. extracted_content = self.extract(startid_url) if extracted_content['lastpushid'] < last_push_id: # Push IDs from Mercurial are incremental. If we cached a value # from one call to this API, and a subsequent call told us that # the ``lastpushid`` is LOWER than the one we have cached, then # the Mercurial IDs were reset. # In this circumstance, we can't rely on the cached id, so must # throw it out and get the latest 10 pushes. logger.warning( ("Got a ``lastpushid`` value of {} lower than " "the cached value of {} due to Mercurial repo reset. " "Getting latest changes for '{}' instead").format( extracted_content['lastpushid'], last_push_id, repository)) cache.delete("{0}:last_push_id".format(repository)) extracted_content = self.extract(source_url) else: if changeset: logger.info("Getting all pushes for '%s' corresponding to " "changeset '%s'" % (repository, changeset)) extracted_content = self.extract(source_url + "&changeset=" + changeset) else: logger.warning("Unable to get last push from cache for '%s', " "getting all pushes" % repository) extracted_content = self.extract(source_url) # ``pushes`` could be empty if there are no new ones since we last # fetched pushes = extracted_content['pushes'] if not pushes: return None last_push_id = max(map(lambda x: int(x), pushes.keys())) last_push = pushes[str(last_push_id)] top_revision = last_push["changesets"][-1]["node"] # TODO: further remove the use of client types here transformed = self.transform(pushes, repository) errors = [] with JobsModel(repository) as jm: for collection in transformed[repository].get_chunks(chunk_size=1): try: collection.validate() jm.store_result_set_data(collection.get_collection_data()) except Exception: newrelic.agent.record_exception() errors.append({ "project": repository, "collection": "result_set", "message": traceback.format_exc() }) if errors: raise CollectionNotStoredException(errors) if not changeset: # only cache the last push if we're not fetching a specific # changeset cache.set("{0}:last_push_id".format(repository), last_push_id) return top_revision