def load_jobs_from_script_file(self, file, job_center): with open(file) as f: for l in f: if 'sudo docker' in l or '#!/bin/bash' in l: continue self.add(l.strip()) log.debug('len(job_ids) =', self.total_jobs)
def find_email_subscriber(self, email: str, error_if_not_found: bool = True) -> Response: log.debug( 'Trying to find email subscriber with email {}.'.format(email)) return self._get(DatabaseAPI._email_subscriber_email_endpoint(email), error_if_not_found)
def init_queues_for_threads(self, threads_num, package_mode=False): num_of_items_per_thread = int( self.get_num_remaining_items(package_mode) / threads_num) self.thread_workloads = [] q = Queue() if package_mode: for r in self.repos: for bp in self.repos[r].buildpairs: for jp in bp.jobpairs: if not jp.reproduced.value: q.put(jp) if q.qsize() >= num_of_items_per_thread: self.thread_workloads.append(q) q = Queue() else: for r in self.repos: for bp in self.repos[r].buildpairs: for jp in bp.jobpairs: for j in jp.jobs: if not j.reproduced.value and not j.skip.value: q.put(j) if q.qsize() >= num_of_items_per_thread: self.thread_workloads.append(q) q = Queue() log.info('Finished initializing queues for all threads.') for i in range(len(self.thread_workloads)): log.debug('tid =', i, ', qsize =', self.thread_workloads[i].qsize())
def _get_java_analyzer(self, primary_language, lines, folds, job_id, confirmed_analyzer, trigger_sha, repo): if confirmed_analyzer is None: confirmed_analyzer = self.get_build_system(lines, job_id, trigger_sha, repo) if confirmed_analyzer is not None: if confirmed_analyzer == 'maven': self.build_system['maven'] += 1 log.debug('Using maven Analyzer') return JavaMavenAnalyzer(primary_language, folds, job_id) elif confirmed_analyzer == 'gradle': self.build_system['gradle'] += 1 log.debug('Using gradle Analyzer') return JavaGradleAnalyzer(primary_language, folds, job_id) elif confirmed_analyzer == 'ant': self.build_system['ant'] += 1 log.debug('Using ant Analyzer') return JavaAntAnalyzer(primary_language, folds, job_id) elif confirmed_analyzer == 'play': self.build_system['play'] += 1 log.debug('Using other Analyzer') return JavaOtherAnalyzer(primary_language, folds, job_id, confirmed_analyzer) else: self.build_system['NA'] += 1 log.debug('Using other Analyzer') return JavaOtherAnalyzer(primary_language, folds, job_id, 'NA')
def get_tag_list(self, repo): log.debug('Getting tag list for {}.'.format(repo)) tags = {} for item in self.get_tags(repo): tags[item['name']] = 0 log.debug('Done getting tag list of', len(tags), 'tags.') return tags
def _load_jobs_from_pairs_for_repo(self, input_file): """ Read the input file, which should contain mined pairs from the database. """ try: buildpairs = read_json(input_file) except json.JSONDecodeError: log.error('Error reading input file {} in PairCenter. Exiting.') raise for bp in buildpairs: # For debug purposes: When we only want to reproduce non-PR pairs, we can uncomment these lines. # if bp['pr_num'] == -1: # continue repo = bp['repo'] if repo not in self.repos: self.repos[repo] = Repo(repo) self.uninitialized_repos.put(repo) self._append_buildpair_and_jobpair_to_repo(repo, bp) self._init_names() self.set_skip_of_job_pairs() self._init_queue_of_repos() # Calculate buildpair and job numbers after done loading from file. self._calc_num_total_buildpairs() self._calc_num_total_jobpairs() self._calc_num_total_jobs() log.debug('pair_center.total_buildpairs =', self.total_buildpairs, 'pair_center.total_jobpairs =', self.total_jobpairs, 'pair_center.total_jobs =', self.total_jobs)
def debug_when(branch, b, repo, utils): # DEBUG when matching committer date gets more result than HTML approach commits_with_same_datetime = [ commit['sha'] for commit in branch.github_commits if commit['commit']['committer']['date'] == b.committed_at ] # AssignTriggerCommits.find_trigger_commit_by_matching_datetime(repo, utils, commits_with_same_datetime, b, # checked_commits) if not b.trigger_commit and commits_with_same_datetime: b.trigger_commit = 'mock' print( 'why got matching committer date but did not find trigger from from html?', len(commits_with_same_datetime), commits_with_same_datetime[0]) for c in commits_with_same_datetime: if utils.github.is_commit_associated_with_build( repo, c, b.build_id): log.warning( 'this commit is confirmed to be the trigger but it didnt show up in HTML.' ) AssignTriggerCommits.print_helpful_links_for_debugging( repo, c, branch, b) utils.github.get_pr_commits_by_html( repo, str(branch.pr_num), branch) for commit in branch.html_commits: if str(b.build_id) == branch.html_commits[commit]: log.debug( 'Tried again and found the commit in HTML this time.' ) break
def _upsert(self, endpoint: Endpoint, entity, singular_entity_name: str = 'entity') -> Response: if entity is None: raise TypeError if not isinstance(endpoint, Endpoint): raise TypeError if not endpoint: raise ValueError if not isinstance(singular_entity_name, str): raise TypeError if not singular_entity_name: raise ValueError log.debug('Trying to upsert {}.'.format(singular_entity_name)) # First, check if the entity exists. If it does, then pass its etag. get_resp = self._get(endpoint, error_if_not_found=False) etag = get_resp.json()['_etag'] if get_resp.ok else None resp = self._put(endpoint, entity, etag) if resp.status_code == 422: log.error('The', singular_entity_name, 'was not upserted because it failed validation.') log.error(pprint.pformat(entity)) log.error(resp.content) return resp
def _bulk_insert( self, endpoint: Endpoint, entities: List, plural_entity_name: str = 'entities' ) -> Generator[Response, None, None]: if not isinstance(entities, list): raise TypeError( 'The {} to bulk insert must be provided as a list.'.format( plural_entity_name)) if not all(e for e in entities): raise ValueError( 'All {} must be non-None.'.format(plural_entity_name)) if not isinstance(endpoint, Endpoint): raise TypeError if not endpoint: raise ValueError if not isinstance(plural_entity_name, str): raise TypeError if not plural_entity_name: raise ValueError log.debug('Trying to bulk insert {} {}.'.format( len(entities), plural_entity_name)) # Insert the entities in chunks to avoid a 413 Request Entity Too Large error. for chunk in DatabaseAPI._chunks(entities, 100): resp = self._post(endpoint, chunk) if resp.status_code == 422: log.error('The', plural_entity_name, 'were not inserted because they failed validation.') log.error(pprint.pformat(chunk)) log.error(resp.content) yield resp
def find_mined_build_pair(self, object_id: str, error_if_not_found: bool = True) -> Response: log.debug('Trying to find mined build pairs for ObjectId {}.'.format( object_id)) return self._get( DatabaseAPI._mined_build_pair_object_id_endpoint(object_id), error_if_not_found)
def print_helpful_links_for_debugging(repo, commit, branch, b): log.debug('PR:', branch.pr_num, 'base branch =', branch.base_branch) log.debug('https://api.github.com/repos/' + repo + '/commits/' + commit + '/status') log.debug('https://api.travis-ci.org/builds/' + str(b.build_id)) log.debug('https://api.github.com/repos/' + repo + '/pulls/' + str(branch.pr_num) + '/commits') log.debug('https://github.com/' + repo + '/pull/' + str(branch.pr_num) + '/commits')
def filter_non_exact_images(job_id, log_path): """ Check if all jobs in this pair (from both the failed and passed build) used images that are available. If an image is found to match the job pair then it gets added to the job pair. This function assumes the language specified in the Travis configuration does not change between the failed and passed builds. Returns a 4-tuple of filter counts. The tuple members represent the following: 1. The number of pairs filtered due to original log not found 2. The number of pairs filtered due to an error reading the original log 3. The number of pairs filtered due to no image provision timestamp in the original log 4. The number of pairs filtered due to usage of a non-exact Docker image. """ log.debug( 'To detect non-exact pairs, we first extract the used images from the original logs.' ) travis_images = read_json(os.path.join(BASE_DIR, "travis_images.json")) provisioned_strs = [] for language in travis_images: provisioned_strs += travis_images[language].values() dockerhub_images = read_json( os.path.join(BASE_DIR, "dockerhub_images.json")) no_original_log = 0 error_reading_original_log = 0 orig_log_path = os.path.join(log_path, str(job_id) + ".txt") if not download_log(job_id, orig_log_path): no_original_log += 1 # Try to find the image by timestamp. If found, add it to the job pair. try: chooser = ExactImageChooserByTime(orig_log_path, travis_images, language) orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log( ) image = chooser.get_image_tag() if image is not None: return image except OSError: # The original log file was not found. error_reading_original_log += 1 # Try to find image by tag. If found, add it to the job pair. if not image: chooser = ExactImageChooserByTag(orig_log_path) image = chooser.get_image_tag() if image is not None: return image # Try to find image by GCE commit SHA. If found, add it to the job pair. if not image: chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images) image = chooser.get_image_tag() if image is not None: return image
def post_run(self): """ Called when all jobs are done reproducing. """ elapsed = time.time() - self.start_time log.debug('total elapsed =', elapsed) # If jobs were reproduced during this run, print the average processing time. if self.newly_reproduced.value: avg_job_time_0 = elapsed / self.newly_reproduced.value avg_job_time_1 = self.job_time_acc / self.newly_reproduced.value log.debug('avg_job_time_0 =', avg_job_time_0, 'avg_job_time_1 =', avg_job_time_1)
def build_image(self, path, dockerfile, full_image_name): image = None try: image = self.client.images.build(path=path, dockerfile=dockerfile, tag=full_image_name) except docker.errors.BuildError as e: log.debug(e) raise ReproduceError('Encountered a build error while building a Docker image: {}'.format(e)) except docker.errors.APIError as e: raise ReproduceError('Encountered a Docker API error while building a Docker image: {}'.format(e)) except KeyboardInterrupt: log.error('Caught a KeyboardInterrupt while building a Docker image.') return image
def modify_build_sh(repo: str, build_sh_path: str): """ Travis builds are sometimes not reproducible when run using the build.sh script ("the build script") generated by travis-build. Thus, to increase the number of reproducible builds, we modify the build script. This function applies the necessary modifications to the build script located at `build_sh_path` and writes the modified file to that same location. This set of modifications was developed over time as we observed more build reproducibility issues that were likely caused by using the build script generated by travis-build. There's a file in Google Drive that explains the modifications applied to the build script. A previous version of this function included several additional build script modifications that have since been removed because travis-build was patched to generate build scripts that need fewer modifications. The BugSwarm pipeline utilizes the patched version of travis-build to generate build scripts. :param repo: A GitHub repository slug :param build_sh_path: The path to the unmodified build script (generated by travis-build). """ if not isinstance(repo, str): raise TypeError if not repo: raise ValueError if not isinstance(build_sh_path, str): raise TypeError if not build_sh_path: raise ValueError log.debug('Modifying build script at {}.'.format(build_sh_path)) # Read and modify the original build script. lines = [] with open(build_sh_path) as f: skip = False for line in f: if 'start git.checkout' in line: skip = True elif 'travis_fold end git.checkout' in line: skip = False print(r'travis_cmd cd\ ' + repo + ' --assert --echo\n') lines.append(r'travis_cmd cd\ ' + repo + ' --assert --echo\n') else: if not skip: lines.append(line) # Overwrite the original build script with the modified build script. Utils.remove_file(build_sh_path) with open(build_sh_path, 'w') as f2: for l in lines: f2.write(l)
def _post_analyze(self, run): """ This function is called after analyzing each run. Print statistics like how many pairs matched and time elapsed and then visualize the match history after this run. """ log.info('Done analyzing run {}.'.format(run)) self._visualize_match_history() log.info('{} reproduced logs analyzed and {} errors in run {}.' .format(self.reproduced_logs_analyzed, self.error_count, run)) # Print a blank line to separate each run. log.info() mmm = self.utils.construct_mmm_count(self.pair_center) aaa = self.utils.construct_aaa_count(self.pair_center) log.debug('Match types in run {}: m1-m2-m3: {} a1-a2-a3: {}.'.format(run, mmm, aaa))
def find_artifact(self, image_tag: str, error_if_not_found: bool = True) -> Response: """ Get artifact data based on image_tag. :param image_tag: The image tag identifying the artifact to find. :param error_if_not_found: return err if the image tag not found. default True. :return: The response object. e.g. find_artifact("Abjad-abjad-289716771") """ log.debug( 'Trying to find artifact with image_tag {}.'.format(image_tag)) return self._get(DatabaseAPI._artifact_image_tag_endpoint(image_tag), error_if_not_found)
def check_is_bad_log(self, job, file_path=None): file_path = file_path or self.get_log_path(job) first = True if os.path.isfile(file_path): try: with open(file_path) as f: for l in f: if first: if '"docker logs" requires exactly 1 argument.' in l: return True if 'port is already allocated.' in l: return True except UnicodeDecodeError: log.debug('UnicodeDecodeError while check_is_bad_log') return False
def process_item(self, job, tid): """ This function is called for each item to be run. First we check for skipping (whether this item can be skipped). :param job: a Job object :param tid: thread_id """ self.items_processed.value += 1 job.reproduced.value = 1 if self.utils.check_if_log_exist_in_task(job): log.debug('Log already exists in task.') self.already_reproduced.value += 1 else: self.newly_reproduced.value += 1 self._reproduce_job(job, tid)
def _get_item(self, name, subitem=''): user = '******' if '/' in name: user, name = name.split('/', 1) resp = self._get( os.path.join(self.api_url('repositories/{0}/{1}'.format(user, name)), subitem + '?page_size=10000')) code = resp.status_code if code == 200: j = resp.json() return 0, j elif code == 404: log.debug('Item does not exist on Docker Hub.') return 0, None # raise ValueError('{0} repository does not exist'.format(name)) else: log.debug('Error when getting item in Docker Hub wrapper.') return 1, None
def process(self, data: Any, context: dict) -> Dict[str, List[FailPassPair]]: log.info('Aligning job pairs.') branches = data total_jobpairs = 0 for branch_id, branch_obj in branches.items(): for buildpair in branch_obj.pairs: for failed_job in buildpair.failed_build.jobs: for passed_job in buildpair.passed_build.jobs: # If the failed job and the passed job have the same config, then they are 'aligned.' # We also want to filter out pass-pass job pairs. since some jobs in the failed build may have # passed, we make sure that the failed job did not pass by checking its 'result' value. if failed_job.config == passed_job.config and failed_job.result != 0: buildpair.jobpairs.append(JobPair(failed_job, passed_job)) total_jobpairs += 1 log.debug('Aligned', total_jobpairs, 'job pairs.') return data
def get_commit_travis_status_info(self, repo, commit) -> Optional[List]: """ Returns the status object for a travis build associated with a commit. Returns None if that does not exist for `commit`. """ _, result = self.github_wrapper.get( 'https://api.github.com/repos/{}/commits/{}/status'.format( repo, commit)) if result is None: return None statuses = result.get('statuses') if not statuses: log.debug('GitHub returned no statuses for commit', commit) return None travis_status = [ x for x in statuses if self.is_commit_travis_status(x) ] return travis_status
def copy_and_reset_repo(job, utils): log.info('Copying and resetting the repository.') # Copy repository from stored project repositories to the workspace repository directory by untar-ing the storage # repository tar file into the workspace directory. with tarfile.open(utils.get_project_storage_repo_tar_path(job), 'w') as tar: tar.add(utils.get_repo_storage_dir(job), arcname=job.repo) repo_tar_obj = tarfile.TarFile(name=utils.get_project_storage_repo_tar_path(job)) utils.clean_workspace_job_dir(job) repo_tar_obj.extractall(utils.get_workspace_sha_dir(job)) # git reset the workspace repository. repo = git.Repo(utils.get_reproducing_repo_dir(job)) if job.is_pr: repo.git.reset('--hard', job.base_sha) repo.git.merge(job.sha) else: log.debug('Resetting repository to', job.sha, 'for job id', str(job.job_id) + '.') repo.git.reset('--hard', job.sha)
def process(self, data: Any, context: dict) -> Dict[str, List[Branch]]: repo = context['repo'] utils = context['utils'] log.info('Grouping builds and jobs by branch.') # Mapping from branch ID to Branch object. branches = {} for job in data: if job['event_type'] == 'pull_request': if job['compare_at'] and '/pull/' in job['compare_at']: pr_num = int(job['compare_at'].split('/pull/')[1]) branch_name = utils.github.get_head_branch_for_pr( repo, str(pr_num)) else: log.debug('job_id =', job['job_id'], 'compare_at =', job['compare_at']) log.error( 'Job was triggered from a pull request, but cannot get pr_num from compare_at.' ) raise StepException else: branch_name = job['branch'] # Sentinel pr_num to indicate a non-PR branch. pr_num = -1 # If Travis returns a null branch name for the job or if we cannot get a head branch name for a pull # request job, then ignore the job. See the method documentation for GitHub.get_head_branch_for_pr for # more information on the latter case. if not branch_name: log.info( 'Ignoring job {} (in build {}) since it is missing a head branch name.' .format(job['job_id'], job['build_id'])) continue GroupJobsByBranch._append_job_to_branch(branches, branch_name, job, pr_num) for branch_id, branch_obj in branches.items(): branch_obj.sort_builds() for branch_id, branch_obj in branches.items(): for build in branch_obj.builds: build.update_status() return branches
def _get_original_result(analyzer, utils, job_id, trigger_sha, repo): original_log_path = utils.get_orig_log_path(job_id) # If the original log does not exist in the expected location, try to download it to that location. If the log # cannot be downloaded, return error. if not os.path.isfile(original_log_path): log.debug('Original log not found at {}.'.format(original_log_path)) log.info('Download original log.') if not download_log(job_id, original_log_path): log.info('Could not download original log.') return None, original_log_path original_result = analyzer.analyze_single_log(original_log_path, job_id, trigger_sha=trigger_sha, repo=repo) if original_result.get('not_in_supported_language') is True: raise ReproduceError( 'Original log was not generated from a job in a supported programming language. ' 'The primary language was "{}."'.format( original_result['primary_language'])) return original_result, original_log_path
def _insert(self, endpoint: Endpoint, entity, singular_entity_name: str = 'entity') -> Response: if entity is None: raise TypeError if not isinstance(endpoint, Endpoint): raise TypeError if not endpoint: raise ValueError if not isinstance(singular_entity_name, str): raise TypeError if not singular_entity_name: raise ValueError log.debug('Trying to insert {}.'.format(singular_entity_name)) resp = self._post(endpoint, entity) if resp.status_code == 422: log.error('The', singular_entity_name, 'was not inserted because it failed validation.') log.error(pprint.pformat(entity)) log.error(resp.content) return resp
def _create_session(self): """ When the quota is exceeded for a token, the program will switch to another tokens and attempt to continue. If the quota is exceeded for all tokens, the program will wait for the token with the lowest wait time. """ min_wait_time = 9999 chosen_token = None updated_token = copy.deepcopy(self._tokens) for t in self._tokens: self._session = cachecontrol.CacheControl(requests.Session()) self._session.headers['Authorization'] = 'token %s' % t has_wait, wait_time = self._exceeded_api_quota() if not has_wait: chosen_token = t min_wait_time = 0 # if a token is chosen, move it to the end updated_token.append(t) del updated_token[updated_token.index(t)] break if wait_time < min_wait_time: min_wait_time = wait_time chosen_token = t # if a token is chosen, move it to the end updated_token.append(t) del updated_token[updated_token.index(t)] self._tokens = updated_token if not chosen_token: raise RuntimeError( 'Unexpected state: No GitHub token chosen in github.py.') log.debug('Chose token {}.'.format(chosen_token)) if min_wait_time: # Sleep until the quota is reset. See https://developer.github.com/v3/#rate-limiting for more information. log.warning('Sleeping until the GitHub API quota is reset in', min_wait_time / 60, 'minutes.') time.sleep(min_wait_time) self._session = cachecontrol.CacheControl(requests.Session()) self._session.headers['Authorization'] = 'token %s' % chosen_token
def process(self, data: Dict[str, Branch], context: dict) -> Optional[Any]: repo = context['repo'] utils = context['utils'] branches = data # Get the merge state of each pull request. log.info('Getting merge state for all pull requests.') start_time = time.time() pr_list_json_file = utils.get_pr_list_json_file(repo) pr_dict = {} if os.path.isfile(pr_list_json_file): try: pr_dict = read_json(pr_list_json_file) except ValueError: os.remove(pr_list_json_file) raise StepException else: pr_entities = utils.github.list_pull_requests(repo) for pr_entity in pr_entities: pr_dict[str(pr_entity['number'])] = pr_entity write_json(pr_list_json_file, pr_dict) for branch_id, branch_obj in branches.items(): if branch_obj.pr_num != -1: # Whether the branch is a pull request branch. if str(branch_obj.pr_num) in pr_dict: branch_obj.merged_at = pr_dict[str( branch_obj.pr_num)]['merged_at'] branch_obj.base_branch = pr_dict[str( branch_obj.pr_num)]['base']['ref'] branch_obj.pr_info = pr_dict[str(branch_obj.pr_num)] log.debug('Got merge state for all pull requests in', time.time() - start_time, 'seconds.') return branches
def find_trigger_commit_by_matching_datetime(repo, utils, commits_with_same_datetime, b, checked_commits): if len(commits_with_same_datetime) == 1: b.trigger_commit = commits_with_same_datetime[0] log.debug('found trigger by matching datetime') elif len(commits_with_same_datetime) > 1: log.debug('Multiple commits with same datetime.') log.debug(commits_with_same_datetime) for commit in commits_with_same_datetime: checked_commits.append(commit) trigger_commit = utils.github.get_build_trigger( repo, commit, b.build_id) if trigger_commit: b.trigger_commit = trigger_commit log.debug('found trigger by matching build id') break
def assign_base_commit(repo, utils, shas, b, branches, branch): parent_shas = utils.get_parents_of_commit(repo, b.trigger_commit, branch.base_branch) parent_shas = parent_shas.split('\n') for sha in shas: if b.trigger_commit not in shas: log.error('Trigger commit not in SHAs!') raise StepException if sha not in shas: log.error('Parent commit not in SHAs!') raise StepException if shas[sha] >= shas[b.trigger_commit]: continue branch_containing_sha = AssignBaseCommits.is_sha_in_pr(branches) if branch_containing_sha: if branch_containing_sha.merged_at: if utils.convert_api_date_to_datetime( branch_containing_sha.merged_at) >= shas[ b.trigger_commit]: log.debug(sha, 'excluded because PR merged later', branch_containing_sha.branch_name) continue else: log.debug(sha, 'excluded because PR not merged', branch_containing_sha.branch_name) continue # Before assigning the base SHA, check if it is on the branch from which the pull request is branched. result = utils.get_branch_of_sha(repo, sha) if not result: log.debug(sha, 'excluded because no branch name.') continue if len(result) > 2: branch_name = result[2:] if branch_name != branch.base_branch: log.debug( sha, 'excluded because base branch name does not match.') continue return sha