def test_parse_round_trip_with_binary_files_in_diff(self): """Parse git diff with binary files though round trip""" utf8_file = os.path.join(self.samples_dir, 'samples/sample8.diff') with open(utf8_file, 'r') as diff_file: res1 = PatchSet(diff_file) res2 = PatchSet(str(res1)) self.assertEqual(res1, res2)
def test_patchset_string_input(self): with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: diff_data = diff_file.read() ps1 = PatchSet(diff_data) with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: ps2 = PatchSet(diff_file) self.assertEqual(ps1, ps2)
def __get_patch(self): try: _patch = PatchSet(self.__diff) except UnidiffParseError: # On some version the diff header is missing # just add a fake one _patch = PatchSet( 'diff --git a/{file} b/{file}\n--- a/{file}\n+++ b/{file}\n{diff}' .format(file=self.__file, diff=self.__diff)) return _patch.added_files + _patch.modified_files
def test_patchset_compare(self): with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: ps1 = PatchSet(diff_file) with codecs.open(self.sample_file, 'r', encoding='utf-8') as diff_file: ps2 = PatchSet(diff_file) other_file = os.path.join(self.samples_dir, 'samples/sample3.diff') with open(other_file, 'rb') as diff_file: ps3 = PatchSet(diff_file, encoding='utf-8') self.assertEqual(ps1, ps2) self.assertNotEqual(ps1, ps3)
def curl_diffs(diff_path): changes_sets = [] if "Token" in config["GitHub"]: token = config["GitHub"]["Token"] diff_url = "https://api.github.com/repos/" + owner + "/" + repo + "/compare/" + diff_path["first_commit_sha"] + "..." + diff_path["merge_commit_sha"] request = Request(diff_url) request.add_header("Authorization", "token %s" % token) request.add_header("Accept", "application/vnd.github.v3.diff") try: url_diff = urlopen(request) diffs = PatchSet(url_diff, encoding="utf-8") except (UnicodeDecodeError, errors.UnidiffParseError): print("UnicodeDecodeError:" + str(diff_path)) return [] else: try: url_diff = urlopen(diff_path["1-n_url"]) diffs = PatchSet(url_diff, encoding="utf-8") except (UnicodeDecodeError, errors.UnidiffParseError): print("UnicodeDecodeError:" + str(diff_path)) return [] filtered_diffs = [x for x in diffs if x.is_modified_file and any([x.path.endswith(y) for y in lang_extentions[lang]])] files_num = len(filtered_diffs) for i, diff in enumerate(filtered_diffs): sys.stdout.write("\r%s pulls %d / %d files" % (diff_path["number"], i+1, files_num)) for hunk in diff: source = "".join([x.value for x in hunk if x.is_removed]) target = "".join([x.value for x in hunk if x.is_added]) if source == target: continue out_metricses = { "number": int(diff_path["number"]), "commit_len": int(diff_path["commit_len"]), "created_at": diff_path["created_at"], "merged_at": diff_path["merged_at"], "merged_by": diff_path["merged_by"], "1-n_url": diff_path["1-n_url"], "file_path": diff.path, "changes_set": TN.make_change_set(source, target) } if out_metricses["changes_set"] == -1: continue changes_sets.append(out_metricses) return changes_sets
def get_patch(api, urn, pr_num, raw=False): """ get the formatted or not patch file for a pr """ path = "/{urn}/pull/{pr}.patch".format(urn=urn, pr=pr_num) data = api("get", path) if raw: return data return PatchSet(data)
def __init__(self, data_path): json_path = path.join(data_path, "json") diffs_path = path.join(data_path, "diff") diff_files = listdir(diffs_path) json_files = listdir(json_path) shas_diff = [f.replace('.diff', '') for f in diff_files] shas_json = [f.replace('.json', '') for f in json_files] if not set(shas_diff) == set(shas_json): warnings.warn("There were missing files") self.shas = list(set(shas_diff) & set(shas_json)) else: self.shas = shas_json self.diff = {} self.metadata = {} for sha in self.shas: diff_filepath = path.join(diffs_path, sha + '.diff') try: with open(diff_filepath, 'r') as diff_file: diff = diff_file.read().decode('utf-8') diff_data = PatchSet(diff.splitlines()) json_filepath = path.join(json_path, sha + '.json') with open(json_filepath, 'r') as json_file: json_data = json.load(json_file) self.diff[sha] = diff_data self.metadata[sha] = json_data except Exception as e: warnings.warn("Problem in sha " + sha)
def test_mypy_lint_a_py(app, pr_context): diff = """diff --git a/a.py b/a.py new file mode 100644 index 0000000..87604af --- /dev/null +++ b/a.py @@ -0,0 +1,5 @@ +def p() -> None: + print('hello') + + +a = p() """ spec = Specification() spec.linters.append(ObjectDict(name='mypy', pattern=None)) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'mypy')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 1 problem = lint.problems[0] assert problem.line == 5 assert problem.filename == 'a.py'
def sort_ig_updates(): """ Any data file that was updated in the last commit, do a sort operation for the IG hashes, leaving non-IG files unchanged. Also add commit dates for any photos that don't have them """ repo = git.Repo(".") prior_commit = repo.commit("HEAD~1") current_commit = repo.commit("HEAD") diff_raw = repo.git.diff(prior_commit, current_commit, ignore_blank_lines=True, ignore_space_at_eol=True) # Start by adding entity and photo commit dates, since this process can # change the URIs for doing tracking in the commits update_entity_commit_dates(prior_commit.hexsha) update_photo_commit_dates(prior_commit.hexsha) # Now do the sorting and rewriting patch = PatchSet(diff_raw) for change in patch: filename = change.path if filename.find("links") == 0: # Don't care about links files continue if filename.find(".txt") == -1: # Don't care about non-data files continue elif change.added > 0: sort_ig_hashes(filename)
def test_yamllint_a_yml(app, pr_context): diff = """diff --git a/a.yml b/a.yml new file mode 100644 index 0000000..1eccee8 --- /dev/null +++ b/a.yml @@ -0,0 +1,3 @@ +--- +a: 1 +a: 2 """ spec = Specification() spec.linters.append(ObjectDict(name='yamllint', pattern=None)) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'yamllint')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 1 problem = lint.problems[0] assert problem.filename == 'a.yml' assert problem.line == 3
def check_diff(self): diff_file = requests.get(self.pr_info['diff_url'], auth=API_AUTH).text if diff_file == "Sorry, this diff is unavailable.": self.add_invalid( 'Your PR looks like an ORPHAN (you deleted the fork). This cannot be automatically checked. Please close this PR and create a new one without removing the fork.' ) return diff = PatchSet(diff_file) fcount = diff_file.count("diff --git") if fcount < 1: self.add_invalid( 'Less than one file has been added/removed/modified.') return if any(d.is_modified_file for d in diff): self.add_attention( 'This PR modifies one or more pre-existing files.') return new_file = self.parse_diff(diff_file.split("\n")) return {'lines': new_file, 'diff': diff, 'diff_file': diff_file}
def test_no_changed_files_ignore(app, caplog): diff = """diff --git a/removed_file b/removed_file deleted file mode 100644 index 1f38447..0000000 --- a/removed_file +++ /dev/null @@ -1,3 +0,0 @@ -This content shouldn't be here. - -This file will be removed. """ context = TestContext('deepanalyzer/badwolf', None, 'pullrequest', 'message', {'commit': { 'hash': '000000' }}, {'commit': { 'hash': '111111' }}, pr_id=1) spec = Specification() spec.linters.append(ObjectDict(name='flake8', pattern=None)) lint = LintProcessor(context, spec, '/tmp') patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes: load_changes.return_value = patch lint.process() assert load_changes.called assert 'No changed files found' in caplog.text()
def __init__(self): self.current_time = int(time.time()) self.repo = git.Repo(".") self.prior_commit = self._starting_commit(COMMIT_AGE) self.current_commit = self.repo.commit("HEAD") self.diff_raw = self.repo.git.diff(self.prior_commit, self.current_commit, ignore_blank_lines=True, ignore_space_at_eol=True) self.patch = PatchSet(self.diff_raw) self.locator_to_photo = {} self.entity_to_commit_date = {} self.seen = {} self.seen["media"] = {} self.seen["panda"] = {} self.seen["photos"] = {} self.seen["wild"] = {} self.seen["zoo"] = {} self.updates = {} self.updates["authors"] = [] self.updates["author_count"] = 0 self.updates["entities"] = [] self.updates["pandas"] = [] self.updates["panda_count"] = 0 self.updates["photos"] = [] self.updates["zoos"] = [] self.updates["zoo_count"] = 0 self.create_updates()
def run(self): command = ['git', 'merge-base', 'HEAD', self._compare_to_ref] cmd = yield self.makeRemoteShellCommand( command=command, stdioLogName='stdio-merge-base', collectStdout=True) yield self.runCommand(cmd) log = yield self.getLog("stdio-merge-base") log.finish() if cmd.results() != results.SUCCESS: return cmd.results() commit = cmd.stdout.strip() self.setProperty('diffinfo-merge-base-commit', commit, 'GitDiffInfo') self.addLogObserver('stdio-diff', self._observer) command = ['git', 'diff', '--no-prefix', '-U0', commit, 'HEAD'] cmd = yield self.makeRemoteShellCommand(command=command, stdioLogName='stdio-diff') yield self.runCommand(cmd) if cmd.results() != results.SUCCESS: return cmd.results() from unidiff import PatchSet patchset = PatchSet(self._observer.getStdout(), metadata_only=True) data = json.dumps(self._convert_patchset(patchset)).encode('utf-8') yield self.setBuildData(self._data_name, data, 'GitDiffInfo') return cmd.results()
def parse(diff): added = Set() removed = Set() diff_stream = StringIO.StringIO(diff) patch = PatchSet(diff_stream) if len(patch) > 0: for hunk in patch[0]: offset = 0 for line in hunk: if line.is_added: added.add(line.target_line_no) offset += 1 elif line.is_removed: removed.add(line.source_line_no + offset) offset -= 1 diff_stream.close # Changed lines are in both the added and removed sets changed = Set(added.intersection(removed)) added.difference_update(changed) removed.difference_update(changed) return (added, removed, changed)
def diff_to_added_and_removed_lines(diff_text): patch = PatchSet(diff_text) added_lines = [] removed_lines = [] for patched_file in patch: file = filepath(patched_file) for hunk in patched_file: for line in hunk: leading_whitespace, trim_text = split_to_leading_whitespace_and_trim_text(line.value.rstrip('\n')) if line.is_added: line_no = line.target_line_no lines_list = added_lines elif line.is_removed: line_no = line.source_line_no lines_list = removed_lines else: continue lines_list.append({ 'file': file, 'line_no': line_no, 'trim_text': trim_text, 'leading_whitespaces': leading_whitespace, }) return { 'added_lines': added_lines, 'removed_lines': removed_lines, }
def test_flake8_lint_a_py(app, pr_context): diff = """diff --git a/a.py b/a.py new file mode 100644 index 0000000..fdeea15 --- /dev/null +++ b/a.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, unicode_literals + + +def add(a, b): + return a+ b """ spec = Specification() spec.linters.append(ObjectDict(name='flake8', pattern=None)) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'flake8')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 1 problem = lint.problems[0] assert problem.filename == 'a.py' assert problem.line == 6
def test_hadolint_lint_a_dockerfile(app, pr_context): diff = """diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cd19857 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,3 @@ +FROM ubuntu:16.04 + +RUN apt-get update && apt-get install file """ spec = Specification() spec.linters.append(ObjectDict(name='hadolint', pattern=None)) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'hadolint')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 4 problem = lint.problems[0] assert problem.line == 3 assert problem.filename == 'Dockerfile'
def _ParsePatchPerFile(self, files): patched_files = {} # Process all patches provided by Github and save them in a new per file per line representation. for patched_file in files: patched_files[patched_file.filename] = { "status": patched_file.status, "sha": patched_file.sha, "deltas": [], } patch_str = io.StringIO() patch_str.write("--- a\n+++ b\n") if patched_file.patch is not None: patch_str.write(patched_file.patch) patch_str.seek(0) logging.debug(f"Parsing diff\n{patch_str.getvalue()}") patch = PatchSet(patch_str, encoding=None) for hunk in patch[0]: for line in hunk: if line.is_context: continue patched_files[patched_file.filename]["deltas"].append( vars(line)) return patched_files
def test_parse_diff_with_new_and_modified_binary_files(self): """Parse git diff file with newly added and modified binaries files.""" utf8_file = os.path.join(self.samples_dir, 'samples/sample8.diff') with open(utf8_file, 'r') as diff_file: res = PatchSet(diff_file) # three file in the patch self.assertEqual(len(res), 3) # first file is added self.assertFalse(res[0].is_modified_file) self.assertFalse(res[0].is_removed_file) self.assertTrue(res[0].is_added_file) self.assertTrue(res[0].is_binary_file) # second file is added self.assertTrue(res[1].is_modified_file) self.assertFalse(res[1].is_removed_file) self.assertFalse(res[1].is_added_file) self.assertTrue(res[1].is_binary_file) # third file is removed self.assertFalse(res[2].is_modified_file) self.assertTrue(res[2].is_removed_file) self.assertFalse(res[2].is_added_file) self.assertTrue(res[2].is_binary_file)
def parse_unidiff(diff): """ diff has the following format: Index: hello/test.txt =================================================================== --- hello/test.txt (revision 6) +++ hello/test.txt (revision 7) @@ -1,4 +1,3 @@ .....source code... aaa +bbb +ccc :param diff: the diff result string :return: Return the change list. We only care about the addition and modification, and we don't need to care about the deletion """ if diff is None or len(diff) == 0: return [] patch = PatchSet(diff.split('\n'), encoding='utf8') # although we only have one patched file change = [] for patched_file in patch: for hunk in patched_file: change.extend([ i.target_line_no for i in hunk if i.target_line_no is not None and not i.is_context ]) return change
def extract_lines(cls, data, query=None, normalizer=None): query = cls.is_added_or_removed if query is None else query normalizer = cls.normalize if normalizer is None else normalizer patch = PatchSet(data) return (normalizer(line) for file in patch if cls.has_target_extension(file) for hunk in file if cls.is_target_hunk(hunk) for line in hunk if query(line))
def test_bandit_lint_a_py(app, pr_context): diff = """diff --git a/a.py b/a.py new file mode 100644 index 0000000..719cd56 --- /dev/null +++ b/a.py @@ -0,0 +1,4 @@ +try: + a = 1 +except Exception: + pass """ spec = Specification() spec.linters.append(ObjectDict(name='bandit')) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'bandit')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 1 problem = lint.problems[0] assert problem.filename == 'a.py' assert problem.line == 3 assert not problem.is_error
def test_rstlint_a_rst(app, pr_context): diff = """diff --git a/a.rst b/a.rst new file mode 100644 index 0000000..4e46cf9 --- /dev/null +++ b/a.rst @@ -0,0 +1,2 @@ +Hello World +==== """ spec = Specification() spec.linters.append(ObjectDict(name='rstlint')) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'rstlint')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 1 problem = lint.problems[0] assert problem.filename == 'a.rst' assert problem.line == 2
def __get_modified_lines(self): res = [] try: _patch = PatchSet(self.__diff) except UnidiffParseError: # On some version the diff header is missing # just add a fake one _patch = PatchSet( "diff --git a/{} b/{}\n--- a/{}\n+++ b/{}\n{}".format( self.__file, self.__file, self.__file, self.__file, self.__diff)) for _f in _patch.added_files + _patch.modified_files: for h in [x for x in _f]: res += h.target_lines() res = [x.target_line_no for x in res] return res
def from_repo(cls, repo: Repo, mutant_id: int = None): diff = repo.git.diff(repo.head, None, '--unified=0') patchset = PatchSet(diff) modified_file_path = patchset[0].target_file[ 2:] # Remove "b/" from the path changed_sourcecode_line = patchset[0][0].source_start previous_line = '' for line in patchset[0][0]: if line.is_added: current_line = str(line)[2:] if line.is_removed: current_line = str(line)[2:] mutant = Mutant(mutant_id=mutant_id, modified_file_path=modified_file_path, line_number_changed=changed_sourcecode_line, previous_line=previous_line, current_line=current_line, repo_path=repo.working_dir) mutant.modified_method = SemanticMutantAnalysis(mutant).method_name() regex_diff = repo.git.diff(repo.head, None, '--unified=7') mutant.context_analysis(regex_diff) return mutant
def test_get_patch_lines_with_renames(): with open("sample_with_renames.diff") as f: patch = PatchSet(f.read()) lines = get_patch_lines(patch) assert len(lines) == 3 assert "files/en-us/web/api/transitionevent/propertyname/index.html" in lines
def test_stylelint_lint_a_scss(app, pr_context): diff = """diff --git a/a.scss b/a.scss new file mode 100644 index 0000000..e545209 --- /dev/null +++ b/a.scss @@ -0,0 +1 @@ +a[id="foo"] { content: "x"; } """ spec = Specification() spec.linters.append(ObjectDict(name='stylelint', pattern=None)) lint = LintProcessor(pr_context, spec, os.path.join(FIXTURES_PATH, 'stylelint')) patch = PatchSet(diff.split('\n')) with mock.patch.object(lint, 'load_changes') as load_changes,\ mock.patch.object(lint, 'update_build_status') as build_status,\ mock.patch.object(lint, '_report') as report: load_changes.return_value = patch build_status.return_value = None report.return_value = (1, 2) lint.problems.set_changes(patch) lint.process() assert load_changes.called assert len(lint.problems) == 2 problem = lint.problems[0] assert problem.filename == 'a.scss'
def insert(cls, pull_request_id: str, diff: str): bdiff = diff.encode('utf-8') diff_hash = hashlib.sha256(bdiff).hexdigest() with session_scope() as session: try: (session.query(Diffs).filter( and_(Diffs.pull_request_id == pull_request_id, Diffs.diff_hash == diff_hash)).one()) except NoResultFound: record = Diffs() record.pull_request_id = pull_request_id record.diff_hash = diff_hash record.diff = diff patch = PatchSet(diff) record.added_lines = patch.added record.removed_lines = patch.removed record.added_files = len(patch.added_files) record.modified_files = len(patch.modified_files) record.removed_files = len(patch.removed_files) session.add(record) (session.query(Diffs).filter( and_(Diffs.diff_hash != diff_hash, Diffs.pull_request_id == pull_request_id)).update( {Diffs.is_most_recent: False}))
def flake8_scan_file(commit_sha, owner, repo, parent_sha=None): """Runs flake8 scan on all changed files and returns array of warnings""" if parent_sha is None: parent_sha = get_commit_parent(commit_sha, owner, repo) diff_url = GIT_COMPARE_URL.format(base=parent_sha, head=commit_sha, owner=owner, repo=repo, host=host_api) diff_info = get(diff_url, auth=auth).json() diff_content = get(diff_url, auth=auth, headers={ "Accept": "application/vnd.github.v3.diff" }).content.decode('utf8') patch_set = PatchSet(diff_content) comments_per_file = {} for file in diff_info['files']: content = get(file['contents_url'], auth=auth).json() file_content = get(content['download_url']).content with open("flake8_tmp_file.py", 'wb') as test_file: test_file.write(file_content) style_guide = flake8.get_style_guide(ignore=['E24', 'W503']) style_guide.input_file('./flake8_tmp_file.py', ) results = style_guide._application.file_checker_manager.checkers[ 0].results comments_per_line = {} for code, line_n, offset, text, src in results: if changed_in_diff(get_file_by_name(patch_set, file['filename']), line_n): comments = comments_per_line.get(line_n, []) comments.append((file['filename'], line_n, offset, code, text)) comments_per_line[line_n] = comments comments_per_file[file['filename']] = comments_per_line return comments_per_file