def test_generate_removed_file(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { return 0; }""", ) revision1 = commit(hg) remove_file(hg, local, "file.cpp") revision2 = commit(hg) assert not os.path.exists(os.path.join(local, "file.cpp")) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=True, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk( repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE ) ) assert ( commits[0].message == f"""Commit A file.cpp UltraBlame original commit: {revision1}""" ) assert ( commits[1].message == f"""Commit R file.cpp UltraBlame original commit: {revision2}""" ) assert not os.path.exists(os.path.join(git_repo, "file.cpp")) proc = subprocess.run( ["git", "show", "HEAD"], cwd=git_repo, capture_output=True, check=True ) assert b"diff --git a/file.cpp b/file.cpp\ndeleted file mode 100644" in proc.stdout
def test_generate_tokenized_and_comments_removed_unusupported_extension( fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.surely_unsupported", """#include <iostream> /* main */ int main() { return 0; }""", ) revision1 = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=True, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.surely_unsupported UltraBlame original commit: {revision1}""") with open(os.path.join(git_repo, "file.surely_unsupported"), "r") as f: cpp_file = f.read() assert (cpp_file == """# include < iostream > / * main * / int main ( ) { return 0 ; } """)
def test_generate_binary_remove_comments(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "an_object_file", b"\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00", ) revision = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=False, remove_comments=True, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A an_object_file UltraBlame original commit: {revision}""") with open(os.path.join(git_repo, "an_object_file"), "rb") as f: obj_file = f.read() assert obj_file == b"\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00"
def main(): description = "Generate a mirror git repository where content is split by word" parser = argparse.ArgumentParser(description=description) parser.add_argument("cache-root", help="Cache for repository clones.") args = parser.parse_args() generator = MicroannotateGenerator(getattr(args, "cache-root")) generator.generate()
def generate(self): db_path = os.path.join("data", self.git_repo_path) db.register( db_path, "https://s3-us-west-2.amazonaws.com/communitytc-bugbug/data/", VERSION, ) is_old_version = db.is_old_schema(db_path) with ThreadPoolExecutorResult(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned") ) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@" ) if not is_old_version: executor.submit(self.clone_git_repo) else: executor.submit(self.init_git_repo) tenacity.retry( lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True ), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() push_args = ["git", "push", repo_push_url, "master"] if is_old_version: push_args.append("--force") done = False while not done: done = generator.generate( self.repo_dir, self.git_repo_path, limit=COMMITS_STEP, tokenize=self.tokenize, remove_comments=self.remove_comments, ) tenacity.retry( lambda: subprocess.run(push_args, cwd=self.git_repo_path, check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() # We are not using db.upload as we don't need to upload the git repo. upload_s3([f"{db_path}.version"])
def generate(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_url = "https://github.com/marco-c/gecko-dev-wordified" repo_push_url = ( f"https://{git_user}:{git_password}@github.com/marco-c/gecko-dev-wordified" ) git_repo_path = os.path.basename(repo_url) retry(lambda: subprocess.run(["git", "clone", repo_url, git_repo_path], check=True)) try: retry(lambda: subprocess.run( ["git", "pull", repo_url, "master"], cwd=git_repo_path, capture_output=True, check=True, )) except subprocess.CalledProcessError as e: # When the repo is empty. if b"Couldn't find remote ref master" in e.stdout: pass done = generator.generate(self.repo_dir, git_repo_path, limit=10000) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) retry(lambda: subprocess.run(["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True))
def generate(self): db_path = os.path.join("data", self.git_repo_path) db.register( db_path, f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.microannotate_{self.git_repo_path}.latest/artifacts/public/", VERSION, ) # TODO: Check the version again once we can run tasks for longer (https://bugzilla.mozilla.org/show_bug.cgi?id=1604175). is_old_version = False # db.is_old_schema(db_path) with ThreadPoolExecutorResult(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned")) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") if not is_old_version: executor.submit(self.clone_git_repo) else: executor.submit(self.init_git_repo) tenacity.retry( lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() push_args = ["git", "push", repo_push_url, "master"] if is_old_version: push_args.append("--force") done = False while not done: done = generator.generate( self.repo_dir, self.git_repo_path, limit=COMMITS_STEP, tokenize=self.tokenize, remove_comments=self.remove_comments, ) tenacity.retry( lambda: subprocess.run( push_args, cwd=self.git_repo_path, check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )()
def generate(self): repository.clone(self.repo_dir) logger.info("mozilla-central cloned") git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") git_repo_path = os.path.basename(self.repo_url) retry(lambda: subprocess.run( ["git", "clone", self.repo_url, git_repo_path], check=True)) try: retry(lambda: subprocess.run( ["git", "pull", self.repo_url, "master"], cwd=git_repo_path, capture_output=True, check=True, )) except subprocess.CalledProcessError as e: # When the repo is empty. if b"Couldn't find remote ref master" in e.stdout: pass retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) for i in range(STEPS): logger.info(f"Step {i} out of {STEPS}") done = generator.generate( self.repo_dir, git_repo_path, limit=TOTAL_COMMITS // STEPS, tokenize=self.tokenize, remove_comments=self.remove_comments, ) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True, )) if done: break
def test_generate_comments_removed_bad_chars(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.java", b"""private static String /* comment */ utf8String = "Non-Ascii 1 byte chars: \x8e\x89\x8a\x88\x8c\x8d, 2 byte chars: \\u1234 \\u1235 \\u1236";""", ) revision = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=False, remove_comments=True, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.java UltraBlame original commit: {revision}""") with open(os.path.join(git_repo, "file.java"), "rb") as f: java_file = f.read() assert ( java_file == b"""private static String utf8String = "Non-Ascii 1 byte chars: \x8e\x89\x8a\x88\x8c\x8d, 2 byte chars: \\u1234 \\u1235 \\u1236";""" )
def main(): description = "Generate a mirror git repository where content is split by word" parser = argparse.ArgumentParser(description=description) parser.add_argument("cache-root", help="Cache for repository clones.") parser.add_argument("repo-url", help="Mirror repository URL.") parser.add_argument("--tokenize", help="Enable word-level tokenization.", action="store_true") parser.add_argument("--remove-comments", help="Enable comment removal.", action="store_true") args = parser.parse_args() generator = MicroannotateGenerator( getattr(args, "cache-root"), getattr(args, "repo-url"), args.tokenize, args.remove_comments, ) generator.generate()
def generate(self): with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned")) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") git_repo_path = os.path.basename(self.repo_url) executor.submit(self.clone_git_repo, git_repo_path) retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) for i in range(STEPS): logger.info(f"Step {i} out of {STEPS}") done = generator.generate( self.repo_dir, git_repo_path, limit=TOTAL_COMMITS // STEPS, tokenize=self.tokenize, remove_comments=self.remove_comments, ) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True, )) if done: break
def test_generate_progressive(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { return 0; }""", ) revision1 = commit(hg) add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { cout << "Hello, world!"; return 0; }""", ) add_file( hg, local, "file.jsm", """function ciao(str) { // Comment one console.log(str); }""", ) revision2 = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end=revision1, limit=None, tokenize=True, remove_comments=False, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert len(commits) == 1 assert (commits[0].message == f"""Commit A file.cpp UltraBlame original commit: {revision1}""") with open(os.path.join(git_repo, "file.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """# include < iostream > / * main * / int main ( ) { return 0 ; } """) assert not os.path.exists(os.path.join(git_repo, "file.jsm")) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=False, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert len(commits) == 2 assert (commits[0].message == f"""Commit A file.cpp UltraBlame original commit: {revision1}""") assert (commits[1].message == f"""Commit M file.cpp A file.jsm UltraBlame original commit: {revision2}""") with open(os.path.join(git_repo, "file.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """# include < iostream > / * main * / int main ( ) { cout < < " Hello world ! " ; return 0 ; } """) with open(os.path.join(git_repo, "file.jsm"), "r") as f: js_file = f.read() assert (js_file == """function ciao ( str ) { / / Comment one console . log ( str ) ; } """)
) parser.add_argument( "repository_out_dir", help="Path to the output repository", action="store" ) parser.add_argument( "--rev-start", help="Which revision to start with (0 by default)", action="store", default="0", ) parser.add_argument( "--rev-end", help="Which revision to end with (tip by default)", action="store", default="tip", ) parser.add_argument("--tokenize", action="store_true", default=True) parser.add_argument("--remove-comments", action="store_true", default=False) args = parser.parse_args() repo_out_dir = os.path.realpath(args.repository_out_dir) generator.generate( args.repository_dir, repo_out_dir, args.rev_start, args.rev_end, args.tokenize, args.remove_comments, )
def test_generate_tokenized_operators(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { if (ciao > 0 && ciao.obj <= 7 && ciao.obj->prova < 42 || !bo) { int x = ciao ? 1 : 2; return 1 + 1 * 41 + 0 / ~3 + 3 % 5 - x ^ 3; } return 0; }""", ) revision = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=False, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.cpp UltraBlame original commit: {revision}""") with open(os.path.join(git_repo, "file.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """# include < iostream > / * main * / int main ( ) { if ( ciao > 0 & & ciao . obj < = 7 & & ciao . obj - > prova < 42 | | ! bo ) { int x = ciao ? 1 : 2 ; return 1 + 1 * 41 + 0 / ~ 3 + 3 % 5 - x ^ 3 ; } return 0 ; } """)
def test_generate_copied_and_moved_file(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { return 0; }""", ) add_file( hg, local, "file2.cpp", """#include <stdio.h> /* main2 */ void main() { return 42; }""", ) revision1 = commit(hg) hg.copy( bytes(os.path.join(local, "file.cpp"), "ascii"), bytes(os.path.join(local, "filecopy.cpp"), "ascii"), ) revision2 = commit(hg) hg.move( bytes(os.path.join(local, "file2.cpp"), "ascii"), bytes(os.path.join(local, "file2move.cpp"), "ascii"), ) revision3 = commit(hg) assert os.path.exists(os.path.join(local, "file.cpp")) assert os.path.exists(os.path.join(local, "filecopy.cpp")) assert not os.path.exists(os.path.join(local, "file2.cpp")) assert os.path.exists(os.path.join(local, "file2move.cpp")) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=False, remove_comments=True, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.cpp A file2.cpp UltraBlame original commit: {revision1}""") assert (commits[1].message == f"""Commit A filecopy.cpp UltraBlame original commit: {revision2}""") assert (commits[2].message == f"""Commit A file2move.cpp R file2.cpp UltraBlame original commit: {revision3}""") with open(os.path.join(git_repo, "file.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """#include <iostream> int main() { return 0; }""") assert not os.path.exists(os.path.join(git_repo, "file2.cpp")) with open(os.path.join(git_repo, "filecopy.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """#include <iostream> int main() { return 0; }""") with open(os.path.join(git_repo, "file2move.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """#include <stdio.h> void main() { return 42; }""")
def test_generate_tokenized_python(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.py", """import sys if sys: print("hello") elif sys != 3: print("maybe") else: print("nope") """, ) revision = commit(hg) generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=False, ) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.py UltraBlame original commit: {revision}""") with open(os.path.join(git_repo, "file.py"), "r") as f: py_file = f.read() assert (py_file == """import sys if sys : print ( " hello " ) elif sys ! = 3 : print ( " maybe " ) else : print ( " nope " ) """)
def test_generate_tokenized(fake_hg_repo, tmpdir): hg, local = fake_hg_repo git_repo = os.path.join(tmpdir.strpath, "repo") add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { return 0; }""", ) revision1 = commit(hg) add_file( hg, local, "file.cpp", """#include <iostream> /* main */ int main() { cout << "Hello, world!"; return 0; }""", ) add_file( hg, local, "file.jsm", """function ciao(str) { // Comment one console.log(str); }""", ) revision2 = commit(hg) assert (generator.generate( local, git_repo, rev_start=0, rev_end="tip", limit=None, tokenize=True, remove_comments=False, ) is True) repo = pygit2.Repository(git_repo) commits = list( repo.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)) assert (commits[0].message == f"""Commit A file.cpp UltraBlame original commit: {revision1}""") assert (commits[1].message == f"""Commit M file.cpp A file.jsm UltraBlame original commit: {revision2}""") with open(os.path.join(git_repo, "file.cpp"), "r") as f: cpp_file = f.read() assert (cpp_file == """# include < iostream > / * main * / int main ( ) { cout < < " Hello world ! " ; return 0 ; } """) with open(os.path.join(git_repo, "file.jsm"), "r") as f: js_file = f.read() assert (js_file == """function ciao ( str ) { / / Comment one console . log ( str ) ; } """) assert utils.get_original_hash(repo, "HEAD") == revision2 assert utils.get_original_hash(repo, commits[0].hex) == revision1 assert utils.get_original_hash(repo, commits[1].hex) == revision2 transformed_to_original, original_to_transformed = utils.get_commit_mapping( git_repo) assert transformed_to_original[commits[0].hex] == revision1 assert transformed_to_original[commits[1].hex] == revision2 assert original_to_transformed[revision1] == commits[0].hex assert original_to_transformed[revision2] == commits[1].hex