def fetch_git(url, directory, jobs, retry, timeout): """ Dump a git repository into the output directory """ assert os.path.isdir(directory), "%s is not a directory" % directory assert not os.listdir(directory), "%s is not empty" % directory assert jobs >= 1, "invalid number of jobs" assert retry >= 1, "invalid number of retries" assert timeout >= 1, "invalid timeout" # find base url url = url.rstrip("/") if url.endswith("HEAD"): url = url[:-4] url = url.rstrip("/") if url.endswith(".git"): url = url[:-4] url = url.rstrip("/") # check for /.git/HEAD printf("[-] Testing %s/.git/HEAD ", url) response = requests.get("%s/.git/HEAD" % url, verify=False, allow_redirects=False) printf("[%d]\n", response.status_code) if response.status_code != 200: printf("error: %s/.git/HEAD does not exist\n", url, file=sys.stderr) return 1 elif not response.text.startswith("ref:"): printf("error: %s/.git/HEAD is not a git HEAD file\n", url, file=sys.stderr) return 1 # check for directory listing printf("[-] Testing %s/.git/ ", url) response = requests.get("%s/.git/" % url, verify=False, allow_redirects=False) printf("[%d]\n", response.status_code) if response.status_code == 200 and is_html( response) and "HEAD" in get_indexed_files(response): printf("[-] Fetching .git recursively\n") process_tasks([".git/", ".gitignore"], RecursiveDownloadWorker, jobs, args=(url, directory, retry, timeout)) printf("[-] Running git checkout .\n") os.chdir(directory) subprocess.check_call(["git", "checkout", "."]) return 0 # no directory listing printf("[-] Fetching common files\n") tasks = [ ".gitignore", ".git/COMMIT_EDITMSG", ".git/description", ".git/hooks/applypatch-msg.sample", ".git/hooks/applypatch-msg.sample", ".git/hooks/applypatch-msg.sample", ".git/hooks/commit-msg.sample", ".git/hooks/post-commit.sample", ".git/hooks/post-receive.sample", ".git/hooks/post-update.sample", ".git/hooks/pre-applypatch.sample", ".git/hooks/pre-commit.sample", ".git/hooks/pre-push.sample", ".git/hooks/pre-rebase.sample", ".git/hooks/pre-receive.sample", ".git/hooks/prepare-commit-msg.sample", ".git/hooks/update.sample", ".git/index", ".git/info/exclude", ".git/objects/info/packs", ] process_tasks(tasks, DownloadWorker, jobs, args=(url, directory, retry, timeout)) # find refs printf("[-] Finding refs/\n") tasks = [ ".git/FETCH_HEAD", ".git/HEAD", ".git/ORIG_HEAD", ".git/config", ".git/info/refs", ".git/logs/HEAD", ".git/logs/refs/heads/master", ".git/logs/refs/remotes/origin/HEAD", ".git/logs/refs/remotes/origin/master", ".git/logs/refs/stash", ".git/packed-refs", ".git/refs/heads/master", ".git/refs/remotes/origin/HEAD", ".git/refs/remotes/origin/master", ".git/refs/stash", ] process_tasks(tasks, FindRefsWorker, jobs, args=(url, directory, retry, timeout)) # find packs printf("[-] Finding packs\n") tasks = [] # use .git/objects/info/packs to find packs info_packs_path = os.path.join(directory, ".git", "objects", "info", "packs") if os.path.exists(info_packs_path): with open(info_packs_path, "r") as f: info_packs = f.read() for sha1 in re.findall(r"pack-([a-f0-9]{40})\.pack", info_packs): tasks.append(".git/objects/pack/pack-%s.idx" % sha1) tasks.append(".git/objects/pack/pack-%s.pack" % sha1) process_tasks(tasks, DownloadWorker, jobs, args=(url, directory, retry, timeout)) # find objects printf("[-] Finding objects\n") objs = set() packed_objs = set() # .git/packed-refs, .git/info/refs, .git/refs/*, .git/logs/* files = [ os.path.join(directory, ".git", "packed-refs"), os.path.join(directory, ".git", "info", "refs"), os.path.join(directory, ".git", "FETCH_HEAD"), os.path.join(directory, ".git", "ORIG_HEAD"), ] for dirpath, _, filenames in os.walk( os.path.join(directory, ".git", "refs")): for filename in filenames: files.append(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk( os.path.join(directory, ".git", "logs")): for filename in filenames: files.append(os.path.join(dirpath, filename)) for filepath in files: if not os.path.exists(filepath): continue with open(filepath, "r") as f: content = f.read() for obj in re.findall(r"(^|\s)([a-f0-9]{40})($|\s)", content): obj = obj[1] objs.add(obj) # use .git/index to find objects index_path = os.path.join(directory, ".git", "index") if os.path.exists(index_path): index = dulwich.index.Index(index_path) for entry in index.iterblobs(): objs.add(entry[1].decode()) # use packs to find more objects to fetch, and objects that are packed pack_file_dir = os.path.join(directory, ".git", "objects", "pack") if os.path.isdir(pack_file_dir): for filename in os.listdir(pack_file_dir): if filename.startswith("pack-") and filename.endswith(".pack"): pack_data_path = os.path.join(pack_file_dir, filename) pack_idx_path = os.path.join(pack_file_dir, filename[:-5] + ".idx") pack_data = dulwich.pack.PackData(pack_data_path) pack_idx = dulwich.pack.load_pack_index(pack_idx_path) pack = dulwich.pack.Pack.from_objects(pack_data, pack_idx) for obj_file in pack.iterobjects(): packed_objs.add(obj_file.sha().hexdigest()) objs |= set(get_referenced_sha1(obj_file)) # fetch all objects printf("[-] Fetching objects\n") process_tasks(objs, FindObjectsWorker, jobs, args=(url, directory, retry, timeout), tasks_done=packed_objs) # git checkout printf("[-] Running git checkout .\n") os.chdir(directory) # ignore errors subprocess.call(["git", "checkout", "."], stderr=open(os.devnull, "wb")) return 0
def fetch_git(url, directory, jobs, retry, timeout): ''' Dump a git repository into the output directory ''' assert os.path.isdir(directory), '%s is not a directory' % directory assert not os.listdir(directory), '%s is not empty' % directory assert jobs >= 1, 'invalid number of jobs' assert retry >= 1, 'invalid number of retries' assert timeout >= 1, 'invalid timeout' # find base url url = url.rstrip('/') if url.endswith('HEAD'): url = url[:-4] url = url.rstrip('/') if url.endswith('.git'): url = url[:-4] url = url.rstrip('/') # check for /.git/HEAD printf('[-] Testing %s/.git/HEAD ', url) response = requests.get('%s/.git/HEAD' % url, verify=False, allow_redirects=False) printf('[%d]\n', response.status_code) if response.status_code != 200: printf('error: %s/.git/HEAD does not exist\n', url, file=sys.stderr) return 1 elif not response.text.startswith('ref:'): printf('error: %s/.git/HEAD is not a git HEAD file\n', url, file=sys.stderr) return 1 # check for directory listing printf('[-] Testing %s/.git/ ', url) response = requests.get('%s/.git/' % url, verify=False, allow_redirects=False) printf('[%d]\n', response.status_code) if response.status_code == 200 and is_html( response) and 'HEAD' in get_indexed_files(response): printf('[-] Fetching .git recursively\n') process_tasks(['.git/', '.gitignore'], RecursiveDownloadWorker, jobs, args=(url, directory, retry, timeout)) printf('[-] Running git checkout .\n') os.chdir(directory) subprocess.check_call(['git', 'checkout', '.']) return 0 # no directory listing printf('[-] Fetching common files\n') tasks = [ '.gitignore', '.git/COMMIT_EDITMSG', '.git/description', '.git/hooks/applypatch-msg.sample', '.git/hooks/applypatch-msg.sample', '.git/hooks/applypatch-msg.sample', '.git/hooks/commit-msg.sample', '.git/hooks/post-commit.sample', '.git/hooks/post-receive.sample', '.git/hooks/post-update.sample', '.git/hooks/pre-applypatch.sample', '.git/hooks/pre-commit.sample', '.git/hooks/pre-push.sample', '.git/hooks/pre-rebase.sample', '.git/hooks/pre-receive.sample', '.git/hooks/prepare-commit-msg.sample', '.git/hooks/update.sample', '.git/index', '.git/info/exclude', '.git/objects/info/packs', ] process_tasks(tasks, DownloadWorker, jobs, args=(url, directory, retry, timeout)) # find refs printf('[-] Finding refs/\n') tasks = [ '.git/FETCH_HEAD', '.git/HEAD', '.git/ORIG_HEAD', '.git/config', '.git/info/refs', '.git/logs/HEAD', '.git/logs/refs/heads/master', '.git/logs/refs/remotes/origin/HEAD', '.git/logs/refs/remotes/origin/master', '.git/logs/refs/stash', '.git/packed-refs', '.git/refs/heads/master', '.git/refs/remotes/origin/HEAD', '.git/refs/remotes/origin/master', '.git/refs/stash', '.git/refs/wip/wtree/refs/heads/master', #Magit '.git/refs/wip/index/refs/heads/master' #Magit ] process_tasks(tasks, FindRefsWorker, jobs, args=(url, directory, retry, timeout)) # find packs printf('[-] Finding packs\n') tasks = [] # use .git/objects/info/packs to find packs info_packs_path = os.path.join(directory, '.git', 'objects', 'info', 'packs') if os.path.exists(info_packs_path): with open(info_packs_path, 'r') as f: info_packs = f.read() for sha1 in re.findall(r'pack-([a-f0-9]{40})\.pack', info_packs): tasks.append('.git/objects/pack/pack-%s.idx' % sha1) tasks.append('.git/objects/pack/pack-%s.pack' % sha1) process_tasks(tasks, DownloadWorker, jobs, args=(url, directory, retry, timeout)) # find objects printf('[-] Finding objects\n') objs = set() packed_objs = set() # .git/packed-refs, .git/info/refs, .git/refs/*, .git/logs/* files = [ os.path.join(directory, '.git', 'packed-refs'), os.path.join(directory, '.git', 'info', 'refs'), os.path.join(directory, '.git', 'FETCH_HEAD'), os.path.join(directory, '.git', 'ORIG_HEAD'), ] for dirpath, _, filenames in os.walk( os.path.join(directory, '.git', 'refs')): for filename in filenames: files.append(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk( os.path.join(directory, '.git', 'logs')): for filename in filenames: files.append(os.path.join(dirpath, filename)) for filepath in files: if not os.path.exists(filepath): continue with open(filepath, 'r') as f: content = f.read() for obj in re.findall(r'(^|\s)([a-f0-9]{40})($|\s)', content): obj = obj[1] objs.add(obj) # use .git/index to find objects index_path = os.path.join(directory, '.git', 'index') if os.path.exists(index_path): index = dulwich.index.Index(index_path) for entry in index.iterblobs(): objs.add(entry[1].decode()) # use packs to find more objects to fetch, and objects that are packed pack_file_dir = os.path.join(directory, '.git', 'objects', 'pack') if os.path.isdir(pack_file_dir): for filename in os.listdir(pack_file_dir): if filename.startswith('pack-') and filename.endswith('.pack'): pack_data_path = os.path.join(pack_file_dir, filename) pack_idx_path = os.path.join(pack_file_dir, filename[:-5] + '.idx') pack_data = dulwich.pack.PackData(pack_data_path) pack_idx = dulwich.pack.load_pack_index(pack_idx_path) pack = dulwich.pack.Pack.from_objects(pack_data, pack_idx) for obj_file in pack.iterobjects(): packed_objs.add(obj_file.sha().hexdigest()) objs |= set(get_referenced_sha1(obj_file)) # fetch all objects printf('[-] Fetching objects\n') process_tasks(objs, FindObjectsWorker, jobs, args=(url, directory, retry, timeout), tasks_done=packed_objs) # git checkout printf('[-] Running git checkout .\n') os.chdir(directory) # ignore errors subprocess.call(['git', 'checkout', '.'], stderr=open(os.devnull, 'wb')) return 0