コード例 #1
0
def fetch_git(url, directory, jobs, retry, timeout):
    """ Dump a git repository into the output directory """

    assert os.path.isdir(directory), "%s is not a directory" % directory
    assert not os.listdir(directory), "%s is not empty" % directory
    assert jobs >= 1, "invalid number of jobs"
    assert retry >= 1, "invalid number of retries"
    assert timeout >= 1, "invalid timeout"

    # find base url
    url = url.rstrip("/")
    if url.endswith("HEAD"):
        url = url[:-4]
    url = url.rstrip("/")
    if url.endswith(".git"):
        url = url[:-4]
    url = url.rstrip("/")

    # check for /.git/HEAD
    printf("[-] Testing %s/.git/HEAD ", url)
    response = requests.get("%s/.git/HEAD" % url,
                            verify=False,
                            allow_redirects=False)
    printf("[%d]\n", response.status_code)

    if response.status_code != 200:
        printf("error: %s/.git/HEAD does not exist\n", url, file=sys.stderr)
        return 1
    elif not response.text.startswith("ref:"):
        printf("error: %s/.git/HEAD is not a git HEAD file\n",
               url,
               file=sys.stderr)
        return 1

    # check for directory listing
    printf("[-] Testing %s/.git/ ", url)
    response = requests.get("%s/.git/" % url,
                            verify=False,
                            allow_redirects=False)
    printf("[%d]\n", response.status_code)

    if response.status_code == 200 and is_html(
            response) and "HEAD" in get_indexed_files(response):
        printf("[-] Fetching .git recursively\n")
        process_tasks([".git/", ".gitignore"],
                      RecursiveDownloadWorker,
                      jobs,
                      args=(url, directory, retry, timeout))

        printf("[-] Running git checkout .\n")
        os.chdir(directory)
        subprocess.check_call(["git", "checkout", "."])
        return 0

    # no directory listing
    printf("[-] Fetching common files\n")
    tasks = [
        ".gitignore",
        ".git/COMMIT_EDITMSG",
        ".git/description",
        ".git/hooks/applypatch-msg.sample",
        ".git/hooks/applypatch-msg.sample",
        ".git/hooks/applypatch-msg.sample",
        ".git/hooks/commit-msg.sample",
        ".git/hooks/post-commit.sample",
        ".git/hooks/post-receive.sample",
        ".git/hooks/post-update.sample",
        ".git/hooks/pre-applypatch.sample",
        ".git/hooks/pre-commit.sample",
        ".git/hooks/pre-push.sample",
        ".git/hooks/pre-rebase.sample",
        ".git/hooks/pre-receive.sample",
        ".git/hooks/prepare-commit-msg.sample",
        ".git/hooks/update.sample",
        ".git/index",
        ".git/info/exclude",
        ".git/objects/info/packs",
    ]
    process_tasks(tasks,
                  DownloadWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find refs
    printf("[-] Finding refs/\n")
    tasks = [
        ".git/FETCH_HEAD",
        ".git/HEAD",
        ".git/ORIG_HEAD",
        ".git/config",
        ".git/info/refs",
        ".git/logs/HEAD",
        ".git/logs/refs/heads/master",
        ".git/logs/refs/remotes/origin/HEAD",
        ".git/logs/refs/remotes/origin/master",
        ".git/logs/refs/stash",
        ".git/packed-refs",
        ".git/refs/heads/master",
        ".git/refs/remotes/origin/HEAD",
        ".git/refs/remotes/origin/master",
        ".git/refs/stash",
    ]

    process_tasks(tasks,
                  FindRefsWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find packs
    printf("[-] Finding packs\n")
    tasks = []

    # use .git/objects/info/packs to find packs
    info_packs_path = os.path.join(directory, ".git", "objects", "info",
                                   "packs")
    if os.path.exists(info_packs_path):
        with open(info_packs_path, "r") as f:
            info_packs = f.read()

        for sha1 in re.findall(r"pack-([a-f0-9]{40})\.pack", info_packs):
            tasks.append(".git/objects/pack/pack-%s.idx" % sha1)
            tasks.append(".git/objects/pack/pack-%s.pack" % sha1)

    process_tasks(tasks,
                  DownloadWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find objects
    printf("[-] Finding objects\n")
    objs = set()
    packed_objs = set()

    # .git/packed-refs, .git/info/refs, .git/refs/*, .git/logs/*
    files = [
        os.path.join(directory, ".git", "packed-refs"),
        os.path.join(directory, ".git", "info", "refs"),
        os.path.join(directory, ".git", "FETCH_HEAD"),
        os.path.join(directory, ".git", "ORIG_HEAD"),
    ]
    for dirpath, _, filenames in os.walk(
            os.path.join(directory, ".git", "refs")):
        for filename in filenames:
            files.append(os.path.join(dirpath, filename))
    for dirpath, _, filenames in os.walk(
            os.path.join(directory, ".git", "logs")):
        for filename in filenames:
            files.append(os.path.join(dirpath, filename))

    for filepath in files:
        if not os.path.exists(filepath):
            continue

        with open(filepath, "r") as f:
            content = f.read()

        for obj in re.findall(r"(^|\s)([a-f0-9]{40})($|\s)", content):
            obj = obj[1]
            objs.add(obj)

    # use .git/index to find objects
    index_path = os.path.join(directory, ".git", "index")
    if os.path.exists(index_path):
        index = dulwich.index.Index(index_path)

        for entry in index.iterblobs():
            objs.add(entry[1].decode())

    # use packs to find more objects to fetch, and objects that are packed
    pack_file_dir = os.path.join(directory, ".git", "objects", "pack")
    if os.path.isdir(pack_file_dir):
        for filename in os.listdir(pack_file_dir):
            if filename.startswith("pack-") and filename.endswith(".pack"):
                pack_data_path = os.path.join(pack_file_dir, filename)
                pack_idx_path = os.path.join(pack_file_dir,
                                             filename[:-5] + ".idx")
                pack_data = dulwich.pack.PackData(pack_data_path)
                pack_idx = dulwich.pack.load_pack_index(pack_idx_path)
                pack = dulwich.pack.Pack.from_objects(pack_data, pack_idx)

                for obj_file in pack.iterobjects():
                    packed_objs.add(obj_file.sha().hexdigest())
                    objs |= set(get_referenced_sha1(obj_file))

    # fetch all objects
    printf("[-] Fetching objects\n")
    process_tasks(objs,
                  FindObjectsWorker,
                  jobs,
                  args=(url, directory, retry, timeout),
                  tasks_done=packed_objs)

    # git checkout
    printf("[-] Running git checkout .\n")
    os.chdir(directory)

    # ignore errors
    subprocess.call(["git", "checkout", "."], stderr=open(os.devnull, "wb"))

    return 0
コード例 #2
0
def fetch_git(url, directory, jobs, retry, timeout):
    ''' Dump a git repository into the output directory '''

    assert os.path.isdir(directory), '%s is not a directory' % directory
    assert not os.listdir(directory), '%s is not empty' % directory
    assert jobs >= 1, 'invalid number of jobs'
    assert retry >= 1, 'invalid number of retries'
    assert timeout >= 1, 'invalid timeout'

    # find base url
    url = url.rstrip('/')
    if url.endswith('HEAD'):
        url = url[:-4]
    url = url.rstrip('/')
    if url.endswith('.git'):
        url = url[:-4]
    url = url.rstrip('/')

    # check for /.git/HEAD
    printf('[-] Testing %s/.git/HEAD ', url)
    response = requests.get('%s/.git/HEAD' % url,
                            verify=False,
                            allow_redirects=False)
    printf('[%d]\n', response.status_code)

    if response.status_code != 200:
        printf('error: %s/.git/HEAD does not exist\n', url, file=sys.stderr)
        return 1
    elif not response.text.startswith('ref:'):
        printf('error: %s/.git/HEAD is not a git HEAD file\n',
               url,
               file=sys.stderr)
        return 1

    # check for directory listing
    printf('[-] Testing %s/.git/ ', url)
    response = requests.get('%s/.git/' % url,
                            verify=False,
                            allow_redirects=False)
    printf('[%d]\n', response.status_code)

    if response.status_code == 200 and is_html(
            response) and 'HEAD' in get_indexed_files(response):
        printf('[-] Fetching .git recursively\n')
        process_tasks(['.git/', '.gitignore'],
                      RecursiveDownloadWorker,
                      jobs,
                      args=(url, directory, retry, timeout))

        printf('[-] Running git checkout .\n')
        os.chdir(directory)
        subprocess.check_call(['git', 'checkout', '.'])
        return 0

    # no directory listing
    printf('[-] Fetching common files\n')
    tasks = [
        '.gitignore',
        '.git/COMMIT_EDITMSG',
        '.git/description',
        '.git/hooks/applypatch-msg.sample',
        '.git/hooks/applypatch-msg.sample',
        '.git/hooks/applypatch-msg.sample',
        '.git/hooks/commit-msg.sample',
        '.git/hooks/post-commit.sample',
        '.git/hooks/post-receive.sample',
        '.git/hooks/post-update.sample',
        '.git/hooks/pre-applypatch.sample',
        '.git/hooks/pre-commit.sample',
        '.git/hooks/pre-push.sample',
        '.git/hooks/pre-rebase.sample',
        '.git/hooks/pre-receive.sample',
        '.git/hooks/prepare-commit-msg.sample',
        '.git/hooks/update.sample',
        '.git/index',
        '.git/info/exclude',
        '.git/objects/info/packs',
    ]
    process_tasks(tasks,
                  DownloadWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find refs
    printf('[-] Finding refs/\n')
    tasks = [
        '.git/FETCH_HEAD',
        '.git/HEAD',
        '.git/ORIG_HEAD',
        '.git/config',
        '.git/info/refs',
        '.git/logs/HEAD',
        '.git/logs/refs/heads/master',
        '.git/logs/refs/remotes/origin/HEAD',
        '.git/logs/refs/remotes/origin/master',
        '.git/logs/refs/stash',
        '.git/packed-refs',
        '.git/refs/heads/master',
        '.git/refs/remotes/origin/HEAD',
        '.git/refs/remotes/origin/master',
        '.git/refs/stash',
        '.git/refs/wip/wtree/refs/heads/master',  #Magit
        '.git/refs/wip/index/refs/heads/master'  #Magit
    ]

    process_tasks(tasks,
                  FindRefsWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find packs
    printf('[-] Finding packs\n')
    tasks = []

    # use .git/objects/info/packs to find packs
    info_packs_path = os.path.join(directory, '.git', 'objects', 'info',
                                   'packs')
    if os.path.exists(info_packs_path):
        with open(info_packs_path, 'r') as f:
            info_packs = f.read()

        for sha1 in re.findall(r'pack-([a-f0-9]{40})\.pack', info_packs):
            tasks.append('.git/objects/pack/pack-%s.idx' % sha1)
            tasks.append('.git/objects/pack/pack-%s.pack' % sha1)

    process_tasks(tasks,
                  DownloadWorker,
                  jobs,
                  args=(url, directory, retry, timeout))

    # find objects
    printf('[-] Finding objects\n')
    objs = set()
    packed_objs = set()

    # .git/packed-refs, .git/info/refs, .git/refs/*, .git/logs/*
    files = [
        os.path.join(directory, '.git', 'packed-refs'),
        os.path.join(directory, '.git', 'info', 'refs'),
        os.path.join(directory, '.git', 'FETCH_HEAD'),
        os.path.join(directory, '.git', 'ORIG_HEAD'),
    ]
    for dirpath, _, filenames in os.walk(
            os.path.join(directory, '.git', 'refs')):
        for filename in filenames:
            files.append(os.path.join(dirpath, filename))
    for dirpath, _, filenames in os.walk(
            os.path.join(directory, '.git', 'logs')):
        for filename in filenames:
            files.append(os.path.join(dirpath, filename))

    for filepath in files:
        if not os.path.exists(filepath):
            continue

        with open(filepath, 'r') as f:
            content = f.read()

        for obj in re.findall(r'(^|\s)([a-f0-9]{40})($|\s)', content):
            obj = obj[1]
            objs.add(obj)

    # use .git/index to find objects
    index_path = os.path.join(directory, '.git', 'index')
    if os.path.exists(index_path):
        index = dulwich.index.Index(index_path)

        for entry in index.iterblobs():
            objs.add(entry[1].decode())

    # use packs to find more objects to fetch, and objects that are packed
    pack_file_dir = os.path.join(directory, '.git', 'objects', 'pack')
    if os.path.isdir(pack_file_dir):
        for filename in os.listdir(pack_file_dir):
            if filename.startswith('pack-') and filename.endswith('.pack'):
                pack_data_path = os.path.join(pack_file_dir, filename)
                pack_idx_path = os.path.join(pack_file_dir,
                                             filename[:-5] + '.idx')
                pack_data = dulwich.pack.PackData(pack_data_path)
                pack_idx = dulwich.pack.load_pack_index(pack_idx_path)
                pack = dulwich.pack.Pack.from_objects(pack_data, pack_idx)

                for obj_file in pack.iterobjects():
                    packed_objs.add(obj_file.sha().hexdigest())
                    objs |= set(get_referenced_sha1(obj_file))

    # fetch all objects
    printf('[-] Fetching objects\n')
    process_tasks(objs,
                  FindObjectsWorker,
                  jobs,
                  args=(url, directory, retry, timeout),
                  tasks_done=packed_objs)

    # git checkout
    printf('[-] Running git checkout .\n')
    os.chdir(directory)

    # ignore errors
    subprocess.call(['git', 'checkout', '.'], stderr=open(os.devnull, 'wb'))

    return 0