Esempio n. 1
0
def main():
    repos = {
        # To już jest
        "tensorflow": ["tensorflow"],
        # "arduino": ["Arduino"],
        # "audacity": ["audacity"],
        # "capistrano": ["capistrano"],
        # "cloudera": ["hue"],

        # "floweisshardt": ["atf"],
        # "bardsoftware": ["ganttproject"],
        # "nginx": ["nginx"],
        # "torakiki": ["pdfsam"],
        # "scala": ["scala"],
        # "Squirrel": ["Squirrel.Windows"],
        # "tornadoweb": ["tornado"]
        # "liferay": ["liferay-learn"]
    }
    if len(repos.keys()) <= 0:
        print("Odkomentuj lub dodaj coś do tego pliku w zmiennej repos")
        return
    spiders = [
        "githubapi",
        # "contributors",
        # 'subscribers',
        # "users",
        # 'milestones'
        'releases'
    ]

    for owner in repos.keys():
        for repository in repos[owner]:
            print(f"{owner}/{repository}")
            print("Scrap api")
            set_env_command = f"set \"REPOSITORY_OWNER={owner}\"&&set \"REPOSITORY_NAME={repository}\""  # na Windows
            # set_env_command = f"export REPOSITORY_OWNER={owner} && export REPOSITORY_NAME={repository}" # na Linux
            os.system(set_env_command)
            os.environ['REPOSITORY_OWNER'] = owner
            os.environ['REPOSITORY_NAME'] = repository
            for spider in spiders:
                scrapy_command = f"scrapy crawl {spider}"
                command = f"{set_env_command} && {scrapy_command}"
                os.system(command)
                Config.set_repository_owner(owner)
                Config.set_repository_name(repository)

            # print("Scrap repo")
            # scrap_repo()
            print("End")
Esempio n. 2
0
def main():
    repos = {
        # "tensorflow": ["tensorflow"],
        "arduino": ["Arduino"]
    }

    spiders = [
        "githubapi",
        "contributors",
        'subscribers',
        "users"
    ]

    for owner in repos.keys():
        for repository in repos[owner]:
            for spider in spiders:
                set_env_command = f"export REPOSITORY_OWNER={owner} && export REPOSITORY_NAME={repository}"
                scrapy_command = f"scrapy crawl {spider}"
                command = f"{set_env_command} && {scrapy_command}"
                os.system(command)
                Config.set_repository_owner(owner)
                Config.set_repository_name(repository)
            scrap_repo()
Esempio n. 3
0
 def start_requests(self):
     Config.github_token()
     for url in self.urls_to_crawl:
         yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 4
0
def scrap_repo():
    print(Config.repository_name())
    repo_download_dir = f"{Config.working_directory()}/{Config.repo_dir()}/{Config.repository_owner()}_{Config.repository_name()}"
    repo_url = f"https://github.com/{Config.repository_owner()}/{Config.repository_name()}.git"
    Util.mkdir(repo_download_dir)
    if os.path.exists(repo_download_dir):
        os.system(f"git -C {repo_download_dir} fetch")
        os.system(f"git -C {repo_download_dir} pull")
    else:
        os.system(f"git clone -q {repo_url} {repo_download_dir}")

    data_dir = f"{Config.working_directory()}/{Config.data_dir()}/{Config.repository_owner()}_{Config.repository_name()}"
    temp_dir = f"{Config.working_directory()}/{Config.temp_dir()}/{Config.repository_owner()}_{Config.repository_name()}"

    commits_csv_file_path = f"{data_dir}/commits.csv"
    commit_hash_csv_file_path = f"{temp_dir}/commits_hash.txt"
    author_names_csv_file_path = f"{temp_dir}/author_names.txt"
    commiter_names_csv_file_path = f"{temp_dir}/commiter_names.txt"
    user_names_csv_file_path = f"{temp_dir}/user_names.txt"
    files_in_repo_csv_file_path = f"{data_dir}/files_in_repo.txt"
    list_files_csv_file_path = f"{temp_dir}/list_files.txt"
    changed_files_csv_file_path = f"{data_dir}/changed_files.txt"

    for file in [
            commits_csv_file_path, commit_hash_csv_file_path,
            author_names_csv_file_path, commiter_names_csv_file_path,
            user_names_csv_file_path, files_in_repo_csv_file_path,
            changed_files_csv_file_path
    ]:
        Util.mkdir(file)

    commands = [
        'echo "tree_hash\$commith_hash\$author_name\$author_email\$author_timestamp\$committer_name\$committer_email\$committer_timestamp" > '
        + commits_csv_file_path,
        'git -C ' + repo_download_dir +
        ' log --pretty=format:\'%T$%H$%an$%ae$%at$%cn$%ce$%ct\' --abbrev-commit >> '
        + commits_csv_file_path,
        'git -C ' + repo_download_dir +
        ' log --pretty=format:\'%H\' --abbrev-commit | sort | uniq -u >> ' +
        commit_hash_csv_file_path,
        'git -C ' + repo_download_dir +
        ' log --pretty=format:\'%ae\' --abbrev-commit | sort | uniq -u | grep -o \'^[^@]*\' > '
        + author_names_csv_file_path,
        'git -C ' + repo_download_dir +
        ' log --pretty=format:\'%ce\' --abbrev-commit | sort | uniq -u | grep -o \'^[^@]*\' > '
        + commiter_names_csv_file_path,
        'cat ' + author_names_csv_file_path + ' > ' +
        user_names_csv_file_path + '.tmp',
        'cat ' + commiter_names_csv_file_path + ' >> ' +
        user_names_csv_file_path + '.tmp',
        'cat ' + user_names_csv_file_path + '.tmp | sort | uniq -u > ' +
        user_names_csv_file_path,
        'git -C ' + repo_download_dir + ' ls-files | wc -l > ' +
        files_in_repo_csv_file_path,
    ]
    for command in commands:
        print(f"command: {command}")
        os.system(command)

    find_files_command = f"find {repo_download_dir} -type f -not -path '*/.git/*' -exec echo {{}} \;"
    repo_files = []
    for repo_file in os.popen(find_files_command).readlines():
        x = repo_file.strip()[len(repo_download_dir) + 1:]
        repo_files.append(x)
    with open(list_files_csv_file_path, 'w') as list_files:
        for x in repo_files:
            list_files.write(f"{x}\n")
        list_files.close()

    with open(changed_files_csv_file_path, 'w') as changed_files:
        for repo_file in progressbar.progressbar(repo_files):
            authors_command = f"git -C {repo_download_dir} log --pretty=format:\"%an\" {repo_file} | sort | uniq -u"
            # print(f"command: {authors_command}")
            authors = []
            for author in os.popen(authors_command).readlines():
                authors.append(author.strip())
            # print(authors)
            for author in authors:
                changed_files.write(f"{repo_file},{author}\n")
        changed_files.close()