# instead of capital "V" DATE_FORMAT = "%Y-%m" DATE_FORMATS = {'week': "%Y-w%v", 'day': "%Y-%m-%d"} d.DEFAULT_EXPIRES = 3600 * 24 * 30 * 12 TAC_CACHE_PATH = os.path.join(stutils.get_config('ST_FS_CACHE_PATH'), 'TAC_repo_cache') if not os.path.isdir(TAC_CACHE_PATH): os.mkdir(TAC_CACHE_PATH) ONE_YEAR = expires = 3600 * 24 * 30 * 12 fs_cache_filtered = d.typed_fs_cache('filtered', expires=ONE_YEAR) fs_cache_aggregated = d.typed_fs_cache('aggregated', expires=ONE_YEAR) cached_iterator = d.cache_iterator('raw') gh_api = stscraper.GitHubAPI() scraper = stgithub.Scraper() get_raw_commits = cached_iterator(gh_api.repo_commits) get_raw_issues = cached_iterator(gh_api.repo_issues) get_raw_issue_comments = cached_iterator(gh_api.repo_issue_comments) get_raw_issue_events = cached_iterator(gh_api.repo_issue_events) get_raw_pulls = cached_iterator(gh_api.repo_pulls) @fs_cache_filtered('user_timeline') def user_timeline(user): return pd.read_sql(""" SELECT DATE_FORMAT(c.created_at, %(date_format)s) as month, count(distinct c.project_id) as cnt FROM commits c, users u WHERE c.author_id = u.id AND u.login=%(user)s
import stscraper as scraper import pandas as pd import time import re #output file path for the scraped joss github repo results outputFilePath = 'C:\\Users\\Sun\\Desktop\\Joss_Repos_Full_ConSup.xlsx' #using local excel file to get the repository url and check whether it is a github repo, the local file is the output result of JossGeneralRepoScraper.py df = pd.read_excel( r'C:\\Users\\Sun\\Desktop\\Joss_General_List_Published.xlsx', sheet_name=0) rawList = df.to_dict(orient='records') #set up token for two scrapers g = Github(login_or_token="your github token", timeout=30, retry=4) gh_api = scraper.GitHubAPI("your github token") jossList = { 'Title': [], 'RepoUrl': [], 'DoiUrl': [], 'RepoName': [], 'StarsCount': [], 'Language': [], 'RepoHasWiki': [], 'AnonContributorsCount': [], 'ContributorsCount': [], 'OpenIssuesCount': [], 'ClosedIssuesCount': [], 'ForksCount': [], 'OpenPullRequestsCount': [],
import os import re import sys import pandas as pd # from GitHubAPI_Crawler.github_api import GitHubAPI import stscraper as scraper from termcolor import colored import api_cache # api = GitHubAPI() # setup for strudel scraper token_list = [ ] gh_api = scraper.GitHubAPI(','.join(token_list)) # import scripts import full_data_access as data import markdown_analysis as md_analysis import notebook_analysis as nb_analysis # import keyword_analysis as kw_analysis import repo_analysis import code_analysis # generates a certain number of segments def get_segments(num_segments): # segment properties total_nbs = 143125 segments = []
def setUp(self): self.api = stscraper.GitHubAPI() # choose something that is reasonably large, at least over 1 page # of both issues and commits self.repo_address = 'pandas-dev/pandas'
- issue_no (int): number of the issue in the project - title (str): issue title - created_at (str): time it was reported, YYY-MM-DDTHH:MM:SS - body (str): issue text - state (str): issue status, 'open' or 'closed' """ from __future__ import print_function import argparse import logging import stscraper as scraper import pandas as pd api = scraper.GitHubAPI() def json_imap(mapping, iterable): """Extract json mappings from an iterable. Typically it is applied to an iterator returned by an API Args: mapping (dict): mapping, same as used by `json_map` iterable (Iterable): any kind of a directly iterable object. Returns: Generator: a generator of mapped items """ for item in iterable: yield scraper.json_map(mapping, item)