Beispiel #1
0
# instead of capital "V"
DATE_FORMAT = "%Y-%m"
DATE_FORMATS = {'week': "%Y-w%v", 'day': "%Y-%m-%d"}

d.DEFAULT_EXPIRES = 3600 * 24 * 30 * 12

TAC_CACHE_PATH = os.path.join(stutils.get_config('ST_FS_CACHE_PATH'),
                              'TAC_repo_cache')
if not os.path.isdir(TAC_CACHE_PATH):
    os.mkdir(TAC_CACHE_PATH)

ONE_YEAR = expires = 3600 * 24 * 30 * 12
fs_cache_filtered = d.typed_fs_cache('filtered', expires=ONE_YEAR)
fs_cache_aggregated = d.typed_fs_cache('aggregated', expires=ONE_YEAR)
cached_iterator = d.cache_iterator('raw')
gh_api = stscraper.GitHubAPI()
scraper = stgithub.Scraper()

get_raw_commits = cached_iterator(gh_api.repo_commits)
get_raw_issues = cached_iterator(gh_api.repo_issues)
get_raw_issue_comments = cached_iterator(gh_api.repo_issue_comments)
get_raw_issue_events = cached_iterator(gh_api.repo_issue_events)
get_raw_pulls = cached_iterator(gh_api.repo_pulls)


@fs_cache_filtered('user_timeline')
def user_timeline(user):
    return pd.read_sql("""
        SELECT DATE_FORMAT(c.created_at, %(date_format)s) as month, count(distinct c.project_id) as cnt 
        FROM commits c, users u 
        WHERE c.author_id = u.id AND u.login=%(user)s 
import stscraper as scraper
import pandas as pd
import time
import re

#output file path for the scraped joss github repo results
outputFilePath = 'C:\\Users\\Sun\\Desktop\\Joss_Repos_Full_ConSup.xlsx'

#using local excel file to get the repository url and check whether it is a github repo, the local file is the output result of JossGeneralRepoScraper.py
df = pd.read_excel(
    r'C:\\Users\\Sun\\Desktop\\Joss_General_List_Published.xlsx', sheet_name=0)
rawList = df.to_dict(orient='records')

#set up token for two scrapers
g = Github(login_or_token="your github token", timeout=30, retry=4)
gh_api = scraper.GitHubAPI("your github token")

jossList = {
    'Title': [],
    'RepoUrl': [],
    'DoiUrl': [],
    'RepoName': [],
    'StarsCount': [],
    'Language': [],
    'RepoHasWiki': [],
    'AnonContributorsCount': [],
    'ContributorsCount': [],
    'OpenIssuesCount': [],
    'ClosedIssuesCount': [],
    'ForksCount': [],
    'OpenPullRequestsCount': [],
import os
import re
import sys
import pandas as pd
# from GitHubAPI_Crawler.github_api import GitHubAPI
import stscraper as scraper
from termcolor import colored
import api_cache

# api = GitHubAPI()

# setup for strudel scraper
token_list = [

]
gh_api = scraper.GitHubAPI(','.join(token_list))

# import scripts
import full_data_access as data
import markdown_analysis as md_analysis   
import notebook_analysis as nb_analysis   
# import keyword_analysis as kw_analysis   
import repo_analysis
import code_analysis

# generates a certain number of segments
def get_segments(num_segments):
    
    # segment properties
    total_nbs = 143125
    segments = []
Beispiel #4
0
 def setUp(self):
     self.api = stscraper.GitHubAPI()
     # choose something that is reasonably large, at least over 1 page
     # of both issues and commits
     self.repo_address = 'pandas-dev/pandas'
    - issue_no (int): number of the issue in the project
    - title (str): issue title
    - created_at (str): time it was reported, YYY-MM-DDTHH:MM:SS
    - body (str): issue text
    - state (str): issue status, 'open' or 'closed'
"""

from __future__ import print_function

import argparse
import logging

import stscraper as scraper
import pandas as pd

api = scraper.GitHubAPI()


def json_imap(mapping, iterable):
    """Extract json mappings from an iterable.

    Typically it is applied to an iterator returned by an API

    Args:
        mapping (dict): mapping, same as used by `json_map`
        iterable (Iterable): any kind of a directly iterable object.
    Returns:
        Generator: a generator of mapped items
    """
    for item in iterable:
        yield scraper.json_map(mapping, item)