예제 #1
0
def github_releases(org: str, name: str, config: GithubConfig) -> ETLReference:
    url = config.releases_url(org, name)
    with APILimiter(url, DELAY) as api_limiter:
        with ETLDataset(url) as etl_dataset:
            for entry in _sync_continuous_data(api_limiter, etl_dataset, url,
                                               config.auth(), config.headers):
                pass

    return ETLReference(url)
예제 #2
0
def github_tags(org: str, name: str, config: GithubConfig) -> ETLReference:
    url = config.tags_url(org, name)
    with APILimiter(url, DELAY) as api_limiter:
        with ETLDataset(url) as etl_dataset:
            for entry in _sync_continuous_data(api_limiter, etl_dataset, url,
                                               config.auth(), config.headers):
                pass
            # Bulid an abstraction that'll update a list of times in place.
            # /tags dosen't return enough data, we'll need to call /tags/:sha to have a more complete dataset

    return ETLReference(url)
예제 #3
0
def _sync_continuous_data(api_limiter: APILimiter,
                          etl_dataset: ETLDataset,
                          base_url: str,
                          auth: HTTPBasicAuth,
                          headers: typing.Dict[str, str] = {},
                          params: typing.Dict[str, str] = {}) -> None:
    SYNC_DATA = True
    page = 1
    limit = 100
    request_params: typing.Dict[str, str] = {'state': 'all', 'sort': 'created'}
    request_params.update(params)
    while SYNC_DATA:
        request_params['page'] = page
        request_params['limit'] = limit
        url = '?'.join([base_url, urlencode(request_params)])
        api_limiter.delay()
        logger.info(f'Pulling URL[{url}]')
        response = requests.get(url, headers=headers, auth=auth)
        if response.status_code in [200]:
            dataset = response.json()
            if len(dataset) == 0:
                SYNC_DATA = False
                break

            for entry in dataset:
                if etl_dataset.contains(entry) is False:
                    etl_dataset.add(entry)
                    yield entry

                else:
                    SYNC_DATA = False
                    break

            page += 1

        else:
            _handle_github_error(response)
            SYNC_DATA = False
            break
예제 #4
0
def github_repo(org: str, name: str, config: GithubConfig) -> ETLReference:
    url = config.repos_url(org, name)
    with APILimiter(url, DELAY) as api_limiter:
        with ETLDataset(url) as etl_dataset:
            response = requests.get(url,
                                    auth=config.auth(),
                                    headers=config.headers)
            if response.status_code in [404]:
                logger.error(
                    f'User[{config.username}] may not have access to Repo[{org}/{name}]'
                )

            elif response.status_code in [200]:
                etl_dataset.update(response.json())

            else:
                raise NotImplementedError(f'{response.status_code}: {url}')

    return ETLReference(url)