def _scrape_sections() -> dict: """Get listing of sections with their headings.""" result = {} _LOG.debug("Scraping available sections") response = requests.get("{}/{}".format(Configuration().github_docs, Configuration().github_docs_version)) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') for topic in soup.find_all('li', {'class': 'js-topic'}): item = topic.h3.find_all('a') if not item or len(item) != 2: continue item = item[1] link = _get_item_href(item) if len(link) != 1: # Omit references to the same document e.g. '/v3/search/#search-repositories' _LOG.debug("Omitting in-document reference %r", link) continue _add_section(result, link[0], (item.text, None)) for child in topic.findChildren(): if child.name != 'li': continue link = _get_item_href(child.a) if link is None or len(link) != 1: # Omit references to the same document _LOG.debug("Omitting in-document reference %r", link) continue _add_section(result, link[0], (item.text, child.text)) return result
def _convert2swagger(internal_representation: dict) -> dict: paths = {} for url, entries in internal_representation.items(): for item_desc, value in entries.items(): if not value or '@top' not in value: continue for request in value['@top'].get('@requests', []): if request['@endpoint'] not in paths: paths[request['@endpoint']] = {} paths[request['@endpoint'].lower()] = { request['@method'].lower(): { 'summary': request['@description'] or "", 'tags': [value['@tag']] } } return { 'paths': paths, "produces": [ "application/json" ], "swagger": "2.0", "info": { "description": "GitHub API v3 swagger definition." }, "host": Configuration().github_api, "basePath": "/{}".format(Configuration().github_docs_version), "schemes": "https" }
def cli(ctx=None, verbose=0, no_color=True, user=None, password=None, token=None, no_validate_schemas=False, no_omit_rate_limiting=False, no_pagination=False, headers=None, per_page_listing=None, github_api=None): """Githubcap command line interface.""" if ctx: ctx.auto_envvar_prefix = 'GITHUBCAP' setup_logging(verbose, no_color) if user is not None: Configuration().user = user if password is not None: Configuration().password = password if token is not None: Configuration().token = token if per_page_listing is not None: Configuration().per_page_listing = per_page_listing if github_api is not None: Configuration().github_api = github_api if headers is not None: Configuration().headers = _parse_cli_headers(headers) Configuration().omit_rate_limiting = not no_omit_rate_limiting Configuration().pagination = not no_pagination Configuration().validate_schemas = not no_validate_schemas _LOG.debug("Configuration: %s", attr.asdict(Configuration().instance))
def _add_section(result: dict, link: str, text: typing.Tuple[str, typing.Union[None, str]]) -> None: """Add section to the resulting scraping section.""" if link in _NO_SCRAPE_SITES: _LOG.debug("Skipping blacklisted section %s%s", Configuration().github_docs, link) return _LOG.debug("Found section to be scraped %r: %s%s", text, Configuration().github_docs, link) if link in result: raise ValueError result[link] = text
def cli_config(ctx, no_pretty=False, create=False, overwrite=False, path=None, no_print=False): """Manipulate with githubcap configuration.""" if no_print and not create: _LOG.error("Nothing to do, exiting...") ctx.exit(1) if create: Configuration().write2file(path, overwrite=overwrite) if not no_print: print_command_result(Configuration().to_dict(), pretty=not no_pretty)
def _do_scrape(sections: dict, schemas_dir: Optional[str] = None, resources_dir: Optional[str] = None) -> dict: """Scrape remote documentation and return its parsed representation.""" _LOG.debug("Creating directory %r for resources", resources_dir) os.makedirs(resources_dir, exist_ok=True) _LOG.debug("Creating directory %r for schemas", schemas_dir) os.makedirs(schemas_dir, exist_ok=True) all_items = {} for link, (item_tag, item_title) in sections.items(): record = {} all_items[link] = record url = "{}{}".format(Configuration().github_docs, link) _LOG.debug("Scraping %r to automatically construct classes", url) response = requests.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.content, 'html.parser') content = soup.find_all('div', {'class': 'content'}) if len(content) != 1: raise ValueError("Found multiple contents in %r", url) content = content[0] for obj in content.find_all('h2'): if obj is None or not isinstance(obj, bs4.element.Tag): continue if obj.name != 'h2': continue item = obj.text.strip() assert item not in record record[item] = {} last_desc = None last_section_title = None last_subsection_title = None for sibling in obj.next_siblings: if sibling.name == 'h2': break if isinstance(sibling, bs4.element.NavigableString): continue if sibling.name == 'p': last_desc = sibling if sibling.name in 'h3': last_section_title = sibling.text.strip() last_subsection_title = None _LOG.debug("Found new section %r in %r", last_section_title, item) if sibling.name in 'h4': last_subsection_title = sibling.text.strip() _LOG.debug("Found new sub-section %r in %r", last_subsection_title, item) if sibling.name == 'pre' and \ sibling.text.lstrip().startswith(('GET', 'DELETE', 'PATCH', 'POST', 'PUT', 'DELETE')): where = _where_location(record, item, last_section_title, last_subsection_title) if '@requests' not in where: where['@requests'] = [] where['@requests'].append(_parse_request_def(sibling.text, last_desc)) if sibling.name == 'table': _LOG.debug("Found table describing types for %r (subsection %r) in %r", last_section_title, last_subsection_title, item) where = _where_location(record, item, last_section_title, last_subsection_title) type_def = _parse_type_definition(sibling) if '@types' in where: var_name = last_desc.find('code').text if '@subtypes' not in where: where['@subtypes'] = [] where['@subtypes'].append({var_name: type_def}) else: where['@types'] = type_def if sibling.name == 'pre' and 'highlight-json' in sibling.attrs.get('class', []): _LOG.debug("Found JSON format in %r for %r, subsection %r", last_section_title, item, last_subsection_title) where = _where_location(record, item, last_section_title, last_subsection_title) if '@json' not in where: where['@json'] = [] where['@json'].append(json.loads(sibling.text)) if sibling.name == 'pre' and 'highlight-headers' in sibling.attrs.get('class', []): _LOG.debug("Found headers in %r for %r, subsection %r", last_section_title, item, last_subsection_title) where = _where_location(record, item, last_section_title, last_subsection_title) if '@headers' not in where: where['@headers'] = [] where['@headers'].append(sibling.text) if sibling.name == 'div' and 'note' in sibling.attrs.get('class', []): for s in sibling.find_all('code'): if s.name == 'code' and s.text.startswith('application/vnd.github.'): where = _where_location(record, item, last_section_title, last_subsection_title) assert '@additonal-headers' not in where, where where['@additional-headers'] = s.text.strip() if not record[item]: _LOG.debug("No valuable data found for %r", item) record.pop(item) else: record[item]['@description'] = _find_section_description(obj) record[item]['@tag'] = item_tag record[item]['@title'] = item_title return all_items
def validate_response(validator, response): if Configuration().validate_schemas: validator(response) return response
) @click.option( '-P', '--no-pagination', is_flag=True, help="Respect pagination - perform multiple API calls on paginated response." ) @click.option('-H', '--headers', type=str, help="A comma separated list of headers to be sent.") @click.option('-l', '--per_page_listing', type=int, show_default=True, default=Configuration().per_page_listing, help="Number of entries in page listing in a single API call.") @click.option('--github-api', type=str, default=Configuration().github_api, show_default=True, help="GitHub API endpoint.") @click.option('--no-validate-schemas', '-S', is_flag=True, help="GitHub API endpoint.") def cli(ctx=None, verbose=0, no_color=True, user=None, password=None,