def process_config(config): """ Master function to process a Scraper config file Returns a Code.gov Metadata file """ agency = config.get("agency", "UNKNOWN") logger.debug("Agency: %s", agency) method = config.get("method", "other") logger.debug("Inventory Method: %s", method) compute_labor_hours = config.get("compute_labor_hours", True) if config.get("contact_email", None) is None: # A default contact email is required to handle the (frequent) case # where a project / repository has no available contact email. logger.warning('Config file should contain a "contact_email"') logger.debug("Creating inventory from config: %s", config) code_gov_metadata = Metadata(agency, method) # Parse config for GitHub repositories github_instances = config.get("GitHub", []) if config.get("github_gov_orgs", False): github_instances.append({"url": "https://github.com", "orgs": gov_orgs()}) for instance in github_instances: url = instance.get("url", "https://github.com") orgs = instance.get("orgs", []) repos = instance.get("repos", []) public_only = instance.get("public_only", True) excluded = instance.get("exclude", []) token = instance.get("token", None) gh_session = github.connect(url, token) for repo in github.query_repos(gh_session, orgs, repos, public_only): if repo.owner.login in excluded or repo.full_name in excluded: logger.info("Excluding: %s", repo.full_name) continue code_gov_project = Project.from_github3( repo, labor_hours=compute_labor_hours ) code_gov_metadata["releases"].append(code_gov_project) # Parse config for GitLab repositories gitlab_instances = config.get("GitLab", []) for instance in gitlab_instances: url = instance.get("url") # orgs = instance.get('orgs', []) repos = instance.get("repos", []) # public_only = instance.get('public_only', True) excluded = instance.get("exclude", []) token = instance.get("token", None) fetch_languages = instance.get("fetch_languages", False) gl_session = gitlab.connect(url, token) for repo in gitlab.query_repos(gl_session, repos): namespace = repo.namespace["path"] path_with_namespace = repo.path_with_namespace if namespace in excluded or path_with_namespace in excluded: logger.info("Excluding: %s", repo.path_with_namespace) continue code_gov_project = Project.from_gitlab( repo, labor_hours=compute_labor_hours, fetch_languages=fetch_languages ) code_gov_metadata["releases"].append(code_gov_project) # Parse config for Bitbucket repositories bitbucket_instances = config.get("Bitbucket", []) for instance in bitbucket_instances: url = instance.get("url") # orgs = instance.get('orgs', None) # public_only = instance.get('public_only', True) username = instance.get("username", None) password = instance.get("password", None) token = instance.get("token", None) excluded = instance.get("exclude", []) bb_session = bitbucket.connect(url, username, password, token) for repo in bitbucket.all_repos(bb_session): project = repo["project"]["key"] project_repo = "%s/%s" % (project, repo["slug"]) if project in excluded or project_repo in excluded: logger.info("Excluding: %s", project_repo) continue code_gov_project = Project.from_stashy( repo, labor_hours=compute_labor_hours ) code_gov_metadata["releases"].append(code_gov_project) # Parse config for TFS repositories tfs_instances = config.get("TFS", []) for instance in tfs_instances: url = instance.get("url") token = instance.get("token", None) projects = tfs.get_projects_metadata(url, token) for project in projects: code_gov_project = Project.from_tfs( project, labor_hours=compute_labor_hours ) code_gov_metadata["releases"].append(code_gov_project) # Handle parsing of DOE CODE records doecode_config = config.get("DOE CODE", {}) doecode_json = doecode_config.get("json", None) doecode_url = doecode_config.get("url", None) doecode_key = doecode_config.get("api_key", None) for record in doecode.process(doecode_json, doecode_url, doecode_key): code_gov_project = Project.from_doecode(record) code_gov_metadata["releases"].append(code_gov_project) return code_gov_metadata
def main(): parser = argparse.ArgumentParser( description='Scrape code repositories for Code.gov / DOECode') parser.add_argument('--agency', type=str, nargs='?', default='', help='Agency Label, e.g. "DOE"') parser.add_argument('--method', type=str, nargs='?', default='', help='Method of measuring open source') parser.add_argument( '--organization', type=str, nargs='?', default='', help='Force all repos to report a particular organzation') parser.add_argument( '--contact-email', type=str, nargs='?', default='', help='Force all repos to report a particular contact email') parser.add_argument('--config', type=str, nargs='?', default='', help='Configuration File (*.json)') parser.add_argument('--github-orgs', type=str, nargs='+', default=[], help='GitHub Organizations') parser.add_argument('--github-repos', type=str, nargs='+', default=[], help='GitHub Repositories') parser.add_argument('--github-gov-orgs', action='store_true', help='Use orgs from government.github.com/community') parser.add_argument('--to-csv', action='store_true', help='Toggle output to CSV') parser.add_argument('--doecode-json', type=str, nargs='?', default='', help='Path to DOECode .json file') parser.add_argument('--verbose', action='store_true', help='Enable verbose output') args = parser.parse_args() _configure_logging(args.verbose) try: config_json = json.load(open(args.config)) except (FileNotFoundError, json.JSONDecodeError): if args.config: raise config_json = {} agency = config_json.get('agency', 'UNKNOWN') agency = args.agency or agency logger.debug('Agency: %s', agency) method = config_json.get('method', 'other') method = args.method or method logger.debug('Inventory Method: %s', method) organization = config_json.get('organization', '') organization = args.organization or organization logger.debug('Organization: %s', organization) contact_email = config_json.get('contact_email', '') contact_email = args.contact_email or contact_email logger.debug('Contact Email: %s', contact_email) github_orgs = config_json.get('github_orgs', []) github_orgs.extend(args.github_orgs) logger.debug('GitHub.com Organizations: %s', github_orgs) if args.github_gov_orgs: github_orgs.extend(gov_orgs()) github_repos = config_json.get('github_repos', []) github_repos.extend(args.github_repos) logger.debug('GitHub.com Repositories: %s', github_repos) bitbucket_servers = config_json.get('bitbucket_servers', []) bitbucket_servers = [connect_to_bitbucket(s) for s in bitbucket_servers] logger.debug('Bitbucket Servers: %s', bitbucket_servers) doecode_json = args.doecode_json logger.debug('Queuing DOE Code JSON: %s', doecode_json) code_json = CodeGovMetadata(agency, method) for org_name in sorted(github_orgs, key=str.lower): code_json['releases'].extend(process_organization(org_name)) for repo_name in sorted(github_repos, key=str.lower): code_json['releases'].append(process_repository(repo_name)) for bitbucket in sorted(bitbucket_servers, key=str.lower): code_json['releases'].extend(process_bitbucket(bitbucket)) if os.path.isfile(doecode_json): code_json['releases'].extend(process_doecode(doecode_json)) elif doecode_json: logger.warning('Unbale to find DOECode json file: %s', doecode_json) # Force certain fields if organization: logger.debug('Forcing Organiation to: %s', organization) for release in code_json['releases']: release['organization'] = organization if contact_email: logger.debug('Forcing Contact Email to: %s', contact_email) for release in code_json['releases']: release['contact']['email'] = contact_email str_org_projects = code_json.to_json() # -- I don't believe we need to be outputing to JSON to the console # -- Maybe if "very verbose" ? # if args.verbose: # print(str_org_projects) logger.info('Number of Projects: %s', len(code_json['releases'])) json_filename = 'code.json' logger.info('Writing output to: %s', json_filename) with open(json_filename, 'w') as fp: logger.info fp.write(str_org_projects) if args.to_csv: csv_filename = 'code.csv' with open(csv_filename, 'w') as fp: for project in code_json['releases']: fp.write(to_doe_csv(project) + '\n')
def process_config(config): """ Master function to process a Scraper config file Returns a Code.gov Metadata file """ agency = config.get('agency', 'UNKNOWN') logger.debug('Agency: %s', agency) method = config.get('method', 'other') logger.debug('Inventory Method: %s', method) compute_labor_hours = config.get('compute_labor_hours', True) if config.get('contact_email', None) is None: # A default contact email is required to handle the (frequent) case # where a project / repository has no available contact email. logger.warning('Config file should contain a "contact_email"') logger.debug('Creating inventory from config: %s', config) code_gov_metadata = Metadata(agency, method) # Parse config for GitHub repositories github_instances = config.get('GitHub', []) if config.get('github_gov_orgs', False): github_instances.append({ 'url': 'https://github.com', 'orgs': gov_orgs(), }) for instance in github_instances: url = instance.get('url', 'https://github.com') orgs = instance.get('orgs', []) repos = instance.get('repos', []) public_only = instance.get('public_only', True) excluded = instance.get('exclude', []) token = instance.get('token', None) gh_session = github.connect(url, token) for repo in github.query_repos(gh_session, orgs, repos, public_only): if repo.owner.login in excluded or repo.full_name in excluded: logger.info('Excluding: %s', repo.full_name) continue code_gov_project = Project.from_github3( repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Parse config for GitLab repositories gitlab_instances = config.get('GitLab', []) for instance in gitlab_instances: url = instance.get('url') # orgs = instance.get('orgs', []) repos = instance.get('repos', []) # public_only = instance.get('public_only', True) excluded = instance.get('exclude', []) token = instance.get('token', None) gl_session = gitlab.connect(url, token) for repo in gitlab.query_repos(gl_session, repos): namespace = repo.namespace['path'] path_with_namespace = repo.path_with_namespace if namespace in excluded or path_with_namespace in excluded: logger.info('Excluding: %s', repo.path_with_namespace) continue code_gov_project = Project.from_gitlab( repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Parse config for Bitbucket repositories bitbucket_instances = config.get('Bitbucket', []) for instance in bitbucket_instances: url = instance.get('url') # orgs = instance.get('orgs', None) # public_only = instance.get('public_only', True) # token = instance.get('token', None) username = instance.get('username') password = instance.get('password') excluded = instance.get('exclude', []) bb_session = bitbucket.connect(url, username, password) for repo in bitbucket.all_repos(bb_session): project = repo['project']['key'] project_repo = '%s/%s' % (project, repo['slug']) if project in excluded or project_repo in excluded: logger.info('Excluding: %s', project_repo) continue code_gov_project = Project.from_stashy( repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Handle parsing of DOE CODE records doecode_config = config.get('DOE CODE', {}) doecode_json = doecode_config.get('json', None) doecode_url = doecode_config.get('url', None) doecode_key = doecode_config.get('api_key', None) for record in doecode.process(doecode_json, doecode_url, doecode_key): code_gov_project = Project.from_doecode(record) code_gov_metadata['releases'].append(code_gov_project) return code_gov_metadata
def process_config(config): """ Master function to process a Scraper config file Returns a Code.gov Metadata file """ agency = config.get('agency', 'UNKNOWN') logger.debug('Agency: %s', agency) method = config.get('method', 'other') logger.debug('Inventory Method: %s', method) compute_labor_hours = config.get('compute_labor_hours', True) if config.get('contact_email', None) is None: # A default contact email is required to handle the (frequent) case # where a project / repository has no available contact email. logger.warning('Config file should contain a "contact_email"') logger.debug('Creating inventory from config: %s', config) code_gov_metadata = Metadata(agency, method) # Parse config for GitHub repositories github_instances = config.get('GitHub', []) if config.get('github_gov_orgs', False): github_instances.append({ 'url': 'https://github.com', 'orgs': gov_orgs(), }) for instance in github_instances: url = instance.get('url', 'https://github.com') orgs = instance.get('orgs', []) repos = instance.get('repos', []) public_only = instance.get('public_only', True) excluded = instance.get('exclude', []) token = instance.get('token', None) gh_session = github.connect(url, token) for repo in github.query_repos(gh_session, orgs, repos, public_only): if repo.owner.login in excluded or repo.full_name in excluded: logger.info('Excluding: %s', repo.full_name) continue code_gov_project = Project.from_github3(repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Parse config for GitLab repositories gitlab_instances = config.get('GitLab', []) for instance in gitlab_instances: url = instance.get('url') # orgs = instance.get('orgs', []) repos = instance.get('repos', []) # public_only = instance.get('public_only', True) excluded = instance.get('exclude', []) token = instance.get('token', None) gl_session = gitlab.connect(url, token) for repo in gitlab.query_repos(gl_session, repos): namespace = repo.namespace['path'] path_with_namespace = repo.path_with_namespace if namespace in excluded or path_with_namespace in excluded: logger.info('Excluding: %s', repo.path_with_namespace) continue code_gov_project = Project.from_gitlab(repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Parse config for Bitbucket repositories bitbucket_instances = config.get('Bitbucket', []) for instance in bitbucket_instances: url = instance.get('url') # orgs = instance.get('orgs', None) # public_only = instance.get('public_only', True) # token = instance.get('token', None) username = instance.get('username') password = instance.get('password') excluded = instance.get('exclude', []) bb_session = bitbucket.connect(url, username, password) for repo in bitbucket.all_repos(bb_session): project = repo['project']['key'] project_repo = '%s/%s' % (project, repo['slug']) if project in excluded or project_repo in excluded: logger.info('Excluding: %s', project_repo) continue code_gov_project = Project.from_stashy(repo, labor_hours=compute_labor_hours) code_gov_metadata['releases'].append(code_gov_project) # Handle parsing of DOE CODE records doecode_config = config.get('DOE CODE', {}) doecode_json = doecode_config.get('json', None) doecode_url = doecode_config.get('url', None) doecode_key = doecode_config.get('api_key', None) for record in doecode.process(doecode_json, doecode_url, doecode_key): code_gov_project = Project.from_doecode(record) code_gov_metadata['releases'].append(code_gov_project) return code_gov_metadata