Exemple #1
0
def process_config(config):
    """
    Master function to process a Scraper config file

    Returns a Code.gov Metadata file
    """

    agency = config.get("agency", "UNKNOWN")
    logger.debug("Agency: %s", agency)

    method = config.get("method", "other")
    logger.debug("Inventory Method: %s", method)

    compute_labor_hours = config.get("compute_labor_hours", True)

    if config.get("contact_email", None) is None:
        # A default contact email is required to handle the (frequent) case
        # where a project / repository has no available contact email.
        logger.warning('Config file should contain a "contact_email"')

    logger.debug("Creating inventory from config: %s", config)
    code_gov_metadata = Metadata(agency, method)

    # Parse config for GitHub repositories
    github_instances = config.get("GitHub", [])
    if config.get("github_gov_orgs", False):
        github_instances.append({"url": "https://github.com", "orgs": gov_orgs()})
    for instance in github_instances:
        url = instance.get("url", "https://github.com")
        orgs = instance.get("orgs", [])
        repos = instance.get("repos", [])
        public_only = instance.get("public_only", True)
        excluded = instance.get("exclude", [])
        token = instance.get("token", None)

        gh_session = github.connect(url, token)

        for repo in github.query_repos(gh_session, orgs, repos, public_only):
            if repo.owner.login in excluded or repo.full_name in excluded:
                logger.info("Excluding: %s", repo.full_name)
                continue

            code_gov_project = Project.from_github3(
                repo, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for GitLab repositories
    gitlab_instances = config.get("GitLab", [])
    for instance in gitlab_instances:
        url = instance.get("url")
        # orgs = instance.get('orgs', [])
        repos = instance.get("repos", [])
        # public_only = instance.get('public_only', True)
        excluded = instance.get("exclude", [])
        token = instance.get("token", None)
        fetch_languages = instance.get("fetch_languages", False)

        gl_session = gitlab.connect(url, token)

        for repo in gitlab.query_repos(gl_session, repos):
            namespace = repo.namespace["path"]
            path_with_namespace = repo.path_with_namespace
            if namespace in excluded or path_with_namespace in excluded:
                logger.info("Excluding: %s", repo.path_with_namespace)
                continue

            code_gov_project = Project.from_gitlab(
                repo, labor_hours=compute_labor_hours, fetch_languages=fetch_languages
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for Bitbucket repositories
    bitbucket_instances = config.get("Bitbucket", [])
    for instance in bitbucket_instances:
        url = instance.get("url")
        # orgs = instance.get('orgs', None)
        # public_only = instance.get('public_only', True)
        username = instance.get("username", None)
        password = instance.get("password", None)
        token = instance.get("token", None)
        excluded = instance.get("exclude", [])

        bb_session = bitbucket.connect(url, username, password, token)

        for repo in bitbucket.all_repos(bb_session):
            project = repo["project"]["key"]
            project_repo = "%s/%s" % (project, repo["slug"])
            if project in excluded or project_repo in excluded:
                logger.info("Excluding: %s", project_repo)
                continue

            code_gov_project = Project.from_stashy(
                repo, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for TFS repositories
    tfs_instances = config.get("TFS", [])
    for instance in tfs_instances:
        url = instance.get("url")
        token = instance.get("token", None)

        projects = tfs.get_projects_metadata(url, token)
        for project in projects:
            code_gov_project = Project.from_tfs(
                project, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Handle parsing of DOE CODE records
    doecode_config = config.get("DOE CODE", {})
    doecode_json = doecode_config.get("json", None)
    doecode_url = doecode_config.get("url", None)
    doecode_key = doecode_config.get("api_key", None)

    for record in doecode.process(doecode_json, doecode_url, doecode_key):
        code_gov_project = Project.from_doecode(record)
        code_gov_metadata["releases"].append(code_gov_project)

    return code_gov_metadata
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Scrape code repositories for Code.gov / DOECode')

    parser.add_argument('--agency',
                        type=str,
                        nargs='?',
                        default='',
                        help='Agency Label, e.g. "DOE"')
    parser.add_argument('--method',
                        type=str,
                        nargs='?',
                        default='',
                        help='Method of measuring open source')
    parser.add_argument(
        '--organization',
        type=str,
        nargs='?',
        default='',
        help='Force all repos to report a particular organzation')
    parser.add_argument(
        '--contact-email',
        type=str,
        nargs='?',
        default='',
        help='Force all repos to report a particular contact email')

    parser.add_argument('--config',
                        type=str,
                        nargs='?',
                        default='',
                        help='Configuration File (*.json)')

    parser.add_argument('--github-orgs',
                        type=str,
                        nargs='+',
                        default=[],
                        help='GitHub Organizations')
    parser.add_argument('--github-repos',
                        type=str,
                        nargs='+',
                        default=[],
                        help='GitHub Repositories')
    parser.add_argument('--github-gov-orgs',
                        action='store_true',
                        help='Use orgs from government.github.com/community')

    parser.add_argument('--to-csv',
                        action='store_true',
                        help='Toggle output to CSV')

    parser.add_argument('--doecode-json',
                        type=str,
                        nargs='?',
                        default='',
                        help='Path to DOECode .json file')

    parser.add_argument('--verbose',
                        action='store_true',
                        help='Enable verbose output')

    args = parser.parse_args()

    _configure_logging(args.verbose)

    try:
        config_json = json.load(open(args.config))
    except (FileNotFoundError, json.JSONDecodeError):
        if args.config:
            raise
        config_json = {}

    agency = config_json.get('agency', 'UNKNOWN')
    agency = args.agency or agency
    logger.debug('Agency: %s', agency)

    method = config_json.get('method', 'other')
    method = args.method or method
    logger.debug('Inventory Method: %s', method)

    organization = config_json.get('organization', '')
    organization = args.organization or organization
    logger.debug('Organization: %s', organization)

    contact_email = config_json.get('contact_email', '')
    contact_email = args.contact_email or contact_email
    logger.debug('Contact Email: %s', contact_email)

    github_orgs = config_json.get('github_orgs', [])
    github_orgs.extend(args.github_orgs)
    logger.debug('GitHub.com Organizations: %s', github_orgs)

    if args.github_gov_orgs:
        github_orgs.extend(gov_orgs())

    github_repos = config_json.get('github_repos', [])
    github_repos.extend(args.github_repos)
    logger.debug('GitHub.com Repositories: %s', github_repos)

    bitbucket_servers = config_json.get('bitbucket_servers', [])
    bitbucket_servers = [connect_to_bitbucket(s) for s in bitbucket_servers]
    logger.debug('Bitbucket Servers: %s', bitbucket_servers)

    doecode_json = args.doecode_json
    logger.debug('Queuing DOE Code JSON: %s', doecode_json)

    code_json = CodeGovMetadata(agency, method)

    for org_name in sorted(github_orgs, key=str.lower):
        code_json['releases'].extend(process_organization(org_name))

    for repo_name in sorted(github_repos, key=str.lower):
        code_json['releases'].append(process_repository(repo_name))

    for bitbucket in sorted(bitbucket_servers, key=str.lower):
        code_json['releases'].extend(process_bitbucket(bitbucket))

    if os.path.isfile(doecode_json):
        code_json['releases'].extend(process_doecode(doecode_json))
    elif doecode_json:
        logger.warning('Unbale to find DOECode json file: %s', doecode_json)

    # Force certain fields
    if organization:
        logger.debug('Forcing Organiation to: %s', organization)
        for release in code_json['releases']:
            release['organization'] = organization

    if contact_email:
        logger.debug('Forcing Contact Email to: %s', contact_email)
        for release in code_json['releases']:
            release['contact']['email'] = contact_email

    str_org_projects = code_json.to_json()

    # -- I don't believe we need to be outputing to JSON to the console
    #   -- Maybe if "very verbose" ?
    # if args.verbose:
    #     print(str_org_projects)

    logger.info('Number of Projects: %s', len(code_json['releases']))

    json_filename = 'code.json'
    logger.info('Writing output to: %s', json_filename)

    with open(json_filename, 'w') as fp:
        logger.info
        fp.write(str_org_projects)

    if args.to_csv:
        csv_filename = 'code.csv'
        with open(csv_filename, 'w') as fp:
            for project in code_json['releases']:
                fp.write(to_doe_csv(project) + '\n')
Exemple #3
0
def process_config(config):
    """
    Master function to process a Scraper config file

    Returns a Code.gov Metadata file
    """

    agency = config.get('agency', 'UNKNOWN')
    logger.debug('Agency: %s', agency)

    method = config.get('method', 'other')
    logger.debug('Inventory Method: %s', method)

    compute_labor_hours = config.get('compute_labor_hours', True)

    if config.get('contact_email', None) is None:
        # A default contact email is required to handle the (frequent) case
        # where a project / repository has no available contact email.
        logger.warning('Config file should contain a "contact_email"')

    logger.debug('Creating inventory from config: %s', config)
    code_gov_metadata = Metadata(agency, method)

    # Parse config for GitHub repositories
    github_instances = config.get('GitHub', [])
    if config.get('github_gov_orgs', False):
        github_instances.append({
            'url': 'https://github.com',
            'orgs': gov_orgs(),
        })
    for instance in github_instances:
        url = instance.get('url', 'https://github.com')
        orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)

        gh_session = github.connect(url, token)

        for repo in github.query_repos(gh_session, orgs, repos, public_only):
            if repo.owner.login in excluded or repo.full_name in excluded:
                logger.info('Excluding: %s', repo.full_name)
                continue

            code_gov_project = Project.from_github3(
                repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for GitLab repositories
    gitlab_instances = config.get('GitLab', [])
    for instance in gitlab_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        # public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)

        gl_session = gitlab.connect(url, token)

        for repo in gitlab.query_repos(gl_session, repos):
            namespace = repo.namespace['path']
            path_with_namespace = repo.path_with_namespace
            if namespace in excluded or path_with_namespace in excluded:
                logger.info('Excluding: %s', repo.path_with_namespace)
                continue

            code_gov_project = Project.from_gitlab(
                repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for Bitbucket repositories
    bitbucket_instances = config.get('Bitbucket', [])
    for instance in bitbucket_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', None)
        # public_only = instance.get('public_only', True)
        # token = instance.get('token', None)
        username = instance.get('username')
        password = instance.get('password')
        excluded = instance.get('exclude', [])

        bb_session = bitbucket.connect(url, username, password)

        for repo in bitbucket.all_repos(bb_session):
            project = repo['project']['key']
            project_repo = '%s/%s' % (project, repo['slug'])
            if project in excluded or project_repo in excluded:
                logger.info('Excluding: %s', project_repo)
                continue

            code_gov_project = Project.from_stashy(
                repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Handle parsing of DOE CODE records

    doecode_config = config.get('DOE CODE', {})
    doecode_json = doecode_config.get('json', None)
    doecode_url = doecode_config.get('url', None)
    doecode_key = doecode_config.get('api_key', None)

    for record in doecode.process(doecode_json, doecode_url, doecode_key):
        code_gov_project = Project.from_doecode(record)
        code_gov_metadata['releases'].append(code_gov_project)

    return code_gov_metadata
Exemple #4
0
def process_config(config):
    """
    Master function to process a Scraper config file

    Returns a Code.gov Metadata file
    """

    agency = config.get('agency', 'UNKNOWN')
    logger.debug('Agency: %s', agency)

    method = config.get('method', 'other')
    logger.debug('Inventory Method: %s', method)

    compute_labor_hours = config.get('compute_labor_hours', True)

    if config.get('contact_email', None) is None:
        # A default contact email is required to handle the (frequent) case
        # where a project / repository has no available contact email.
        logger.warning('Config file should contain a "contact_email"')

    logger.debug('Creating inventory from config: %s', config)
    code_gov_metadata = Metadata(agency, method)

    # Parse config for GitHub repositories
    github_instances = config.get('GitHub', [])
    if config.get('github_gov_orgs', False):
        github_instances.append({
            'url': 'https://github.com',
            'orgs': gov_orgs(),
        })
    for instance in github_instances:
        url = instance.get('url', 'https://github.com')
        orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)

        gh_session = github.connect(url, token)

        for repo in github.query_repos(gh_session, orgs, repos, public_only):
            if repo.owner.login in excluded or repo.full_name in excluded:
                logger.info('Excluding: %s', repo.full_name)
                continue

            code_gov_project = Project.from_github3(repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for GitLab repositories
    gitlab_instances = config.get('GitLab', [])
    for instance in gitlab_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        # public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)

        gl_session = gitlab.connect(url, token)

        for repo in gitlab.query_repos(gl_session, repos):
            namespace = repo.namespace['path']
            path_with_namespace = repo.path_with_namespace
            if namespace in excluded or path_with_namespace in excluded:
                logger.info('Excluding: %s', repo.path_with_namespace)
                continue

            code_gov_project = Project.from_gitlab(repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for Bitbucket repositories
    bitbucket_instances = config.get('Bitbucket', [])
    for instance in bitbucket_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', None)
        # public_only = instance.get('public_only', True)
        # token = instance.get('token', None)
        username = instance.get('username')
        password = instance.get('password')
        excluded = instance.get('exclude', [])

        bb_session = bitbucket.connect(url, username, password)

        for repo in bitbucket.all_repos(bb_session):
            project = repo['project']['key']
            project_repo = '%s/%s' % (project, repo['slug'])
            if project in excluded or project_repo in excluded:
                logger.info('Excluding: %s', project_repo)
                continue

            code_gov_project = Project.from_stashy(repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Handle parsing of DOE CODE records

    doecode_config = config.get('DOE CODE', {})
    doecode_json = doecode_config.get('json', None)
    doecode_url = doecode_config.get('url', None)
    doecode_key = doecode_config.get('api_key', None)

    for record in doecode.process(doecode_json, doecode_url, doecode_key):
        code_gov_project = Project.from_doecode(record)
        code_gov_metadata['releases'].append(code_gov_project)

    return code_gov_metadata