Beispiel #1
0
def process_config(config):
    """
    Master function to process a Scraper config file

    Returns a Code.gov Metadata file
    """

    agency = config.get("agency", "UNKNOWN")
    logger.debug("Agency: %s", agency)

    method = config.get("method", "other")
    logger.debug("Inventory Method: %s", method)

    compute_labor_hours = config.get("compute_labor_hours", True)

    if config.get("contact_email", None) is None:
        # A default contact email is required to handle the (frequent) case
        # where a project / repository has no available contact email.
        logger.warning('Config file should contain a "contact_email"')

    logger.debug("Creating inventory from config: %s", config)
    code_gov_metadata = Metadata(agency, method)

    # Parse config for GitHub repositories
    github_instances = config.get("GitHub", [])
    if config.get("github_gov_orgs", False):
        github_instances.append({"url": "https://github.com", "orgs": gov_orgs()})
    for instance in github_instances:
        url = instance.get("url", "https://github.com")
        orgs = instance.get("orgs", [])
        repos = instance.get("repos", [])
        public_only = instance.get("public_only", True)
        excluded = instance.get("exclude", [])
        token = instance.get("token", None)

        gh_session = github.connect(url, token)

        for repo in github.query_repos(gh_session, orgs, repos, public_only):
            if repo.owner.login in excluded or repo.full_name in excluded:
                logger.info("Excluding: %s", repo.full_name)
                continue

            code_gov_project = Project.from_github3(
                repo, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for GitLab repositories
    gitlab_instances = config.get("GitLab", [])
    for instance in gitlab_instances:
        url = instance.get("url")
        # orgs = instance.get('orgs', [])
        repos = instance.get("repos", [])
        # public_only = instance.get('public_only', True)
        excluded = instance.get("exclude", [])
        token = instance.get("token", None)
        fetch_languages = instance.get("fetch_languages", False)

        gl_session = gitlab.connect(url, token)

        for repo in gitlab.query_repos(gl_session, repos):
            namespace = repo.namespace["path"]
            path_with_namespace = repo.path_with_namespace
            if namespace in excluded or path_with_namespace in excluded:
                logger.info("Excluding: %s", repo.path_with_namespace)
                continue

            code_gov_project = Project.from_gitlab(
                repo, labor_hours=compute_labor_hours, fetch_languages=fetch_languages
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for Bitbucket repositories
    bitbucket_instances = config.get("Bitbucket", [])
    for instance in bitbucket_instances:
        url = instance.get("url")
        # orgs = instance.get('orgs', None)
        # public_only = instance.get('public_only', True)
        username = instance.get("username", None)
        password = instance.get("password", None)
        token = instance.get("token", None)
        excluded = instance.get("exclude", [])

        bb_session = bitbucket.connect(url, username, password, token)

        for repo in bitbucket.all_repos(bb_session):
            project = repo["project"]["key"]
            project_repo = "%s/%s" % (project, repo["slug"])
            if project in excluded or project_repo in excluded:
                logger.info("Excluding: %s", project_repo)
                continue

            code_gov_project = Project.from_stashy(
                repo, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Parse config for TFS repositories
    tfs_instances = config.get("TFS", [])
    for instance in tfs_instances:
        url = instance.get("url")
        token = instance.get("token", None)

        projects = tfs.get_projects_metadata(url, token)
        for project in projects:
            code_gov_project = Project.from_tfs(
                project, labor_hours=compute_labor_hours
            )
            code_gov_metadata["releases"].append(code_gov_project)

    # Handle parsing of DOE CODE records
    doecode_config = config.get("DOE CODE", {})
    doecode_json = doecode_config.get("json", None)
    doecode_url = doecode_config.get("url", None)
    doecode_key = doecode_config.get("api_key", None)

    for record in doecode.process(doecode_json, doecode_url, doecode_key):
        code_gov_project = Project.from_doecode(record)
        code_gov_metadata["releases"].append(code_gov_project)

    return code_gov_metadata
Beispiel #2
0
def process_config(config):
    """
    Master function to process a Scraper config file

    Returns a Code.gov Metadata file
    """

    agency = config.get('agency', 'UNKNOWN')
    logger.debug('Agency: %s', agency)

    method = config.get('method', 'other')
    logger.debug('Inventory Method: %s', method)

    compute_labor_hours = config.get('compute_labor_hours', True)

    if config.get('contact_email', None) is None:
        # A default contact email is required to handle the (frequent) case
        # where a project / repository has no available contact email.
        logger.warning('Config file should contain a "contact_email"')

    logger.debug('Creating inventory from config: %s', config)
    code_gov_metadata = Metadata(agency, method)

    # Parse config for GitHub repositories
    github_instances = config.get('GitHub', [])
    if config.get('github_gov_orgs', False):
        github_instances.append({
            'url': 'https://github.com',
            'orgs': gov_orgs(),
        })
    for instance in github_instances:
        url = instance.get('url', 'https://github.com')
        orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)

        gh_session = github.connect(url, token)

        for repo in github.query_repos(gh_session, orgs, repos, public_only):
            if repo.owner.login in excluded or repo.full_name in excluded:
                logger.info('Excluding: %s', repo.full_name)
                continue

            code_gov_project = Project.from_github3(
                repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for GitLab repositories
    gitlab_instances = config.get('GitLab', [])
    for instance in gitlab_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', [])
        repos = instance.get('repos', [])
        # public_only = instance.get('public_only', True)
        excluded = instance.get('exclude', [])
        token = instance.get('token', None)
        fetch_languages = instance.get('fetch_languages', False)

        gl_session = gitlab.connect(url, token)

        for repo in gitlab.query_repos(gl_session, repos):
            namespace = repo.namespace['path']
            path_with_namespace = repo.path_with_namespace
            if namespace in excluded or path_with_namespace in excluded:
                logger.info('Excluding: %s', repo.path_with_namespace)
                continue

            code_gov_project = Project.from_gitlab(
                repo,
                labor_hours=compute_labor_hours,
                fetch_languages=fetch_languages)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for Bitbucket repositories
    bitbucket_instances = config.get('Bitbucket', [])
    for instance in bitbucket_instances:
        url = instance.get('url')
        # orgs = instance.get('orgs', None)
        # public_only = instance.get('public_only', True)
        # token = instance.get('token', None)
        username = instance.get('username')
        password = instance.get('password')
        excluded = instance.get('exclude', [])

        bb_session = bitbucket.connect(url, username, password)

        for repo in bitbucket.all_repos(bb_session):
            project = repo['project']['key']
            project_repo = '%s/%s' % (project, repo['slug'])
            if project in excluded or project_repo in excluded:
                logger.info('Excluding: %s', project_repo)
                continue

            code_gov_project = Project.from_stashy(
                repo, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Parse config for TFS repositories
    tfs_instances = config.get('TFS', [])
    for instance in tfs_instances:
        url = instance.get('url')
        token = instance.get('token', None)

        projects = tfs.get_projects_metadata(url, token)
        for project in projects:
            code_gov_project = Project.from_tfs(
                project, labor_hours=compute_labor_hours)
            code_gov_metadata['releases'].append(code_gov_project)

    # Handle parsing of DOE CODE records

    doecode_config = config.get('DOE CODE', {})
    doecode_json = doecode_config.get('json', None)
    doecode_url = doecode_config.get('url', None)
    doecode_key = doecode_config.get('api_key', None)

    for record in doecode.process(doecode_json, doecode_url, doecode_key):
        code_gov_project = Project.from_doecode(record)
        code_gov_metadata['releases'].append(code_gov_project)

    return code_gov_metadata