Esempio n. 1
0
def event_push(payload, connections, tenant_parser, repo_cache):
    repo_name = payload.get("repository", {}).get("full_name")
    LOGGER.info("Handling push event for repo '%s'", repo_name)
    # NOTE (felix): We could use the installation_id later on, to update the
    # installation map only for this installation.
    # installation_id = payload.get('installation', {}).get('id')
    ref = payload.get("ref")

    # TODO (felix) Get the right connection from the configuration based on what?
    # The provider? The github url? Both?
    gh_con = connections["github"]

    repo_info = gh_con.installation_map.get(repo_name)
    if not repo_info:
        # If the repo is not part of our installation map, we might have missed the create/add event.
        # Thus, we could reinit the GitHub connection and try it again
        # TODO (felix): re-init for this installation only?
        LOGGER.info(
            "Repo '%s' is not part of our installation map, we might have missed an event. "
            "Reinitialising installation map",
            repo_name,
        )
        gh_con._prime_install_map()
        repo_info = gh_con.installation_map.get(repo_name)
        if not repo_info:
            LOGGER.error(
                "Repo '%s' still not part of our installation map, something went wrong. Skip scraping."
            )
            return

    default_branch = repo_info["default_branch"]

    # TODO validate installation id from payload against installation map?
    # If they do not match, we might have missed an installation event and
    # should update our installation map

    parts = ref.split("/", 2)
    branch = parts[2]
    if branch != default_branch:
        LOGGER.info(
            "Push event contains ref %s, but default branch is %s. "
            "Won't handle event for repo %s.",
            ref,
            default_branch,
            repo_name,
        )
        return

    LOGGER.info("Handling push event for repo %s with ref %s", repo_name, ref)

    scrape_repo_list([repo_name],
                     connections,
                     tenant_parser,
                     repo_cache=repo_cache)
Esempio n. 2
0
def _scrape_repo_map(
    repo_map, tenants, connections, reusable_repos, scrape_time, repo_cache, delete_only
):
    # TODO It would be great if the tenant_list contains only the relevant tenants based
    # on the repository map (or whatever is the correct source). In other words:
    # It should only contain the tenants which are really "updated".

    tenant_list = []
    for tenant_name in tenants:
        # Build the tenant data for Elasticsearch
        uuid = hashlib.sha1(str.encode(tenant_name)).hexdigest()
        tenant = ZuulTenant(meta={"id": uuid})
        tenant.tenant_name = tenant_name
        tenant.scrape_time = scrape_time
        tenant_list.append(tenant)

    # Simplify the list of repos for log output and keyword match in Elasticsearch
    # NOTE (fschmidt): Elasticsearch can only work with lists
    repo_list = list(repo_map.keys())

    LOGGER.info(
        "Using scraping time: %s", datetime.strftime(scrape_time, "%Y-%m-%dT%H:%M:%SZ")
    )

    if not delete_only:
        # TODO (fschmidt): This should only be done once during initialization,
        # when a repo or installation changed or for a push event to the
        # TENANT_SOURCES_REPO.
        # This would also mean, that the tenant_configuration needs to be kept
        # in memory, e.g. in the TenantScraper itself (something like the prime
        # and reprime of the installations in the GitHub connection)
        # We also need to identify the repos that were added to / removed from
        # the tenant configuration in the push event.

        # Update tenant sources

        # First, store the tenants in Elasticsearch
        LOGGER.info("Updating %d tenant definitions in Elasticsearch", len(tenant_list))
        ZuulTenant.bulk_save(tenant_list)

        LOGGER.info("Scraping the following repositories: %s", repo_list)

        for repo_name, repo_data in repo_map.items():
            # Extract the data from the repo_data
            tenants = repo_data["tenants"]
            connection_name = repo_data["connection_name"]

            cached_repo = repo_cache.setdefault(repo_name, repo_data)

            # Update the scrape time in cache
            cached_repo["scrape_time"] = scrape_time

            # Initialize the repository for scraping
            con = connections.get(connection_name)
            if not con:
                LOGGER.error(
                    "Checkout of repo '%s' failed. No connection named '%s' found. "
                    "Please check your configuration file.",
                    repo_name,
                    connection_name,
                )
                # NOTE (felix): Remove the repo from the repo_list, so the outdated
                # data (which would be all data in this case) won't be deleted.
                repo_list.remove(repo_name)
                continue
            provider = con.provider
            repo_class = REPOS.get(provider)
            repo = repo_class(repo_name, con)

            # Check if the repo was created successfully, if not, skip it.
            # Possible reasons are e.g: No access (via GitHub app or Gerrit user),
            # Clone/checkout failures for plain git repos or similar.
            if not repo._repo:
                LOGGER.error(
                    "Repo '%s' could not be initialized. Skip scraping.", repo_name
                )
                continue

            # Build the data for the repo itself to be stored in Elasticsearch
            uuid = hashlib.sha1(str.encode(repo_name)).hexdigest()
            es_repo = GitRepo(meta={"id": uuid})
            es_repo.repo_name = repo_name
            es_repo.scrape_time = scrape_time
            es_repo.provider = provider

            # scrape the repo if is part of the tenant config
            scrape_repo(repo, tenants, reusable_repos, scrape_time)

            # Store the information for the repository itself, if it was scraped successfully
            LOGGER.info("Updating repo definition for '%s' in Elasticsearch", repo_name)
            GitRepo.bulk_save([es_repo])
    else:
        # Delete the repositories from the repo_cache
        for repo_name in repo_list:
            repo_cache.pop(repo_name, None)

    # In both cases we want to delete outdated data.
    # In case of delete_only, this will be everything!
    # NOTE (felix): In case of a config error, the repo is removed from this list
    LOGGER.info("Deleting outdated data for the following repositories: %s", repo_list)
    delete_outdated(
        scrape_time, [AnsibleRole, ZuulJob], extra_filter=Q("terms", repo=repo_list)
    )

    LOGGER.info("Deleting the following repositories (only if outdated): %s", repo_list)
    # NOTE (fschmidt): Usually, this should not delete anything we just scraped.
    delete_outdated(
        scrape_time,
        [GitRepo],
        extra_filter=Q({"terms": {"repo_name.keyword": repo_list}}),
    )