def event_push(payload, connections, tenant_parser, repo_cache): repo_name = payload.get("repository", {}).get("full_name") LOGGER.info("Handling push event for repo '%s'", repo_name) # NOTE (felix): We could use the installation_id later on, to update the # installation map only for this installation. # installation_id = payload.get('installation', {}).get('id') ref = payload.get("ref") # TODO (felix) Get the right connection from the configuration based on what? # The provider? The github url? Both? gh_con = connections["github"] repo_info = gh_con.installation_map.get(repo_name) if not repo_info: # If the repo is not part of our installation map, we might have missed the create/add event. # Thus, we could reinit the GitHub connection and try it again # TODO (felix): re-init for this installation only? LOGGER.info( "Repo '%s' is not part of our installation map, we might have missed an event. " "Reinitialising installation map", repo_name, ) gh_con._prime_install_map() repo_info = gh_con.installation_map.get(repo_name) if not repo_info: LOGGER.error( "Repo '%s' still not part of our installation map, something went wrong. Skip scraping." ) return default_branch = repo_info["default_branch"] # TODO validate installation id from payload against installation map? # If they do not match, we might have missed an installation event and # should update our installation map parts = ref.split("/", 2) branch = parts[2] if branch != default_branch: LOGGER.info( "Push event contains ref %s, but default branch is %s. " "Won't handle event for repo %s.", ref, default_branch, repo_name, ) return LOGGER.info("Handling push event for repo %s with ref %s", repo_name, ref) scrape_repo_list([repo_name], connections, tenant_parser, repo_cache=repo_cache)
def _scrape_repo_map( repo_map, tenants, connections, reusable_repos, scrape_time, repo_cache, delete_only ): # TODO It would be great if the tenant_list contains only the relevant tenants based # on the repository map (or whatever is the correct source). In other words: # It should only contain the tenants which are really "updated". tenant_list = [] for tenant_name in tenants: # Build the tenant data for Elasticsearch uuid = hashlib.sha1(str.encode(tenant_name)).hexdigest() tenant = ZuulTenant(meta={"id": uuid}) tenant.tenant_name = tenant_name tenant.scrape_time = scrape_time tenant_list.append(tenant) # Simplify the list of repos for log output and keyword match in Elasticsearch # NOTE (fschmidt): Elasticsearch can only work with lists repo_list = list(repo_map.keys()) LOGGER.info( "Using scraping time: %s", datetime.strftime(scrape_time, "%Y-%m-%dT%H:%M:%SZ") ) if not delete_only: # TODO (fschmidt): This should only be done once during initialization, # when a repo or installation changed or for a push event to the # TENANT_SOURCES_REPO. # This would also mean, that the tenant_configuration needs to be kept # in memory, e.g. in the TenantScraper itself (something like the prime # and reprime of the installations in the GitHub connection) # We also need to identify the repos that were added to / removed from # the tenant configuration in the push event. # Update tenant sources # First, store the tenants in Elasticsearch LOGGER.info("Updating %d tenant definitions in Elasticsearch", len(tenant_list)) ZuulTenant.bulk_save(tenant_list) LOGGER.info("Scraping the following repositories: %s", repo_list) for repo_name, repo_data in repo_map.items(): # Extract the data from the repo_data tenants = repo_data["tenants"] connection_name = repo_data["connection_name"] cached_repo = repo_cache.setdefault(repo_name, repo_data) # Update the scrape time in cache cached_repo["scrape_time"] = scrape_time # Initialize the repository for scraping con = connections.get(connection_name) if not con: LOGGER.error( "Checkout of repo '%s' failed. No connection named '%s' found. " "Please check your configuration file.", repo_name, connection_name, ) # NOTE (felix): Remove the repo from the repo_list, so the outdated # data (which would be all data in this case) won't be deleted. repo_list.remove(repo_name) continue provider = con.provider repo_class = REPOS.get(provider) repo = repo_class(repo_name, con) # Check if the repo was created successfully, if not, skip it. # Possible reasons are e.g: No access (via GitHub app or Gerrit user), # Clone/checkout failures for plain git repos or similar. if not repo._repo: LOGGER.error( "Repo '%s' could not be initialized. Skip scraping.", repo_name ) continue # Build the data for the repo itself to be stored in Elasticsearch uuid = hashlib.sha1(str.encode(repo_name)).hexdigest() es_repo = GitRepo(meta={"id": uuid}) es_repo.repo_name = repo_name es_repo.scrape_time = scrape_time es_repo.provider = provider # scrape the repo if is part of the tenant config scrape_repo(repo, tenants, reusable_repos, scrape_time) # Store the information for the repository itself, if it was scraped successfully LOGGER.info("Updating repo definition for '%s' in Elasticsearch", repo_name) GitRepo.bulk_save([es_repo]) else: # Delete the repositories from the repo_cache for repo_name in repo_list: repo_cache.pop(repo_name, None) # In both cases we want to delete outdated data. # In case of delete_only, this will be everything! # NOTE (felix): In case of a config error, the repo is removed from this list LOGGER.info("Deleting outdated data for the following repositories: %s", repo_list) delete_outdated( scrape_time, [AnsibleRole, ZuulJob], extra_filter=Q("terms", repo=repo_list) ) LOGGER.info("Deleting the following repositories (only if outdated): %s", repo_list) # NOTE (fschmidt): Usually, this should not delete anything we just scraped. delete_outdated( scrape_time, [GitRepo], extra_filter=Q({"terms": {"repo_name.keyword": repo_list}}), )