def scrape_outdated(config, connections, reusable_repos, tenant_parser, repo_cache): scrape_interval = config["FORCE_SCRAPE_INTERVAL"] repo_list = [] now = datetime.now(timezone.utc) threshold = now - timedelta(hours=scrape_interval) # Check the repo cache for entries older than 24 hours for key, val in repo_cache.items(): # TODO We should clean up repos containing 'None' providers some time if val["scrape_time"] < threshold and val.get("provider") is not None: repo_list.append(key) if repo_list: LOGGER.info( "Found repos which weren't scraped for %d hours: %s", scrape_interval, repo_list, ) scrape_repo_list( repo_list, connections, reusable_repos, tenant_parser, repo_cache=repo_cache ) else: LOGGER.info( "Found no repos which weren't scraped for %d hours: %s", scrape_interval, repo_list, )
def scrape(ctx, full, repo): LOGGER.info("Hello, Zubbi!") config = ctx.obj["config"] tenant_sources_repo = config.get("TENANT_SOURCES_REPO") tenant_sources_file = config.get("TENANT_SOURCES_FILE") # Initialize objects that are needed by all subcommands connections = init_connections(ctx.obj["config"]) reusable_repos = ctx.obj["config"].get("REUSABLE_PROJECTS", []) repo_cache = _initialize_repo_cache() tenant_parser = _initialize_tenant_parser( tenant_sources_repo, tenant_sources_file, connections ) if full: scrape_full(connections, reusable_repos, tenant_parser) elif repo: scrape_full(connections, reusable_repos, tenant_parser, repos=repo) else: # Listen to ZMQ messages socket_addr = config.get("ZMQ_SUB_SOCKET_ADDRESS") timeout = config.get("ZMQ_SUB_TIMEOUT") socket = create_zmq_socket(socket_addr, timeout) while True: # Check if a periodic run is necessary LOGGER.debug("Checking for outdated repos") scrape_outdated( config, connections, reusable_repos, tenant_parser, repo_cache ) # Listen to ZMQ messages (if configured) or wait if socket is None: LOGGER.debug( "No ZMQ socket configured. Just going to wait for %d seconds.", timeout, ) time.sleep(timeout) else: # Check for incoming messages on ZMQ LOGGER.debug("Checking for incoming ZMQ messages") try: event, payload = socket.recv_multipart() handle_event( event.decode("utf-8"), json.loads(payload.decode("utf-8")), connections, reusable_repos, tenant_parser, repo_cache, ) except zmq.error.Again: # If no message was received until the timeout, ZMQ throws # zmq.error.Again: Resource temporarily unavailable LOGGER.debug("Did not receive any ZMQ message")
def event_push(payload, connections, tenant_parser, repo_cache): repo_name = payload.get("repository", {}).get("full_name") LOGGER.info("Handling push event for repo '%s'", repo_name) # NOTE (felix): We could use the installation_id later on, to update the # installation map only for this installation. # installation_id = payload.get('installation', {}).get('id') ref = payload.get("ref") # TODO (felix) Get the right connection from the configuration based on what? # The provider? The github url? Both? gh_con = connections["github"] repo_info = gh_con.installation_map.get(repo_name) if not repo_info: # If the repo is not part of our installation map, we might have missed the create/add event. # Thus, we could reinit the GitHub connection and try it again # TODO (felix): re-init for this installation only? LOGGER.info( "Repo '%s' is not part of our installation map, we might have missed an event. " "Reinitialising installation map", repo_name, ) gh_con._prime_install_map() repo_info = gh_con.installation_map.get(repo_name) if not repo_info: LOGGER.error( "Repo '%s' still not part of our installation map, something went wrong. Skip scraping." ) return default_branch = repo_info["default_branch"] # TODO validate installation id from payload against installation map? # If they do not match, we might have missed an installation event and # should update our installation map parts = ref.split("/", 2) branch = parts[2] if branch != default_branch: LOGGER.info( "Push event contains ref %s, but default branch is %s. " "Won't handle event for repo %s.", ref, default_branch, repo_name, ) return LOGGER.info("Handling push event for repo %s with ref %s", repo_name, ref) scrape_repo_list([repo_name], connections, tenant_parser, repo_cache=repo_cache)
def scrape_repo(repo, tenants, scrape_time): job_files, role_files = Scraper(repo).scrape() jobs, roles = RepoParser(repo, tenants, job_files, role_files, scrape_time).parse() LOGGER.info("Updating %d job definitions in Elasticsearch", len(jobs)) ZuulJob.bulk_save(jobs) LOGGER.info("Updating %d role definitions in Elasticsearch", len(roles)) AnsibleRole.bulk_save(roles)
def configure_logger(verbosity): # Import root logger to apply the configuration to all module loggers from zubbi.scraper import LOGGER console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) log_formatter = logging.Formatter( "%(asctime)s %(levelname)s [%(name)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%SZ" ) console_handler.setFormatter(log_formatter) level = getattr(logging, verbosity.upper()) environment_level = getattr( logging, os.environ.get("ZUBBI_VERBOSITY", verbosity).upper() ) LOGGER.setLevel(min(environment_level, level)) LOGGER.addHandler(console_handler)
def event_installation(payload, connections, reusable_repos, tenant_parser, repo_cache): action = payload.get("action") installation_id = payload.get("installation", {}).get("id") repositories = payload.get("repositories", []) LOGGER.info( "Handling installation event with action '%s' for installation %d", action, installation_id, ) if action == "created": LOGGER.info("Scraping repos for new installation %d", installation_id) # Get list of repos from the payload repo_names = [r["full_name"] for r in repositories] # Scrape them scrape_repo_list( repo_names, connections, reusable_repos, tenant_parser, repo_cache=repo_cache, ) if action == "deleted": LOGGER.info("Deleting data for installation %d", installation_id) # TODO (felix) Get the right connection from the configuration based on what? # The provider? The github url? Both? gh_con = connections["github"] # Get repos for this installation from our GitHubConnection as they are # not listed in the payload. # FIXME (fschmidt): When the repo map is updated between triggering and # handling the event, the repo_list can no longer be retrieved from # the installation (which is already deleted by then). # As a fallback, we could search for all repos in ES which start with # the organization (<orga>/*) repositories = gh_con.get_repos_for_installation(installation_id) if not repositories: LOGGER.warning( "Could not retrieve repo list for installation %d. " "Maybe we don't have access any longer." "Nothing to delete", installation_id, ) return # Delete all data for those repos scrape_repo_list( repositories, connections, reusable_repos, tenant_parser, repo_cache=repo_cache, delete_only=True, )
def _initialize_repo_cache(): """Initialize the repository cache used for scraping. Retrieves a list of repositories with their provider and last scraping time from Elasticsearch. This list can be used to check which repos need to be scraped (e.g. after a specific amount of time). """ LOGGER.info("Initializing repository cache") # Initialize Repo Cache repo_cache = {} # Get all repos from Elasticsearch for hit in GitRepo.search().query("match_all").scan(): # TODO (fschmidt): Maybe we can use this list as cache for the whole # scraper-webhook part. # This way, we could reduce the amount of operations needed for GitHub # and ElasticSearch repo_cache[hit.repo_name] = hit.to_dict(skip_empty=False) return repo_cache
def event_installation_repositories( payload, connections, reusable_repos, tenant_parser, repo_cache ): installation_id = payload.get("installation", {}).get("id") repos_added = payload.get("repositories_added") repos_removed = payload.get("repositories_removed") LOGGER.info( "Handling installation_repositories event for installation %d", installation_id ) # TODO validate installation id from payload against installation map? # If they do not match, we might have missed an installation event and # should update our installation map # Scrape each added repo if repos_added is not None: LOGGER.info( "Scraping %d new repositories for installation %d", len(repos_added), installation_id, ) # Get list of repos from the payload repo_names = [r["full_name"] for r in repos_added] # Scrape them scrape_repo_list( repo_names, connections, reusable_repos, tenant_parser, repo_cache=repo_cache, ) # Just delete the data for these repos if repos_removed is not None: LOGGER.info( "Deleting data from %d repositories for installation %d", len(repos_removed), installation_id, ) # Get list of repos from the payload repo_names = [r["full_name"] for r in repos_removed] # Delete all data for those repos scrape_repo_list( repo_names, connections, reusable_repos, tenant_parser, repo_cache=repo_cache, delete_only=True, )
def delete_outdated(scrape_time, indices, extra_filter=None): # Delete all outdated entries in Elasticsearch LOGGER.info( "Going to delete outdated scraping results which are older than %s", datetime.strftime(scrape_time, "%Y-%m-%dT%H:%M:%SZ"), ) for index in indices: try: deleted_docs = index.outdated_query(scrape_time, extra_filter).delete() LOGGER.info( "Deleted %d outdated %ss in Elasticsearch", deleted_docs.deleted, index.__name__, ) except ConflictError: LOGGER.info("Deleted 0 outdated %ss in Elasticsearch", index.__name__)
def handle_event(event, payload, connections, tenant_parser, repo_cache): LOGGER.info("Handling event '%s'", event) try: # TODO (fschmidt): Maybe we should change this file/module to be a class # to get rid of this module lookup this_module = sys.modules[__name__] method = getattr(this_module, "event_{}".format(event)) except AttributeError: LOGGER.warning( "Could not find an appropriate method to handle event '%s'", event) return try: # TODO (fschmidt): What about 'repository' events? # To get updates for public/private? # https://developer.github.com/v3/activity/events/types/#repositoryevent method(payload, connections, tenant_parser, repo_cache) except Exception: # TODO (fschmidt): Does it make sense to catch an Exception here? # Could we catch anything more specific? LOGGER.exception("Error while handling event '%s'", event)
def _scrape_repo_map( repo_map, tenants, connections, reusable_repos, scrape_time, repo_cache, delete_only ): # TODO It would be great if the tenant_list contains only the relevant tenants based # on the repository map (or whatever is the correct source). In other words: # It should only contain the tenants which are really "updated". tenant_list = [] for tenant_name in tenants: # Build the tenant data for Elasticsearch uuid = hashlib.sha1(str.encode(tenant_name)).hexdigest() tenant = ZuulTenant(meta={"id": uuid}) tenant.tenant_name = tenant_name tenant.scrape_time = scrape_time tenant_list.append(tenant) # Simplify the list of repos for log output and keyword match in Elasticsearch # NOTE (fschmidt): Elasticsearch can only work with lists repo_list = list(repo_map.keys()) LOGGER.info( "Using scraping time: %s", datetime.strftime(scrape_time, "%Y-%m-%dT%H:%M:%SZ") ) if not delete_only: # TODO (fschmidt): This should only be done once during initialization, # when a repo or installation changed or for a push event to the # TENANT_SOURCES_REPO. # This would also mean, that the tenant_configuration needs to be kept # in memory, e.g. in the TenantScraper itself (something like the prime # and reprime of the installations in the GitHub connection) # We also need to identify the repos that were added to / removed from # the tenant configuration in the push event. # Update tenant sources # First, store the tenants in Elasticsearch LOGGER.info("Updating %d tenant definitions in Elasticsearch", len(tenant_list)) ZuulTenant.bulk_save(tenant_list) LOGGER.info("Scraping the following repositories: %s", repo_list) for repo_name, repo_data in repo_map.items(): # Extract the data from the repo_data tenants = repo_data["tenants"] connection_name = repo_data["connection_name"] cached_repo = repo_cache.setdefault(repo_name, repo_data) # Update the scrape time in cache cached_repo["scrape_time"] = scrape_time # Initialize the repository for scraping con = connections.get(connection_name) if not con: LOGGER.error( "Checkout of repo '%s' failed. No connection named '%s' found. " "Please check your configuration file.", repo_name, connection_name, ) # NOTE (felix): Remove the repo from the repo_list, so the outdated # data (which would be all data in this case) won't be deleted. repo_list.remove(repo_name) continue provider = con.provider repo_class = REPOS.get(provider) repo = repo_class(repo_name, con) # Check if the repo was created successfully, if not, skip it. # Possible reasons are e.g: No access (via GitHub app or Gerrit user), # Clone/checkout failures for plain git repos or similar. if not repo._repo: LOGGER.error( "Repo '%s' could not be initialized. Skip scraping.", repo_name ) continue # Build the data for the repo itself to be stored in Elasticsearch uuid = hashlib.sha1(str.encode(repo_name)).hexdigest() es_repo = GitRepo(meta={"id": uuid}) es_repo.repo_name = repo_name es_repo.scrape_time = scrape_time es_repo.provider = provider # scrape the repo if is part of the tenant config scrape_repo(repo, tenants, reusable_repos, scrape_time) # Store the information for the repository itself, if it was scraped successfully LOGGER.info("Updating repo definition for '%s' in Elasticsearch", repo_name) GitRepo.bulk_save([es_repo]) else: # Delete the repositories from the repo_cache for repo_name in repo_list: repo_cache.pop(repo_name, None) # In both cases we want to delete outdated data. # In case of delete_only, this will be everything! # NOTE (felix): In case of a config error, the repo is removed from this list LOGGER.info("Deleting outdated data for the following repositories: %s", repo_list) delete_outdated( scrape_time, [AnsibleRole, ZuulJob], extra_filter=Q("terms", repo=repo_list) ) LOGGER.info("Deleting the following repositories (only if outdated): %s", repo_list) # NOTE (fschmidt): Usually, this should not delete anything we just scraped. delete_outdated( scrape_time, [GitRepo], extra_filter=Q({"terms": {"repo_name.keyword": repo_list}}), )
def scrape_repo_list( repo_list, connections, reusable_repos, tenant_parser, repo_cache=None, delete_only=False, ): scrape_time = datetime.now(timezone.utc) # Simplify the usage of a non-existing repo cache if repo_cache is None: repo_cache = {} # Keep track on repositories that are no longer part of our tenant config # and thus should be deleted from Elasticsearch. Otherwise, Zubbi will loop # over them each time once they become outdated (older than 24 hours). invalid_repo_map = {} # Update tenant sources # TODO (fschmidt): This should not be necessary for each scraping. But, as we # don't have a mechanism yet to filter for the necessary events, we keep it # like this. tenant_parser.parse() repo_map = tenant_parser.repo_map tenant_list = tenant_parser.tenants filtered_repo_map = {} # Get the relevant repositories from the tenant parser's repo map. # Repos which are not part of them won't be scraped. for repo_name in repo_list: # Get the tenants from the repo map. If we get no tenants, we assume # that the repo is not part of the tenant config. repo_data = repo_map.get(repo_name, None) if repo_data is None: LOGGER.warning( "Repo '%s' is not part of our tenant sources. Skip scraping.", repo_name ) invalid_repo_map[repo_name] = None else: # TODO Simplify this with dict/list comprehension filtered_repo_map[repo_name] = repo_data if invalid_repo_map: LOGGER.info( "The following repositories are no longer part of our tenant " "sources and will be deleted: %s", invalid_repo_map.keys(), ) _scrape_repo_map( # TODO (felix): It would be simpler if we could provide a list here # as - in case the delete_only is set - we will only extract the # keys from the map and delete all data for those repositories. # Maybe we could check sometime why the split between # _scrape_repo_map and _scrape_repo_list was necessary. IIRC I did # that mainly to simplify the call via command line (when repos are # specified as arguments). invalid_repo_map, tenant_list, connections, reusable_repos, scrape_time, repo_cache, delete_only=True, ) if not filtered_repo_map: LOGGER.info("Repo list is empty, nothing to scrape.") return return _scrape_repo_map( filtered_repo_map, tenant_list, connections, reusable_repos, scrape_time, repo_cache, delete_only, )