def _parse_username_email_from_cgit(th_tag, commit, namespace, repo): """ Parse the username and email address from a cgit "th" element of author. :param th_tag: a BeautifulSoup4 element object :param str commit: the commit being processed :param str namespace: the namespace of the repo being processed :param str repo: the repo being processed :return: a tuple of (username, email) :rtype: tuple """ person_text = th_tag.next_sibling.string # Set some defaults in the event the cgit entry is malformed username = None email = None if person_text: match = re.match( r'^.+<(?P<email>(?P<username>.+)@(?P<domain>.+))>$', person_text) if match: match_dict = match.groupdict() if match_dict['domain'].lower() == 'redhat.com': username = match_dict['username'].lower() else: # If the email isn't a Red Hat email address, then use the whole email address # as the username. This should only happen with erroneous git configurations. username = match_dict['email'].lower() email = match_dict['email'].lower() if username is None or email is None: log.error('Couldn\'t find the {0} for the commit "{1}" on repo "{2}/{3}"'.format( th_tag.string, commit, namespace, repo)) return username, email
def query_api_and_update_neo4j(self): """ Scrape the Freshmaker API and upload the data to Neo4j. :param str start_date: a datetime to start scraping data from """ # Initialize session and url session = retry_session() fm_url = self.freshmaker_url while True: log.debug('Querying {0}'.format(fm_url)) try: rv_json = session.get(fm_url, timeout=60).json() except ConnectionError: # TODO: Remove this once FACTORY-3955 is resolved log.error( 'The connection to Freshmaker at %s failed. Skipping the rest of the scraper.', fm_url, ) break for fm_event in rv_json['items']: try: int(fm_event['search_key']) except ValueError: # Skip Freshmaker Events that don't have the search_key as the Advisory ID continue log.debug('Creating FreshmakerEvent {0}'.format(fm_event['id'])) event_params = dict( id_=fm_event['id'], event_type_id=fm_event['event_type_id'], message_id=fm_event['message_id'], state=fm_event['state'], state_name=fm_event['state_name'], state_reason=fm_event['state_reason'], url=fm_event['url'] ) if fm_event.get('time_created'): event_params['time_created'] = timestamp_to_datetime(fm_event['time_created']) if fm_event.get('time_done'): event_params['time_done'] = timestamp_to_datetime(fm_event['time_created']) event = FreshmakerEvent.create_or_update(event_params)[0] log.debug('Creating Advisory {0}'.format(fm_event['search_key'])) advisory = Advisory.get_or_create(dict( id_=fm_event['search_key'] ))[0] event.conditional_connect(event.triggered_by_advisory, advisory) for build_dict in fm_event['builds']: # To handle a faulty container build in Freshmaker if build_dict['build_id'] and int(build_dict['build_id']) < 0: continue log.debug('Creating FreshmakerBuild {0}'.format(build_dict['build_id'])) fb_params = dict( id_=build_dict['id'], dep_on=build_dict['dep_on'], name=build_dict['name'], original_nvr=build_dict['original_nvr'], rebuilt_nvr=build_dict['rebuilt_nvr'], state=build_dict['state'], state_name=build_dict['state_name'], state_reason=build_dict['state_reason'], time_submitted=timestamp_to_datetime(build_dict['time_submitted']), type_=build_dict['type'], type_name=build_dict['type_name'], url=build_dict['url'] ) if build_dict['time_completed']: fb_params['time_completed'] = timestamp_to_datetime( build_dict['time_completed']) if build_dict['build_id']: fb_params['build_id'] = build_dict['build_id'] fb = FreshmakerBuild.create_or_update(fb_params)[0] event.requested_builds.connect(fb) # The build ID obtained from Freshmaker API is actually a Koji task ID task_result = None if build_dict['build_id']: task_result = self.get_koji_task_result(build_dict['build_id']) if not task_result: continue # Extract the build ID from a task result xml_root = ET.fromstring(task_result) # TODO: Change this if a task can trigger multiple builds try: build_id = xml_root.find(".//*[name='koji_builds'].//string").text except AttributeError: build_id = None if not build_id: continue log.debug('Creating ContainerKojiBuild {0}'.format(build_id)) build_params = { 'id_': build_id, 'original_nvr': build_dict['original_nvr'] } try: build = ContainerKojiBuild.create_or_update(build_params)[0] except neomodel.exceptions.ConstraintValidationFailed: # This must have errantly been created as a KojiBuild instead of a # ContainerKojiBuild, so let's fix that. build = KojiBuild.nodes.get_or_none(id_=build_id) if not build: # If there was a constraint validation failure and the build isn't just # the wrong label, then we can't recover. raise build.add_label(ContainerKojiBuild.__label__) build = ContainerKojiBuild.create_or_update(build_params)[0] event.successful_koji_builds.connect(build) if rv_json['meta'].get('next'): fm_url = rv_json['meta']['next'] else: break
def _get_repo_info(repo_and_commit): """ Query cgit for the namespace, username and email of the author. :param tuple repo_and_commit: contains the repo and commit to query for :return: a JSON string of a dictionary with the keys namespace, author_username, author_email, and the commit :rtype: str """ repo, commit = repo_and_commit log.debug( 'Attempting to find the cgit URL for the commit "{0}" in repo "{1}"' .format(commit, repo)) session = retry_session() rv = {'commit': commit} cgit_result = None # The tuple of namespaces to try when determining which namespace this git module belongs # to since this information isn't stored in GitBZ yet namespaces = ('rpms', 'containers', 'modules', 'tests') cgit_url = getenv('ESTUARY_CGIT_URL', 'http://pkgs.devel.redhat.com/cgit/') for namespace in namespaces: url = '{0}{1}/{2}/commit/?id={3}&dt=2'.format( cgit_url, namespace, repo, commit) log.debug('Trying the URL "{0}"'.format(url)) try: cgit_result = session.get(url, timeout=15) except ConnectionError: log.error('The connection to "{0}" failed'.format(url)) continue if cgit_result.status_code == 200: # If the repo is empty, cgit oddly returns a 200 status code, so let's correct the # status code so that the remainder of the code knows it's a bad request if 'Repository seems to be empty' in cgit_result.text: cgit_result.status_code = 404 else: # If the repo is populated and a 200 status code is returned, then we can # assume we found the correct repo break if not cgit_result or cgit_result.status_code != 200: log.error( 'Couldn\'t find the commit "{0}" for the repo "{1}" in the namespaces: {2}' .format(commit, repo, ', '.join(namespaces))) return rv log.debug( 'Found the cgit URL "{0}" for the commit "{1}" in repo "{2}"'. format(url, commit, repo)) rv['namespace'] = namespace # Start parsing the cgit content soup = BeautifulSoup(cgit_result.text, 'html.parser') # Workaround for BS4 in EL7 since `soup.find('th', string=person)` doesn't work in # that environment th_tags = soup.find_all('th') data_found = {'author': False} for th_tag in th_tags: if th_tag.string in ('author'): data_found[th_tag.string] = True username_key = '{0}_username'.format(th_tag.string) email_key = '{0}_email'.format(th_tag.string) rv[username_key], rv[ email_key] = DistGitScraper._parse_username_email_from_cgit( th_tag, commit, namespace, repo) # If all the "th" elements we're interested in were parsed, then break from the loop # early if all(data_found.values()): break soup.decompose() return rv
def _get_exception_users(): """ Get the list of users that are explicitly whitelisted. If the LDAP search fails, an empty set is returned. :return: a set of usernames :rtype: set :raise InternalServerError: if a required configuration value is not set or the connection to the LDAP server fails """ # Import this here so it's not required for deployments with auth disabled import ldap3 base_error = '%s is not set in the server configuration' ldap_uri = current_app.config.get('LDAP_URI') if not ldap_uri: log.error(base_error, 'LDAP_URI') raise InternalServerError() ldap_group_dn = current_app.config.get('LDAP_EXCEPTIONS_GROUP_DN') if not ldap_group_dn: log.error(base_error, 'LDAP_EXCEPTIONS_GROUP_DN') raise InternalServerError() if ldap_uri.startswith('ldaps://'): ca = current_app.config['LDAP_CA_CERTIFICATE'] log.debug('Connecting to %s using SSL and the CA %s', ldap_uri, ca) tls = ldap3.Tls(ca_certs_file=ca, validate=ssl.CERT_REQUIRED) server = ldap3.Server(ldap_uri, use_ssl=True, tls=tls) else: log.debug('Connecting to %s without SSL', ldap_uri) server = ldap3.Server(ldap_uri) connection = ldap3.Connection(server) try: connection.open() except ldap3.core.exceptions.LDAPSocketOpenError: log.exception('The connection to %s failed', ldap_uri) raise InternalServerError() membership_attr = current_app.config['LDAP_GROUP_MEMBERSHIP_ATTRIBUTE'] log.debug('Searching for the attribute %s on %s', ldap_group_dn, membership_attr) # Set the scope to base so only the group from LDAP_GROUP_DN is returned success = connection.search(ldap_group_dn, '(cn=*)', search_scope=ldap3.BASE, attributes=[membership_attr]) if not success: log.error( 'The user exceptions list could not be determined because the search for the attribute ' '%s on %s failed with %r', membership_attr, ldap_group_dn, connection.response, ) return set() return set([ dn.split('=')[1].split(',')[0] for dn in connection.response[0]['attributes'][membership_attr] ])