Ejemplo n.º 1
0
    def commit_handler(self, msg):
        """
        Handle a dist-git commit message and update Neo4j if necessary.

        :param dict msg: a message to be processed
        """
        repo = DistGitRepo.get_or_create({
            'namespace':
            msg['headers']['namespace'],
            'name':
            msg['headers']['repo']
        })[0]

        # Get the username from the email if the email is a Red Hat email
        email = msg['headers']['email'].lower()
        if email.endswith('@redhat.com'):
            username = email.split('@redhat.com')[0]
        else:
            username = email

        author = User.create_or_update({
            'username': username,
            'email': email
        })[0]

        commit_message = msg['body']['msg']['message']
        commit = DistGitCommit.create_or_update({
            'hash_':
            msg['headers']['rev'],
            'log_message':
            commit_message,
            'author_date':
            timestamp_to_datetime(msg['body']['msg']['author_date']),
            'commit_date':
            timestamp_to_datetime(msg['body']['msg']['commit_date'])
        })[0]

        bug_rel_mapping = self.parse_bugzilla_bugs(commit_message)

        for bug_id in bug_rel_mapping['resolves']:
            bug = BugzillaBug.get_or_create({'id_': bug_id})[0]
            commit.resolved_bugs.connect(bug)

        for bug_id in bug_rel_mapping['related']:
            bug = BugzillaBug.get_or_create({'id_': bug_id})[0]
            commit.related_bugs.connect(bug)

        for bug_id in bug_rel_mapping['reverted']:
            bug = BugzillaBug.get_or_create({'id_': bug_id})[0]
            commit.reverted_bugs.connect(bug)

        commit.conditional_connect(commit.author, author)

        repo.commits.connect(commit)
Ejemplo n.º 2
0
    def _update_neo4j(neo4j_url, total_results, counter_and_results):
        """
        Update Neo4j results via mapping with multiprocessing.

        :param str neo4j_url: database url for Neo4j
        :param int total_results: the total number of results that will be processed. This is used
        for a logging statement about progress.
        :param tuple counter_and_results: a tuple where the first index is the current counter and
        the second index is a list of dictionaries representing results from Teiid
        """
        try:
            previous_total = counter_and_results[0]
            results = counter_and_results[1]
            # Since _update_neo4j will be run in a separate process, we must configure the database
            # URL every time the method is run.
            neomodel_config.DATABASE_URL = neo4j_url
            # Create a thread pool with 4 threads to speed up queries to cgit
            pool = ThreadPool(4)
            counter = 0
            for result in results:
                if counter % 200 == 0:
                    until = counter + 200
                    if until > len(results):
                        until = len(results)
                    # Because of the joins in the SQL query, we end up with several rows with the
                    # same commit hash and we only want to query cgit once per commit
                    unique_commits = set([(c['module'], c['sha'])
                                          for c in results[counter:until]])
                    log.debug(
                        'Getting the author email addresses from cgit in parallel '
                        'for results {0} to {1}'.format(counter, until))
                    repos_info = {
                        r['commit']: r
                        for r in pool.map(DistGitScraper._get_repo_info,
                                          unique_commits)
                    }
                    # This is no longer needed so it can be cleared to save RAM
                    del unique_commits
                counter += 1
                log.info('Processing commit entry {0}/{1}'.format(
                    previous_total + counter, total_results))
                repo_info = repos_info[result['sha']]
                if not repo_info.get('namespace'):
                    log.info(
                        'Skipping nodes creation with commit ID {0}'.format(
                            result['commit_id']))
                    continue

                log.debug(
                    'Creating nodes associated with commit ID {0}'.format(
                        result['commit_id']))
                repo = DistGitRepo.get_or_create({
                    'namespace':
                    repo_info['namespace'],
                    'name':
                    result['module']
                })[0]
                commit = DistGitCommit.create_or_update({
                    'author_date':
                    result['author_date'],
                    'commit_date':
                    result['commit_date'],
                    'hash_':
                    result['sha'],
                    # In case we get unicode characters in Python 2
                    'log_message':
                    bytes(result['log_message'], 'utf-8').decode()
                })[0]
                bug = BugzillaBug.get_or_create({'id_':
                                                 result['bugzilla_id']})[0]

                log.debug(
                    'Creating the user nodes associated with commit ID {0}'.
                    format(result['commit_id']))
                author = User.create_or_update({
                    'username':
                    repo_info['author_username'],
                    'email':
                    repo_info['author_email']
                })[0]

                log.debug(
                    'Creating the relationships associated with commit ID {0}'.
                    format(result['commit_id']))
                repo.commits.connect(commit)

                commit.conditional_connect(commit.author, author)

                if result['bugzilla_type'] == 'related':
                    commit.related_bugs.connect(bug)
                elif result['bugzilla_type'] == 'resolves':
                    commit.resolved_bugs.connect(bug)
                elif result['bugzilla_type'] == 'reverted':
                    commit.reverted_bugs.connect(bug)
                # This is no longer needed so it can be cleared to save RAM
                del repo_info
        finally:
            # Close the DB connection after this is done processing
            db.driver.close()
Ejemplo n.º 3
0
    def update_neo4j(self, results):
        """
        Update Neo4j with the dist-git commit and push information from Teiid.

        :param list results: a list of dictionaries
        """
        pool = Pool(processes=8)
        counter = 0
        for result in results:
            if counter % 200 == 0:
                until = counter + 200
                if until > len(results):
                    until = len(results)
                # Because of the joins in the SQL query, we end up with several rows with the same
                # commit hash and we only want to query cgit once per commit
                unique_commits = set([(c['module'], c['sha'])
                                      for c in results[counter:until]])
                log.debug(
                    'Getting the author and committer email addresses from cgit in parallel '
                    'for results {0} to {1}'.format(counter, until))
                repos_info = {}
                for _r in pool.map(DistGitScraper._get_repo_info,
                                   unique_commits):
                    r = json.loads(_r)
                    repos_info[r['commit']] = r
                # This is no longer needed so it can be cleared to save RAM
                del unique_commits
                # A lot of RAM was allocated or used up, so let's call gc.collect() to ensure it
                # is removed
                gc.collect()
            counter += 1
            log.info('Processing commit and push entry {0}/{1}'.format(
                str(counter), str(len(results))))
            repo_info = repos_info[result['sha']]
            if not repo_info.get('namespace'):
                log.info(
                    'Skipping nodes creation with commit ID {0} and push ID {1}'
                    .format(result['commit_id'], result['push_id']))
                continue

            log.debug(
                'Creating nodes associated with commit ID {0} and push ID {1}'.
                format(result['commit_id'], result['push_id']))
            repo = DistGitRepo.get_or_create({
                'namespace':
                repo_info['namespace'],
                'name':
                result['module']
            })[0]
            branch_name = result['ref'].rsplit('/', 1)[1]
            branch = DistGitBranch.get_or_create({
                'name':
                branch_name,
                'repo_namespace':
                repo_info['namespace'],
                'repo_name':
                result['module']
            })[0]
            commit = DistGitCommit.create_or_update({
                'author_date':
                result['author_date'],
                'commit_date':
                result['commit_date'],
                'hash_':
                result['sha'],
                # In case we get unicode characters in Python 2
                'log_message':
                bytes(result['log_message'], 'utf-8').decode()
            })[0]
            push = DistGitPush.get_or_create({
                'id_': result['push_id'],
                'push_date': result['push_date'],
                'push_ip': result['push_ip']
            })[0]
            bug = BugzillaBug.get_or_create({'id_': result['bugzilla_id']})[0]

            log.debug(
                'Creating the user nodes associated with commit ID {0} and push ID {1}'
                .format(result['commit_id'], result['push_id']))
            author = User.create_or_update({
                'username':
                repo_info['author_username'],
                'email':
                repo_info['author_email']
            })[0]
            committer = User.create_or_update({
                'username':
                repo_info['committer_username'],
                'email':
                repo_info['committer_email']
            })[0]
            pusher = User.get_or_create({'username': result['pusher']})[0]

            log.debug(
                'Creating the relationships associated with commit ID {0} and push ID {1}'
                .format(result['commit_id'], result['push_id']))
            repo.contributors.connect(author)
            repo.contributors.connect(committer)
            repo.contributors.connect(pusher)
            repo.commits.connect(commit)
            repo.pushes.connect(push)
            repo.branches.connect(branch)

            branch.contributors.connect(author)
            branch.contributors.connect(committer)
            branch.contributors.connect(pusher)
            branch.commits.connect(commit)
            branch.pushes.connect(push)

            push.conditional_connect(push.pusher, pusher)
            push.commits.connect(commit)

            commit.conditional_connect(commit.author, author)
            commit.conditional_connect(commit.committer, committer)

            if repo_info['parent']:
                parent_commit = DistGitCommit.get_or_create(
                    {'hash_': repo_info['parent']})[0]
                commit.conditional_connect(commit.parent, parent_commit)

            if result['bugzilla_type'] == 'related':
                commit.related_bugs.connect(bug)
            elif result['bugzilla_type'] == 'resolves':
                commit.resolved_bugs.connect(bug)
            elif result['bugzilla_type'] == 'reverted':
                commit.reverted_bugs.connect(bug)
            # This is no longer needed so it can be cleared to save RAM
            del repo_info