def main():
    import sys
    # create a new session and init db tables
    session = SessionWrapper.new(init=True)
    repos = session.query(GhIssue.slug).distinct()

    for r in repos:
        get_commits(r.slug, sys.argv[1])
Example #2
0
def main():
    session = SessionWrapper.new(init=True)
    # only repos for which there are valid issue links
    repos = session.query(IssueLink.repo_id, Repo.slug).filter(
        and_(IssueLink.repo_id == Repo.id, IssueLink.is_pr == 0,
             IssueLink.delta_open > 0,
             IssueLink.delta_closed <= 0)).distinct().all()

    tokens = Tokens()
    tokens_iter = tokens.iterator()
    tokens_queue = Queue()
    for token in tokens_iter:
        tokens_queue.put(token)
    tokens_map = dict()

    for r in repos:
        b = Blamer(tokens, tokens_queue, tokens_map)
        b.get_blamed_commits(r.slug, r.repo_id)
from orm.initdb import SessionWrapper
from orm.tables import Project, Commit, Blame, Repo, Bug_Commit_Timeline, Bug_Issue_Timeline, Issue_Timeline
from orm.ghissue import GhIssue, GHIssueClassification

session = SessionWrapper.new(init=True)
#session.execute("Truncate table bug_commit_timeline")


def create_dict():
    out = dict()
    for year in range(2012, 2019):
        innedDict = dict()
        for quart in range(0, 4):
            innedDict[quart] = 0
        out[year] = innedDict
    return out


def extract_blame():

    for project in session.query(Project).filter(Project.num_commits >= 0):
        # if not (project.language =="C++" or project.language=="Java"):
        #     continue
        repo = session.query(Repo).filter_by(slug=project.name).first()
        blame_dict = create_dict()
        blamed_entries = session.query(Blame).filter_by(repo_id=repo.id)
        num_blamed_entries = blamed_entries.count()
        for entry in blamed_entries:
            blamed_commit_sha = entry.blamed_sha
            try:
                blamed_commit = session.query(Commit).filter_by(
def get_commits(slug, repos_folder):
    contributors = {}
    counter = itertools.count(start=1)
    basic_classifier = BasicFileTypeClassifier()

    session = SessionWrapper.new()

    try:
        folder_name = slugToFolderName(slug)
        folder_path = os.path.join(repos_folder, folder_name)

        min_commit = datetime.now(timezone.utc)
        max_commit = min_commit - timedelta(days=100 * 365)
        total_commits = 0

        if not os.path.exists(folder_path):
            return slug

        try:
            db_repo = session.query(Repo).filter_by(slug=slug).one()
            # the reason why we return here is to skip analyzing
            # again a repo in case of crashing exception that forces
            # the script to be run again
            logger.info(
                msg="Skipping analysis of commits from %s, already in the db" %
                slug)
            #return slug
        except exc.NoResultFound:
            db_repo = Repo(slug, min_commit, max_commit, total_commits)
            session.add(db_repo)
            session.commit()
        except exc.MultipleResultsFound:
            logger.warning(msg="Found multiple results querying for repo %s." %
                           slug)
            pass

        git_repo = pygit2.Repository(folder_path)

        last = git_repo[git_repo.head.target]

        # Fetch all commits as an iterator, and iterate it
        for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME):
            commit = CommitWrapper(c)

            total_commits += 1

            sha = commit.getSha()

            authored_datetime = commit.getAuthoredDate()
            committed_datetime = commit.getCommittedDate()

            if authored_datetime < min_commit:
                min_commit = authored_datetime
            if authored_datetime > max_commit:
                max_commit = authored_datetime

            (author_name, author_email) = commit.getAuthor()
            (author_name_l, author_email_l) = (author_name.lower(),
                                               author_email.lower())
            (committer_name, committer_email) = commit.getCommitter()
            (committer_name_l, committer_email_l) = (committer_name.lower(),
                                                     committer_email.lower())

            if (author_name_l, author_email_l) not in contributors:
                contributors[(author_name_l, author_email_l)] = next(counter)
            author_id = contributors[(author_name_l, author_email_l)]

            if (committer_name_l, committer_email_l) not in contributors:
                contributors[(committer_name_l,
                              committer_email_l)] = next(counter)
            committer_id = contributors[(committer_name_l, committer_email_l)]

            parents = commit.getParents()
            num_parents = len(parents)
            if not num_parents:
                continue

            message = commit.getMessage().strip()

            try:
                db_commit = session.query(Commit).filter_by(repo_id=db_repo.id,
                                                            sha=sha).one()
                continue  # if already present, stop and go on analyzing the next one
            except exc.NoResultFound:
                diff = commit.getDiff(git_repo)
                loc_added = diff.stats.insertions
                loc_deleted = diff.stats.deletions
                num_files_touched = diff.stats.files_changed

                # get info about changes to src files in the new  commit
                all_files, src_files, num_src_files_touched, src_loc_added, src_loc_deleted = \
                    CommitWrapper.get_src_changes(basic_classifier, diff)
                try:
                    db_commit = Commit(db_repo.id, sha, authored_datetime,
                                       author_id, committer_id, message,
                                       num_parents, loc_added, loc_deleted,
                                       num_files_touched, all_files,
                                       src_loc_added, src_loc_deleted,
                                       num_src_files_touched, src_files)
                    session.add(db_commit)
                    # required to flush the pending data before adding to the CommitFiles table below
                    session.commit()

                except:
                    all_files = ""
                    src_files = ""
                    message = ""
                    db_commit = Commit(db_repo.id, sha, authored_datetime,
                                       author_id, committer_id, message,
                                       num_parents, loc_added, loc_deleted,
                                       num_files_touched, all_files,
                                       src_loc_added, src_loc_deleted,
                                       num_src_files_touched, src_files)
                    session.add(db_commit)
                    # required to flush the pending data before adding to the CommitFiles table below
                    session.commit()

                # parse changed files per diff
                for patch in diff:
                    commit_file = os.path.basename(patch.delta.new_file.path)
                    try:
                        commit_file = session.query(CommitFiles).filter_by(
                            commit_sha=sha, repo_slug=slug,
                            file=commit_file).one()
                        continue  # if already present, stop and go on analyzing the next one
                    except exc.NoResultFound:
                        lang = basic_classifier.labelFile(commit_file)
                        loc_ins = 0
                        loc_del = 0
                        for hunk in patch.hunks:
                            for hl in hunk.lines:
                                if hl.origin == '-':
                                    loc_del -= 1
                                elif hl.origin == '+':
                                    loc_ins += 1
                        commit_file = CommitFiles(db_repo.id, db_repo.slug,
                                                  sha, commit_file, loc_ins,
                                                  loc_del, lang)
                        session.add(commit_file)

                session.commit()

            if message is not None:
                issue_id_results = commit.getIssueIds()

                if len(issue_id_results) >= 1:
                    num_valid_issues = 0

                    for (line_num, issue_ids) in issue_id_results:

                        for issue_id in issue_ids:
                            logger.info(msg="Analyzing {0} issue {1}.".format(
                                slug, issue_id))
                            try:
                                gh_issue = session.query(GhIssue).filter(
                                    and_(GhIssue.slug == slug,
                                         GhIssue.issue_number ==
                                         issue_id)).one()
                            except exc.MultipleResultsFound:
                                logger.warning(
                                    msg="{0}: Issue {1} has multiple entries.".
                                    format(slug, issue_id))
                                continue
                            except exc.NoResultFound:
                                logger.warning(
                                    msg=
                                    "{0}: Issue {1} no entry found in the issue table."
                                    .format(slug, issue_id))
                                continue

                            try:
                                db_link = session.query(IssueLink).filter(
                                    and_(IssueLink.repo_id == db_repo.id,
                                         IssueLink.sha == sha,
                                         IssueLink.issue_number ==
                                         issue_id)).one()
                                print(db_repo.id, "Touch")
                                continue
                            except exc.NoResultFound:
                                delta_open = (
                                    authored_datetime -
                                    gh_issue.created_at.replace(
                                        tzinfo=pytz.utc)).total_seconds()

                                if gh_issue.closed_at is not None:
                                    delta_closed = (
                                        authored_datetime -
                                        gh_issue.closed_at.replace(
                                            tzinfo=pytz.utc)).total_seconds()

                                    if delta_open > 0 and delta_closed <= 0 and gh_issue.pr_num is None:
                                        num_valid_issues += 1
                                else:
                                    delta_closed = None

                                db_link = IssueLink(
                                    db_repo.id, sha, line_num, issue_id,
                                    gh_issue.pr_num is not None, delta_open,
                                    delta_closed)
                                session.add(db_link)

        for (name, email), user_id in sorted(contributors.items(),
                                             key=lambda e: e[1]):
            try:
                db_user = session.query(User).filter(
                    and_(User.name == func.binary(name),
                         User.email == func.binary(email),
                         User.repo_id == db_repo.id)).one()
            except exc.NoResultFound:
                db_user = User(db_repo.id, user_id, name, email)
                session.add(db_user)
            except exc.MultipleResultsFound:
                # Would this happens because we allow name aliases during mining?
                # Should we deal with it? And how?
                logger.warning(
                    msg="Multiple entries for user \'{0}\' <{1}> in repo {2}".
                    format(name, email, db_repo.slug))

        db_repo.min_commit = min_commit
        db_repo.max_commit = max_commit
        db_repo.total_commits = total_commits
        session.add(db_repo)

        session.commit()

        return slug

    except Exception as e:
        logger.error(msg="{0}: unknown error:\t{1}".format(slug, e))
        traceback.print_exc()
    finally:
        return slug
Example #5
0
def get_commits(slug, repos_folder):
    contributors = {}
    counter = itertools.count(start=1)

    basic_classifier = BasicFileTypeClassifier()

    session = SessionWrapper.new()

    try:
        folder_name = slugToFolderName(slug)
        folder_path = os.path.join(repos_folder, folder_name)

        min_commit = datetime.now(timezone.utc)
        max_commit = min_commit - timedelta(days=100 * 365)
        total_commits = 0

        if not os.path.exists(folder_path):
            return slug

        try:
            db_repo = session.query(Repo).filter_by(slug=slug).one()
        except exc.NoResultFound:
            db_repo = Repo(slug, min_commit, max_commit, total_commits)
            session.add(db_repo)
            session.commit()
        except exc.MultipleResultsFound:
            logger.warning(msg="Found multiple results querying for repo %s." %
                           slug)
            pass

        git_repo = pygit2.Repository(folder_path)

        last = git_repo[git_repo.head.target]

        # Fetch all commits as an iterator, and iterate it
        for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME):
            commit = CommitWrapper(c)

            total_commits += 1

            sha = commit.getSha()

            authored_datetime = commit.getAuthoredDate()
            committed_datetime = commit.getCommittedDate()

            if authored_datetime < min_commit:
                min_commit = authored_datetime
            if authored_datetime > max_commit:
                max_commit = authored_datetime

            (author_name, author_email) = commit.getAuthor()
            (author_name_l, author_email_l) = (author_name.lower(),
                                               author_email.lower())
            (committer_name, committer_email) = commit.getCommitter()
            (committer_name_l, committer_email_l) = (committer_name.lower(),
                                                     committer_email.lower())

            if (author_name_l, author_email_l) not in contributors:
                contributors[(author_name_l, author_email_l)] = next(counter)
            author_id = contributors[(author_name_l, author_email_l)]

            if (committer_name_l, committer_email_l) not in contributors:
                contributors[(committer_name_l,
                              committer_email_l)] = next(counter)
            committer_id = contributors[(committer_name_l, committer_email_l)]

            message = commit.getMessage()

            if message is not None:
                issue_ids = commit.getIssueIds()

                if len(issue_ids) >= 1:
                    num_valid_issues = 0
                    for issue_id in issue_ids:
                        try:
                            # was session_travis
                            gh_issue = session.query(GhIssue).filter(
                                and_(GhIssue.slug == slug,
                                     GhIssue.issue_number == issue_id)).one()
                        except exc.MultipleResultsFound:
                            logger.warning(
                                msg="{0}: Issue {1} has multiple entries.".
                                format(slug, issue_id))
                            continue
                        except exc.NoResultFound:
                            logger.warning(
                                msg=
                                "{0}: Issue {1} no entry found in the issue table."
                                .format(slug, issue_id))
                            continue

                        try:
                            db_link = session.query(IssueLink).filter(
                                and_(
                                    IssueLink.repo_id == db_repo.id,
                                    IssueLink.sha == sha,
                                    IssueLink.issue_number == issue_id)).one()
                        except exc.NoResultFound:
                            # why authored_datetime and not commited_datetime ???????? ## TODO
                            delta_open = (
                                authored_datetime -
                                gh_issue.created_at.replace(tzinfo=pytz.utc)
                            ).total_seconds()
                            ### closed at is important!!!!!!!! ## TODO
                            if gh_issue.closed_at is not None:
                                delta_closed = (
                                    authored_datetime -
                                    gh_issue.closed_at.replace(tzinfo=pytz.utc)
                                ).total_seconds()

                                if delta_open > 0 and delta_closed <= 0 and gh_issue.pr_num is None:
                                    num_valid_issues += 1
                            else:
                                delta_closed = None

                            db_link = IssueLink(db_repo.id, sha, issue_id,
                                                gh_issue.pr_num is not None,
                                                delta_open, delta_closed)

                            session.add(db_link)

                    if not num_valid_issues:
                        continue

                    first_msg_line = message.split('\n')[0]

                    parents = commit.getParents()
                    num_parents = len(parents)

                    if not num_parents:
                        continue

                    sha_parent = parents[0].hex

                    diff = commit.getDiff(git_repo)

                    try:
                        db_commit = session.query(Commit).filter_by(
                            sha=sha).one()
                    except exc.NoResultFound:
                        db_commit = Commit(db_repo.id, sha, authored_datetime,
                                           author_id, committer_id,
                                           first_msg_line, num_parents,
                                           diff.stats.insertions,
                                           diff.stats.deletions,
                                           diff.stats.files_changed)
                        session.add(db_commit)
                        session.commit()

                    # TODO parte da estrarre in un altro script
                    blamed_commits = {}

                    for patch in diff:
                        old_file = patch.delta.old_file.path
                        label = basic_classifier.labelFile(old_file)

                        # Ignore changes to documentation files
                        if label == basic_classifier.DOC:
                            continue

                        line_labels = {}
                        blame_counter = {}

                        for hunk in patch.hunks:
                            if hunk.old_lines:
                                for hl in hunk.lines:
                                    if hl.origin == '-':
                                        line_labels[
                                            hl.
                                            old_lineno] = basic_classifier.labelDiffLine(
                                                hl.content.replace('\r',
                                                                   '').replace(
                                                                       '\n',
                                                                       ''))

                                try:
                                    for bh in git_repo.blame(
                                            old_file,
                                            newest_commit=sha_parent,
                                            min_line=hunk.old_start,
                                            max_line=hunk.old_start +
                                            hunk.old_lines - 1):
                                        blamed_sha = str(bh.final_commit_id)

                                        if blamed_sha in blamed_commits:
                                            blamed_commit = blamed_commits[
                                                blamed_sha]
                                        else:
                                            try:
                                                blamed_commit = CommitWrapper(
                                                    git_repo.revparse_single(
                                                        blamed_sha))

                                                blamed_commits[
                                                    blamed_sha] = blamed_commit

                                                blamed_parents = blamed_commit.getParents(
                                                )
                                                blamed_num_parents = len(
                                                    blamed_parents)

                                                if not blamed_num_parents:
                                                    ins = None
                                                    dels = None
                                                    files = None
                                                else:
                                                    blamed_diff = blamed_commit.getDiff(
                                                        git_repo)
                                                    ins = blamed_diff.stats.insertions
                                                    dels = blamed_diff.stats.deletions
                                                    files = blamed_diff.stats.files_changed

                                                # Ignore commits that changed more than 100 files
                                                if files >= 100:
                                                    continue

                                                try:
                                                    blamed_db_commit = session.query(
                                                        Commit).filter_by(
                                                            sha=blamed_sha
                                                        ).one()
                                                except exc.MultipleResultsFound:
                                                    logger.warning(
                                                        msg=
                                                        "{0}: Multiple rows for blamed sha {1}."
                                                        .format(
                                                            slug, blamed_sha))
                                                    traceback.print_exc()
                                                except exc.NoResultFound:
                                                    blamed_authored_datetime = blamed_commit.getAuthoredDate(
                                                    )

                                                    (
                                                        blamed_author_name,
                                                        blamed_author_email
                                                    ) = blamed_commit.getAuthor(
                                                    )
                                                    (blamed_author_name_l,
                                                     blamed_author_email_l) = (
                                                         blamed_author_name.
                                                         lower(),
                                                         blamed_author_email.
                                                         lower())

                                                    (
                                                        blamed_committer_name,
                                                        blamed_committer_email
                                                    ) = blamed_commit.getCommitter(
                                                    )
                                                    (blamed_committer_name_l,
                                                     blamed_committer_email_l
                                                     ) = (
                                                         blamed_committer_name.
                                                         lower(),
                                                         blamed_committer_email
                                                         .lower())

                                                    if (blamed_author_name_l,
                                                            blamed_author_email_l
                                                        ) not in contributors:
                                                        contributors[(
                                                            blamed_author_name_l,
                                                            blamed_author_email_l
                                                        )] = next(counter)
                                                    blamed_author_id = contributors[
                                                        (blamed_author_name_l,
                                                         blamed_author_email_l
                                                         )]

                                                    if (blamed_committer_name_l,
                                                            blamed_committer_email_l
                                                        ) not in contributors:
                                                        contributors[(
                                                            blamed_committer_name_l,
                                                            blamed_committer_email_l
                                                        )] = next(counter)
                                                    blamed_committer_id = contributors[
                                                        (blamed_committer_name_l,
                                                         blamed_committer_email_l
                                                         )]

                                                    blamed_message = blamed_commit.getMessage(
                                                    )
                                                    blamed_first_msg_line = blamed_message.split(
                                                        '\n')[0]

                                                    blamed_db_commit = Commit(
                                                        db_repo.id, blamed_sha,
                                                        blamed_authored_datetime,
                                                        blamed_author_id,
                                                        blamed_committer_id,
                                                        blamed_first_msg_line,
                                                        blamed_num_parents,
                                                        ins, dels, files)
                                                    session.add(
                                                        blamed_db_commit)
                                                    session.commit()

                                            except Exception as e:
                                                logger.error(
                                                    msg=
                                                    "{0}: revparse error {1}:\t{2}"
                                                    .format(
                                                        slug, blamed_sha, e))
                                                traceback.print_exc()

                                        for line_num in range(
                                                bh.final_start_line_number,
                                                bh.final_start_line_number +
                                                bh.lines_in_hunk):
                                            if line_labels[
                                                    line_num] == basic_classifier.CG_CODE:
                                                blame_counter.setdefault(
                                                    blamed_sha, 0)
                                                blame_counter[blamed_sha] += 1
                                except Exception as e:
                                    logger.error(
                                        msg="{0} blame error {1}:\t{2}".format(
                                            slug, sha, e))

                        for blamed_sha, num_lines in blame_counter.items():
                            b = Blame(db_repo.id, sha, old_file, label,
                                      blamed_sha, num_lines)
                            session.add(b)
                        session.commit()

        for (name, email), user_id in sorted(contributors.items(),
                                             key=lambda e: e[1]):
            try:
                db_user = session.query(User).filter(
                    and_(User.name == func.binary(name),
                         User.email == func.binary(email),
                         User.repo_id == db_repo.id)).one()
            except exc.NoResultFound:
                db_user = User(db_repo.id, user_id, name, email)
                session.add(db_user)
            except exc.MultipleResultsFound:
                # FIXME this should'nt be happening
                # is it because we allow name aliases during mining?
                # How do we deal with it now?
                logger.warning(
                    msg="Multiple entries for user \'{0}\' <{1}> in repo {2}".
                    format(name, email, db_repo.slug))

        db_repo.min_commit = min_commit
        db_repo.max_commit = max_commit
        db_repo.total_commits = total_commits
        session.add(db_repo)

        session.commit()

        return slug

    except Exception as e:
        logger.error(msg="{0}: unknown error:\t{1}".format(slug, e))
        traceback.print_exc()
    finally:
        return slug
Example #6
0
    def get_blamed_commits(self, slug, db_repo_id, repos_folder='./repos'):
        session = SessionWrapper.new()
        basic_classifier = BasicFileTypeClassifier()

        folder_path = os.path.join(repos_folder, slugToFolderName(slug))
        try:
            git_repo = pygit2.Repository(folder_path)
            last = git_repo[git_repo.head.target]
        except Exception:
            logger.error("Git error opening repo %s" % slug)
            return

        try:
            contributors = self.get_contributors(session, db_repo_id)
            # TODO check start number
            counter = itertools.count(start=len(contributors))
        except exc.NoResultFound:
            logger.error(
                msg="No contributors found for repo {0}.".format(slug))
            traceback.print_exc()
            pass

        repo, pid, gh = self.get_gh_repo(slug)

        blamed_commits = {}
        # Fetch all commits as an iterator, and iterate it
        for c in git_repo.walk(last.id, pygit2.GIT_SORT_TIME):
            commit = CommitWrapper(c)
            sha = commit.getSha()

            closes_valid_issue = False
            issue_links = session.query(IssueLink).filter(
                and_(IssueLink.repo_id == db_repo_id, IssueLink.sha == sha,
                     IssueLink.is_pr == 0, IssueLink.delta_open > 0,
                     IssueLink.delta_closed <= 0))
            """
            Valid issues are those 
            1) for which the associated commit was registered *after* the issue was open (delta open > 0)
            2) for which the associated commit was registered *before or exactly when* the associated issue was closed
               (delta closed <= 0)
            3) are not pull requests (is_pr == 1), just issues (is_pr == 0)              
            """
            for issue_link in issue_links:
                # check for possible labels: if 'feature' or 'enhancement', ignore
                # if no labels or labels are 'fix', 'bug-fix', retain
                self.wait_if_depleted(pid, gh)
                issue = repo.get_issue(issue_link.issue_number)
                if issue:
                    if not issue.labels:  # no labels is fine
                        closes_valid_issue = True
                        break
                    else:
                        for label in issue.labels:
                            if label.name in self.invalid_labels:
                                break
                            elif label.name in self.valid_labels:
                                closes_valid_issue = True
                                break

            if not closes_valid_issue:
                continue

            logger.info("Blaming commit %s from repo %s" % (sha, slug))
            try:
                sha_parent = commit.getParents()[0].hex
            except IndexError:
                continue

            diff = commit.getDiff(git_repo)

            for patch in diff:
                # skip changes to binary files
                if patch.delta.is_binary:
                    continue

                old_file = patch.delta.old_file.path
                label = basic_classifier.labelFile(old_file)

                # Ignore changes to documentation files
                if label == basic_classifier.DOC:
                    continue

                line_labels = {}
                blame_counter = {}

                for hunk in patch.hunks:
                    if hunk.old_lines:
                        for hl in hunk.lines:
                            """
                            only changes to deleted lines can be tracked back to when they were first introduced
                            there is no parent commit that introduced a new line that it's being added in the current
                            commit for the first time (ie, lines marked with a '+' in the diffs)
                            
                            this is not a basic SZZ implementation, as we classify changes at line level (e.g., skip changes
                            to line of comments)
                            """
                            if hl.origin == '-':
                                line_labels[
                                    hl.
                                    old_lineno] = basic_classifier.labelDiffLine(
                                        hl.content.replace('\r', '').replace(
                                            '\n', ''))

                        try:
                            for bh in git_repo.blame(old_file,
                                                     newest_commit=sha_parent,
                                                     min_line=hunk.old_start,
                                                     max_line=hunk.old_start +
                                                     hunk.old_lines - 1):
                                blamed_sha = str(bh.final_commit_id)

                                # if sha of commit is not already in the list of blamed commit
                                if blamed_sha not in blamed_commits:
                                    try:
                                        blamed_commit = CommitWrapper(
                                            git_repo.revparse_single(
                                                blamed_sha))

                                        blamed_commits[
                                            blamed_sha] = blamed_commit

                                        blamed_parents = blamed_commit.getParents(
                                        )
                                        blamed_num_parents = len(
                                            blamed_parents)

                                        if not blamed_num_parents:
                                            ins = None
                                            dels = None
                                            num_files = None
                                        else:
                                            blamed_diff = blamed_commit.getDiff(
                                                git_repo)
                                            ins = blamed_diff.stats.insertions
                                            dels = blamed_diff.stats.deletions
                                            num_files = blamed_diff.stats.files_changed

                                        # TODO fine-tune: Ignore commits that changed more than 100 files
                                        if num_files is None or num_files >= 50:

                                            continue

                                        # TODO fine-tune: filter number of new lines (ins)
                                        if ins and ins >= 200:

                                            continue

                                        try:
                                            blamed_db_commit = session.query(
                                                Commit).filter_by(
                                                    sha=blamed_sha).one()
                                        except exc.MultipleResultsFound:
                                            logger.warning(
                                                msg=
                                                "{0}: Multiple rows for blamed sha {1}."
                                                .format(slug, blamed_sha))
                                            traceback.print_exc()
                                        except exc.NoResultFound:
                                            # TODO does it ever happen?
                                            logger.warning(
                                                "exc.NoResultFound at line 141 of blame"
                                            )
                                            blamed_authored_datetime = blamed_commit.getAuthoredDate(
                                            )

                                            (blamed_author_name,
                                             blamed_author_email
                                             ) = blamed_commit.getAuthor()
                                            (blamed_author_name_l,
                                             blamed_author_email_l) = (
                                                 blamed_author_name.lower(),
                                                 blamed_author_email.lower())

                                            (blamed_committer_name,
                                             blamed_committer_email
                                             ) = blamed_commit.getCommitter()
                                            (blamed_committer_name_l,
                                             blamed_committer_email_l) = (
                                                 blamed_committer_name.lower(),
                                                 blamed_committer_email.lower(
                                                 ))

                                            if (blamed_author_name_l,
                                                    blamed_author_email_l
                                                ) not in contributors:
                                                logger.debug(
                                                    msg=
                                                    "Found a blamed author {0} - {1} not in the contributors list for repo {2}."
                                                    .format(
                                                        blamed_author_name_l,
                                                        blamed_author_email_l,
                                                        slug))
                                                # TODO what to do with newly added contributors here? save to db???
                                                contributors[(
                                                    blamed_author_name_l,
                                                    blamed_author_email_l
                                                )] = next(counter)
                                            blamed_author_id = contributors[(
                                                blamed_author_name_l,
                                                blamed_author_email_l)]

                                            if (blamed_committer_name_l,
                                                    blamed_committer_email_l
                                                ) not in contributors:
                                                logger.debug(
                                                    msg=
                                                    "Found a blamed author {0} - {1} not in the contributors list for repo {2}."
                                                    .format(
                                                        blamed_committer_name_l,
                                                        blamed_committer_email_l,
                                                        slug))
                                                # TODO what to do with newly added contributors here? save to db???
                                                contributors[(
                                                    blamed_committer_name_l,
                                                    blamed_committer_email_l
                                                )] = next(counter)
                                            blamed_committer_id = contributors[
                                                (blamed_committer_name_l,
                                                 blamed_committer_email_l)]

                                            blamed_message = blamed_commit.getMessage(
                                            )
                                            blamed_first_msg_line = blamed_message.split(
                                                '\n')[0]

                                            # get info about changes to src files in the new blamed commit
                                            all_files, src_files, num_src_files_touched, src_loc_added, src_loc_deleted = \
                                                CommitWrapper.get_src_changes(basic_classifier,
                                                                              blamed_commit.getDiff(git_repo))

                                            blamed_db_commit = Commit(
                                                db_repo_id, blamed_sha,
                                                blamed_authored_datetime,
                                                blamed_author_id,
                                                blamed_committer_id,
                                                blamed_first_msg_line,
                                                blamed_num_parents, ins, dels,
                                                num_files, all_files,
                                                src_loc_added, src_loc_deleted,
                                                num_src_files_touched,
                                                src_files)
                                            session.add(blamed_db_commit)
                                            session.commit()
                                    except Exception as e:
                                        logger.error(
                                            msg="{0}: revparse error {1}:\t{2}"
                                            .format(slug, blamed_sha, e))
                                        traceback.print_exc()

                                for line_num in range(
                                        bh.final_start_line_number,
                                        bh.final_start_line_number +
                                        bh.lines_in_hunk):

                                    if line_labels[
                                            line_num] == basic_classifier.CG_CODE:
                                        blame_counter.setdefault(blamed_sha, 0)
                                        blame_counter[blamed_sha] += 1
                        except ValueError as ve:
                            logger.error(
                                msg=
                                "{0} blame error on commit {1} probably due to changes coming from a submodule: {2}"
                                .format(slug, sha, ve))
                        except Exception as e:
                            logger.error(
                                msg="{0} Unknown blame error on commit {1}: {2}"
                                .format(slug, sha, e))
                            traceback.print_exc()

                for blamed_sha, num_lines in blame_counter.items():
                    b = Blame(db_repo_id, sha, old_file, label, blamed_sha,
                              num_lines)
                    session.add(b)
                session.commit()
Example #7
0
__all__ = [
    'base', 'tables', 'ghissue', 'initdb', 'issue_comments', 'commit_files',
    'cross_reference', 'SessionWrapper'
]
from orm.base import Base
from orm.commit_files import CommitFiles
from orm.cross_reference import CrossReference
from orm.ghissue import GhIssue, BGhIssue, GHIssueClassification
from orm.initdb import SessionWrapper, SessionWrapper_GHT
from orm.issue_comments import IssueComment
from orm.tables import User, Repo, Commit, IssueLink, Blame, Project, Control_Repo
from orm.lindholmen import Repo_Lindholmen, UMLFile_Lindholmen, Commit_Lindholmen, Lindholmen_Issues
from orm.ght import Repo_GHT, PR_GHT, Issue_GHT, User_GHT

SessionWrapper.load_config()