Exemple #1
0
 def get_file_id(self, file_path, commit_id):
     """Ask for the file_id for a given file_path and commit_id"""
     
     if config.debug:
         profiler_start("Getting file id for file_path %s and commit_id %d",
                         (file_path, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     cursor = cnn.cursor()
     query = """SELECT file_id from file_paths
                WHERE file_path = ? AND commit_id <= ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder),
                     (file_path, commit_id))
     try:
         file_id = cursor.fetchone()[0]
     except:
         file_id = None
     
     cursor.close()
     cnn.close()
     
     if config.debug:
         profiler_stop("Getting file id for file_path %s and commit_id %d",
                        (file_path, commit_id), True)
     
     return file_id
Exemple #2
0
 def get_path_from_database(self, file_id, commit_id):
     """Returns the last valid path for a given file_id at commit_id
        (May have been removed afterwords!)"""
     
     if config.debug:
         profiler_start("Getting full file path for file_id %d and \
                         commit_id %d", (file_id, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     
     cursor = cnn.cursor()
     query = """SELECT file_path from file_paths
                WHERE file_id=? AND commit_id <= ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder), (file_id, commit_id))
     try:
         file_path = cursor.fetchone()[0]
     except:
         file_path = None
     
     cursor.close()
     cnn.close()
     
     printdbg("get_path_from_database:\
               Path for file_id %d at commit_id %d: %s",
              (file_id, commit_id, file_path))
     if config.debug:
         profiler_stop("Getting full file path for file_id %d and\
                          commit_id %d", (file_id, commit_id), True)
     return file_path
Exemple #3
0
 def get_path_from_database(self, file_id, commit_id):
     """Returns the last valid path for a given file_id at commit_id
        (May have been removed afterwords!)"""
     
     if config.debug:
         profiler_start("Getting full file path for file_id %d and \
                         commit_id %d", (file_id, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     
     cursor = cnn.cursor()
     query = """SELECT current_file_path from actions
                WHERE file_id=? AND commit_id <= ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder), (file_id, commit_id))
     try:
         file_path = cursor.fetchone()[0]
     except:
         file_path = None
     
     cursor.close()
     cnn.close()
     
     printdbg("get_path_from_database:\
               Path for file_id %d at commit_id %d: %s",
              (file_id, commit_id, file_path))
     if config.debug:
         profiler_stop("Getting full file path for file_id %d and\
                          commit_id %d", (file_id, commit_id), True)
     return file_path
Exemple #4
0
 def get_file_id(self, file_path, commit_id):
     """Ask for the file_id for a given file_path and commit_id"""
     
     if config.debug:
         profiler_start("Getting file id for file_path %s and commit_id %d",
                         (file_path, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     cursor = cnn.cursor()
     query = """SELECT file_id from actions
                WHERE binary current_file_path = ? AND commit_id = ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder),
                     (file_path, commit_id))
     try:
         file_id = cursor.fetchone()[0]
     except:
         file_id = None
     
     cursor.close()
     cnn.close()
     
     if config.debug:
         profiler_stop("Getting file id for file_path %s and commit_id %d",
                        (file_path, commit_id), True)
     
     return file_id
Exemple #5
0
    def get_path(self, file_id, commit_id, repo_id):
        profiler_start("Getting path for file %d at commit %d", (file_id, commit_id))

        adj = self.__dict__['adj']
        assert adj is not None, "Matrix no updated"

        path = self.__build_path(file_id, adj)

        profiler_stop("Getting path for file %d at commit %d", (file_id, commit_id), True)
        return path
Exemple #6
0
    def update_all(self, repo_id):
        """
        update_all enable cache for adjacency matrices
        Pros: File paths in different revisions can be
        accessed randomly, i.e. after calling update_all,
        get_path can be called with any revision in any
        order.
        Cons: It consumes significant memory to store
        the adjacency matrices

        If the config has low_memory set to true, shelve will
        be used instead, to write the cache out to disk.
        """
        profiler_start("Update all file paths")
        
        if Config().low_memory:
            self.shelve_file_name = str(time()) + "-shelve.db"
            
            # If there is an old file, shelf will complain viciously
            if os.path.exists(self.shelve_file_name):
                os.remove(self.shelve_file_name)
            
            self.__dict__['cached_adj'] = shelve.open(self.shelve_file_name, 
                                                        writeback=False)
        
        db = self.__dict__['db']
        cnn = db.connect()

        cursor = cnn.cursor()
        query = """select distinct(s.id) from scmlog s, actions a
                    where s.id = a.commit_id and repository_id=?
                    order by s.date"""
        cursor.execute(statement(query, db.place_holder), (repo_id,))
        
        old_id = -1
        all_commits = [i[0] for i in cursor.fetchall()]
        for id in all_commits:
            if old_id != id:
                adj = self.__dict__['cached_adj'].get(str(id))

                if adj is None:
                    self.update_for_revision(cursor, id, repo_id)
                    self.__dict__['cached_adj'][str(id)] = \
                    deepcopy(self.__dict__['adj'])
                old_id = id
        cursor.close()
        cnn.close()
        profiler_stop("Update all file paths", delete=True)
Exemple #7
0
    def update_all(self, repo_id):
        """
        update_all enable cache for adjacency matrices
        Pros: File paths in different revisions can be
        accessed randomly, i.e. after calling update_all,
        get_path can be called with any revision in any
        order.
        Cons: It consumes significant memory to store
        the adjacency matrices

        If the config has low_memory set to true, shelve will
        be used instead, to write the cache out to disk.
        """
        profiler_start("Update all file paths")
        
        if Config().low_memory:
            self.shelve_file_name = str(time()) + "-shelve.db"
            
            # If there is an old file, shelf will complain viciously
            if os.path.exists(self.shelve_file_name):
                os.remove(self.shelve_file_name)
            
            self.__dict__['cached_adj'] = shelve.open(self.shelve_file_name, 
                                                        writeback=False)
        
        db = self.__dict__['db']
        cnn = db.connect()

        cursor = cnn.cursor()
        query = """select distinct(s.id) from scmlog s, actions a
                    where s.id = a.commit_id and repository_id=?
                    order by s.commit_date"""
        cursor.execute(statement(query, db.place_holder), (repo_id,))
        
        old_id = -1
        all_commits = [i[0] for i in cursor.fetchall()]
        for id in all_commits:
            if old_id != id:
                adj = self.__dict__['cached_adj'].get(str(id))

                if adj is None:
                    self.update_for_revision(cursor, id, repo_id)
                    self.__dict__['cached_adj'][str(id)] = \
                    deepcopy(self.__dict__['adj'])
                old_id = id
        cursor.close()
        cnn.close()
        profiler_stop("Update all file paths", delete=True)
Exemple #8
0
    def __build_path(self, file_id, adj):
        if file_id not in adj.adj:
            return None

        profiler_start("Building path for file %d", (file_id,))

        tokens = []
        id = file_id

        while id != -1:
            tokens.insert(0, adj.files[id])
            id = adj.adj[id]

        profiler_stop("Building path for file %d", (file_id,), True)

        return "/" + "/".join(tokens)
Exemple #9
0
    def __build_path(self, file_id, adj):
        if file_id not in adj.adj:
            return None

        profiler_start("Building path for file %d", (file_id,))

        tokens = []
        id = file_id

        while id != -1:
            tokens.insert(0, adj.files[id])
            id = adj.adj.get(id,-1)

        profiler_stop("Building path for file %d", (file_id,), True)

        return "/" + "/".join(tokens)
Exemple #10
0
class BlameJob (Job):

    class BlameContentHandler (OutputDevice):
        def __init__ (self):
            self.authors = {}

        def start_file (self, filename):
            pass

        def line (self, line):
            self.authors.setdefault (line.author, 0)
            self.authors[line.author] += 1

        def end_file (self):
            pass

        def get_authors (self):
            return self.authors

    def __init__ (self, file_id, commit_id, path, rev):
        Job.__init__(self)
        self.file_id = file_id
        self.commit_id = commit_id
        self.path = path
        self.rev = rev
        self.authors = None

    def run (self, repo, repo_uri):
        profiler_start("Running BlameJob for %s@%s", (self.path,self.rev))
        def blame_line (line, p):
            p.feed (line)

        repo_type = repo.get_type ()
        if repo_type == 'cvs':
            # CVS paths contain the module stuff
            uri = repo.get_uri_for_path (repo_uri)
            module = uri[len (repo.get_uri ()):].strip ('/')

            if module != '.':
                path = self.path[len (module):].strip ('/')
            else:
                path = self.path.strip ('/')
        else:
            path = self.path.strip ('/')

        filename = os.path.basename (self.path)
        p = create_parser (repo.get_type (), self.path)
        out = self.get_content_handler()
        p.set_output_device (out)
        wid = repo.add_watch (BLAME, blame_line, p)
        try:
            repo.blame (os.path.join (repo_uri, path), self.rev)
            self.collect_results(out)
        except RepositoryCommandError, e:
            self.failed = True
            printerr ("Command %s returned %d (%s)", (e.cmd, e.returncode, e.error))
        p.end ()
        repo.remove_watch(BLAME, wid)
        profiler_stop("Running BlameJob for %s@%s", (self.path,self.rev), delete=True)
Exemple #11
0
    def __build_path(self, file_id, adj):
        if file_id not in adj.adj:
            return None

        profiler_start("Building path for file %d", (file_id,))
        
        tokens = []
        id = file_id
        
        while id is not None and id != -1:
            tokens.insert(0, adj.files[id])
            #use get instead of index to avoid key error
            id = adj.adj.get(id) 

        profiler_stop("Building path for file %d", (file_id,), True)

        return "/" + "/".join(tokens)
Exemple #12
0
    def __build_path(self, file_id, adj):
        if file_id not in adj.adj:
            return None

        profiler_start("Building path for file %d", (file_id,))
        
        tokens = []
        id = file_id
        
        while id is not None and id != -1:
            tokens.insert(0, adj.files[id])
            #use get instead of index to avoid key error
            id = adj.adj.get(id) 

        profiler_stop("Building path for file %d", (file_id,), True)

        return "/" + "/".join(tokens)
Exemple #13
0
    def update_all(self, repo_id):
        profiler_start("Update all file paths")
        db = self.__dict__['db']
        cnn = db.connect ()

        cursor = cnn.cursor ()
        query = """select distinct(s.id) from scmlog s, actions a
                    where s.id = a.commit_id and repository_id=?
                    order by s.id"""
        cursor.execute (statement (query, db.place_holder), (repo_id,))        
        old_id = -1
        all_commits = [i[0] for i in cursor.fetchall ()]
        for id in all_commits:
            if old_id != id:
                self.update_for_revision (cursor, id, repo_id)
                old_id = id
        cursor.close()
        cnn.close()
        profiler_stop("Update all file paths", delete=True)
Exemple #14
0
    def get_patches(self, repo, repo_uri, repo_id, db, cursor):
        profiler_start("Hunks: fetch all patches")
        icursor = ICursor(cursor, self.INTERVAL_SIZE)
        # Get the patches from this repository
        query = """select p.commit_id, p.file_id, p.patch, s.rev
                    from patches p, scmlog s
                    where p.commit_id = s.id and
                    s.repository_id = ? and
                    p.patch is not NULL"""
        icursor.execute(statement(query, db.place_holder), (repo_id,))
        profiler_stop("Hunks: fetch all patches", delete=True)

        rs = icursor.fetchmany()

        while rs:
            for commit_id, file_id, patch_content, rev in rs:
                yield (commit_id, file_id, to_utf8(patch_content), rev)
            
            rs = icursor.fetchmany()
Exemple #15
0
    def get_patches(self, repo, repo_uri, repo_id, db, cursor):
        profiler_start("Hunks: fetch all patches")
        icursor = ICursor(cursor, self.INTERVAL_SIZE)
        # Get the patches from this repository
        query = """select p.commit_id, p.file_id, p.patch, s.rev
                    from patches p, scmlog s
                    where p.commit_id = s.id and
                    s.repository_id = ? and
                    p.patch is not NULL"""
        icursor.execute(statement(query, db.place_holder), (repo_id, ))
        profiler_stop("Hunks: fetch all patches", delete=True)

        rs = icursor.fetchmany()

        while rs:
            for commit_id, file_id, patch_content, rev in rs:
                yield (commit_id, file_id, to_utf8(patch_content), rev)

            rs = icursor.fetchmany()
Exemple #16
0
    def update_all(self, repo_id):
        profiler_start("Update all file paths")
        db = self.__dict__['db']
        cnn = db.connect()

        cursor = cnn.cursor()
        query = """select distinct(s.id) from scmlog s, actions a
                    where s.id = a.commit_id and repository_id=?
                    order by s.id"""
        cursor.execute(statement(query, db.place_holder), (repo_id, ))
        old_id = -1
        all_commits = [i[0] for i in cursor.fetchall()]
        for id in all_commits:
            if old_id != id:
                self.update_for_revision(cursor, id, repo_id)
                old_id = id
        cursor.close()
        cnn.close()
        profiler_stop("Update all file paths", delete=True)
Exemple #17
0
    def get_commit_data(self, patch_content):
        profiler_start("get_commit_data")
        lines = [l + "\n" for l in patch_content.splitlines() if l]
        hunks = []

        for patch in [p for p in parse_patches(lines, allow_dirty=True, \
                            allow_continue=True) if isinstance(p, Patch)]:
            # This method matches that of parseLine in UnifiedDiffParser.java
            # It's not necessarily intuitive, but this algorithm is much harder
            # than it looks, I spent hours trying to get a simpler solution.
            # It does, however, seem to work, which is pretty amazing when
            # you think about how difficult it is for long enough.
            # The trick that this method does is that each *part* of a hunk
            # ie. added, deleted, changed are treated as *new entities*.
            # The EntityDelta table does not store just diffs, it stores
            # each part of a diff.
            # I will need to copy the behavior of how Sep inserts a CommitData
            # into the database to ensure things match
            for hunk in patch.hunks:
                old_start_line = hunk.orig_pos - 1
                new_start_line = hunk.mod_pos - 1

                old_end_line = 0
                new_end_line = 0

                added = False
                deleted = False
                in_change = False

                for line in hunk.lines:
                    if isinstance(line, RemoveLine):
                        if not in_change or not deleted:
                            in_change = True
                            old_start_line += 1
                            old_end_line = old_start_line
                        else:
                            old_end_line += 1

                        deleted = True

                    elif isinstance(line, InsertLine):
                        if not in_change or not added:
                            in_change = True
                            new_start_line += 1
                            new_end_line = new_start_line
                        else:
                            new_end_line += 1

                        added = True

                    elif isinstance(line, ContextLine):
                        if in_change:
                            in_change = False
                            printdbg("Patch new name: " + patch.newname)
                            file_name = re.split('\s+', patch.newname)[0]
                            if file_name == "/dev/null":
                                file_name = re.split('\s+', patch.oldname)[0]
                            cd = CommitData(file_name)

                            if deleted:
                                cd.old_start_line = old_start_line
                                cd.old_end_line = old_end_line
                                old_start_line = old_end_line

                            if added:
                                cd.new_start_line = new_start_line
                                cd.new_end_line = new_end_line
                                new_start_line = new_end_line

                            hunks.append(cd)
                            added = deleted = False

                        old_start_line += 1
                        new_start_line += 1

                # The diff ended without a new context line
                if in_change:
                    cd = CommitData(re.split('\s+', patch.newname)[0])

                    if deleted:
                        cd.old_start_line = old_start_line
                        cd.old_end_line = old_end_line

                    if added:
                        cd.new_start_line = new_start_line
                        cd.new_end_line = new_end_line

                    hunks.append(cd)
        profiler_stop("get_commit_data")
        return hunks
Exemple #18
0
                execute_statement(statement(insert, db.place_holder),
                                  (file_id, commit_id, hunk.old_start_line,
                                   hunk.old_end_line, hunk.new_start_line,
                                   hunk.new_end_line),
                                  write_cursor,
                                  db,
                                  "Couldn't insert hunk, dup record?",
                                  exception=ExtensionRunError)

            connection.commit()
            progress.finished_one()

        read_cursor.close()
        connection.commit()
        connection.close()
        progress.done()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running hunks extension", delete=True)

    def backout(self, repo, uri, db):
        update_statement = """delete from hunks
                              where commit_id in (select s.id from scmlog s
                                          where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)


register_extension("Hunks", Hunks)
Exemple #19
0
                        p.patch is not NULL""", db.place_holder), (repo_id, ))
        nr_records = cursor.fetchone()[0]
        progress = Progress("[Extension PatchesLOC]", nr_records)

        patches = self.get_patches(repo, path or repo.get_uri(), repo_id, db,
                                   cursor)

        for commit_id, file_id, patch_content, rev in patches:
            (added, removed) = self.count_lines(patch_content)
            insert = """insert into patch_lines(file_id, commit_id,
                        added, removed)
                        values(?,?,?,?)"""
            execute_statement(statement(insert, db.place_holder),
                              (file_id, commit_id, added, removed),
                              cursor,
                              db,
                              "Couldn't insert patch, dup record?",
                              exception=ExtensionRunError)
            connection.commit()
            progress.finished_one()

        cursor.close()
        connection.commit()
        connection.close()
        progress.done()

        profiler_stop("Running PatchLOC extension", delete=True)


register_extension("PatchLOC", PatchLOC)
Exemple #20
0
                        where id = ?"""

            if self.fixes_bug(commit_message):
                is_bug_fix = 1
            else:
                is_bug_fix = 0

            execute_statement(statement(update, db.place_holder),
                              (is_bug_fix, row_id),
                              write_cursor,
                              db,
                              "Couldn't update scmlog",
                              exception=ExtensionRunError)

        read_cursor.close()
        connection.commit()
        connection.close()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running BugFixMessage extension", delete=True)

    def backout(self, repo, uri, db):
        backout_statement = """update scmlog
                       set is_bug_fix = NULL
                       where repository_id = ?"""

        self._do_backout(repo, uri, db, backout_statement)


register_extension("BugFixMessage", BugFixMessage)
Exemple #21
0
    def update_for_revision(self, cursor, commit_id, repo_id):
        db = self.__dict__['db']

        if commit_id == self.__dict__['rev']:
            return
        prev_commit_id = self.__dict__['rev']
        self.__dict__['rev'] = commit_id

        profiler_start("Updating adjacency matrix for commit %d", (commit_id,))
        if self.__dict__['adj'] is None:
            adj = Adj()
            self.__dict__['adj'] = adj
        else:
            adj = self.__dict__['adj']

        rf = self.__dict__['files']
        if rf is not None:
            repo_files_id, repo_files = rf
            if repo_files_id != repo_id:
                del self.__dict__['files']
                repo_files = {}
        else:
            repo_files = {}

        if not repo_files:
            # Get and cache all the files table
            query = "select id, file_name from files where repository_id = ?"
            # profiler_start("Getting files for repository %d", (repo_id,))
            cursor.execute(statement(query, db.place_holder), (repo_id,))
            # profiler_stop("Getting files for repository %d", (repo_id,), 
            # True)
            rs = cursor.fetchmany()
            while rs:
                for id, file_name in rs:
                    repo_files[id] = file_name
                rs = cursor.fetchmany()
            self.__dict__['files'] = (repo_id, repo_files)
            adj.files = repo_files

        # Get the files that have been renamed
        # with the new name for the given rev
        query = "select af.file_id, af.new_file_name " + \
                "from actions_file_names af, files f " + \
                "where af.file_id = f.id " + \
                "and af.commit_id = ? " + \
                "and af.type = 'V' " + \
                "and f.repository_id = ?"
        # profiler_start("Getting new file names for commit %d", (commit_id,))
        cursor.execute(statement(query, db.place_holder), (commit_id, repo_id))
        # profiler_stop("Getting new file names for commit %d", (commit_id,), 
        # True)
        rs = cursor.fetchmany()
        while rs:
            for id, file_name in rs:
                adj.files[id] = file_name
            rs = cursor.fetchmany()

        # Get the new file links since the last time
        query = "select fl.parent_id, fl.file_id " + \
                "from file_links fl, files f " + \
                "where fl.file_id = f.id "
        if prev_commit_id is None:
            query += "and fl.commit_id = ? "
            args = (commit_id, repo_id)
        else:
            query += "and fl.commit_id between ? and ? "
            args = (prev_commit_id, commit_id, repo_id)
        query += "and f.repository_id = ?"
#        profiler_start("Getting file links for commit %d", (commit_id,))
        cursor.execute(statement(query, db.place_holder), args)
#        profiler_stop("Getting file links for commit %d", (commit_id,), True)
        rs = cursor.fetchmany()
        while rs:
            for f1, f2 in rs:
                adj.adj[f2] = f1
            rs = cursor.fetchmany()

        profiler_stop("Updating adjacency matrix for commit %d",
                       (commit_id,), True)
Exemple #22
0
 def run(self, repo, repo_uri):
     profiler_start("Processing patch for revision %s", (self.rev))
     self.repo = repo
     self.repo_uri = repo_uri
     self.get_patch_for_commit()
     profiler_stop("Processing patch for revision %s", (self.rev))
Exemple #23
0
    def update_for_revision(self, cursor, commit_id, repo_id):
        db = self.__dict__['db']

        if commit_id == self.__dict__['rev']:
            return
        prev_commit_id = self.__dict__['rev']
        self.__dict__['rev'] = commit_id

        profiler_start("Updating adjacency matrix for commit %d", (commit_id,))
        if self.__dict__['adj'] is None:
            adj = Adj()
            self.__dict__['adj'] = adj
        else:
            adj = self.__dict__['adj']

        rf = self.__dict__['files']
        if rf is not None:
            repo_files_id, repo_files = rf
            if repo_files_id != repo_id:
                del self.__dict__['files']
                repo_files = {}
        else:
            repo_files = {}

        if not repo_files:
            # Get and cache all the files table
            query = "select id, file_name from files where repository_id = ?"
            # profiler_start("Getting files for repository %d", (repo_id,))
            cursor.execute(statement(query, db.place_holder), (repo_id,))
            # profiler_stop("Getting files for repository %d", (repo_id,), 
            # True)
            rs = cursor.fetchmany()
            while rs:
                for id, file_name in rs:
                    repo_files[id] = file_name
                rs = cursor.fetchmany()
            self.__dict__['files'] = (repo_id, repo_files)
            adj.files = repo_files

        # Get the files that have been renamed
        # with the new name for the given rev
        query = "select af.file_id, af.new_file_name " + \
                "from actions_file_names af, files f " + \
                "where af.file_id = f.id " + \
                "and af.commit_id = ? " + \
                "and af.type = 'V' " + \
                "and f.repository_id = ?"
        # profiler_start("Getting new file names for commit %d", (commit_id,))
        cursor.execute(statement(query, db.place_holder), (commit_id, repo_id))
        # profiler_stop("Getting new file names for commit %d", (commit_id,), 
        # True)
        rs = cursor.fetchmany()
        while rs:
            for id, file_name in rs:
                adj.files[id] = file_name
            rs = cursor.fetchmany()

        # Get the new file links since the last time
        query = "select fl.parent_id, fl.file_id " + \
                "from file_links fl, files f " + \
                "where fl.file_id = f.id "
        if prev_commit_id is None:
            query += "and fl.commit_id = ? "
            args = (commit_id, repo_id)
        else:
            query += "and fl.commit_id between ? and ? "
            args = (prev_commit_id, commit_id, repo_id)
        query += "and f.repository_id = ?"
#        profiler_start("Getting file links for commit %d", (commit_id,))
        cursor.execute(statement(query, db.place_holder), args)
#        profiler_stop("Getting file links for commit %d", (commit_id,), True)
        rs = cursor.fetchmany()
        while rs:
            for f1, f2 in rs:
                adj.adj[f2] = f1
            rs = cursor.fetchmany()

        profiler_stop("Updating adjacency matrix for commit %d",
                       (commit_id,), True)
Exemple #24
0
                execute_statement(statement(insert, db.place_holder),
                                  (file_id, commit_id,
                                   hunk.old_start_line,
                                   hunk.old_end_line,
                                   hunk.new_start_line,
                                   hunk.new_end_line),
                                   write_cursor,
                                   db,
                                   "Couldn't insert hunk, dup record?",
                                   exception=ExtensionRunError)

            connection.commit()
            progress.finished_one()

        read_cursor.close()
        connection.commit()
        connection.close()
        progress.done()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running hunks extension", delete=True)

    def backout(self, repo, uri, db):
        update_statement = """delete from hunks
                              where commit_id in (select s.id from scmlog s
                                          where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)

register_extension("Hunks", Hunks)
Exemple #25
0
                         (i,))
                
                processed_jobs = self.__process_finished_jobs(job_pool, 
                                                              write_cursor, db)
                connection.commit()
                i = i - processed_jobs
                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
                
        profiler_start("Inserting results in db")
        #self.__insert_many(write_cursor)
        connection.commit()
        profiler_stop("Inserting results in db")

        read_cursor.close()
        write_cursor.close()
        connection.close()

        # This turns off the profiler and deletes it's timings
        profiler_stop("Running content extension", delete=True)
        
    def backout(self, repo, uri, db):
        update_statement = """delete from content where
                              commit_id in (select id from scmlog s
                                            where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)
            execute_statement(statement(__insert__,
                                            self.db.place_holder),
                                  (patch_id, commit_id_new, file_id, old_class, new_class, old_function, new_function, 0),
                                  write_cursor,
                                  db,
                                  "\nCouldn't insert, duplicate patch?",
                                  exception=ExtensionRunError)
            #clear
            old_cla.clear()
            new_cla.clear()
            old_func.clear()
            new_func.clear()
            
        cnn.commit()
        write_cursor.close()
        cursor.close()
        cnn.close()
        profiler_stop("Running Patches extension", delete=True)
        
        end = time.time()
        print function_name_change_count, 'file change name!'
        print 'num of source file:', num_of_source
        print 'num of exception:', num_of_exception
        print 'num of non_source_file:', non_source_file    
        print 'num of files can not be recovered:', num_of_unrecovered
        print 'num_of_id1:', num_of_id1
        print 'consuming time: %ss' % str(end - start)

register_extension("Analyse_patch", Analyse_patch)
    
Exemple #27
0
    def get_commit_data(self, patch_content):
        profiler_start("get_commit_data")
        lines = [l + "\n" for l in patch_content.splitlines() if l]
        hunks = []

        for patch in [p for p in parse_patches(lines, allow_dirty=True, \
                            allow_continue=True) if isinstance(p, Patch)]:
            # This method matches that of parseLine in UnifiedDiffParser.java
            # It's not necessarily intuitive, but this algorithm is much harder
            # than it looks, I spent hours trying to get a simpler solution.
            # It does, however, seem to work, which is pretty amazing when
            # you think about how difficult it is for long enough.
            # The trick that this method does is that each *part* of a hunk
            # ie. added, deleted, changed are treated as *new entities*.
            # The EntityDelta table does not store just diffs, it stores
            # each part of a diff.
            # I will need to copy the behavior of how Sep inserts a CommitData
            # into the database to ensure things match
            for hunk in patch.hunks:
                old_start_line = hunk.orig_pos - 1
                new_start_line = hunk.mod_pos - 1

                old_end_line = 0
                new_end_line = 0

                added = False
                deleted = False
                in_change = False

                for line in hunk.lines:
                    if isinstance(line, RemoveLine):
                        if not in_change or not deleted:
                            in_change = True
                            old_start_line += 1
                            old_end_line = old_start_line
                        else:
                            old_end_line += 1

                        deleted = True

                    elif isinstance(line, InsertLine):
                        if not in_change or not added:
                            in_change = True
                            new_start_line += 1
                            new_end_line = new_start_line
                        else:
                            new_end_line += 1

                        added = True

                    elif isinstance(line, ContextLine):
                        if in_change:
                            in_change = False
                            printdbg("Patch new name: " + patch.newname)
                            file_name = re.split('\s+', patch.newname)[0]
                            if file_name == "/dev/null":
                                file_name = re.split('\s+', patch.oldname)[0]
                            cd = CommitData(file_name)

                            if deleted:
                                cd.old_start_line = old_start_line
                                cd.old_end_line = old_end_line
                                old_start_line = old_end_line

                            if added:
                                cd.new_start_line = new_start_line
                                cd.new_end_line = new_end_line
                                new_start_line = new_end_line

                            hunks.append(cd)
                            added = deleted = False

                        old_start_line += 1
                        new_start_line += 1

                # The diff ended without a new context line
                if in_change:
                    cd = CommitData(re.split('\s+', patch.newname)[0])

                    if deleted:
                        cd.old_start_line = old_start_line
                        cd.old_end_line = old_end_line

                    if added:
                        cd.new_start_line = new_start_line
                        cd.new_end_line = new_end_line

                    hunks.append(cd)
        profiler_stop("get_commit_data")
        return hunks
Exemple #28
0
class HunkBlameJob(Job):
    class BlameContentHandler(BlameJob.BlameContentHandler):
        def __init__(self, hunks):
            self.hunks = hunks
            self.bug_revs = {}

        def line(self, blame_line):
            if not self.profiled:
                profiler_start("Processing blame output for %s",
                               (self.filename))
                self.profiled = True
            for hunk_id, start_line, end_line in self.hunks:
                if blame_line.line >= start_line and blame_line.line <= end_line:
                    if self.bug_revs.get(hunk_id) is None:
                        self.bug_revs[hunk_id] = set()
                    self.bug_revs[hunk_id].add(blame_line.rev)
                    break

        def start_file(self, filename):
            self.filename = filename
            self.profiled = False

        def end_file(self):
            profiler_stop("Processing blame output for %s", (self.filename))
            if len(self.bug_revs) == 0:
                printdbg("No bug revision found in this file")

    def __init__(self, hunks, path, rev):
        Job.__init__(self)
        self.hunks = hunks
        self.path = path
        self.rev = rev
        self.bug_revs = {}

    def run(self, repo, repo_uri):
        profiler_start("Running HunkBlameJob for %s@%s", (self.path, self.rev))

        def blame_line(line, p):
            p.feed(line)

        start = sys.maxint
        end = 0
        for hunk in self.hunks:
            if hunk[1] < start:
                start = hunk[1]
            if hunk[2] > end:
                end = hunk[2]

        repo_type = repo.get_type()
        if repo_type == 'cvs':
            # CVS paths contain the module stuff
            uri = repo.get_uri_for_path(repo_uri)
            module = uri[len(repo.get_uri()):].strip('/')

            if module != '.':
                path = self.path[len(module):].strip('/')
            else:
                path = self.path.strip('/')
        else:
            path = self.path.strip('/')

        p = create_parser(repo.get_type(), self.path)
        out = self.get_content_handler()
        p.set_output_device(out)
        wid = repo.add_watch(BLAME, blame_line, p)
        try:
            repo.blame(os.path.join(repo_uri, path),
                       self.rev,
                       start=start,
                       end=end)
            self.collect_results(out)
        except RepositoryCommandError, e:
            self.failed = True
            printerr("Command %s returned %d (%s)",
                     (e.cmd, e.returncode, e.error))
        p.end()
        repo.remove_watch(BLAME, wid)
        profiler_stop("Running HunkBlameJob for %s@%s", (self.path, self.rev),
                      delete=True)
Exemple #29
0
            if composed:
                rev = revision.split ("|")[0]
            else:
                rev = revision

            relative_path = fr.get_path ()
            printdbg ("Path for %d at %s -> %s", (file_id, rev, relative_path))

            if repo.get_type () == 'svn' and relative_path == 'tags':
                printdbg ("Skipping file %s", (relative_path,))
                continue

            job = BlameJob (file_id, commit_id, relative_path, rev)
            job_pool.push (job)
            n_blames += 1

            if n_blames >= self.MAX_BLAMES:
                job_pool.join()
                self.process_finished_jobs (job_pool, write_cursor)
                n_blames = 0
        job_pool.join ()
        self.process_finished_jobs (job_pool, write_cursor, True)

        read_cursor.close ()
        write_cursor.close ()
        cnn.close()

        profiler_stop ("Running Blame extension", delete = True)

register_extension ("Blame", Blame)
Exemple #30
0
                    (repo.get_type()))
        except Exception, e:
            raise ExtensionRunError( \
                    "Error creating repository %s. Exception: %s" % \
                    (repo.get_uri(), str(e)))

        profiler_start("Hunks: fetch all patches")
        icursor = ICursor(read_cursor, self.INTERVAL_SIZE)
        # Get the patches from this repository
        query = """select p.commit_id, p.patch, s.rev 
                    from patches p, scmlog s 
                    where p.commit_id = s.id and
                    s.repository_id = ? and 
                    p.patch is not NULL"""
        icursor.execute(statement(query, db.place_holder), (repo_id, ))
        profiler_stop("Hunks: fetch all patches", delete=True)

        self.__prepare_table(connection)
        fp = FilePaths(db)
        rs = icursor.fetchmany()

        while rs:
            for commit_id, patch_content, rev in rs:
                for hunk in self.get_commit_data(patch_content):
                    # Get the file ID from the database for linking
                    hunk_file_name = re.sub(r'^[ab]\/', '',
                                            hunk.file_name.strip())
                    file_id = fp.get_file_id(hunk_file_name, commit_id)

                    if file_id == None:
                        printdbg("file not found")
Exemple #31
0
                        set is_bug_fix = ?
                        where id = ?"""

            if self.fixes_bug(commit_message):
                is_bug_fix = 1
            else:
                is_bug_fix = 0

            execute_statement(statement(update, db.place_holder),
                              (is_bug_fix, row_id),
                              write_cursor,
                              db,
                              "Couldn't update scmlog",
                              exception=ExtensionRunError)

        read_cursor.close()
        connection.commit()
        connection.close()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running BugFixMessage extension", delete=True)

    def backout(self, repo, uri, db):
        backout_statement = """update scmlog
                       set is_bug_fix = NULL
                       where repository_id = ?"""

        self._do_backout(repo, uri, db, backout_statement)
          
register_extension("BugFixMessage", BugFixMessage)
Exemple #32
0
                finally:
                    inner_cursor.close()
                    
                hunks = [h for h in hunks if h[0] not in blames]
                job = HunkBlameJob(hunks, relative_path, pre_rev)
                
                job_pool.push (job)
                n_blames += 1
        
                if n_blames >= self.MAX_BLAMES:
                    processed_jobs = self.process_finished_jobs (job_pool, write_cursor)
                    n_blames -= processed_jobs
                    if processed_jobs<=self.MAX_BLAMES/5:
                        profiler_start("Joining unprocessed jobs")
                        job_pool.join()
                        profiler_stop("Joining unprocessed jobs", delete=True)
            except NotValidHunkWarning as e:
                printerr("Not a valid hunk: "+str(e))
            finally:
                file_rev = read_cursor.fetchone()

        job_pool.join ()
        self.process_finished_jobs (job_pool, write_cursor, True)

        try:
            self.__drop_cache(cnn)
        except:
            printdbg("Couldn't drop cache because of " + str(e))

        read_cursor.close ()
        write_cursor.close ()
Exemple #33
0
            job = ContentJob(commit_id, file_id, rev, relative_path)
            job_pool.push(job)
            i = i + 1
            if i >= queuesize:
                printdbg("Content queue is now at %d, flushing to database", 
                         (i,))
                
                processed_jobs = self.__process_finished_jobs(job_pool, 
                                                              connection, db)
                i = i - processed_jobs
                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, connection, db)

        read_cursor.close()
        connection.close()

        # This turns off the profiler and deletes it's timings
        profiler_stop("Running content extension", delete=True)
        
    def backout(self, repo, uri, db):
        update_statement = """delete from content where
                              commit_id in (select id from scmlog s
                                            where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)

register_extension("Content", Content)
Exemple #34
0
                printdbg("FileCount queue is now at %d, flushing to database",
                         (i, ))

                processed_jobs = self.__process_finished_jobs(
                    job_pool, write_cursor, db)

                connection.commit()
                i = i - processed_jobs

                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
        read_cursor.close()
        connection.commit()
        connection.close()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running FileCount extension", delete=True)

    def backout(self, repo, uri, db):
        update_statement = """update scmlog
                       set file_count = NULL
                       where repository_id = ?"""

        self._do_backout(repo, uri, db, update_statement)


register_extension("FileCount", FileCount)
Exemple #35
0
 def end_file (self):
     profiler_stop("Processing blame output for %s",(self.filename))
     if len(self.bug_revs)==0:
         printdbg("No bug revision found in this file")
Exemple #36
0
            job = ContentJob(commit_id, file_id, rev, relative_path)
            job_pool.push(job)
            i = i + 1
            if i >= queuesize:
                printdbg("Content queue is now at %d, flushing to database", (i,))

                processed_jobs = self.__process_finished_jobs(job_pool, connection, db)
                i = i - processed_jobs
                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, connection, db)

        read_cursor.close()
        connection.close()

        # This turns off the profiler and deletes it's timings
        profiler_stop("Running content extension", delete=True)

    def backout(self, repo, uri, db):
        update_statement = """delete from content where
                              commit_id in (select id from scmlog s
                                            where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)


register_extension("Content", Content)
Exemple #37
0
            if i >= queuesize:
                printdbg("FileCount queue is now at %d, flushing to database", 
                         (i,))

                processed_jobs = self.__process_finished_jobs(job_pool, 
                                                              write_cursor, db)

                connection.commit()
                i = i - processed_jobs
                
                if processed_jobs < (queuesize / 5):
                    job_pool.join()
        
        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
        read_cursor.close()
        connection.commit()
        connection.close()

        # This turns off the profiler and deletes its timings
        profiler_stop("Running FileCount extension", delete=True)
                
    def backout(self, repo, uri, db):
        update_statement = """update scmlog
                       set file_count = NULL
                       where repository_id = ?"""

        self._do_backout(repo, uri, db, update_statement)

register_extension("FileCount", FileCount)
Exemple #38
0
                    inner_cursor.close()

                hunks = [h for h in hunks if h[0] not in blames]
                job = HunkBlameJob(hunks, relative_path, pre_rev)

                job_pool.push(job)
                n_blames += 1

                if n_blames >= self.MAX_BLAMES:
                    processed_jobs = self.process_finished_jobs(
                        job_pool, write_cursor)
                    n_blames -= processed_jobs
                    if processed_jobs <= self.MAX_BLAMES / 5:
                        profiler_start("Joining unprocessed jobs")
                        job_pool.join()
                        profiler_stop("Joining unprocessed jobs", delete=True)
            except NotValidHunkWarning as e:
                printerr("Not a valid hunk: " + str(e))
            finally:
                file_rev = read_cursor.fetchone()

        job_pool.join()
        self.process_finished_jobs(job_pool, write_cursor, True)

        try:
            self.__drop_cache(cnn)
        except:
            printdbg("Couldn't drop cache because of " + str(e))

        read_cursor.close()
        write_cursor.close()
Exemple #39
0
                         (i, ))

                processed_jobs = self.__process_finished_jobs(
                    job_pool, write_cursor, db)
                connection.commit()
                i = i - processed_jobs
                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)

        profiler_start("Inserting results in db")
        #self.__insert_many(write_cursor)
        connection.commit()
        profiler_stop("Inserting results in db")

        read_cursor.close()
        write_cursor.close()
        connection.close()

        # This turns off the profiler and deletes it's timings
        profiler_stop("Running content extension", delete=True)

    def backout(self, repo, uri, db):
        update_statement = """delete from content where
                              commit_id in (select id from scmlog s
                                            where s.repository_id = ?)"""

        self._do_backout(repo, uri, db, update_statement)
Exemple #40
0
 def end_file(self):
     profiler_stop("Processing blame output for %s", (self.filename))
     if len(self.bug_revs) == 0:
         printdbg("No bug revision found in this file")
Exemple #41
0
                    (repo.get_type()))
        except Exception, e:
            raise ExtensionRunError( \
                    "Error creating repository %s. Exception: %s" % \
                    (repo.get_uri(), str(e)))
        
        profiler_start("Hunks: fetch all patches")
        icursor = ICursor(read_cursor, self.INTERVAL_SIZE)
        # Get the patches from this repository
        query = """select p.commit_id, p.patch, s.rev 
                    from patches p, scmlog s 
                    where p.commit_id = s.id and
                    s.repository_id = ? and 
                    p.patch is not NULL"""
        icursor.execute(statement(query, db.place_holder), (repo_id,))
        profiler_stop("Hunks: fetch all patches", delete=True)

        self.__prepare_table(connection)
        fp = FilePaths(db)
        rs = icursor.fetchmany()

        while rs:
            for commit_id, patch_content, rev in rs:  
                for hunk in self.get_commit_data(patch_content):
                    # Get the file ID from the database for linking
                    hunk_file_name = re.sub(r'^[ab]\/', '', 
                                            hunk.file_name.strip())
                    file_id = fp.get_file_id(hunk_file_name, commit_id)
                    
                    if file_id == None:
                        printdbg("file not found")