Beispiel #1
0
    def __create_cache(self, cnn):
        cursor = cnn.cursor ()

        try:
            self.__drop_cache(cnn)
        except Exception, e:
            printdbg("Couldn't drop cache because of " + str(e))
Beispiel #2
0
 def get_path_from_database(self, file_id, commit_id):
     """Returns the last valid path for a given file_id at commit_id
        (May have been removed afterwords!)"""
     
     if config.debug:
         profiler_start("Getting full file path for file_id %d and \
                         commit_id %d", (file_id, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     
     cursor = cnn.cursor()
     query = """SELECT current_file_path from actions
                WHERE file_id=? AND commit_id <= ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder), (file_id, commit_id))
     try:
         file_path = cursor.fetchone()[0]
     except:
         file_path = None
     
     cursor.close()
     cnn.close()
     
     printdbg("get_path_from_database:\
               Path for file_id %d at commit_id %d: %s",
              (file_id, commit_id, file_path))
     if config.debug:
         profiler_stop("Getting full file path for file_id %d and\
                          commit_id %d", (file_id, commit_id), True)
     return file_path
Beispiel #3
0
def get_line_types(repo, repo_uri, rev, path):
    """Returns an array, where each item means a line of code.
       Each item is labled 'code', 'comment' or 'empty'"""

    #profiler_start("Processing LineTypes for revision %s:%s", (self.rev, self.file_path))
    uri = os.path.join(repo_uri, path) # concat repo_uri and file_path for full path
    file_content = _get_file_content(repo, uri, rev)  # get file_content

    if file_content is None or file_content == '':
        printerr("[get_line_types] Error: No file content for " + str(rev) + ":" + str(path) + " found! Skipping.")
        line_types = None
    else:
        try:
            lexer = get_lexer_for_filename(path)
        except ClassNotFound:
            try:
                printdbg("[get_line_types] Guessing lexer for" + str(rev) + ":" + str(path) + ".")
                lexer = guess_lexer(file_content)
            except ClassNotFound:
                printdbg("[get_line_types] No guess or lexer found for " + str(rev) + ":" + str(path) + ". Using TextLexer instead.")
                lexer = TextLexer()

        if isinstance(lexer, NemerleLexer):
            # this lexer is broken and yield an unstoppable process
            # see https://bitbucket.org/birkenfeld/pygments-main/issue/706/nemerle-lexer-ends-in-an-infinite-loop
            lexer = TextLexer()

        # Not shure if this should be skipped, when the language uses off-side rules (e.g. python,
        # see http://en.wikipedia.org/wiki/Off-side_rule for list)
        stripped_code = _strip_lines(file_content)
        lexer_output = _iterate_lexer_output(lexer.get_tokens(stripped_code))
        line_types_str = _comment_empty_or_code(lexer_output)
        line_types = line_types_str.split("\n")

    return line_types
Beispiel #4
0
 def get_path_from_database(self, file_id, commit_id):
     """Returns the last valid path for a given file_id at commit_id
        (May have been removed afterwords!)"""
     
     if config.debug:
         profiler_start("Getting full file path for file_id %d and \
                         commit_id %d", (file_id, commit_id))
     
     db = self.__dict__['db']
     cnn = db.connect()
     
     cursor = cnn.cursor()
     query = """SELECT file_path from file_paths
                WHERE file_id=? AND commit_id <= ?
                ORDER BY commit_id DESC LIMIT 1"""
     cursor.execute(statement(query, db.place_holder), (file_id, commit_id))
     try:
         file_path = cursor.fetchone()[0]
     except:
         file_path = None
     
     cursor.close()
     cnn.close()
     
     printdbg("get_path_from_database:\
               Path for file_id %d at commit_id %d: %s",
              (file_id, commit_id, file_path))
     if config.debug:
         profiler_stop("Getting full file path for file_id %d and\
                          commit_id %d", (file_id, commit_id), True)
     return file_path
Beispiel #5
0
    def __create_cache(self, cnn):
        cursor = cnn.cursor()

        try:
            self.__drop_cache(cnn)
        except Exception, e:
            printdbg("Couldn't drop cache because of " + str(e))
Beispiel #6
0
def line_is_code(line_types_array, line_nr):
    """Decides if a given line nr is executable code"""

    try:
        line_type = line_types_array[line_nr - 1]
    except IndexError as e:
        printdbg("Line lexer output. Must be an empty line!")
        line_type = None

    return line_type == "code"
Beispiel #7
0
def line_is_code(line_types_array, line_nr):
    """Decides if a given line nr is executable code"""

    try:
        line_type = line_types_array[line_nr-1]
    except IndexError as e:
        printdbg("Line lexer output. Must be an empty line!")
        line_type = None

    return line_type == "code"
Beispiel #8
0
 def __match_string(self, regexes, flags, string):
     """Checks whether a string matches a series of regexes"""
     for r in regexes:
         # The bit at the beginning and end matches whitespace, punctuation
         # or the start or end of a line.
         delimiters = "[\s\.,;\!\?\'\"\/\\\]"
         if re.search("(" + delimiters + "+|^)" + r + "(" + delimiters + "+|$)", string, flags):
             printdbg("[STRING] matched on " + str(r) + " " + string)
             return True
             
     return False
Beispiel #9
0
    def populate_insert_args(self, job):
        authors = job.get_authors()
        file_id = job.get_file_id()
        commit_id = job.get_commit_id()

        try:
            args = [(self.id_counter + i, file_id, commit_id, \
                 self.authors[key], authors[key]) \
                 for i, key in enumerate(authors.keys())]
        except:
            printdbg("Error occurred while processing file %d @ commit %d", (file_id, commit_id))
            raise
        return args
Beispiel #10
0
def statement(str, ph_mark):
    '''docstring demo'''
    if "?" == ph_mark or "?" not in str:
        printdbg(str)
        return str

    tokens = str.split("'")
    for i in range(0, len(tokens), 2):
        tokens[i] = tokens[i].replace("?", ph_mark)

    retval = "'".join(tokens)
    printdbg(retval)

    return retval
Beispiel #11
0
    def populate_insert_args(self, job):
        authors = job.get_authors()
        file_id = job.get_file_id()
        commit_id = job.get_commit_id()

        try:
            args = [(self.id_counter + i, file_id, commit_id, \
                 self.authors[key], authors[key]) \
                 for i, key in enumerate(authors.keys())]
        except:
            printdbg("Error occurred while processing file %d @ commit %d",
                     (file_id, commit_id))
            raise
        return args
Beispiel #12
0
    def __execute(self):
        """
        Here some documentation comes. This is test for bestfork
        """
        q = "%s LIMIT %d OFFSET %d" % (self.query, self.interval_size, self.i)
        self.i += self.interval_size

        printdbg(q)
        if self.args:
            self.cursor.execute(q, self.args)
        else:
            self.cursor.execute(q)

        self.need_exec = False
Beispiel #13
0
def statement(str, ph_mark):
    '''docstring demo'''
    if "?" == ph_mark or "?" not in str:
        printdbg(str)
        return str

    tokens = str.split("'")
    for i in range(0, len(tokens), 2):
        tokens[i] = tokens[i].replace("?", ph_mark)

    retval = "'".join(tokens)
    printdbg(retval)

    return retval
Beispiel #14
0
    def __execute(self):
        """
        Here some documentation comes. This is test for bestfork
        """
        q = "%s LIMIT %d OFFSET %d" % (self.query, self.interval_size, self.i)
        self.i += self.interval_size

        printdbg(q)
        if self.args:
            self.cursor.execute(q, self.args)
        else:
            self.cursor.execute(q)

        self.need_exec = False
Beispiel #15
0
 def populate_insert_args(self, job):
     bug_revs = job.get_bug_revs ()
     cnn = self.db.connect()
     cursor = cnn.cursor()
     args = []
     for hunk_id in bug_revs:
         for rev in bug_revs[hunk_id]:
             printdbg("Find id for rev %s"%rev)
             query = "select id from scmlog where rev = ?"
             cursor.execute(statement(query, self.db.place_holder),(rev,))
             
             fetched_row = cursor.fetchone()
             
             if fetched_row is not None:
                 args.append((hunk_id,fetched_row[0]))
                 
     cursor.close()
     cnn.close()
     return args
Beispiel #16
0
    def populate_insert_args(self, job):
        bug_revs = job.get_bug_revs()
        cnn = self.db.connect()
        cursor = cnn.cursor()
        args = []
        for hunk_id in bug_revs:
            for rev in bug_revs[hunk_id]:
                printdbg("Find id for rev %s" % rev)
                query = "select id from scmlog where rev = ?"
                cursor.execute(statement(query, self.db.place_holder), (rev, ))

                fetched_row = cursor.fetchone()

                if fetched_row is not None:
                    args.append((hunk_id, fetched_row[0]))

        cursor.close()
        cnn.close()
        return args
Beispiel #17
0
def get_all_extensions():
    # Do something to get a list of extensions, probably like a file
    # glob, then do a get_extension on each one. Return the entire
    # _extensions list
    
    # Get a list of the paths that are sitting in the directory with this
    # script, ie. all possible extensions
    possible_file_paths = glob(os.path.realpath(os.path.dirname(__file__)) \
                               + "/*.py")
    # This splitting will extract the file name from the expression.
    # The list has special Python files, like __init.py__ filtered.
    for extension in [os.path.splitext(os.path.split(fp)[1])[0] for 
                      fp in possible_file_paths if (not fp.startswith('__')
                      and not fp.endswith('__.py'))]:
        try:
            printdbg("Getting extension " + extension)
            get_extension(extension)
        except ExtensionUnknownError:
            pass
        
    return _extensions
Beispiel #18
0
def get_line_types(repo, repo_uri, rev, path):
    """Returns an array, where each item means a line of code.
       Each item is labled 'code', 'comment' or 'empty'"""

    #profiler_start("Processing LineTypes for revision %s:%s", (self.rev, self.file_path))
    uri = os.path.join(repo_uri,
                       path)  # concat repo_uri and file_path for full path
    file_content = _get_file_content(repo, uri, rev)  # get file_content

    if file_content is None or file_content == '':
        printerr("[get_line_types] Error: No file content for " + str(rev) +
                 ":" + str(path) + " found! Skipping.")
        line_types = None
    else:
        try:
            lexer = get_lexer_for_filename(path)
        except ClassNotFound:
            try:
                printdbg("[get_line_types] Guessing lexer for" + str(rev) +
                         ":" + str(path) + ".")
                lexer = guess_lexer(file_content)
            except ClassNotFound:
                printdbg("[get_line_types] No guess or lexer found for " +
                         str(rev) + ":" + str(path) +
                         ". Using TextLexer instead.")
                lexer = TextLexer()

        if isinstance(lexer, NemerleLexer):
            # this lexer is broken and yield an unstoppable process
            # see https://bitbucket.org/birkenfeld/pygments-main/issue/706/nemerle-lexer-ends-in-an-infinite-loop
            lexer = TextLexer()

        # Not shure if this should be skipped, when the language uses off-side rules (e.g. python,
        # see http://en.wikipedia.org/wiki/Off-side_rule for list)
        stripped_code = _strip_lines(file_content)
        lexer_output = _iterate_lexer_output(lexer.get_tokens(stripped_code))
        line_types_str = _comment_empty_or_code(lexer_output)
        line_types = line_types_str.split("\n")

    return line_types
Beispiel #19
0
    def run(self, repo, uri, db):
        self.db = db
        self.repo = repo

        path = uri_to_filename(uri)
        if path is not None:
            repo_uri = repo.get_uri_for_path(path)
        else:
            repo_uri = uri

        path = uri_to_filename(uri)
        self.repo_uri = path or repo.get_uri()

        cnn = self.db.connect()

        cursor = cnn.cursor()
        cursor.execute(
            statement("SELECT id from repositories where uri = ?",
                      db.place_holder), (repo_uri, ))
        repo_id = cursor.fetchone()[0]

        # If table does not exist, the list of commits is empty,
        # otherwise it will be filled within the except block below
        commits = []

        try:
            printdbg("Creating patches table")
            self.__create_table(cnn)
        except TableAlreadyExists:
            printdbg("Patches table exists already, getting max ID")
            cursor.execute(
                statement("SELECT max(id) from patches", db.place_holder))
            id = cursor.fetchone()[0]
            if id is not None:
                DBPatch.id_counter = id + 1

            commits = self.__get_patches_for_repository(repo_id, cursor)
        except Exception, e:
            raise ExtensionRunError(str(e))
Beispiel #20
0
    def run(self, repo, uri, db):
        profiler_start("Running Patches extension")
        self.db = db
        self.repo = repo

        path = uri_to_filename(uri)
        if path is not None:
            repo_uri = repo.get_uri_for_path(path)
        else:
            repo_uri = uri

        path = uri_to_filename(uri)
        self.repo_uri = path or repo.get_uri()

        cnn = self.db.connect()

        cursor = cnn.cursor()
        cursor.execute(statement("SELECT id from repositories where uri = ?",
                                 db.place_holder), (repo_uri,))
        repo_id = cursor.fetchone()[0]

        # If table does not exist, the list of commits is empty,
        # otherwise it will be filled within the except block below
        commits = []

        try:
            printdbg("Creating patches table")
            self.__create_table(cnn)
        except TableAlreadyExists:
            printdbg("Patches table exists already, getting max ID")
            cursor.execute(statement("SELECT max(id) from patches",
                                     db.place_holder))
            id = cursor.fetchone()[0]
            if id is not None:
                DBPatch.id_counter = id + 1

            commits = self.__get_patches_for_repository(repo_id, cursor)
        except Exception, e:
            raise ExtensionRunError(str(e))
Beispiel #21
0
def get_all_extensions():
    # Do something to get a list of extensions, probably like a file
    # glob, then do a get_extension on each one. Return the entire
    # _extensions list
    
    # Get a list of the paths that are sitting in the directory with this
    # script, ie. all possible extensions
    possible_file_paths = glob(os.path.realpath(os.path.dirname(__file__)) \
                               + "/*.py")
    
    # This splitting will extract the file name from the expression.
    # The list has special Python files, like __init.py__ filtered.
    for extension in [os.path.splitext(os.path.split(fp)[1])[0] for 
                      fp in possible_file_paths if (not fp.startswith('__')
                      and not fp.endswith('__.py'))]:
        try:
            printdbg("Getting extension " + extension)
            get_extension(extension)
        except ExtensionUnknownError:
            pass
        
    return _extensions
Beispiel #22
0
 def close(self):
     """Closes FilePaths to ensure all caches are deleted"""
     
     if Config().low_memory:
         # FIXME: This should be closed, but sometimes shelve
         # just won't do it. The best way is to timeout the try,
         # but not closing and just deleting will do the same
         # think, just in a more yucky way
         printdbg("Syncing shelf")
         self.__dict__['cached_adj'].sync()
         printdbg("Closing shelf")
         self.__dict__['cached_adj'].close()
         printdbg("Deleting shelve " + self.shelve_file_name)
         os.remove(self.shelve_file_name)
         # Clean up cached adj in case this gets called without
         # update_all later
         self.__dict__['cached_adj'] = {}
Beispiel #23
0
 def close(self):
     """Closes FilePaths to ensure all caches are deleted"""
     
     if Config().low_memory:
         # FIXME: This should be closed, but sometimes shelve
         # just won't do it. The best way is to timeout the try,
         # but not closing and just deleting will do the same
         # think, just in a more yucky way
         printdbg("Syncing shelf")
         self.__dict__['cached_adj'].sync()
         printdbg("Closing shelf")
         self.__dict__['cached_adj'].close()
         printdbg("Deleting shelve " + self.shelve_file_name)
         os.remove(self.shelve_file_name)
         # Clean up cached adj in case this gets called without
         # update_all later
         self.__dict__['cached_adj'] = {}
Beispiel #24
0
        # Get code files
        query = "select f.id from file_types ft, files f " + \
                "where f.id = ft.file_id and " + \
                "ft.type in ('code', 'unknown') and " + \
                "f.repository_id = ?"
        read_cursor.execute (statement (query, db.place_holder), (repoid,))
        code_files = [item[0] for item in read_cursor.fetchall ()]

        n_blames = 0
        fr = FileRevs (db, cnn, read_cursor, repoid)
        for revision, commit_id, file_id, action_type, composed in fr:
            if file_id not in code_files:
                continue

            if (file_id, commit_id) in blames:
                printdbg ("%d@%d is already in the database, skip it", (file_id, commit_id))
                continue

            if composed:
                rev = revision.split ("|")[0]
            else:
                rev = revision

            relative_path = fr.get_path ()
            printdbg ("Path for %d at %s -> %s", (file_id, rev, relative_path))

            if repo.get_type () == 'svn' and relative_path == 'tags':
                printdbg ("Skipping file %s", (relative_path,))
                continue

            job = BlameJob (file_id, commit_id, relative_path, rev)
Beispiel #25
0
        self.__prepare_table(connection)

        i = 0

        for row in read_cursor:
            row_id = row[0]
            rev = row[1]

            job = FileCountJob(row_id, rev)
            job_pool.push(job)

            i = i + 1

            if i >= queuesize:
                printdbg("FileCount queue is now at %d, flushing to database",
                         (i, ))

                processed_jobs = self.__process_finished_jobs(
                    job_pool, write_cursor, db)

                connection.commit()
                i = i - processed_jobs

                if processed_jobs < (queuesize / 5):
                    job_pool.join()

        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
        read_cursor.close()
        connection.commit()
        connection.close()
Beispiel #26
0
         and h.file_id is not null
         and h.commit_id is not null
 """
 read_cursor.execute(statement (outer_query, db.place_holder), (repoid,))
 file_rev = read_cursor.fetchone()
 n_blames = 0
 fp = FilePaths(db)
 fp.update_all(repoid)
 while file_rev is not None:
     try:
         file_id, commit_id = file_rev
         pre_commit_id, pre_rev = self.__find_previous_commit(file_id, commit_id)
         relative_path = fp.get_path(file_id, pre_commit_id, repoid)
         if relative_path is None:
             raise NotValidHunkWarning("Couldn't find path for file ID %d"%file_id)
         printdbg ("Path for %d at %s -> %s", (file_id, pre_rev, relative_path))
         
         try:
             inner_cursor = cnn.cursor()
         
             inner_query = """select h.id, h.old_start_line, h.old_end_line from hunks h
                 where h.file_id = ? and h.commit_id = ?
                     and h.old_start_line is not null 
                     and h.old_end_line is not null
                     and h.file_id is not null
                     and h.commit_id is not null
             """
             inner_cursor.execute(statement(inner_query, db.place_holder), (file_id, commit_id))
             hunks = inner_cursor.fetchall()
         #FIXME
         except Exception as e:
Beispiel #27
0
        except Exception, e:
            raise ExtensionRunError( \
                    "Error creating repository %s. Exception: %s" % \
                    (repo.get_uri(), str(e)))

        # Try to create a table for storing the content
        # TODO: Removed use case for choosing between all or just the HEAD,
        # should ideally put that back again. Just all for now is fine.
        try:
            self.__prepare_table(connection)
        except Exception as e:
            raise ExtensionRunError("Couldn't prepare table because " + \
                                    str(e))

        queuesize = Config().max_threads
        printdbg("Setting queuesize to " + str(queuesize))

        # This is where the threading stuff comes in, I expect
        job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize)

        # This filters files if they're not source files.
        # I'm pretty sure "unknown" is returning binary files too, but
        # these are implicitly left out when trying to convert to utf-8
        # after download. However, ignore them for now to speed things up
        query = "select f.id from file_types ft, files f " + \
                "where f.id = ft.file_id and " + \
                "ft.type in('code') and " + \
                "f.repository_id = ?"
        # "ft.type in('code', 'unknown') and " + \
        read_cursor.execute(statement(query, db.place_holder), (repo_id, ))
        code_files = [item[0] for item in read_cursor.fetchall()]
Beispiel #28
0
        read_cursor.execute(statement(outer_query, db.place_holder),
                            (repoid, ))
        file_rev = read_cursor.fetchone()
        n_blames = 0
        fp = FilePaths(db)
        fp.update_all(repoid)
        while file_rev is not None:
            try:
                file_id, commit_id = file_rev
                pre_commit_id, pre_rev = self.__find_previous_commit(
                    file_id, commit_id)
                relative_path = fp.get_path(file_id, pre_commit_id, repoid)
                if relative_path is None:
                    raise NotValidHunkWarning(
                        "Couldn't find path for file ID %d" % file_id)
                printdbg("Path for %d at %s -> %s",
                         (file_id, pre_rev, relative_path))

                try:
                    inner_cursor = cnn.cursor()

                    inner_query = """select h.id, h.old_start_line, h.old_end_line from hunks h
                        where h.file_id = ? and h.commit_id = ?
                            and h.old_start_line is not null 
                            and h.old_end_line is not null
                            and h.file_id is not null
                            and h.commit_id is not null
                    """
                    inner_cursor.execute(
                        statement(inner_query, db.place_holder),
                        (file_id, commit_id))
                    hunks = inner_cursor.fetchall()
Beispiel #29
0
        while rs:
            for commit_id, revision, composed_rev in rs:
                if commit_id in commits: 
                    continue

                if composed_rev:
                    rev = revision.split ("|")[0]
                else:
                    rev = revision

                job = PatchJob(rev, commit_id)
                job_pool.push(job)

                i = i + 1
                if i >= queuesize:
                    printdbg("Queue is now at %d, flushing to database", (i,))
                    job_pool.join()
                    self.__process_finished_jobs(job_pool, write_cursor, db)
                    cnn.commit()
                    i = 0

            cnn.commit()
            rs = icursor.fetchmany ()

        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
        cnn.commit ()
        write_cursor.close ()
        cursor.close ()
        cnn.close ()
Beispiel #30
0
    def get_commit_data(self, patch_content):
        lines = [l + "\n" for l in patch_content.split("\n") if l]
        hunks = []

        for patch in [p for p in parse_patches(lines, allow_dirty=True, \
                            allow_continue=True) if isinstance(p, Patch)]:
            # This method matches that of parseLine in UnifiedDiffParser.java
            # It's not necessarily intuitive, but this algorithm is much harder
            # than it looks, I spent hours trying to get a simpler solution.
            # It does, however, seem to work, which is pretty amazing when
            # you think about how difficult it is for long enough.
            # The trick that this method does is that each *part* of a hunk
            # ie. added, deleted, changed are treated as *new entities*.
            # The EntityDelta table does not store just diffs, it stores
            # each part of a diff.
            # I will need to copy the behavior of how Sep inserts a CommitData
            # into the database to ensure things match
            for hunk in patch.hunks:
                old_start_line = hunk.orig_pos - 1
                new_start_line = hunk.mod_pos - 1

                old_end_line = 0
                new_end_line = 0

                added = False
                deleted = False
                in_change = False

                for line in hunk.lines:
                    if isinstance(line, RemoveLine):
                        if not in_change or not deleted:
                            in_change = True
                            old_start_line += 1
                            old_end_line = old_start_line
                        else:
                            old_end_line += 1

                        deleted = True

                    elif isinstance(line, InsertLine):
                        if not in_change or not added:
                            in_change = True
                            new_start_line += 1
                            new_end_line = new_start_line
                        else:
                            new_end_line += 1

                        added = True

                    elif isinstance(line, ContextLine):
                        if in_change:
                            in_change = False
                            printdbg("Patch new name: " + patch.newname)
                            file_name = patch.newname.strip()
                            if file_name == "/dev/null":
                                file_name = patch.oldname.strip()
                            cd = CommitData(file_name)

                            if deleted:
                                cd.old_start_line = old_start_line
                                cd.old_end_line = old_end_line
                                old_start_line = old_end_line

                            if added:
                                cd.new_start_line = new_start_line
                                cd.new_end_line = new_end_line
                                new_start_line = new_end_line

                            hunks.append(cd)
                            added = deleted = False

                        old_start_line += 1
                        new_start_line += 1

                # The diff ended without a new context line
                if in_change:
                    file_name = patch.newname.strip()
                    if file_name == "/dev/null":
                        file_name = patch.oldname.strip()
                    cd = CommitData(file_name)

                    if deleted:
                        cd.old_start_line = old_start_line
                        cd.old_end_line = old_end_line

                    if added:
                        cd.new_start_line = new_start_line
                        cd.new_end_line = new_end_line

                    hunks.append(cd)
        return hunks
Beispiel #31
0
            repo_id = read_cursor.fetchone()[0]
        except NotImplementedError:
            raise ExtensionRunError("Content extension is not supported for %s repos" % (repo.get_type()))
        except Exception, e:
            raise ExtensionRunError("Error creating repository %s. Exception: %s" % (repo.get_uri(), str(e)))

        # Try to create a table for storing the content
        # TODO: Removed use case for choosing between all or just the HEAD,
        # should ideally put that back again. Just all for now is fine.
        try:
            self.__prepare_table(connection)
        except Exception as e:
            raise ExtensionRunError("Couldn't prepare table because " + str(e))

        queuesize = self.MAX_THREADS
        printdbg("Setting queuesize to " + str(queuesize))

        # This is where the threading stuff comes in, I expect
        job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize)

        # This filters files if they're not source files.
        # I'm pretty sure "unknown" is returning binary files too, but
        # these are implicitly left out when trying to convert to utf-8
        # after download. However, ignore them for now to speed things up
        query = (
            "select f.id from file_types ft, files f "
            + "where f.id = ft.file_id and "
            + "ft.type in('code') and "
            + "f.repository_id = ?"
        )
        # "ft.type in('code', 'unknown') and " + \
Beispiel #32
0
    def run(self, repo, uri, db):
        #record how many patches contains different file name
        function_name_change_count = 0
        #only suitable for my computer, user can change according to your own settings
        prefix = r'/home/moqi/Downloads/voldemort'
        #old file name
        f_of_old = open('/home/moqi/Downloads/voldemort/old', 'w')
        #new file name
        f_of_new = open('/home/moqi/Downloads/voldemort/new', 'w')
        #store information returns by search_lines
        search_result={}
        #number of exception
        #such as /null and file has been deleted so that can not open
        #not accurate
        num_of_exception = 0
        #number of file which do not belong to source files
        non_source_file = 0 
        #number of patch which commit_id = 1
        num_of_id1 = 0 
        #number of files can not be recovered
        num_of_unrecovered = 0
        #old_cla contains class definition in old file
        old_cla = sets.Set()
        new_cla = sets.Set()
        old_func = sets.Set()
        new_func = sets.Set()
        #max id in table patches
        id_max = 0
        #patch_id
        patch_id = 0
        #file_id
        file_id = 0
        ##old_class, new_class, old_function, new_function
        old_class = ''
        new_class = ''
        old_function = ''
        new_function = ''
        
        
        
        __insert__ = """INSERT INTO analyse_patch (patch_id, commit_id, file_id, old_class, new_class, 
                    old_function, new_function, if_id1)
                    values (?, ?, ?, ?, ?, ?, ?, ?)"""
        start = time.time()
        
        profiler_start("Running analyse_patch extension")
        self.db = db
        self.repo = repo
        
        path = uri_to_filename(uri)
        if path is not None:
            repo_uri = repo.get_uri_for_path(path)
            ##added by me
            prefix = path
        else:
            repo_uri = uri

        path = uri_to_filename(uri)
        self.repo_uri = path or repo.get_uri()

        cnn = self.db.connect()

        cursor = cnn.cursor()
        write_cursor = cnn.cursor()
        
        cursor.execute(statement("SELECT id from repositories where uri = ?",
                             db.place_holder), (repo_uri,))
        repo_id = cursor.fetchone()[0]

        try:
            printdbg("Creating analyse_patch table")
            self.__create_table(cnn)
        except TableAlreadyExists:
            pass
        except Exception, e:
            raise ExtensionRunError(str(e))
Beispiel #33
0
 def end_file (self):
     profiler_stop("Processing blame output for %s",(self.filename))
     if len(self.bug_revs)==0:
         printdbg("No bug revision found in this file")
Beispiel #34
0
        self.__prepare_table(connection)
        fp = FilePaths(db)

        patches = self.get_patches(repo, path or repo.get_uri(), repo_id, db,
                                   read_cursor)

        for commit_id, patch_content, rev in patches:
            for hunk in self.get_commit_data(patch_content):
                # Get the file ID from the database for linking
                hunk_file_name = re.sub(r'^[ab]\/', '',
                                        hunk.file_name.strip())
                file_id = fp.get_file_id(hunk_file_name, commit_id)

                if file_id == None:
                    printdbg("file not found")
                    if repo.type == "git":
                        # The liklihood is that this is a merge, not a
                        # missing ID from some data screwup.
                        # We'll just continue and throw this away
                        continue
                    else:
                        printerr("No file ID found for hunk " + \
                                 hunk_file_name + \
                                 " at commit " + str(commit_id))

                insert = """insert into hunks(file_id, commit_id,
                            old_start_line, old_end_line, new_start_line,
                            new_end_line)
                            values(?,?,?,?,?,?)"""
Beispiel #35
0
        profiler_stop("Hunks: fetch all patches", delete=True)

        self.__prepare_table(connection)
        fp = FilePaths(db)
        rs = icursor.fetchmany()

        while rs:
            for commit_id, patch_content, rev in rs:
                for hunk in self.get_commit_data(patch_content):
                    # Get the file ID from the database for linking
                    hunk_file_name = re.sub(r'^[ab]\/', '',
                                            hunk.file_name.strip())
                    file_id = fp.get_file_id(hunk_file_name, commit_id)

                    if file_id == None:
                        printdbg("file not found")
                        if repo.type == "git":
                            # The liklihood is that this is a merge, not a
                            # missing ID from some data screwup.
                            # We'll just continue and throw this away
                            continue
                        else:
                            printerr("No file ID found for hunk " + \
                                     hunk_file_name + \
                                     " at commit " + commit_id)

                    insert = """insert into hunks(file_id, commit_id,
                                old_start_line, old_end_line, new_start_line, 
                                new_end_line)
                                values(?,?,?,?,?,?)"""
Beispiel #36
0
    def get_commit_data(self, patch_content):
        lines = [l + "\n" for l in patch_content.split("\n") if l]
        hunks = []

        for patch in [p for p in parse_patches(lines, allow_dirty=True, \
                            allow_continue=True) if isinstance(p, Patch)]:
            # This method matches that of parseLine in UnifiedDiffParser.java
            # It's not necessarily intuitive, but this algorithm is much harder
            # than it looks, I spent hours trying to get a simpler solution.
            # It does, however, seem to work, which is pretty amazing when
            # you think about how difficult it is for long enough.
            # The trick that this method does is that each *part* of a hunk
            # ie. added, deleted, changed are treated as *new entities*.
            # The EntityDelta table does not store just diffs, it stores
            # each part of a diff.
            # I will need to copy the behavior of how Sep inserts a CommitData
            # into the database to ensure things match
            for hunk in patch.hunks:
                old_start_line = hunk.orig_pos - 1
                new_start_line = hunk.mod_pos - 1

                old_end_line = 0
                new_end_line = 0

                added = False
                deleted = False
                in_change = False

                for line in hunk.lines:
                    if isinstance(line, RemoveLine):
                        if not in_change or not deleted:
                            in_change = True
                            old_start_line += 1
                            old_end_line = old_start_line
                        else:
                            old_end_line += 1

                        deleted = True

                    elif isinstance(line, InsertLine):
                        if not in_change or not added:
                            in_change = True
                            new_start_line += 1
                            new_end_line = new_start_line
                        else:
                            new_end_line += 1

                        added = True

                    elif isinstance(line, ContextLine):
                        if in_change:
                            in_change = False
                            printdbg("Patch new name: " + patch.newname)
                            file_name = patch.newname.strip()
                            if file_name == "/dev/null":
                                file_name = patch.oldname.strip()
                            cd = CommitData(file_name)

                            if deleted:
                                cd.old_start_line = old_start_line
                                cd.old_end_line = old_end_line
                                old_start_line = old_end_line

                            if added:
                                cd.new_start_line = new_start_line
                                cd.new_end_line = new_end_line
                                new_start_line = new_end_line

                            hunks.append(cd)
                            added = deleted = False

                        old_start_line += 1
                        new_start_line += 1

                # The diff ended without a new context line
                if in_change:
                    file_name = patch.newname.strip()
                    if file_name == "/dev/null":
                        file_name = patch.oldname.strip()
                    cd = CommitData(file_name)

                    if deleted:
                        cd.old_start_line = old_start_line
                        cd.old_end_line = old_end_line

                    if added:
                        cd.new_start_line = new_start_line
                        cd.new_end_line = new_end_line

                    hunks.append(cd)
        return hunks
Beispiel #37
0
 def end_file(self):
     profiler_stop("Processing blame output for %s", (self.filename))
     if len(self.bug_revs) == 0:
         printdbg("No bug revision found in this file")
Beispiel #38
0
        self.__prepare_table(connection)

        i = 0

        for row in read_cursor:
            row_id = row[0]
            rev = row[1]
            
            job = FileCountJob(row_id, rev)
            job_pool.push(job)
            
            i = i + 1
            
            if i >= queuesize:
                printdbg("FileCount queue is now at %d, flushing to database", 
                         (i,))

                processed_jobs = self.__process_finished_jobs(job_pool, 
                                                              write_cursor, db)

                connection.commit()
                i = i - processed_jobs
                
                if processed_jobs < (queuesize / 5):
                    job_pool.join()
        
        job_pool.join()
        self.__process_finished_jobs(job_pool, write_cursor, db)
        read_cursor.close()
        connection.commit()
        connection.close()