def __process_finished_jobs(self, job_pool, write_cursor, db): # start = datetime.now() finished_job = job_pool.get_next_done(0) processed_jobs = 0 # commit_id is the commit ID. For some reason, the # documentation advocates tablename_id as the reference, # but in the source, these are referred to as commit IDs. # Don't ask me why! while finished_job is not None: file_contents = None if not Config().no_content: file_contents = str(finished_job.file_contents) query = """ insert into content(commit_id, file_id, content, loc, size) values(?,?,?,?,?)""" insert_statement = statement(query, db.place_holder) parameters = (finished_job.commit_id, finished_job.file_id, file_contents, finished_job.file_number_of_lines, finished_job.file_size) execute_statement(insert_statement, parameters, write_cursor, db, "Couldn't insert, duplicate record?", exception=ExtensionRunError) processed_jobs += 1 finished_job = job_pool.get_next_done(0) return processed_jobs
def update_all(self, repo_id): """ update_all enable cache for adjacency matrices Pros: File paths in different revisions can be accessed randomly, i.e. after calling update_all, get_path can be called with any revision in any order. Cons: It consumes significant memory to store the adjacency matrices If the config has low_memory set to true, shelve will be used instead, to write the cache out to disk. """ profiler_start("Update all file paths") if Config().low_memory: self.shelve_file_name = str(time()) + "-shelve.db" # If there is an old file, shelf will complain viciously if os.path.exists(self.shelve_file_name): os.remove(self.shelve_file_name) self.__dict__['cached_adj'] = shelve.open(self.shelve_file_name, writeback=False) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """select distinct(s.id) from scmlog s, actions a where s.id = a.commit_id and repository_id=? order by s.date""" cursor.execute(statement(query, db.place_holder), (repo_id,)) old_id = -1 all_commits = [i[0] for i in cursor.fetchall()] for id in all_commits: if old_id != id: adj = self.__dict__['cached_adj'].get(str(id)) if adj is None: self.update_for_revision(cursor, id, repo_id) self.__dict__['cached_adj'][str(id)] = \ deepcopy(self.__dict__['adj']) old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True)
def close(self): """Closes FilePaths to ensure all caches are deleted""" if Config().low_memory: # FIXME: This should be closed, but sometimes shelve # just won't do it. The best way is to timeout the try, # but not closing and just deleting will do the same # think, just in a more yucky way printdbg("Syncing shelf") self.__dict__['cached_adj'].sync() printdbg("Closing shelf") self.__dict__['cached_adj'].close() printdbg("Deleting shelve " + self.shelve_file_name) os.remove(self.shelve_file_name) # Clean up cached adj in case this gets called without # update_all later self.__dict__['cached_adj'] = {}
def __init__(self, repo, uri): LineCounter.__init__(self, repo, uri) from pycvsanaly2.Config import Config from pycvsanaly2.CVSParser import CVSParser p = CVSParser() p.set_repository(repo, uri) def new_line(line, parser): parser.feed(line) reader = LogReader() reader.set_repo(repo, uri) logfile = Config().repo_logfile if logfile is not None: reader.set_logfile(logfile) reader.start(new_line, p) self.lines = p.get_added_removed_lines()
old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True) if __name__ == '__main__': import sys from pycvsanaly2.Database import create_database from pycvsanaly2.Config import Config db = create_database('sqlite', sys.argv[1]) cnn = db.connect() fp = FilePaths(db) config = Config() config.profile = True cursor = cnn.cursor() cursor.execute( "select s.id, file_id from scmlog s, actions a where s.id = a.commit_id" ) old_id = -1 for id, file_id in cursor.fetchall(): if old_id != id: print "Commit ", id fp.update_for_revision(cursor, id, 1) old_id = id print fp.get_path(file_id, id, 1) cursor.close()
def __get_path_from_db(self, file_id, commit_id): cursor = self.cnn.cursor() cursor.execute(statement(self.__path_query__, self.db.place_holder), (file_id, commit_id)) path = cursor.fetchone()[0] cursor.close() return "/" + path if __name__ == '__main__': import sys from pycvsanaly2.Database import create_database from pycvsanaly2.Config import Config config = Config() config.load() db = create_database(config.db_driver, sys.argv[1], config.db_user, config.db_password, config.db_hostname) cnn = db.connect() cursor = cnn.cursor() fr = FileRevs(db, cnn, cursor, 1) for revision, commit_id, file_id, action_type, composed in fr: print revision, commit_id, action_type, fr.get_path() cursor.close() cnn.close()
(self.path, self.rev, e.cmd, \ e.returncode, e.error)) except Exception, e: failed = True printerr("Error obtaining %s@%s. Exception: %s", \ (self.path, self.rev, str(e))) self.repo.remove_watch(LS, wid) if failed: printerr("Failure due to error") else: try: self.ls_lines = io.getvalue().splitlines() if Config().count_types: self.ls_lines = [ fp for fp in self.ls_lines if guess_file_type(fp) in Config().count_types ] except Exception, e: printerr("Error getting ls-lines." + "Exception: %s", (str(e), )) finally: io.close() def _get_ls_line_count(self): return len(self.ls_lines) ls_line_count = property(_get_ls_line_count)
relative_path = file_link[1] break else: file_link = cursor.fetchone() except CommandError as e: printerr(str(e) + '\n' + e.error) cursor.close() if relative_path is None: return None else: return relative_path.strip("/") if __name__ == '__main__': import sys from pycvsanaly2.Database import create_database from pycvsanaly2.Config import Config config = Config() config.load() db = create_database(config.db_driver, sys.argv[1], config.db_user, config.db_password, config.db_hostname) cnn = db.connect() cursor = cnn.cursor() fr = FileRevs(db, cnn, cursor, 1) for revision, commit_id, file_id, action_type, composed in fr: print revision, commit_id, action_type, fr.get_path() cursor.close() cnn.close()
if __name__ == '__main__': import sys sys.path.insert(0, "../../") from pycvsanaly2.Database import statement from pycvsanaly2.utils import printdbg from pycvsanaly2.profile import profiler_start, profiler_stop from pycvsanaly2.Config import Config from copy import deepcopy import shelve import os from time import time config = Config() class Adj(object): def __init__(self): self.files = {} self.adj = {} class FilePaths(object): __shared_state = {'rev': None, 'adj': None, 'files': None, 'cached_adj': {}, 'db': None}
def fixes_bug(self, commit_message): """Check whether a commit message indicated a bug was present. # This is set in the config. Uncomment if you wish to try out # specific regexes #>>> Config().bug_fix_regexes = ["defect(s)?", "patch(ing|es|ed)?", \ "bug(s|fix(es)?)?", "debug(ged)?", "fix(es|ed)?", "\#\d+"] #>>> Config().bug_fix_regexes_case_sensitive = ["[A-Z]+-\d+",] >>> b = BugFixMessage() # Easy ones >>> b.fixes_bug("Bug") True >>> b.fixes_bug("Bugs") True >>> b.fixes_bug("Fix") True >>> b.fixes_bug("Fixed") True >>> b.fixes_bug("Defect") True >>> b.fixes_bug("Defects") True >>> b.fixes_bug("Patches") True >>> b.fixes_bug("Patching") True # Embeds in sentences >>> b.fixes_bug("Fixed a bug") True >>> b.fixes_bug("Debugged this one") True >>> b.fixes_bug("Found a hole, which I patched, shouldn't be problem") True >>> b.fixes_bug("Put in a couple of fixes in x.java") True >>> b.fixes_bug("Implemented a bugfix") True >>> b.fixes_bug("References #1234") True >>> b.fixes_bug("Defect X is no more") True >>> b.fixes_bug("Closes JENKINS-1234") True # Embeds in long commit messages >>> b.fixes_bug("This was tough. Fixed now.") True >>> b.fixes_bug("Found X; debugged and solved.") True # Regression tests from Apache # When adding these, keep weird punctuation intact. >>> b.fixes_bug("Fixups to build the whole shebang once again.") True >>> b.fixes_bug("Change some INFO messages to DEBUG messages.") True >>> b.fixes_bug("Put back PR#6347") True >>> b.fixes_bug("Typo fixage..") True >>> b.fixes_bug("another typo/fixup") True >>> b.fixes_bug("Refix the entity tag comparisons") True >>> b.fixes_bug("Closeout PR#721") True >>> b.fixes_bug("SECURITY: CVE-2010-0408 (cve.mitre.org)") True >>> b.fixes_bug(" debugged the require_one and require_all") True >>> b.fixes_bug(" various style fixups / general changes") True >>> b.fixes_bug(" Win32: Eliminate useless debug error message") True # Things that shouldn't match # Refactoring could go either way, depending on whether you think # renaming/refactoring is a "bug fix." Right now, we don't call that # a "bug" >>> b.fixes_bug("Added method print_debug()") False >>> b.fixes_bug("Altered debug_log()") False >>> b.fixes_bug("NETWORK_PATCH_FIX") False >>> b.fixes_bug("Rename ap_debug_assert() to AP_DEBUG_ASSERT()") False >>> b.fixes_bug("Use bread() etc instead of fread() for " + \ "reading/writing") False >>> b.fixes_bug("Refactored to look cleaner") False >>> b.fixes_bug("Rewrite this yucky file") False >>> b.fixes_bug("Edited this file on 2010-12-01") False >>> b.fixes_bug("This file pertains to the A80-154 spec") False >>> b.fixes_bug("This is for March-28") False """ if self.__match_string(Config().bug_fix_regexes, \ re.DOTALL | re.IGNORECASE, commit_message): return True if self.__match_string(Config().bug_fix_regexes_case_sensitive, \ re.DOTALL, commit_message): return True return False
(repo.get_type())) except Exception, e: raise ExtensionRunError( \ "Error creating repository %s. Exception: %s" % \ (repo.get_uri(), str(e))) # Try to create a table for storing the content # TODO: Removed use case for choosing between all or just the HEAD, # should ideally put that back again. Just all for now is fine. try: self.__prepare_table(connection) except Exception as e: raise ExtensionRunError("Couldn't prepare table because " + \ str(e)) queuesize = Config().max_threads printdbg("Setting queuesize to " + str(queuesize)) # This is where the threading stuff comes in, I expect job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize) # This filters files if they're not source files. # I'm pretty sure "unknown" is returning binary files too, but # these are implicitly left out when trying to convert to utf-8 # after download. However, ignore them for now to speed things up query = "select f.id from file_types ft, files f " + \ "where f.id = ft.file_id and " + \ "ft.type in('code') and " + \ "f.repository_id = ?" # "ft.type in('code', 'unknown') and " + \ read_cursor.execute(statement(query, db.place_holder), (repo_id, ))