def testBuildTrainDbModelData(self): dbPath = config.getCfDbFile() db = CFDatabase(dbPath) db.clean() self.assertEqual(len(db.getAllFixData()), 0) del db buildTestDB.main(True) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 2) self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores') self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero') self.assertEqual(db.getLastCommit(), 'f2917b938f0ecbc62ad48101d034369a1ae61a19') modelBuilder = ModelDataBuilder() bugVector = modelBuilder.getEncodedBugData('core.DivideZero') dictionary = modelBuilder.getDictionary() fixVector = modelBuilder.getEncodedFixData('core.DivideZero', dictionary) self.assertSequenceEqual(bugVector, [[ 1, 2, 3, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 2, 12, 13, 1, 4, 14, 15, 11, 15, 16, 4, 17, 18, 3, 4 ]]) self.assertSequenceEqual(fixVector, [[ 1, 2, 3, 4, 5, 6, 1, 19, 3, 8, 9, 10, 11, 2, 12, 13, 1, 4, 14, 15, 11, 15, 16, 4, 17, 18, 3, 4 ]])
def testDatabaseOpenCreateSuccess(self): dbPath = config.getRepoDir() + "/testtmp.sqlite" dbFile = Path(dbPath) if dbFile.is_file(): dbFile.unlink() db = CFDatabase(dbPath) dbFile = Path(dbPath) self.assertTrue(dbFile.is_file()) del db db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 0) del db dbFile.unlink()
def testDatabaseStoreAndClean(self): dbPath = config.getRepoDir() + "/testtmp.sqlite" dbFile = Path(dbPath) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 0) db.store('', '', 'a') self.assertEqual(len(db.getAllFixData()), 1) self.assertEqual(len(db.getFixDataForChecker('a')), 1) self.assertEqual(len(db.getFixDataForChecker('b')), 0) db.clean() self.assertEqual(len(db.getAllFixData()), 0) del db dbFile.unlink()
def testBuildTrainDbClean(self): dbPath = config.getCfDbFile() db = CFDatabase(dbPath) db.clean() self.assertEqual(len(db.getAllFixData()), 0) del db buildTestDB.main(True) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 2) self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores') self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero') self.assertEqual(db.getLastCommit(), 'f2917b938f0ecbc62ad48101d034369a1ae61a19')
def prepareDb(self, clean=False): self.db = CFDatabase(config.getCfDbFile()) if clean: self.db.clean()
class TestDbBuilder(): def __init__(self): self.vcs = GitProvider(config.getRepoDir()) self.ccdb = CCDatabase(config.getCcDbFile()) self.codeChecker = CodeChecker(config.getRepoDir()) def loadCommitList(self, clean=False): self.commits = self.vcs.getAllVersions(config.getBranch()) if not clean: lastCommit = self.db.getLastCommit() lastIndex = self.commits.index(lastCommit) + 1 if lastIndex < len(self.commits) - 1: self.commits = self.commits[0:lastIndex] self.currentCommitIndex = lastIndex - 1 else: self.currentCommitIndex = len(self.commits) def prepareDb(self, clean=False): self.db = CFDatabase(config.getCfDbFile()) if clean: self.db.clean() def checkoutToNextVersion(self): self.currentCommitIndex = self.currentCommitIndex - 1 if (self.currentCommitIndex < 0): return False self.vcs.checkout(self.commits[self.currentCommitIndex]) return True def getDiffResolvedIds(self): resolved = self.codeChecker.diffResolved(config.getCcRunName(), config.getTmpDir(), self.ccdb) ids = [] for bug in resolved: ids.append(bug['reportId']) return ids def convertFilePathToRepoRelativePath(self, path): return os.path.relpath(path, config.getRepoDir()) def extractCode(self, id): bugData = self.ccdb.getNotResolvedBugData(id) #TODO: Possible improvement for bugData if bugData is None: #TODO: Implement custom errors return None fileRelativePath = self.convertFilePathToRepoRelativePath( bugData.getFile()) try: fullCodeWithBug = self.vcs.getFileContents( fileRelativePath, self.commits[self.currentCommitIndex + 1]) fullCodeWithoutBug = self.vcs.getFileContents( fileRelativePath, self.commits[self.currentCommitIndex]) except KeyError as extractError: return None diff = POSIXDiffer().diff(fullCodeWithBug, fullCodeWithoutBug) extractor = CodeExtractor(bugData) try: extractor.extractAll(fullCodeWithBug, diff) except ValueError as extractError: return None bugCodeFragment = extractor.getBugCodeFragment() fixCodeFragment = extractor.getFixCodeFragment() usedDiffs = extractor.getUsedDiffs() #Easy version - ignore bug if none or more than one diff used to fix #TODO: Possible improvement here if len(usedDiffs) != 1: return None return entities.FixData(bugCodeFragment, fixCodeFragment, bugData.getChecker(), bugData.getMessage(), bugData.getLine() - bugData.getStartLine()) def prepareEnv(self, clean=False): print('Preparing train db... ', end='') self.prepareDb(clean) print('done') print('Loading commit list... ', end='') self.loadCommitList(clean) print('done') if clean: print('Checking out to root... ', end='') self.checkoutToNextVersion() print('done') print('Initial analysis... ', end='') self.codeChecker.check(True) print('done') print('Storing initial results... ', end='') self.codeChecker.store(self.commits[self.currentCommitIndex]) print('done') print('Storing version information... ', end='') self.db.storeLastCommit(self.commits[self.currentCommitIndex]) print('done') print('Cleaning up tmp directory... ', end='') shutil.rmtree(config.getTmpDir()) print('done') print('Cleaning up working directory... ', end='') self.codeChecker.clean() print('done') def findAndStoreFixDataForVersion(self): print('Analyzing version', self.commits[self.currentCommitIndex], '... ', end='') self.codeChecker.check(True) print('done') print('Getting list of resolved bugs for version', self.commits[self.currentCommitIndex], '... ', end='') ids = self.getDiffResolvedIds() print('done') bugNo = 1 allBugs = len(ids) anyStored = False for id in ids: print('Parsing data for bug ({0}/{1}, #{2})...'.format( bugNo, allBugs, id), sep='', end='') fixData = self.extractCode(id) bugNo = bugNo + 1 print('done') if fixData is not None: print('Storing fixData... ', end='') self.db.store(fixData.getBugCode(), fixData.getFixCode(), fixData.getChecker(), fixData.getMessage(), fixData.getLine()) anyStored = True print('done') if bugNo % 100 == 0 and anyStored: self.db.commit() anyStored = False if anyStored: self.db.commit() print('Storing CodeChecker results for this version... ', end='') self.codeChecker.store(self.commits[self.currentCommitIndex]) print('done') print('Storing version information... ', end='') self.db.storeLastCommit(self.commits[self.currentCommitIndex]) print('done') print('Cleaning up tmp directory... ', end='') shutil.rmtree(config.getTmpDir()) print('done') print('Cleaning up working directory... ', end='') self.codeChecker.clean() print('done') def iterateThroughVcsHistory(self): while self.checkoutToNextVersion(): self.findAndStoreFixDataForVersion() def checkoutToTop(self): self.vcs.checkout(config.getBranch()) def build(self, clean=False): self.prepareEnv(clean) self.iterateThroughVcsHistory() self.checkoutToTop()
def __init__(self): self.db = CFDatabase(config.getCfDbFile()) self.lexer = CxxLexer() self.checkers = Checkers()
class DictionaryBuilder(): def __init__(self): self.db = CFDatabase(config.getCfDbFile()) self.lexer = CxxLexer() self.checkers = Checkers() def build(self, checker): # Load all data from DB print("Fetching data from database...") allData = self.db.getFixDataForChecker(checker) allDataLen = len(allData) print("Done, fetched {0} records".format(allDataLen)) if allDataLen < 1: print("No data found") return # Tokenize all code snippets and extract extra tokens from checker's messages # Labelize all tokens existing only in fixed code (added data) # Labelize all tokens appearing more than X times # Labelize all C++ STL names (namespaces, constants, defines, variables, functions, headers, numeric literals) # Labelize all UNK token indexes print("Converting to tokens...") tokens = deque() tokensLen = 0 labels = {} i = 0 tokensLen = 0 minTokens1Len = 9999 minTokens2Len = 9999 maxTokens1Len = 0 maxTokens2Len = 0 uniqTokenIDs = {} for tid in range(globals.firstAvailableToken): uniqTokenIDs[tid] = 0 uniqTokenIDs[0] = 1 # T_ZERO uniqTokenIDs[349] = 1 # T_SOS uniqTokenIDs[351] = 1 # T_UNK while i < allDataLen: # Tokenize tokens1 = self.lexer.tokenize(allData[i][1]) tokens2 = self.lexer.tokenize(allData[i][2]) extra = self.checkers.extractTokensForChecker( checker, allData[i][4]) newTokens = [] # Extract new tokens for token2 in tokens2: matchFound = False for token1 in tokens1: if token1['token'] == token2['token'] and token1[ 'has_value'] == token2['has_value']: if token1['has_value']: if token1['value'] == token2['value']: matchFound = True else: matchFound = True if not matchFound: newTokens.append(token2) tokens1Len = len(tokens1) tokens2Len = len(tokens2) # Statistics if tokens1Len < minTokens1Len: minTokens1Len = tokens1Len if tokens2Len < minTokens2Len: minTokens2Len = tokens2Len if tokens1Len > maxTokens1Len: maxTokens1Len = tokens1Len if tokens2Len > maxTokens2Len: maxTokens2Len = tokens2Len # Count occurrences of each label allTokens = tokens1 + tokens2 + extra for token in allTokens: value = globals.emptyValue if token['has_value']: value = token['value'] if value in labels: labels[value] += 1 else: labels[value] = 1 uniqTokenIDs[int(token['token'])] += 1 tokensLen += 1 if len(newTokens) > 0: tokens.append(newTokens) i += 1 print('Done {0}, processed {1} tokens ({2}/{3}/{4}/{5})'.format( i, len(allTokens), tokens1Len, tokens2Len, len(extra), len(newTokens)), file=sys.stderr) print("Done, converted {0} tokens".format(tokensLen)) # Labelizing labelDb = [globals.emptyValue] # UNK print("Adding UNK token labels") for i in range(config.cfNoOfUnkTokens): labelDb.append("UNK_{0}".format(i)) print("Done, current label DB has {0} entries".format(len(labelDb))) # Common occurrences print("Filtering labels, selecting only those with > {0} occurrences". format(config.cfLabelThreshold)) for key in labels.keys(): if labels[key] > config.cfLabelThreshold: labelDb.append(key) print("Done, current label DB has {0} entries".format(len(labelDb))) # New tokens in fixed code print("Filtering labels, selecting only tokens introduced with fix") for entry in tokens: for token in entry: if token['has_value']: labelDb.append(token['value']) print("Done, current label DB has {0} entries".format(len(labelDb))) # STL part # Token IDs for i in range(globals.firstAvailableToken): if uniqTokenIDs[i] > 0: labelDb.append("T_{0}".format(i)) # Printout print("Uniqueing labels") labelsUnique = list(set(labelDb)) print("Done, current label DB has {0} entries".format( len(labelsUnique))) print("Data set info") print("Min no of tokens (bug): {0}".format(minTokens1Len)) print("Min no of tokens (fix): {0}".format(minTokens2Len)) print("Max no of tokens (bug): {0}".format(maxTokens1Len)) print("Max no of tokens (fix): {0}".format(maxTokens2Len)) print("Extracted labels:") print(labelsUnique) print("Token uses:") for i in range(globals.firstAvailableToken): if uniqTokenIDs[i] > 0: print("{0}: {1}".format(i, uniqTokenIDs[i])) # Save to file print("Writing to dictionary file") with open(config.cfDictFilenameFormat.format(checker), "w") as f: f.write(json.dumps(labelsUnique)) print("Done, exiting...")
class LearningDataBuilder(): def __init__(self): self.db = CFDatabase(config.getCfDbFile()) self.checkers = Checkers() def build(self, checker): # Initialize coder print("Initializing coder...") self.dictionary = Dictionary(checker) self.coder = Coder(self.dictionary) # Load all data from DB print("Fetching data from database...") allData = self.db.getFixDataForChecker(checker) allDataLen = len(allData) print("Done, fetched {0} records".format(allDataLen)) if allDataLen < 1: print("No data found") return # Encode all data print("Encoding all data and writing to output file...") i = 0 (maxBug, maxFix, maxUnk) = self.checkers.getModelStatsForChecker(checker) with open(config.cfTrainFilenameFormat.format(checker), 'w') as f: while i < allDataLen: checkerInfo = self.checkers.extractTokensForChecker( checker, allData[i][4]) encodedBugData, initialUnkList = self.coder.encode( allData[i][1], checkerData=checkerInfo) encodedFixData, finalUnkList = self.coder.encode( allData[i][2], unkList=initialUnkList, reverse=False) if -1 in encodedBugData: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (bug), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) elif -1 in encodedFixData: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (fix), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) elif len(encodedBugData) > maxBug or len( encodedFixData) > maxFix or len(finalUnkList) > maxUnk: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (lengths), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) else: print("{0}: [{2} - {3} ({1})] Done (lenUnk = {1})".format( i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) f.write( json.dumps({ 'x': encodedBugData, 'y': encodedFixData }) + '\n') i += 1 print('Done {0}'.format(i), file=sys.stderr) print("All done, exiting...")
def testDatabaseOpenExistingSuccess(self): dbPath = config.getRepoDir() + "/test.sqlite" db = CFDatabase(dbPath) self.assertGreater(len(db.getAllFixData()), 0) del db
def testBuildTrainDbIncremental(self): originalBranch = config.getBranch() dbPath = config.getCfDbFile() db = CFDatabase(dbPath) db.clean() self.assertEqual(len(db.getAllFixData()), 0) del db config.setBranch('trainDbScriptIncrementalTest') buildTestDB.main(True) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 1) self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores') self.assertEqual(db.getLastCommit(), 'add691cf37da6c9d40666eac1bc8c1afda071c77') del db config.setBranch(originalBranch) buildTestDB.main(False) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 2) self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores') self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero') self.assertEqual(db.getLastCommit(), 'f2917b938f0ecbc62ad48101d034369a1ae61a19')
def setUpClass(self): print("Starting up...") self.db = CFDatabase(config.getCfDbFile()) self.checkers = Checkers() self.checkerList = ['deadcode.DeadStores'] self.checkerIndex = 0