Example #1
0
 def testBuildTrainDbModelData(self):
     dbPath = config.getCfDbFile()
     db = CFDatabase(dbPath)
     db.clean()
     self.assertEqual(len(db.getAllFixData()), 0)
     del db
     buildTestDB.main(True)
     db = CFDatabase(dbPath)
     self.assertEqual(len(db.getAllFixData()), 2)
     self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores')
     self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero')
     self.assertEqual(db.getLastCommit(),
                      'f2917b938f0ecbc62ad48101d034369a1ae61a19')
     modelBuilder = ModelDataBuilder()
     bugVector = modelBuilder.getEncodedBugData('core.DivideZero')
     dictionary = modelBuilder.getDictionary()
     fixVector = modelBuilder.getEncodedFixData('core.DivideZero',
                                                dictionary)
     self.assertSequenceEqual(bugVector, [[
         1, 2, 3, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 2, 12, 13, 1, 4, 14, 15,
         11, 15, 16, 4, 17, 18, 3, 4
     ]])
     self.assertSequenceEqual(fixVector, [[
         1, 2, 3, 4, 5, 6, 1, 19, 3, 8, 9, 10, 11, 2, 12, 13, 1, 4, 14, 15,
         11, 15, 16, 4, 17, 18, 3, 4
     ]])
Example #2
0
 def testDatabaseOpenCreateSuccess(self):
     dbPath = config.getRepoDir() + "/testtmp.sqlite"
     dbFile = Path(dbPath)
     if dbFile.is_file():
         dbFile.unlink()
     db = CFDatabase(dbPath)
     dbFile = Path(dbPath)
     self.assertTrue(dbFile.is_file())
     del db
     db = CFDatabase(dbPath)
     self.assertEqual(len(db.getAllFixData()), 0)
     del db
     dbFile.unlink()
Example #3
0
 def testDatabaseStoreAndClean(self):
     dbPath = config.getRepoDir() + "/testtmp.sqlite"
     dbFile = Path(dbPath)
     db = CFDatabase(dbPath)
     self.assertEqual(len(db.getAllFixData()), 0)
     db.store('', '', 'a')
     self.assertEqual(len(db.getAllFixData()), 1)
     self.assertEqual(len(db.getFixDataForChecker('a')), 1)
     self.assertEqual(len(db.getFixDataForChecker('b')), 0)
     db.clean()
     self.assertEqual(len(db.getAllFixData()), 0)
     del db
     dbFile.unlink()
 def testBuildTrainDbClean(self):
     dbPath = config.getCfDbFile()
     db = CFDatabase(dbPath)
     db.clean()
     self.assertEqual(len(db.getAllFixData()), 0)
     del db
     buildTestDB.main(True)
     db = CFDatabase(dbPath)
     self.assertEqual(len(db.getAllFixData()), 2)
     self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores')
     self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero')
     self.assertEqual(db.getLastCommit(),
                      'f2917b938f0ecbc62ad48101d034369a1ae61a19')
Example #5
0
 def prepareDb(self, clean=False):
     self.db = CFDatabase(config.getCfDbFile())
     if clean:
         self.db.clean()
Example #6
0
class TestDbBuilder():
    def __init__(self):
        self.vcs = GitProvider(config.getRepoDir())
        self.ccdb = CCDatabase(config.getCcDbFile())
        self.codeChecker = CodeChecker(config.getRepoDir())

    def loadCommitList(self, clean=False):
        self.commits = self.vcs.getAllVersions(config.getBranch())
        if not clean:
            lastCommit = self.db.getLastCommit()
            lastIndex = self.commits.index(lastCommit) + 1
            if lastIndex < len(self.commits) - 1:
                self.commits = self.commits[0:lastIndex]
            self.currentCommitIndex = lastIndex - 1
        else:
            self.currentCommitIndex = len(self.commits)

    def prepareDb(self, clean=False):
        self.db = CFDatabase(config.getCfDbFile())
        if clean:
            self.db.clean()

    def checkoutToNextVersion(self):
        self.currentCommitIndex = self.currentCommitIndex - 1
        if (self.currentCommitIndex < 0):
            return False
        self.vcs.checkout(self.commits[self.currentCommitIndex])
        return True

    def getDiffResolvedIds(self):
        resolved = self.codeChecker.diffResolved(config.getCcRunName(),
                                                 config.getTmpDir(), self.ccdb)
        ids = []
        for bug in resolved:
            ids.append(bug['reportId'])
        return ids

    def convertFilePathToRepoRelativePath(self, path):
        return os.path.relpath(path, config.getRepoDir())

    def extractCode(self, id):
        bugData = self.ccdb.getNotResolvedBugData(id)
        #TODO: Possible improvement for bugData
        if bugData is None:
            #TODO: Implement custom errors
            return None

        fileRelativePath = self.convertFilePathToRepoRelativePath(
            bugData.getFile())
        try:
            fullCodeWithBug = self.vcs.getFileContents(
                fileRelativePath, self.commits[self.currentCommitIndex + 1])
            fullCodeWithoutBug = self.vcs.getFileContents(
                fileRelativePath, self.commits[self.currentCommitIndex])
        except KeyError as extractError:
            return None

        diff = POSIXDiffer().diff(fullCodeWithBug, fullCodeWithoutBug)

        extractor = CodeExtractor(bugData)
        try:
            extractor.extractAll(fullCodeWithBug, diff)
        except ValueError as extractError:
            return None

        bugCodeFragment = extractor.getBugCodeFragment()
        fixCodeFragment = extractor.getFixCodeFragment()

        usedDiffs = extractor.getUsedDiffs()
        #Easy version - ignore bug if none or more than one diff used to fix
        #TODO: Possible improvement here
        if len(usedDiffs) != 1:
            return None
        return entities.FixData(bugCodeFragment, fixCodeFragment,
                                bugData.getChecker(), bugData.getMessage(),
                                bugData.getLine() - bugData.getStartLine())

    def prepareEnv(self, clean=False):
        print('Preparing train db... ', end='')
        self.prepareDb(clean)
        print('done')
        print('Loading commit list... ', end='')
        self.loadCommitList(clean)
        print('done')
        if clean:
            print('Checking out to root... ', end='')
            self.checkoutToNextVersion()
            print('done')
            print('Initial analysis... ', end='')
            self.codeChecker.check(True)
            print('done')
            print('Storing initial results... ', end='')
            self.codeChecker.store(self.commits[self.currentCommitIndex])
            print('done')
            print('Storing version information... ', end='')
            self.db.storeLastCommit(self.commits[self.currentCommitIndex])
            print('done')
            print('Cleaning up tmp directory... ', end='')
            shutil.rmtree(config.getTmpDir())
            print('done')
            print('Cleaning up working directory... ', end='')
            self.codeChecker.clean()
            print('done')

    def findAndStoreFixDataForVersion(self):
        print('Analyzing version',
              self.commits[self.currentCommitIndex],
              '... ',
              end='')
        self.codeChecker.check(True)
        print('done')
        print('Getting list of resolved bugs for version',
              self.commits[self.currentCommitIndex],
              '... ',
              end='')
        ids = self.getDiffResolvedIds()
        print('done')
        bugNo = 1
        allBugs = len(ids)
        anyStored = False
        for id in ids:
            print('Parsing data for bug ({0}/{1}, #{2})...'.format(
                bugNo, allBugs, id),
                  sep='',
                  end='')
            fixData = self.extractCode(id)
            bugNo = bugNo + 1
            print('done')
            if fixData is not None:
                print('Storing fixData... ', end='')
                self.db.store(fixData.getBugCode(), fixData.getFixCode(),
                              fixData.getChecker(), fixData.getMessage(),
                              fixData.getLine())
                anyStored = True
                print('done')
            if bugNo % 100 == 0 and anyStored:
                self.db.commit()
                anyStored = False
        if anyStored:
            self.db.commit()
        print('Storing CodeChecker results for this version... ', end='')
        self.codeChecker.store(self.commits[self.currentCommitIndex])
        print('done')
        print('Storing version information... ', end='')
        self.db.storeLastCommit(self.commits[self.currentCommitIndex])
        print('done')
        print('Cleaning up tmp directory... ', end='')
        shutil.rmtree(config.getTmpDir())
        print('done')
        print('Cleaning up working directory... ', end='')
        self.codeChecker.clean()
        print('done')

    def iterateThroughVcsHistory(self):
        while self.checkoutToNextVersion():
            self.findAndStoreFixDataForVersion()

    def checkoutToTop(self):
        self.vcs.checkout(config.getBranch())

    def build(self, clean=False):
        self.prepareEnv(clean)
        self.iterateThroughVcsHistory()
        self.checkoutToTop()
 def __init__(self):
     self.db = CFDatabase(config.getCfDbFile())
     self.lexer = CxxLexer()
     self.checkers = Checkers()
class DictionaryBuilder():
    def __init__(self):
        self.db = CFDatabase(config.getCfDbFile())
        self.lexer = CxxLexer()
        self.checkers = Checkers()

    def build(self, checker):
        # Load all data from DB
        print("Fetching data from database...")
        allData = self.db.getFixDataForChecker(checker)
        allDataLen = len(allData)
        print("Done, fetched {0} records".format(allDataLen))
        if allDataLen < 1:
            print("No data found")
            return

        # Tokenize all code snippets and extract extra tokens from checker's messages
        # Labelize all tokens existing only in fixed code (added data)
        # Labelize all tokens appearing more than X times
        # Labelize all C++ STL names (namespaces, constants, defines, variables, functions, headers, numeric literals)
        # Labelize all UNK token indexes
        print("Converting to tokens...")
        tokens = deque()
        tokensLen = 0
        labels = {}
        i = 0
        tokensLen = 0

        minTokens1Len = 9999
        minTokens2Len = 9999
        maxTokens1Len = 0
        maxTokens2Len = 0

        uniqTokenIDs = {}
        for tid in range(globals.firstAvailableToken):
            uniqTokenIDs[tid] = 0
        uniqTokenIDs[0] = 1  # T_ZERO
        uniqTokenIDs[349] = 1  # T_SOS
        uniqTokenIDs[351] = 1  # T_UNK

        while i < allDataLen:
            # Tokenize
            tokens1 = self.lexer.tokenize(allData[i][1])
            tokens2 = self.lexer.tokenize(allData[i][2])
            extra = self.checkers.extractTokensForChecker(
                checker, allData[i][4])
            newTokens = []

            # Extract new tokens
            for token2 in tokens2:
                matchFound = False
                for token1 in tokens1:
                    if token1['token'] == token2['token'] and token1[
                            'has_value'] == token2['has_value']:
                        if token1['has_value']:
                            if token1['value'] == token2['value']:
                                matchFound = True
                        else:
                            matchFound = True
                if not matchFound:
                    newTokens.append(token2)
            tokens1Len = len(tokens1)
            tokens2Len = len(tokens2)

            # Statistics
            if tokens1Len < minTokens1Len:
                minTokens1Len = tokens1Len
            if tokens2Len < minTokens2Len:
                minTokens2Len = tokens2Len
            if tokens1Len > maxTokens1Len:
                maxTokens1Len = tokens1Len
            if tokens2Len > maxTokens2Len:
                maxTokens2Len = tokens2Len

            # Count occurrences of each label
            allTokens = tokens1 + tokens2 + extra
            for token in allTokens:
                value = globals.emptyValue
                if token['has_value']:
                    value = token['value']
                if value in labels:
                    labels[value] += 1
                else:
                    labels[value] = 1
                uniqTokenIDs[int(token['token'])] += 1
                tokensLen += 1
            if len(newTokens) > 0:
                tokens.append(newTokens)
            i += 1
            print('Done {0}, processed {1} tokens ({2}/{3}/{4}/{5})'.format(
                i, len(allTokens), tokens1Len, tokens2Len, len(extra),
                len(newTokens)),
                  file=sys.stderr)
        print("Done, converted {0} tokens".format(tokensLen))

        # Labelizing
        labelDb = [globals.emptyValue]
        # UNK
        print("Adding UNK token labels")
        for i in range(config.cfNoOfUnkTokens):
            labelDb.append("UNK_{0}".format(i))
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # Common occurrences
        print("Filtering labels, selecting only those with > {0} occurrences".
              format(config.cfLabelThreshold))
        for key in labels.keys():
            if labels[key] > config.cfLabelThreshold:
                labelDb.append(key)
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # New tokens in fixed code
        print("Filtering labels, selecting only tokens introduced with fix")
        for entry in tokens:
            for token in entry:
                if token['has_value']:
                    labelDb.append(token['value'])
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # STL part

        # Token IDs
        for i in range(globals.firstAvailableToken):
            if uniqTokenIDs[i] > 0:
                labelDb.append("T_{0}".format(i))

        # Printout
        print("Uniqueing labels")
        labelsUnique = list(set(labelDb))
        print("Done, current label DB has {0} entries".format(
            len(labelsUnique)))
        print("Data set info")
        print("Min no of tokens (bug): {0}".format(minTokens1Len))
        print("Min no of tokens (fix): {0}".format(minTokens2Len))
        print("Max no of tokens (bug): {0}".format(maxTokens1Len))
        print("Max no of tokens (fix): {0}".format(maxTokens2Len))
        print("Extracted labels:")
        print(labelsUnique)
        print("Token uses:")
        for i in range(globals.firstAvailableToken):
            if uniqTokenIDs[i] > 0:
                print("{0}: {1}".format(i, uniqTokenIDs[i]))

        # Save to file
        print("Writing to dictionary file")
        with open(config.cfDictFilenameFormat.format(checker), "w") as f:
            f.write(json.dumps(labelsUnique))
        print("Done, exiting...")
Example #9
0
class LearningDataBuilder():
    def __init__(self):
        self.db = CFDatabase(config.getCfDbFile())
        self.checkers = Checkers()

    def build(self, checker):
        # Initialize coder
        print("Initializing coder...")
        self.dictionary = Dictionary(checker)
        self.coder = Coder(self.dictionary)

        # Load all data from DB
        print("Fetching data from database...")
        allData = self.db.getFixDataForChecker(checker)
        allDataLen = len(allData)
        print("Done, fetched {0} records".format(allDataLen))
        if allDataLen < 1:
            print("No data found")
            return

        # Encode all data
        print("Encoding all data and writing to output file...")
        i = 0
        (maxBug, maxFix,
         maxUnk) = self.checkers.getModelStatsForChecker(checker)
        with open(config.cfTrainFilenameFormat.format(checker), 'w') as f:
            while i < allDataLen:
                checkerInfo = self.checkers.extractTokensForChecker(
                    checker, allData[i][4])
                encodedBugData, initialUnkList = self.coder.encode(
                    allData[i][1], checkerData=checkerInfo)
                encodedFixData, finalUnkList = self.coder.encode(
                    allData[i][2], unkList=initialUnkList, reverse=False)
                if -1 in encodedBugData:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (bug), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                elif -1 in encodedFixData:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (fix), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                elif len(encodedBugData) > maxBug or len(
                        encodedFixData) > maxFix or len(finalUnkList) > maxUnk:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (lengths), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                else:
                    print("{0}: [{2} - {3} ({1})] Done (lenUnk = {1})".format(
                        i + 1, len(finalUnkList), len(encodedBugData),
                        len(encodedFixData)))
                    f.write(
                        json.dumps({
                            'x': encodedBugData,
                            'y': encodedFixData
                        }) + '\n')

                i += 1
                print('Done {0}'.format(i), file=sys.stderr)

        print("All done, exiting...")
Example #10
0
 def testDatabaseOpenExistingSuccess(self):
     dbPath = config.getRepoDir() + "/test.sqlite"
     db = CFDatabase(dbPath)
     self.assertGreater(len(db.getAllFixData()), 0)
     del db
Example #11
0
    def testBuildTrainDbIncremental(self):
        originalBranch = config.getBranch()
        dbPath = config.getCfDbFile()
        db = CFDatabase(dbPath)
        db.clean()
        self.assertEqual(len(db.getAllFixData()), 0)
        del db

        config.setBranch('trainDbScriptIncrementalTest')
        buildTestDB.main(True)
        db = CFDatabase(dbPath)
        self.assertEqual(len(db.getAllFixData()), 1)
        self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores')
        self.assertEqual(db.getLastCommit(),
                         'add691cf37da6c9d40666eac1bc8c1afda071c77')
        del db

        config.setBranch(originalBranch)
        buildTestDB.main(False)
        db = CFDatabase(dbPath)
        self.assertEqual(len(db.getAllFixData()), 2)
        self.assertEqual(db.getAllFixData()[0][3], 'deadcode.DeadStores')
        self.assertEqual(db.getAllFixData()[1][3], 'core.DivideZero')
        self.assertEqual(db.getLastCommit(),
                         'f2917b938f0ecbc62ad48101d034369a1ae61a19')
Example #12
0
 def setUpClass(self):
     print("Starting up...")
     self.db = CFDatabase(config.getCfDbFile())
     self.checkers = Checkers()
     self.checkerList = ['deadcode.DeadStores']
     self.checkerIndex = 0