Ejemplo n.º 1
0
def main():
    originalDir = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__)))  # vuddy root directory
    vulsDir = os.path.join(originalDir, "vul")

    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('REPO', help='''Repository name''')
    arg_parser.add_argument('-a',
                            '--abstract-level',
                            required=True,
                            type=int,
                            nargs=1,
                            choices=[0, 4],
                            help='''Abstract Level''')

    args = arg_parser.parse_args()

    projName = args.REPO
    intendedAbsLvl = 4
    if args.abstract_level:
        intendedAbsLvl = args.abstract_level[0]

    projDictList = []
    hashFileMapList = []
    for i in range(0, 5):
        projDictList.append({})
        hashFileMapList.append({})

    print "loading source",
    srcFileList = parser.loadVul(os.path.join(vulsDir, projName))
    print "(done)"

    time0 = time.time()

    numFiles = len(srcFileList)
    numFuncs = 0
    numLines = 0

    pool = mp.Pool()
    func = partial(parse_function, intendedAbsLvl)
    for srcFileIdx, returnTuple in enumerate(pool.imap(func, srcFileList)):
        srcFile = returnTuple[0]
        functionInstanceList = returnTuple[1]
        functionInstanceList_New = returnTuple[2]

        print srcFileIdx + 1, '/', len(srcFileList), srcFile
        numFuncs += len(functionInstanceList)
        if len(functionInstanceList) > 0:
            numLines += functionInstanceList[0].parentNumLoc

        for fi, f in enumerate(functionInstanceList):
            f.removeListDup()
            path = f.parentFile
            path = "." + path[f.parentFile.find("/vul/"):]
            absBody = parser.abstract(f, intendedAbsLvl)[1]
            absBody = parser.normalize(absBody)
            # print absBody
            funcLen = len(absBody)
            # print funcLen, absBody
            # print len(absBody)
            hashValue = hashlib.md5(absBody).hexdigest()

            if intendedAbsLvl == 4 and len(functionInstanceList_New) > 0:
                fnew = functionInstanceList_New[fi]
                fnew.removeListDup()
                absBodyNew = parser.abstract(fnew, intendedAbsLvl)[1]
                absBodyNew = parser.normalize(absBodyNew)
                hashValueNew = hashlib.md5(absBodyNew).hexdigest()

                if hashValue == hashValueNew:
                    # if abstract bodies of old and new func are identical,
                    # don't create hash index
                    continue

            try:
                projDictList[intendedAbsLvl][funcLen].append(hashValue)
            except KeyError:
                projDictList[intendedAbsLvl][funcLen] = [hashValue]

            try:
                hashFileMapList[intendedAbsLvl][hashValue].extend(
                    [path, f.funcId])
            except KeyError:
                hashFileMapList[intendedAbsLvl][hashValue] = [path, f.funcId]

    packageInfo = config.version + ' ' + str(projName) + ' ' + str(
        numFiles) + ' ' + str(numFuncs) + ' ' + str(numLines) + '\n'
    hidxDir = os.path.join(originalDir, "hidx")
    if os.path.exists(hidxDir) is False:
        os.makedirs(hidxDir)
    hidxFile = os.path.join(
        hidxDir, "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName))
    with open(hidxFile, 'w') as fp:
        fp.write(packageInfo)
        for key in sorted(projDictList[intendedAbsLvl]):
            fp.write(str(key) + '\t')
            for h in list(set(projDictList[intendedAbsLvl][key])):
                fp.write(h + '\t')
            fp.write('\n')

        fp.write('\n=====\n')

        for key in sorted(hashFileMapList[intendedAbsLvl]):
            fp.write(str(key) + '\t')
            for f in hashFileMapList[intendedAbsLvl][key]:
                fp.write(str(f) + '\t')
            fp.write('\n')

    print "Hash index saved to:", os.path.join(
        originalDir, "hidx",
        "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName))
    time1 = time.time()
    print "Elapsed time:", time1 - time0
Ejemplo n.º 2
0
                body = getBody(pu.removeComment(raw))

            if body.count(";") == 1:
                kill = 1  # this function must be single-line
            else:
                kill = 0

            cnt = 0
            for line in body.split('\n'):
                if len(line.strip()) > 0:
                    cnt += 1  # cnt will be 1 for single lined functions

            with open(os.path.join(vulsDir, dir, vul[:-8] + "_NEW.vul"),
                      'r') as fp:
                newraw = ''.join(fp.readlines())
                newbody = getBody(pu.removeComment(newraw))

            if kill == 1 or cnt == 1 or pu.normalize(body) == pu.normalize(
                    newbody) or len(newraw) == 0:
                vulBase = vul[:-8]
                os.remove(os.path.join(vulsDir, dir, vulBase + "_OLD.vul"))
                os.remove(os.path.join(vulsDir, dir, vulBase + "_NEW.vul"))
                os.remove(os.path.join(vulsDir, dir, vulBase + ".patch"))
                try:
                    rmcntDict[dir] += 1
                except:
                    rmcntDict[dir] = 1

for dir in rmcntDict:
    print "removed", rmcntDict[dir], "FP records from", dir
Ejemplo n.º 3
0
def source_from_cvepatch(
        ctr,
        diffFileName):  # diffFileName holds the filename of each DIFF patch
    # diffFileName looks like: CVE-2012-2372_7a9bc620049fed37a798f478c5699a11726b3d33.diff
    global repoName
    global debugMode
    global total
    global multimodeFlag
    global dummyFunction
    global diffDir
    global originalDir

    chunksCnt = 0  # number of DIFF patches
    currentCounter = 0

    with ctr.diffFileCntLock:
        currentCounter = ctr.diffFileCnt.value
        print str(ctr.diffFileCnt.value + 1) + '/' + str(total),
        ctr.diffFileCnt.value += 1

    if os.path.getsize(os.path.join(diffDir, repoName,
                                    diffFileName)) > 1000000:
        # don't do anything with big DIFFs (merges, upgrades, ...).
        print "[-]", diffFileName, "\t(file too large)"
    else:
        diffFileNameSplitted = diffFileName.split('_')
        cveId = diffFileNameSplitted[0]  # use only one CVEid
        commitHashValue = diffFileNameSplitted[-1].split('.')[0]

        print "[+]", diffFileName, "\t(proceed)"
        with open(os.path.join(diffDir, repoName, diffFileName), 'r') as fp:
            patchLines = ''.join(fp.readlines())
            patchLinesSplitted = re.split(pat_src, patchLines)
            commitLog = patchLinesSplitted[0]
            affectedFilesList = patchLinesSplitted[1:]

        repoPath = ''
        if multimodeFlag:  # multimode DIFFs have repoPath at the beginning.
            repoPath = commitLog.split('\n')[0].rstrip()

        numAffectedFiles = len(affectedFilesList)
        for aidx, affectedFile in enumerate(affectedFilesList):
            if debugMode:
                print "\tFile # " + str(aidx +
                                        1) + '/' + str(numAffectedFiles),
            firstLine = affectedFile.split('\n')[
                0]  # git --diff a/path/filename.ext b/path/filename.ext
            affectedFileName = firstLine.split("--git ")[1].split(
                " ")[0].split("/")[-1]
            codePath = firstLine.split(' b')[1].strip()  # path/filename.ext

            if not codePath.endswith(".c") and not codePath.endswith(
                    ".cpp") and not codePath.endswith(
                        ".cc") and not codePath.endswith(
                            ".c++") and not codePath.endswith(".cxx"):
                if debugMode:
                    print "\t[-]", codePath, "(wrong extension)"
            else:
                secondLine = affectedFile.split('\n')[1]

                if secondLine.startswith(
                        "index"
                ) == 0:  # or secondLine.endswith("100644") == 0:
                    if debugMode:
                        print "\t[-]", codePath, "(invalid metadata)"  # we are looking for "index" only.
                else:
                    if debugMode:
                        print "\t[+]", codePath
                    indexHashOld = secondLine.split(' ')[1].split('..')[0]
                    indexHashNew = secondLine.split(' ')[1].split('..')[1]

                    chunksList = re.split(
                        pat_chunk,
                        affectedFile)[1:]  # diff file per chunk (in list)
                    chunksCnt += len(chunksList)

                    if multimodeFlag:
                        os.chdir(
                            os.path.join(config.gitStoragePath, repoName,
                                         repoPath))
                    else:
                        os.chdir(os.path.join(config.gitStoragePath, repoName))

                    tmpOldFileName = os.path.join(
                        originalDir, "tmp",
                        "{0}_{1}_old".format(repoName, currentCounter))
                    command_show = "\"{0}\" show {1} > {2}".format(
                        config.gitBinary, indexHashOld, tmpOldFileName)
                    os.system(command_show)

                    tmpNewFileName = os.path.join(
                        originalDir, "tmp",
                        "{0}_{1}_new".format(repoName, currentCounter))
                    command_show = "\"{0}\" show {1} > {2}".format(
                        config.gitBinary, indexHashNew, tmpNewFileName)
                    os.system(command_show)

                    os.chdir(originalDir)
                    oldFunctionInstanceList = parseutility.parseFile_shallow(
                        tmpOldFileName, "")
                    newFunctionInstanceList = parseutility.parseFile_shallow(
                        tmpNewFileName, "")

                    finalOldFunctionList = []

                    numChunks = len(chunksList)
                    for ci, chunk in enumerate(chunksList):
                        if debugMode:
                            print "\t\tChunk # " + str(ci + 1) + "/" + str(
                                numChunks),

                        chunkSplitted = chunk.split('\n')
                        chunkFirstLine = chunkSplitted[0]
                        chunkLines = chunkSplitted[1:]

                        if debugMode:
                            print chunkFirstLine
                        lineNums = pat_linenum.search(chunkFirstLine)
                        oldLines = lineNums.group(1).split(',')
                        newLines = lineNums.group(2).split(',')

                        offset = int(oldLines[0])
                        pmList = []
                        lnList = []
                        for chunkLine in chunkSplitted[1:]:
                            if len(chunkLine) != 0:
                                pmList.append(chunkLine[0])

                        for i, pm in enumerate(pmList):
                            if pm == ' ' or pm == '-':
                                lnList.append(offset + i)
                            elif pm == '+':
                                lnList.append(offset + i - 1)
                                offset -= 1
                        """ HERE, ADD CHECK FOR NEW FUNCTIONS """
                        hitOldFunctionList = []
                        for f in oldFunctionInstanceList:
                            # print f.lines[0], f.lines[1]

                            for num in range(f.lines[0], f.lines[1] + 1):
                                if num in lnList:
                                    # print "Hit at", num

                                    hitOldFunctionList.append(f)
                                    break  # found the function to be patched

                                    # if f.lines[0] <= offset <= f.lines[1]:
                                    #     print "\t\t\tOffset HIT!!", f.name
                                    # elif f.lines[0] <= bound <= f.lines[1]:
                                    #     print "\t\t\tBound  HIT!!", f.name

                        for f in hitOldFunctionList:
                            # print "Verify hitFunction", f.name
                            # print "ln",
                            for num in range(f.lines[0], f.lines[1] + 1):
                                # print num,
                                try:
                                    listIndex = lnList.index(num)
                                except ValueError:
                                    pass
                                else:
                                    if lnList.count(num) > 1:
                                        listIndex += 1
                                    # print "\nmatch:", num
                                    # print "value\t", chunkSplitted[1:][lnList.index(num)]
                                    # print "pm   \t", pmList[lnList.index(num)]
                                    if pmList[listIndex] == '+' or pmList[
                                            listIndex] == '-':
                                        # print "Maybe meaningful",
                                        flag = 0
                                        for commentKeyword in [
                                                "/*", "*/", "//", "*"
                                        ]:
                                            if chunkLines[listIndex][
                                                    1:].lstrip().startswith(
                                                        commentKeyword):
                                                flag = 1
                                                break
                                        if flag:
                                            pass
                                            # print "but not."
                                        else:
                                            # print "MEANINGFUL!!"
                                            finalOldFunctionList.append(f)
                                            break
                                    else:
                                        pass
                                        # print "Not meaningful"
                                        # print "============\n"

                    finalOldFunctionList = list(
                        set(finalOldFunctionList))  # sometimes list has dups

                    finalNewFunctionList = []
                    for fold in finalOldFunctionList:
                        flag = 0
                        for fnew in newFunctionInstanceList:
                            if fold.name == fnew.name:
                                finalNewFunctionList.append(fnew)
                                flag = 1
                                break
                        if not flag:
                            finalNewFunctionList.append(dummyFunction)

                    if debugMode:
                        print "\t\t\t", len(
                            finalNewFunctionList), "functions found."
                    vulFileNameBase = diffFileName.split(
                        '.diff')[0] + '_' + affectedFileName

                    # os.chdir(os.path.join(originalDir, "vul", repoName))

                    for index, f in enumerate(finalOldFunctionList):
                        os.chdir(originalDir)
                        oldFuncInstance = finalOldFunctionList[index]

                        fp = open(oldFuncInstance.parentFile, 'r')
                        srcFileRaw = fp.readlines()
                        fp.close()
                        finalOldFunction = ''.join(
                            srcFileRaw[oldFuncInstance.lines[0] -
                                       1:oldFuncInstance.lines[1]])

                        # oldFuncArgs = ''
                        # for ai, funcArg in enumerate(oldFuncInstance.parameterList):
                        #     oldFuncArgs += "DTYPE " + funcArg
                        #     if ai + 1 != len(oldFuncInstance.parameterList):
                        #         oldFuncArgs += ', '
                        # finalOldFunction = "DTYPE {0} ({1})\n{{ {2}\n}}"\
                        #     .format(oldFuncInstance.name, oldFuncArgs, oldFuncInstance.funcBody)

                        finalOldFuncId = str(oldFuncInstance.funcId)

                        newFuncInstance = finalNewFunctionList[index]

                        if newFuncInstance.name is None:
                            finalNewFunction = ""
                        else:
                            fp = open(newFuncInstance.parentFile, 'r')
                            srcFileRaw = fp.readlines()
                            fp.close()
                            finalNewFunction = ''.join(
                                srcFileRaw[newFuncInstance.lines[0] -
                                           1:newFuncInstance.lines[1]])

                            # finalNewFunction = finalNewFunctionList[index].funcBody

                        finalOldBody = finalOldFunction[
                            finalOldFunction.find('{') +
                            1:finalOldFunction.rfind('}')]
                        finalNewBody = finalNewFunction[
                            finalNewFunction.find('{') +
                            1:finalNewFunction.rfind('}')]
                        tmpold = parseutility.normalize(
                            parseutility.removeComment(finalOldBody))
                        tmpnew = parseutility.normalize(
                            parseutility.removeComment(finalNewBody))

                        if tmpold != tmpnew and len(tmpnew) > 0:
                            # if two are same, it means nothing but comment is patched.
                            with ctr.functionCntLock:
                                ctr.functionCnt.value += 1
                            os.chdir(os.path.join(originalDir, "vul",
                                                  repoName))
                            vulOldFileName = vulFileNameBase + '_' + finalOldFuncId + "_OLD.vul"
                            vulNewFileName = vulFileNameBase + '_' + finalOldFuncId + "_NEW.vul"
                            with open(vulOldFileName, 'w') as fp:
                                fp.write(finalOldFunction)
                            with open(vulNewFileName, 'w') as fp:
                                if finalNewFunctionList[
                                        index].name is not None:
                                    fp.write(finalNewFunction)
                                else:
                                    fp.write("")
                            diffCommand = "\"{0}\" -u {1} {2} >> {3}_{4}.patch".format(
                                config.diffBinary, vulOldFileName,
                                vulNewFileName, vulFileNameBase,
                                finalOldFuncId)
                            os.system(diffCommand)
Ejemplo n.º 4
0
vulsDir = os.path.join(originalDir, "vul")
dirs = os.listdir(vulsDir)
os.chdir(vulsDir)
for d in dirs:
    if os.path.isdir(d):
        repolist.append(d)
        cntdict[d] = 0
        # print d
        # print repolist
        vulcntlist.append(len(os.listdir(d)))
        # print vulcntlist
        for vul in os.listdir(d):
            if vul.endswith("OLD.vul"):
                with open(os.path.join(d, vul), "r") as fp:
                    text = '\n'.join(fp.readlines())
                    text = normalize(text)
                    checksum = hashlib.md5(text).hexdigest()
                    try:
                        hashdict[checksum].append(d + ' ' + vul)
                    except:
                        hashdict[checksum] = [d + ' ' + vul]

cnt = 0

for key in hashdict:
    if len(hashdict[key]) > 1:
        for vul in hashdict[key][1:]:
            cnt += 1
            repo = vul.split(' ')[0]
            rest = vul.split(' ')[1]
            base = rest[:-8]