def main(): originalDir = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) # vuddy root directory vulsDir = os.path.join(originalDir, "vul") arg_parser = argparse.ArgumentParser() arg_parser.add_argument('REPO', help='''Repository name''') arg_parser.add_argument('-a', '--abstract-level', required=True, type=int, nargs=1, choices=[0, 4], help='''Abstract Level''') args = arg_parser.parse_args() projName = args.REPO intendedAbsLvl = 4 if args.abstract_level: intendedAbsLvl = args.abstract_level[0] projDictList = [] hashFileMapList = [] for i in range(0, 5): projDictList.append({}) hashFileMapList.append({}) print "loading source", srcFileList = parser.loadVul(os.path.join(vulsDir, projName)) print "(done)" time0 = time.time() numFiles = len(srcFileList) numFuncs = 0 numLines = 0 pool = mp.Pool() func = partial(parse_function, intendedAbsLvl) for srcFileIdx, returnTuple in enumerate(pool.imap(func, srcFileList)): srcFile = returnTuple[0] functionInstanceList = returnTuple[1] functionInstanceList_New = returnTuple[2] print srcFileIdx + 1, '/', len(srcFileList), srcFile numFuncs += len(functionInstanceList) if len(functionInstanceList) > 0: numLines += functionInstanceList[0].parentNumLoc for fi, f in enumerate(functionInstanceList): f.removeListDup() path = f.parentFile path = "." + path[f.parentFile.find("/vul/"):] absBody = parser.abstract(f, intendedAbsLvl)[1] absBody = parser.normalize(absBody) # print absBody funcLen = len(absBody) # print funcLen, absBody # print len(absBody) hashValue = hashlib.md5(absBody).hexdigest() if intendedAbsLvl == 4 and len(functionInstanceList_New) > 0: fnew = functionInstanceList_New[fi] fnew.removeListDup() absBodyNew = parser.abstract(fnew, intendedAbsLvl)[1] absBodyNew = parser.normalize(absBodyNew) hashValueNew = hashlib.md5(absBodyNew).hexdigest() if hashValue == hashValueNew: # if abstract bodies of old and new func are identical, # don't create hash index continue try: projDictList[intendedAbsLvl][funcLen].append(hashValue) except KeyError: projDictList[intendedAbsLvl][funcLen] = [hashValue] try: hashFileMapList[intendedAbsLvl][hashValue].extend( [path, f.funcId]) except KeyError: hashFileMapList[intendedAbsLvl][hashValue] = [path, f.funcId] packageInfo = config.version + ' ' + str(projName) + ' ' + str( numFiles) + ' ' + str(numFuncs) + ' ' + str(numLines) + '\n' hidxDir = os.path.join(originalDir, "hidx") if os.path.exists(hidxDir) is False: os.makedirs(hidxDir) hidxFile = os.path.join( hidxDir, "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName)) with open(hidxFile, 'w') as fp: fp.write(packageInfo) for key in sorted(projDictList[intendedAbsLvl]): fp.write(str(key) + '\t') for h in list(set(projDictList[intendedAbsLvl][key])): fp.write(h + '\t') fp.write('\n') fp.write('\n=====\n') for key in sorted(hashFileMapList[intendedAbsLvl]): fp.write(str(key) + '\t') for f in hashFileMapList[intendedAbsLvl][key]: fp.write(str(f) + '\t') fp.write('\n') print "Hash index saved to:", os.path.join( originalDir, "hidx", "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName)) time1 = time.time() print "Elapsed time:", time1 - time0
body = getBody(pu.removeComment(raw)) if body.count(";") == 1: kill = 1 # this function must be single-line else: kill = 0 cnt = 0 for line in body.split('\n'): if len(line.strip()) > 0: cnt += 1 # cnt will be 1 for single lined functions with open(os.path.join(vulsDir, dir, vul[:-8] + "_NEW.vul"), 'r') as fp: newraw = ''.join(fp.readlines()) newbody = getBody(pu.removeComment(newraw)) if kill == 1 or cnt == 1 or pu.normalize(body) == pu.normalize( newbody) or len(newraw) == 0: vulBase = vul[:-8] os.remove(os.path.join(vulsDir, dir, vulBase + "_OLD.vul")) os.remove(os.path.join(vulsDir, dir, vulBase + "_NEW.vul")) os.remove(os.path.join(vulsDir, dir, vulBase + ".patch")) try: rmcntDict[dir] += 1 except: rmcntDict[dir] = 1 for dir in rmcntDict: print "removed", rmcntDict[dir], "FP records from", dir
def source_from_cvepatch( ctr, diffFileName): # diffFileName holds the filename of each DIFF patch # diffFileName looks like: CVE-2012-2372_7a9bc620049fed37a798f478c5699a11726b3d33.diff global repoName global debugMode global total global multimodeFlag global dummyFunction global diffDir global originalDir chunksCnt = 0 # number of DIFF patches currentCounter = 0 with ctr.diffFileCntLock: currentCounter = ctr.diffFileCnt.value print str(ctr.diffFileCnt.value + 1) + '/' + str(total), ctr.diffFileCnt.value += 1 if os.path.getsize(os.path.join(diffDir, repoName, diffFileName)) > 1000000: # don't do anything with big DIFFs (merges, upgrades, ...). print "[-]", diffFileName, "\t(file too large)" else: diffFileNameSplitted = diffFileName.split('_') cveId = diffFileNameSplitted[0] # use only one CVEid commitHashValue = diffFileNameSplitted[-1].split('.')[0] print "[+]", diffFileName, "\t(proceed)" with open(os.path.join(diffDir, repoName, diffFileName), 'r') as fp: patchLines = ''.join(fp.readlines()) patchLinesSplitted = re.split(pat_src, patchLines) commitLog = patchLinesSplitted[0] affectedFilesList = patchLinesSplitted[1:] repoPath = '' if multimodeFlag: # multimode DIFFs have repoPath at the beginning. repoPath = commitLog.split('\n')[0].rstrip() numAffectedFiles = len(affectedFilesList) for aidx, affectedFile in enumerate(affectedFilesList): if debugMode: print "\tFile # " + str(aidx + 1) + '/' + str(numAffectedFiles), firstLine = affectedFile.split('\n')[ 0] # git --diff a/path/filename.ext b/path/filename.ext affectedFileName = firstLine.split("--git ")[1].split( " ")[0].split("/")[-1] codePath = firstLine.split(' b')[1].strip() # path/filename.ext if not codePath.endswith(".c") and not codePath.endswith( ".cpp") and not codePath.endswith( ".cc") and not codePath.endswith( ".c++") and not codePath.endswith(".cxx"): if debugMode: print "\t[-]", codePath, "(wrong extension)" else: secondLine = affectedFile.split('\n')[1] if secondLine.startswith( "index" ) == 0: # or secondLine.endswith("100644") == 0: if debugMode: print "\t[-]", codePath, "(invalid metadata)" # we are looking for "index" only. else: if debugMode: print "\t[+]", codePath indexHashOld = secondLine.split(' ')[1].split('..')[0] indexHashNew = secondLine.split(' ')[1].split('..')[1] chunksList = re.split( pat_chunk, affectedFile)[1:] # diff file per chunk (in list) chunksCnt += len(chunksList) if multimodeFlag: os.chdir( os.path.join(config.gitStoragePath, repoName, repoPath)) else: os.chdir(os.path.join(config.gitStoragePath, repoName)) tmpOldFileName = os.path.join( originalDir, "tmp", "{0}_{1}_old".format(repoName, currentCounter)) command_show = "\"{0}\" show {1} > {2}".format( config.gitBinary, indexHashOld, tmpOldFileName) os.system(command_show) tmpNewFileName = os.path.join( originalDir, "tmp", "{0}_{1}_new".format(repoName, currentCounter)) command_show = "\"{0}\" show {1} > {2}".format( config.gitBinary, indexHashNew, tmpNewFileName) os.system(command_show) os.chdir(originalDir) oldFunctionInstanceList = parseutility.parseFile_shallow( tmpOldFileName, "") newFunctionInstanceList = parseutility.parseFile_shallow( tmpNewFileName, "") finalOldFunctionList = [] numChunks = len(chunksList) for ci, chunk in enumerate(chunksList): if debugMode: print "\t\tChunk # " + str(ci + 1) + "/" + str( numChunks), chunkSplitted = chunk.split('\n') chunkFirstLine = chunkSplitted[0] chunkLines = chunkSplitted[1:] if debugMode: print chunkFirstLine lineNums = pat_linenum.search(chunkFirstLine) oldLines = lineNums.group(1).split(',') newLines = lineNums.group(2).split(',') offset = int(oldLines[0]) pmList = [] lnList = [] for chunkLine in chunkSplitted[1:]: if len(chunkLine) != 0: pmList.append(chunkLine[0]) for i, pm in enumerate(pmList): if pm == ' ' or pm == '-': lnList.append(offset + i) elif pm == '+': lnList.append(offset + i - 1) offset -= 1 """ HERE, ADD CHECK FOR NEW FUNCTIONS """ hitOldFunctionList = [] for f in oldFunctionInstanceList: # print f.lines[0], f.lines[1] for num in range(f.lines[0], f.lines[1] + 1): if num in lnList: # print "Hit at", num hitOldFunctionList.append(f) break # found the function to be patched # if f.lines[0] <= offset <= f.lines[1]: # print "\t\t\tOffset HIT!!", f.name # elif f.lines[0] <= bound <= f.lines[1]: # print "\t\t\tBound HIT!!", f.name for f in hitOldFunctionList: # print "Verify hitFunction", f.name # print "ln", for num in range(f.lines[0], f.lines[1] + 1): # print num, try: listIndex = lnList.index(num) except ValueError: pass else: if lnList.count(num) > 1: listIndex += 1 # print "\nmatch:", num # print "value\t", chunkSplitted[1:][lnList.index(num)] # print "pm \t", pmList[lnList.index(num)] if pmList[listIndex] == '+' or pmList[ listIndex] == '-': # print "Maybe meaningful", flag = 0 for commentKeyword in [ "/*", "*/", "//", "*" ]: if chunkLines[listIndex][ 1:].lstrip().startswith( commentKeyword): flag = 1 break if flag: pass # print "but not." else: # print "MEANINGFUL!!" finalOldFunctionList.append(f) break else: pass # print "Not meaningful" # print "============\n" finalOldFunctionList = list( set(finalOldFunctionList)) # sometimes list has dups finalNewFunctionList = [] for fold in finalOldFunctionList: flag = 0 for fnew in newFunctionInstanceList: if fold.name == fnew.name: finalNewFunctionList.append(fnew) flag = 1 break if not flag: finalNewFunctionList.append(dummyFunction) if debugMode: print "\t\t\t", len( finalNewFunctionList), "functions found." vulFileNameBase = diffFileName.split( '.diff')[0] + '_' + affectedFileName # os.chdir(os.path.join(originalDir, "vul", repoName)) for index, f in enumerate(finalOldFunctionList): os.chdir(originalDir) oldFuncInstance = finalOldFunctionList[index] fp = open(oldFuncInstance.parentFile, 'r') srcFileRaw = fp.readlines() fp.close() finalOldFunction = ''.join( srcFileRaw[oldFuncInstance.lines[0] - 1:oldFuncInstance.lines[1]]) # oldFuncArgs = '' # for ai, funcArg in enumerate(oldFuncInstance.parameterList): # oldFuncArgs += "DTYPE " + funcArg # if ai + 1 != len(oldFuncInstance.parameterList): # oldFuncArgs += ', ' # finalOldFunction = "DTYPE {0} ({1})\n{{ {2}\n}}"\ # .format(oldFuncInstance.name, oldFuncArgs, oldFuncInstance.funcBody) finalOldFuncId = str(oldFuncInstance.funcId) newFuncInstance = finalNewFunctionList[index] if newFuncInstance.name is None: finalNewFunction = "" else: fp = open(newFuncInstance.parentFile, 'r') srcFileRaw = fp.readlines() fp.close() finalNewFunction = ''.join( srcFileRaw[newFuncInstance.lines[0] - 1:newFuncInstance.lines[1]]) # finalNewFunction = finalNewFunctionList[index].funcBody finalOldBody = finalOldFunction[ finalOldFunction.find('{') + 1:finalOldFunction.rfind('}')] finalNewBody = finalNewFunction[ finalNewFunction.find('{') + 1:finalNewFunction.rfind('}')] tmpold = parseutility.normalize( parseutility.removeComment(finalOldBody)) tmpnew = parseutility.normalize( parseutility.removeComment(finalNewBody)) if tmpold != tmpnew and len(tmpnew) > 0: # if two are same, it means nothing but comment is patched. with ctr.functionCntLock: ctr.functionCnt.value += 1 os.chdir(os.path.join(originalDir, "vul", repoName)) vulOldFileName = vulFileNameBase + '_' + finalOldFuncId + "_OLD.vul" vulNewFileName = vulFileNameBase + '_' + finalOldFuncId + "_NEW.vul" with open(vulOldFileName, 'w') as fp: fp.write(finalOldFunction) with open(vulNewFileName, 'w') as fp: if finalNewFunctionList[ index].name is not None: fp.write(finalNewFunction) else: fp.write("") diffCommand = "\"{0}\" -u {1} {2} >> {3}_{4}.patch".format( config.diffBinary, vulOldFileName, vulNewFileName, vulFileNameBase, finalOldFuncId) os.system(diffCommand)
vulsDir = os.path.join(originalDir, "vul") dirs = os.listdir(vulsDir) os.chdir(vulsDir) for d in dirs: if os.path.isdir(d): repolist.append(d) cntdict[d] = 0 # print d # print repolist vulcntlist.append(len(os.listdir(d))) # print vulcntlist for vul in os.listdir(d): if vul.endswith("OLD.vul"): with open(os.path.join(d, vul), "r") as fp: text = '\n'.join(fp.readlines()) text = normalize(text) checksum = hashlib.md5(text).hexdigest() try: hashdict[checksum].append(d + ' ' + vul) except: hashdict[checksum] = [d + ' ' + vul] cnt = 0 for key in hashdict: if len(hashdict[key]) > 1: for vul in hashdict[key][1:]: cnt += 1 repo = vul.split(' ')[0] rest = vul.split(' ')[1] base = rest[:-8]