def generateExamples(dataset, predClasses): '''Given the entire dataset and a predicted label, print TOP_EG number of example pairs.''' # Init labels = [dataset.dict_Index_Cluster[i] for i in predClasses] prettyLabels = [i.replace('\n', ' ') for i in labels] examplesList = [] print 'Top-{} Predicted Class-IDs:\n{}'.format(TOP_K, predClasses) print 'TOP-{} Predicted Class-Labels (Error-IDs and Unique Repairs):\n{}\nExamples:'.format(TOP_K, prettyLabels) # For each label numC = 0 for label in labels: dataSamples = dataset.dict_ClusterRaw_DataSamples[label] # Fetch all training examples srcTrgtPairs = [(H.joinList(i.src, ''), H.joinList(i.trgt, '')) for i in dataSamples] # Create list of (before, after) topEgs = H.getTop_K(srcTrgtPairs, TOP_EG) # Sort them based on frequency and fetch Top-EG #examples for i in range(len(topEgs)): # For each example eg = topEgs[i] before, after = eg[0].replace('\n', ''), eg[1].replace('\n', '') # Remove new-line characters if before.strip() == '': before = '// Empty Line' # Deal with empty lines if after.strip() == '': after = '// Empty Line' # Deal with empty lines print 'Eg #{} class-{} before: {}'.format(i+1, predClasses[numC], before) # Print them print 'Eg #{} class-{} after : {}'.format(i+1, predClasses[numC], after) examplesList.append(eg) numC += 1 # Extend to the examplesList, proportionately return examplesList
def writeTypeKind(): path = './data/' nameRead = path + 'subset-srcTrgtPairs.csv' nameWrite = path + 'TokenKind.csv' headers, lines = H.readCSV(nameRead) writeH = ['spell', 'kind', 'cursorTypeKind'] dictSpell = collections.defaultdict(lambda: {}) count = 0 for line in lines: srcText = line[headers.index('sourceText')] codeObj = Code(srcText) for token in codeObj.getTokens(): cToken = CToken(token, codeObj) dictSpell[cToken.spell][str(cToken.kind) + '!@#$%' + str(cToken.cursorType)] = 0 count += 1 print count, line[headers.index('sourceID')] writeL = [[spell, kindType.split('!@#$%')[0], kindType.split('!@#$%')[1]] for spell in dictSpell for kindType in dictSpell[spell]] H.writeCSV(nameWrite, writeH, writeL)
def writeAbstractions(): path = './data/' nameRead = path + 'subset-srcTrgtPairs' nameWrite = nameRead + '_newAbs' headers, lines = H.readCSV(nameRead + '.csv') headers += ['NEW_SrcAbs', 'NEW_TrgtAbs'] writeLines = [] count = 0 for line in lines[:10]: writeLine = line srcText = line[headers.index('sourceText')] trgtText = line[headers.index('targetText')] for text, hname in zip([srcText, trgtText], ['', '']): codeObj = Code(text) absLines = getProgAbstraction(codeObj) writeLine.append(H.joinLL(absLines)) count += 1 print count, line[headers.index('sourceID')] writeLines.append(writeLine) H.writeCSV(nameWrite + '.csv', headers, writeLines)
def writeErrSets(fname): '''Invoke this function on a "clean" dataset - a dataset.csv which doesn't contain the ErrSet column.''' headers, lines = H.readCSV(fname) headers.append("ErrSet") dictErrDiff = {} # {CompErr1:ErrSet1, ...} allErrs = readAllErrors() count = 0 print 'Total #src-target pairs=',len(lines) indexErrClang = headers.index("errorClang") indexErrLLVM = headers.index("errorLLVM") indexLineNums = headers.index("lineNums_Abs") indexDi, indexDd = headers.index("diffAbs_ins"), headers.index("diffAbs_del") for line in lines: count += 1 if count%1000==0: print count,'/',len(lines),'done ...' diffsI, diffsD = line[indexDi].splitlines(), line[indexDd].splitlines() errClang, errLLVM, diffLineNums = line[indexErrClang], line[indexErrLLVM], set(line[indexLineNums].splitlines()) errClang, errLLVM = errClang.replace('\r', '\n'), errLLVM.replace('\r', '\n') errSet, errExpList, compLineNums = getErrSet(allErrs, dictErrDiff, errClang) # Get the err-set (unique rep for set of errors) clusterErr(errSet, diffsI, diffsD) # Cluster the diffs (add the diff to dictErrDiff) errSet.calcIntersection(compLineNums, diffLineNums) # Update counts to calc precision-recall of compiler lineNums line.append(errSet.key) H.writeCSV(fname, headers, lines) writeAllErrs(allErrs) writeClusterErr(dictErrDiff)
def writeSummary(row): headers = [ 'time_Recorded', 'train_set', 'predict_set', 'totalNumPairs', 'Train,Valid,Test', 'numClasses', 'modelName', 'modelSummary', 'Pred@1,3,5', 'precision', 'recall', 'trainTime', 'max_seq_length', 'max_vocab_size', 'TRAIN_MULT_FACTOR', 'EPOCHS', 'EMBEDDING_VECTOR_LENGTH', 'classMapping_RawCluster', 'confusion_matrix', 'acc', 'loss', 'val_acc', 'val_loss' ] H.appendCSV(fname_summary, headers, [row])
def getAbstractLiteral(self): flagIsString = False quotes = ['\'', '"'] if self.cursorType == TypeKind.CONSTANTARRAY: flagIsString = True elif len(self.spell) >= 2 and self.spell[0] in quotes and self.spell[ -1] in quotes: flagIsString = True if flagIsString: # TypeKind.CONSTANTARRAY or TypeKind.INT with single quotes - char or Invalids with double quotes self.addAbstract(self.spell[0], self.spell[0]) # Add First Quote intermediateStr = self.spell[1:-1] if len( intermediateStr ) > 0: # Ignore 0 length LITERAL, to differentiate those cases when nothing exists inside quotes formatSpecs = self.extractFormatSpec( intermediateStr ) # If String, abstract format spec (%d), special chars,... if len( formatSpecs ) > 0: # If format specifiers present, add them instead of Char/String map(self.addAbstract, formatSpecs, formatSpecs) elif len(intermediateStr ) == 1: # Character: Otherwise, if no formatSpecs self.addAbstract( str(self.kind) + '_CHAR', intermediateStr) #Add a placeholder Literal_Char else: # String - if len(intermediateStr) >= 1 self.addAbstract( str(self.kind) + '_STRING', intermediateStr ) # Else, add a placeholder Literal_String self.addAbstract(self.spell[-1], self.spell[-1]) # Add Last Quote elif isInt(self.spell): # If actually an integer literal self.addAbstract(str(self.kind) + '_INT', self.spell) # Add a placeholder Literal_Int elif isFloat(self.spell): # If actually a float literal self.addAbstract(str(self.kind) + "_DOUBLE", self.spell) # Add a placeholder Literal_Int else: # If neither String, nor int/float: add cursorType (can't abstract - mostly Invalid) self.addAbstract( str(self.kind) + '_' + str(self.cursorType), self.spell) if self.cursorType != TypeKind.INVALID: # Log the "special" type of Literal (unless its INVALID) H.errorLog([['CodeID', self.codeID], [ 'AbstractToken new literal-type', str(self.kind) + '_' + str(self.cursorType) ], ['lineNum', self.lineNum], ['spell', self.spell]])
def setDict_Indices(self): '''Once raw cluster/label dicts are created, assign indices to them''' # Assign indices to class: Order of "inverse length", then by "key ascending" for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples): self.dict_Cluster_Index[clusterRaw] = len(self.dict_Cluster_Index) + 1 # Assign indices to labels: Order of "inverse length", then by "key ascending" for label, dataSamples in H.sortDictLen_Rev(self.dict_Label_DataSamples): self.dict_Label_Index[label] = len(self.dict_Label_Index) + 1 self.dict_Index_Cluster = {v: k for k, v in self.dict_Cluster_Index.iteritems()} self.dict_Index_Label = {v: k for k, v in self.dict_Label_Index.iteritems()}
def setDict_Cluster(self, fname, H_src=None, H_trgt=None): self.dict_ClusterRaw_DataSamples = {} if '.xlsx' in fname: df=pandas.read_excel(fname, converters={'clusterID': str, 'subClassID': str}) headers, lines = df.columns.tolist(), df.values else: headers, lines = H.readCSV(fname) lines = lines hList = map(lambda x:x.lower(), headers) indexTrain, indexPredict = hList.index(self.trainSet.lower()), hList.index(self.predictSet.lower()) i_errSet = hList.index('errset') for l in lines: trainRaw, predictRaw = l[indexTrain], l[indexPredict] errSet = l[i_errSet] src, trgt = None, None if H_src: src = l[headers.index(H_src)] # If headers for source-target pairs if H_trgt: trgt = l[headers.index(H_trgt)] # Then, associate with the dataSample if predictRaw == predictRaw: # is not empty (i.e, shouldn't be a NaN, for pandas) d=DataSample(str(trainRaw), errSet, str(predictRaw), self.predictSet, src, trgt) # Append to self.dict_ClusterRaw_DataSamples if d.clusterRaw not in self.dict_ClusterRaw_DataSamples: self.dict_ClusterRaw_DataSamples[d.clusterRaw] = [] self.dict_ClusterRaw_DataSamples[d.clusterRaw].append(d) # Once raw cluster/label dicts are created, # filter out small clusters, assign labels and indices to rest self.setDict_filterSize() self.setDict_labels() self.setDict_Indices()
def splitTrainTest(self): print colored('\tSplitting Train+Test ...', 'magenta') self.num_classes = len(self.dict_ClusterRaw_DataSamples) + 1 self.num_labels = len(self.dict_Label_DataSamples) print 'NumClasses=', self.num_classes - 1 print 'NumLabels=', self.num_labels - 1 for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples): clusterIndex = self.dict_Cluster_Index[clusterRaw] li = dataSamples labelList = li[0].labelList # Pick any dataSamples labelList - would be the same for all similar clusterRaw labelIndices = [self.dict_Label_Index[label] for label in labelList] numTrain = int(math.ceil(CF.TRAIN_SPLIT * len(li))) numValid = int(math.floor(CF.VALIDATION_SPLIT * len(li))) numTest = len(li) - numTrain - numValid print 'Class-',clusterIndex, 'NumTrain=', numTrain, 'NumValid=', numValid, 'NumTest=', numTest self.X_train_DataSample.extend(li[:numTrain]) self.X_valid_DataSample.extend(li[numTrain : numTrain+numValid]) self.X_test_DataSample.extend(li[numTrain + numValid :]) self.y_train_cluster.extend([clusterIndex] * numTrain) self.y_valid_cluster.extend([clusterIndex] * numValid) self.y_test_cluster.extend([clusterIndex] * (len(li) - numTrain - numValid)) self.y_train_label.extend([labelIndices] * numTrain) self.y_valid_label.extend([labelIndices] * numValid) self.y_test_label.extend([labelIndices] * (len(li) - numTrain - numValid)) self.X_train_rawText = [i.rawText for i in self.X_train_DataSample] self.X_valid_rawText = [i.rawText for i in self.X_valid_DataSample] self.X_test_rawText = [i.rawText for i in self.X_test_DataSample]
def writeConfMat(self, confMat): headers = [ 'actualClass', '#test-count', 'precision', 'recall', 'egBefore', 'predClass-1', 'predCount-1' ] rows = [] for confRow in confMat: accs = confRow.getCSV_Acc() egBefore = confRow.egBefore liSorted = confRow.getCSV_Conf() row = accs + [egBefore] + [j for li in liSorted for j in li] rows.append(row) H.writeCSV(CF.fnameConfMat, headers, rows)
def recordAccModel(modelName, dataset): strDeepModel, h, trainTime, prec, recall, strConfMat, predAtK = trainTest( modelName, dataset) currTime = datetime.datetime.now().ctime() numPairs = dataset.getTotalNumPairs() tvt = (round(CF.TRAIN_SPLIT, 2), round(CF.VALIDATION_SPLIT, 2), round(1 - CF.TRAIN_SPLIT - CF.VALIDATION_SPLIT, 2)) dict_index, num_classes = dataset.dict_Cluster_Index, dataset.num_classes if dataset.multiClass: dict_index, num_classes = dataset.dict_Label_Index, dataset.num_labels classMapStr = '\n'.join( [str(j) + ' -> ' + str(i) for i, j in H.sortDictVal(dict_index)]) row = [currTime, trainSet, predictSet, numPairs, tvt, num_classes - 1] row += [ modelName, strDeepModel, predAtK, prec, recall, trainTime, dataset.max_seq_length, dataset.max_vocab_size ] row += [TRAIN_MULT_FACTOR, EPOCHS, CF.EMBEDDING_VECTOR_LENGTH] row += [ classMapStr, strConfMat, roundH(h, 'acc'), roundH(h, 'loss'), roundH(h, 'val_acc'), roundH(h, 'val_loss') ] writeSummary(row)
def getAbstractIdentifier(self, symbTable): '''If Identifier, then add the type of identifier as Abstract token (except for special cases)''' if self.flagIsDirective: # If directive declaration (#include<>), add actual spellings to abstraction (and not invalid-types) self.addAbstract(self.spell, self.spell) elif self.spell in CF.IncludeIdentifiers: # Handle specials like printf self.addAbstract(self.spell, self.spell) else: # All other cursorTypes #print '-getAbstractIdentifier-\n', self.spell, self.cursorType symbTable.insertToken( self.spell, self.cursor ) # Check & Add unknown variable/func declaration to Symbol-Table symbTypes = symbTable.lookup( self.spell) # try to fetch type from sybmTable if len( symbTypes ) > 0: # If lookup success, add the symbType as the abstraction map(self.addAbstract, symbTypes, [self.spell] * len(symbTypes)) # Add self.spell as Concretization of all AbstractTypes # Log error in case SymbTable and Clang differ in claimed Type if len( symbTypes ) == 1 and self.cursorType != TypeKind.INVALID and self.cursorType != TypeKind.FUNCTIONPROTO: # Unless the type is INVALID or FUNCTION if symbTypes[0] != self.cursorType: H.errorLog( [['CodeID', self.codeID], [ 'AbstractToken SymbTab & Clang mismatch type', str(symbTypes[0]) + ' and ' + str(self.cursorType) ], ['lineNum', self.lineNum], ['spell', self.spell]]) else: # Otherwise, If symbTable doesn't have the type, insert the cursorType (probably INVALID type) self.addAbstract(self.cursorType, self.spell)
def readAllErrors(): '''Check if indexing of errors (sorted based on count) is already present in the path. Based on some previous run (or semester). If so, use that indexing (most freq comp error gets index-1)''' allErrs = {} try: headers, lines = H.readCSV(CF.fnameErrorIDs) indexIndex, indexErrExp = headers.index('index'), headers.index('error_message') for line in lines: index, errExp = line[indexIndex], line[indexErrExp] allErrs[errExp] = Error(errExp, index=index) except IOError: pass return allErrs
def getBuggyAbsLine(codeText): '''Given codeText, return the buggy abstract lines (abstraction of erroneous lines) and their line numbers''' codeObj = Code(codeText) absLines = getProgAbstraction(codeObj) errs = codeObj.getSevereErrors() absLinesBuggy, lineNums = [], [] if len(errs) > 0: for err in errs: lineNum = err.line # Pick the first error line Num if lineNum > 0 and lineNum <= len(absLines): # If line-num reported by compiler doesn't exceeds #absLines (and is >=1) absLine = H.joinList(absLines[lineNum - 1], ' ') if lineNum not in lineNums: # Add unique lineNum / absLine absLinesBuggy.append(absLine) lineNums.append(lineNum) return absLinesBuggy, lineNums
def addRaw_Bigram(self): for bigram in H.pairwise(self.rawText.split()): p1, p2 = bigram biRawText = p1 +'<BIGRAM>'+ p2 self.rawText += ' '+ biRawText
def printProgAbstraction(fnamePath): codeText = open(fnamePath).read() codeObj = Code(codeText) absLines = getProgAbstraction(codeObj) for line in absLines: print H.joinList(line, ' ')
def getCSV_Conf(self): return [(k, v) for k, v in H.sortDictVal(self.predClasses, reverse=True)]
def calcConfMat(self): print colored('\n\tConfusion Matrix: ...', 'magenta') predAtK = [] for topK in [1, 3, 5]: countM, countN = 0, 0 predClasses_Tests = self.deepModel.getPrediction(topK) for actClasses_bin, predClasses_bin in zip(self.y_test, predClasses_Tests): if self.multiClass: # Add +1 to index since off-by-one with dict_index2class actClasses_indices = [ index + 1 for index in range(len(actClasses_bin)) if actClasses_bin[index] == 1 ] predClasses_indices = [ index + 1 for index in range(len(predClasses_bin)) if predClasses_bin[index] == 1 ] else: actClasses_indices, predClasses_indices = [ np.argmax(actClasses_bin) ], predClasses_bin actClasses = [ self.dict_index2class[index] for index in actClasses_indices ] predClasses = [ self.dict_index2class[index] for index in predClasses_indices ] for actClass in actClasses: if actClass in predClasses: # True-Positive countM += 1 if topK == 1: # Conf Matrix only for Pred@1 self.confMatrix[actClass].truePos += 1 self.confMatrix[actClass].updatePred(actClass) else: # False-Negative: Not predicted at all countN += 1 if topK == 1: # Conf Matrix only for Pred@1 self.confMatrix[actClass].falseNeg += 1 for predClass in predClasses: # Add all confusion labels self.confMatrix[actClass].updatePred(predClass) if topK == 1: # Conf Matrix only for Pred@1 for predClass in predClasses: if predClass not in actClasses: # False-Positive: Predicted, but falsely self.confMatrix[predClass].falsePos += 1 if topK == 1: # Conf Matrix only for Pred@1 sortedConfMat = [ self.confMatrix[i] for i in self.getSortedConfMat()[0] ] strConf = H.joinList(sortedConfMat) self.writeConfMat(sortedConfMat) prec_at_k = round(100 * float(countM) / (countM + countN), 2) predAtK.append(prec_at_k) print 'Pred@{}= {}'.format(topK, prec_at_k) return strConf, predAtK
def __str__(self): return H.joinList(self.abstractTokens, ' ')