def genGraph(fpPseudoCode,fopGraph): createDirIfNotExist(fopGraph) f1=open(fpPseudoCode) strPseudoCodes=f1.read() f1.close() dictPseudoCodes={} arrPseudoCodes=strPseudoCodes.split('\n') for i in range(0,len(arrPseudoCodes)): strTrim=arrPseudoCodes[i].strip() if strTrim.endswith('_text.txt'): currentKey=strTrim lstPseudoCodes=[] dictPseudoCodes[currentKey]=lstPseudoCodes # print(currentKey) else: dictPseudoCodes[currentKey].append(arrPseudoCodes[i]) index=0 for key in dictPseudoCodes.keys(): index=index+1 pseudoCodeName=key.replace('_text.txt','') lstPseudoCodes=dictPseudoCodes[key] strPseudo='\n'.join(lstPseudoCodes).strip().replace('.',' PUNC_CHAR ') strPseudo =strPseudo.replace('\n',' . ') # lstPseudoCodes='\n'.join(lstPseudoCodes).strip().split('\n') fpItemPseudoCode=fopGraph+pseudoCodeName+'_graphNL.txt' # createDirIfNotExist(fopItemPseudoCode) print('{}\t{}'.format(index,fpItemPseudoCode)) strJson = textToJson(strPseudo) f1 = open(fpItemPseudoCode, 'w') f1.write(strJson) f1.close()
def compileMixCCodeAndSave(fopStep1, fopStep2, fopASTInfo, fopStep4GraphAll, fopStep4GraphSimplify, fpLog, nlpObj, offsetContext, isSaveGraph): createDirIfNotExist(fopStep2) createDirIfNotExist(fopASTInfo) createDirIfNotExist(fopStep4GraphAll) createDirIfNotExist(fopStep4GraphSimplify) f1 = open(fpLog, 'w') f1.write('') f1.close() numWordProcess = 0 totalTimeProcess = 0 lstCFilesStep1 = glob.glob(fopStep1 + '*.cpp') # t = time.time() # Parallel(n_jobs=8)(delayed(checkAndGenerateAST)(i,lstCFilesStep1, fopStep2, fopASTInfo,fpLog) for i in range(0,len(lstCFilesStep1))) for i in range(0, len(lstCFilesStep1)): # if i!=2: # continue ii, itemNumWord, itemNumTime = checkAndGenerateAST( i, lstCFilesStep1, fopStep2, fopASTInfo, fopStep4GraphAll, fopStep4GraphSimplify, fpLog, nlpObj, offsetContext, isSaveGraph) numWordProcess = numWordProcess + itemNumWord totalTimeProcess = totalTimeProcess + itemNumTime # break avgTimePerWords = (totalTimeProcess * 1.0) / numWordProcess
def extractTextFiles(fpData,fopSPOCPlain,fopSPOCNested): tsv_file = open(fpData) df = pd.read_csv(tsv_file, delimiter="\t") df_grouped=df.groupby(['subid','probid','workerid']) # iterate over each group index=0 lenGrouped=len(df_grouped) for group_name, df_group in df_grouped: #print('\nCREATE TABLE {}('.format(group_name)) #print('aaa {} {}'.format(type(df_group),group_name)) index=index+1 try: strTotalText, strTotalCode,strTotalLine=getPseudoCodeAndCode(df_group) # print('type {}'.format(df_group['workerid'])) # row0=df_group.itertuples()[0] workId=df_group['workerid'].iloc[0] probId=df_group['probid'].iloc[0] subId=df_group['subid'].iloc[0] fpPlainText='{}/{}_{}_{}_text.txt'.format(fopSPOCPlain, workId, probId, subId) fpPlainCode = '{}/{}_{}_{}_code.txt'.format(fopSPOCPlain, workId, probId, subId) fpPlainLine = '{}/{}_{}_{}_line.txt'.format(fopSPOCPlain, workId, probId, subId) fopNest= fopSPOCNested + '/' + str(workId) + '/' + str(probId) + '/' + str(subId) + '/' createDirIfNotExist(fopNest) fpNestText = '{}/{}_{}_{}_text.txt'.format(fopNest, workId, probId, subId) fpNestCode = '{}/{}_{}_{}_code.txt'.format(fopNest, workId, probId, subId) fpNestLine = '{}/{}_{}_{}_line.txt'.format(fopNest, workId, probId, subId) fff=open(fpPlainText,'w') fff.write(strTotalText) fff.close() fff=open(fpPlainCode,'w') fff.write(strTotalCode) fff.close() fff=open(fpPlainLine,'w') fff.write(strTotalLine) fff.close() fff=open(fpNestText,'w') fff.write(strTotalText) fff.close() fff=open(fpNestCode,'w') fff.write(strTotalCode) fff.close() fff=open(fpNestLine,'w') fff.write(strTotalLine) fff.close() except Exception as e: print('{}\n{}'.format(str(e),traceback.format_exc())) print('Index {}/{}'.format(index,lenGrouped))
def compileMixCCodeAndSave(fopStep1, fopStep2, fopASTInfo, fpLog, numOmit): createDirIfNotExist(fopStep2) createDirIfNotExist(fopASTInfo) f1 = open(fpLog, 'w') f1.write('') f1.close() lstCFilesStep1 = glob.glob(fopStep1 + '*.cpp') t = time.time() Parallel(n_jobs=8)(delayed(checkAndGenerateAST)( i, lstCFilesStep1, fopStep2, fopASTInfo, fpLog, numOmit) for i in range(0, len(lstCFilesStep1))) print(time.time() - t)
def generateMixFiles(fopPseudoCode, fopCodeFile, fopOutputMix): createDirIfNotExist(fopOutputMix) lstFileItemSPoC = glob.glob(fopCodeFile + '*_code.cpp') indexFile = -1 for index in range(0, len(lstFileItemSPoC)): fpCodeFile = lstFileItemSPoC[index] indexFile = indexFile + 1 nameOfSubmission = os.path.basename(fpCodeFile).replace( '_code.cpp', '') fpPseudoCode = fopPseudoCode + nameOfSubmission + '_text.txt' f1 = open(fpCodeFile, 'r') arrCodeLines = f1.read().strip().split('\n') f1.close() f1 = open(fpPseudoCode, 'r') arrPseudoLines = f1.read().strip().split('\n') f1.close() # lstIndexes=getMostComplicatedPseudocode(arrPseudoLines,arrCodeLines,distanceHeader,4) # replace pseudocode to code # for indPseudoCode in range(0,len(lstIndexes)): for indPseudoCode in range(0, len(arrPseudoLines)): if not checkComplicatedPseudoCodeAndCode( arrPseudoLines[indPseudoCode].strip(), arrCodeLines[indPseudoCode + distanceHeader].strip()): continue lstStrCodeCombine = [] for ind in range(0, distanceHeader): lstStrCodeCombine.append(arrCodeLines[ind]) for ind in range(distanceHeader, len(arrCodeLines)): indInCode = ind - distanceHeader if indInCode == indPseudoCode: strLinePseudo = '// {}'.format( arrPseudoLines[indPseudoCode]) lstStrCodeCombine.append(strLinePseudo) else: lstStrCodeCombine.append(arrCodeLines[ind]) fpVersion = fopOutputMix + nameOfSubmission + '_v' + str( indPseudoCode + 1) + '.cpp' f1 = open(fpVersion, 'w') f1.write('\n'.join(lstStrCodeCombine)) f1.close() print('complete {}/{} {}'.format(index, len(lstFileItemSPoC), fpCodeFile))
def compileMixCCodeAndSave(fopStep1,fopStep2,fopASTInfo,fopStep4GraphAll,fopStep4GraphSimplify,fpLog,nlpObj,offsetContext,isSaveGraph): createDirIfNotExist(fopStep2) createDirIfNotExist(fopASTInfo) createDirIfNotExist(fopStep4GraphAll) createDirIfNotExist(fopStep4GraphSimplify) f1 = open(fpLog, 'w') f1.write('') f1.close() lstCFilesStep1=glob.glob(fopStep1+'*_text.txt') # t = time.time() # Parallel(n_jobs=8)(delayed(checkAndGenerateAST)(i,lstCFilesStep1, fopStep2, fopASTInfo,fpLog) for i in range(0,len(lstCFilesStep1))) for i in range(0, len(lstCFilesStep1)): # if i!=2: # continue checkAndGenerateAST(i, lstCFilesStep1, fopStep2, fopASTInfo,fopStep4GraphAll,fopStep4GraphSimplify,fpLog,nlpObj,offsetContext,isSaveGraph)
def generateScript(fpDictRepo, fopMetadata,fopCommitMessage,fopRepoExtract,fopRepoZip,fpLogCommandDownload,fpLogCommandExtract,fpLogCommandCommit): try: f1=open(fpDictRepo,'r') arrDicts=f1.read().strip().split('\n') f1.close() createDirIfNotExist(fopMetadata) createDirIfNotExist(fopCommitMessage) createDirIfNotExist(fopRepoZip) createDirIfNotExist(fopRepoExtract) if not os.path.exists(fpLogCommandDownload): f1=open(fpLogCommandDownload,'w') f1.write('') f1.close() if not os.path.exists(fpLogCommandExtract): f1 = open(fpLogCommandExtract, 'w') f1.write('') f1.close() if not os.path.exists(fpLogCommandCommit): f1 = open(fpLogCommandCommit, 'w') f1.write('') f1.close() lstCommandDownloads=[] lstCommandExtract = [] lstCommandCommitMessage = [] for i in range(0,len(arrDicts)): try: arrItem=arrDicts[i].strip().split('\t') if len(arrItem)>=2: strAuthor=arrItem[0].split('__')[0] strRepo = arrItem[0].split('__')[1] strSHA='' strZipDownload='https://github.com/{}/{}/tree/{}'.format(strAuthor,strRepo,strSHA) fopZipLocation=fopRepoZip+strAuthor+'/'+strRepo+'/' createDirIfNotExist(fopZipLocation) fpZipLocation=fopZipLocation+strSHA+'.zip' commandDownload='wget {} -O {}'.format(strZipDownload,fpZipLocation) # print(fopRepoZip) commandExtract='unzip {} -d {}'.format(fpZipLocation,fopRepoExtract) createDirIfNotExist(fopMetadata+strAuthor+'/') fopLocalMeta=fopMetadata+strAuthor+'/'+strRepo+'/' fpLogMessage=fopCommitMessage+strAuthor+'__'+strRepo+'.txt' commandFirstRemove='rm -rf {}'.format(fopLocalMeta) commandCommitMessage1 ='cd {}'.format(fopMetadata+strAuthor+'/') commandCommitMessage2='git clone --no-checkout https://github.com/{}/{}/'.format(strAuthor,strRepo) commandCommitMessage3 = 'cd {}'.format(fopLocalMeta) commandCommitMessage4 = 'git log >{}'.format(fpLogMessage) commandCommitMessage5 = 'cd {}'.format(fopCommitMessage) lstCommandDownloads.append(commandDownload) lstCommandExtract.append(commandExtract) lstCommandCommitMessage.append(commandFirstRemove) lstCommandCommitMessage.append(commandCommitMessage1) lstCommandCommitMessage.append(commandCommitMessage2) lstCommandCommitMessage.append(commandCommitMessage3) lstCommandCommitMessage.append(commandCommitMessage4) lstCommandCommitMessage.append(commandCommitMessage5) except: traceback.print_exc() f1=open(fpLogCommandDownload,'a') f1.write('\n'.join(lstCommandDownloads)+'\n') f1.close() f1 = open(fpLogCommandExtract, 'a') f1.write('\n'.join(lstCommandExtract) + '\n') f1.close() f1 = open(fpLogCommandCommit, 'a') f1.write('\n'.join(lstCommandCommitMessage) + '\n') f1.close() except: traceback.print_exc()
strTabChar = ' tabChar ' strEndLineChar = '_EL_' strEndFileChar = '_EF_' strSplitIndent = ' IndentSplit ' strSplitJson = ' JsonSplit ' strSplitCharacterForNodeEdge = '_ABAZ_' fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopStep3V2 = fopRoot + 'step3_v2/' fopStep5V2 = fopRoot + 'step5_v2_HGT/' fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/' fopStep2Tokenize = fopRoot + 'step2_tokenize/' fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' fopGraphEntityInfo = fopStep3V2 + 'graphEntityInfo/' createDirIfNotExist(fopStep5V2) f1 = open(fpDictLiterals, 'r') arrLits = f1.read().strip().split('\n') f1.close() dictLiteralsToValues = {} dictValuesToLiterals = {} for item in arrLits: arrTabs = item.split('\t') if len(arrTabs) >= 2: strContent = '\t'.join(arrTabs[1:]) dictLiteralsToValues[arrTabs[0]] = strContent dictValuesToLiterals[strContent] = arrTabs[0] # sorted(glob.glob(fopMixVersion+'**/**/a_json.txt')) distanceHeader = 33
fopDataFolder='/Users/hungphan/git/dataPapers/' fopInputSPOCData=fopDataFolder+'/SPOCDataset/spoc/' fopTrainSPOCNested= fopDataFolder + '/trainSPOCNest/' fopTrainSPOCPlain= fopDataFolder + '/trainSPOCPlain/' fpTrainData=fopInputSPOCData+'train/spoc-train.tsv' fopTestPSPOCNested= fopDataFolder + '/testPSPOCNest/' fopTestPSPOCPlain= fopDataFolder + '/testPSPOCPlain/' fpTestPData=fopInputSPOCData+'test/spoc-testp.tsv' fopTestWSPOCNested= fopDataFolder + '/testWSPOCNest/' fopTestWSPOCPlain= fopDataFolder + '/testWSPOCPlain/' fpTestWData=fopInputSPOCData+'test/spoc-testw.tsv' createDirIfNotExist(fopTrainSPOCNested) createDirIfNotExist(fopTrainSPOCPlain) createDirIfNotExist(fopTestPSPOCNested) createDirIfNotExist(fopTestPSPOCPlain) createDirIfNotExist(fopTestWSPOCNested) createDirIfNotExist(fopTestWSPOCPlain) def getPseudoCodeAndCode(df): lstText=[] lstCode=[] lstLine=[] for row_index, row in df.iterrows(): strCode=str(row['code']) strText = str(row['text']) lstText.append(strText) indent=int(row['indent'])
strRegexCamelCases = r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))' strStmtSplit = ' StmtSplit ' strTabChar = ' tabChar ' strEndLineChar = ' endLineChar ' strSplitIndent = ' IndentSplit ' strSplitJson = ' JsonSplit ' strParseResultsType = "<class 'pyparsing.ParseResults'>" strStrType = "<class 'str'>" fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fpPseudocodeAfterPOS = fopRoot + 'pseudocode_after_pos_v1.txt' fopGraph = fopRoot + 'graphPOS/' createDirIfNotExist(fopGraph) f1 = open(fpPseudocodeAfterPOS, 'r') arrAfterLines = f1.read().split('\n') f1.close() for i in range(0, len(arrAfterLines)): itemStr = arrAfterLines[i] arrItTabs = itemStr.split('\t') if len(arrItTabs) >= 2: strJson = arrItTabs[1] jsonObj = ast.literal_eval(strJson) fpGraphDot = fopGraph + str((i + 1)) + '.dot' fpGraphPng = fopGraph + str((i + 1)) + '.png' graphAll = pgv.AGraph(directed=True) getGraphFromJsonPOS(jsonObj, graphAll) graphAll.write(fpGraphDot)
dictWords = {} dictTotal = walkAndGetPOSJson(data, indexSentence, lstNonTerminals, lstTerminals) except: strJsonObj = '{}' dictTotal = {} traceback.print_exc() return dictTotal fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopMixVersion = fopRoot + 'step4_mixCode/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' fpPseudocodeBeforePOS = fopRoot + 'pseudocode_before_pos.txt' fpCachedFilePath = fopRoot + 'cachedFilePaths.txt' createDirIfNotExist(fopMixVersion) model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) print('before traverse') lstFpJsonFiles = [] if not os.path.isfile(fpCachedFilePath): lstFop1 = sorted(glob.glob(fopMixVersion + '*/')) for fop1 in lstFop1: lstFop2 = sorted(glob.glob(fop1 + '*/')) for fop2 in lstFop2: lstFop3 = sorted(glob.glob(fop2 + 'v_*_label.txt')) # print(fp3) for fp3 in lstFop3: lstFpJsonFiles.append(fp3)
# shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.png', # fopItemAllNumOfStmts + fnIndexVersion + '_graph.png') except: traceback.print_exc() fopRoot='../../../../dataPapers/textInSPOC/correctCodeRaw/' fopCodeFile=fopRoot+'step2_tokenize/' fopPseudoFile=fopRoot+'step2_pseudo_tokenize/' fopTreeSitterFile=fopRoot+'step3_treesitter_tokenize/' fopLabels=fopRoot+'step2_pseudo_tokenize/' fopTreeSitterFile=fopRoot+'step3_treesitter_tokenize/' fopMixVersion=fopRoot+'step4_mixCode/' fopAllocateByMainStmt=fopRoot+'step4_mainStmt/' fopAllocateByNumOfStmts=fopRoot+'step4_numOfStatements/' fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt' createDirIfNotExist(fopMixVersion) createDirIfNotExist(fopAllocateByMainStmt) createDirIfNotExist(fopAllocateByNumOfStmts) f1=open(fpDictLiterals,'r') arrLits=f1.read().strip().split('\n') f1.close() dictLiterals={} for item in arrLits: arrTabs=item.split('\t') if len(arrTabs)>=2: strContent='\t'.join(arrTabs[1:]) dictLiterals[arrTabs[0]]=strContent # print('len dict {}'.format(len(dictLiterals.keys()))) # input('abc ')
model.eval() #inference import spacy nlp = spacy.load('en_core_web_sm') test_loss, test_acc = evaluate(model, test_iterator, criterion) # acc_prob1=test_acc print('test loss and test acc \n{}\t{}'.format(test_loss, test_acc)) return test_acc fopRoot = '../../../../../media/dataPapersExternal/mixCodeRaw/' fpInputText = fopRoot + 'embeddingModels/d2v/paragraph_text.txt' fopOutputML = fopRoot + 'resultMLs/rnn-lstm-small/' fpResultDetails = fopOutputML + 'result_details.txt' fpDoc2VecModel = fopRoot + 'embeddingModels/d2v/d2v.model.txt' createDirIfNotExist(fopOutputML) sys.stdout = open(fpResultDetails, 'w') # fpTrain = fopRoot + 'train.csv' # fpTextTrain = fopRoot + 'train.text.txt' fpLabelP1Train = fopOutputML + 'train.label.p1.txt' fpLabelP2Train = fopOutputML + 'train.label.p2.txt' fpLabelP3Train = fopOutputML + 'train.label.p3.txt' # fpValid = fopRoot + 'valid.csv' # fpTextValid = fopRoot + 'testP.text.txt' fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt' fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt' fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt'
def extractPOSAndTree(fopTextCorpus, fopPOSCorpus, item): try: fopItemTextCorpus = fopTextCorpus + item + '/' # fopItemTreeCorpus = fopTextCorpus + item + '/' fopItemPOSCorpus = fopPOSCorpus + item + '/' # createDirIfNotExist(fopItemTreeCorpus) createDirIfNotExist(fopItemPOSCorpus) lstTextFiles = sorted(glob.glob(fopItemTextCorpus + '/*.txt')) lstPOSPerFile = [] lstTreePerFile = [] lstProcessTextPerFile = [] # print('len {}'.format(len(lstTextFiles))) numWordProcess = 0 totalTimeProcess = 0 for j in range(0, len(lstTextFiles)): try: if j == 1: break # start_time = time.time() f1 = open(lstTextFiles[j], 'r') arrTexts = f1.read().split('\n') f1.close() indexName = os.path.basename(lstTextFiles[j]).replace( '.txt', '') fpPOS = fopItemPOSCorpus + indexName + '_pos.txt' # fpTree = fopItemPOSCorpus + indexName + '_tree.txt' fpTextPreprocess = fopItemPOSCorpus + indexName + '_preprocess.txt' # print('go here') f1 = open(fpPOS, 'w') f1.write('') f1.close() # f1 = open(fpTree, 'w') # f1.write('') # f1.close() f1 = open(fpTextPreprocess, 'w') f1.write('') f1.close() for i in range(0, len(arrTexts)): try: strItem = arrTexts[i].replace( strEndLine, '\m').replace(strTabChar, '\t').replace('// ', '') start_time = time.time() strPostText, strPOS = getPOSAndTreeFromText(strItem) numWordItem = len(strItem.split()) numWordProcess = numWordProcess + numWordItem end_time = time.time() totalTimeProcess = totalTimeProcess + (end_time - start_time) if strPostText != '': lstProcessTextPerFile.append(strPostText) # lstTreePerFile.append(strTree) lstPOSPerFile.append(strPOS) if ((len(lstProcessTextPerFile) > 0) and (((i + 1) == len(arrTexts)) or ((i + 1) % 1000 == 0))): f1 = open(fpPOS, 'a') f1.write('\n'.join(lstPOSPerFile) + '\n') f1.close() # f1 = open(fpTree, 'a') # f1.write('\n'.join(lstTreePerFile)+'\n') # f1.close() f1 = open(fpTextPreprocess, 'a') f1.write('\n'.join(lstProcessTextPerFile) + '\n') f1.close() lstPOSPerFile = [] lstTreePerFile = [] lstProcessTextPerFile = [] print('finish write at index {} of file {}'.format( i, lstTextFiles[j])) if i == 100: break except: traceback.print_exc() # duration=time.time() - start_time duration = 0 # print('index {}/{} duration {}'.format(i,len(lstTextFiles),duration)) except: traceback.print_exc() avgTimePerWords = (totalTimeProcess * 1.0) / numWordProcess print('total time and words and avg\t{}\t{}\t{}'.format( numWordProcess, totalTimeProcess, avgTimePerWords)) except: traceback.print_exc()
import operator import clang.cindex sys.path.append(os.path.abspath(os.path.join('../..'))) from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult fopData = '../../../dataPapers/' fopTextInSPoC = fopData + 'textInSPOC/' fopStatisticFolderTestP = fopData + 'textInSPOC/testPOnlyC/' fpCombineASTsTestP = fopTextInSPoC + 'combineASTs_TestP.txt' fopStatisticFolderTestW = fopData + 'textInSPOC/testWOnlyC/' fopASTForCTestW = fopData + 'textInSPOC/testWASTForC/' fopASTForCTestP = fopData + 'textInSPOC/testPASTForC/' fpCombineASTsTestW = fopTextInSPoC + 'combineASTs_TestW.txt' createDirIfNotExist(fopASTForCTestP) createDirIfNotExist(fopASTForCTestW) numOmit = 30 lstFiles = sorted(glob.glob(fopStatisticFolderTestP + "*_code.cpp")) dictCountWords = {} f1 = open(fpCombineASTsTestP, 'w') f1.write('') f1.close() ''' for index in range(0,len(lstFiles)): fpCodeFileCPP=lstFiles[index] fineName=os.path.basename(fpCodeFileCPP) fpASTItem = fopASTForCTestP + fineName.replace('_code.txt', '_ast.txt') # fFile=open(fpCodeFileCPP, 'r') # strContentOfFile=fFile.read().strip()
numWordProcess, totalTimeProcess, avgTimePerWords)) except: traceback.print_exc() batch_size = 100000 strEndLine = ' ENDLINE ' strTabChar = ' TABCHAR ' strSingleComment = ' SINGLECOMMENTCHAR ' # fopCCContent='/home/hungphd/media/dataPapersExternal/apiCallPapers_v1/AlonCommentExtraction/' fopTextCorpus = '/home/hungphd/media/dataPapersExternal/textCorpus/' # fopTextPostProcessCorpus='/home/hungphd/media/dataPapersExternal/textPostProcessCorpus/' fopPOSCorpus = '/home/hungphd/media/dataPapersExternal/posCorpus_small/' # fopParseTreeCorpus='/home/hungphd/media/dataPapersExternal/treeCorpus/' createDirIfNotExist(fopPOSCorpus) lstTypesOfSDs = ['ad', 'cs', 'cc', 'cm', 'qa', 'sr'] fopStanfordParser = '/home/hungphd/git/dataPapers/stanford-parser-4.2.0' os.environ['STANFORD_PARSER'] = fopStanfordParser os.environ['STANFORD_MODELS'] = fopStanfordParser from pycorenlp import StanfordCoreNLP # for item in lstTypesOfSDs: from multiprocessing.pool import ThreadPool as Pool # from multiprocessing import Pool from multiprocessing import Process lstThreads = []
# jsonPOS=walkAndGetPOSJson(data,indexSentence,lstNonTerminals,lstTerminals) # dictTotal['children'].append(jsonPOS) strOutput=strSplitParsedTree.join(lstOutput) except: # strJsonObj = 'Error' # dictTotal=None traceback.print_exc() return strOutput fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopPseudoTokens=fopRoot+'step2_pseudo_tokenize/' fopEstimateTime=fopRoot+'estimate_time_pos/' fopPseudocodePOS=fopRoot+'step2_pseudocode_pos/' fpCachedFilePath=fopRoot+'cachedFilePaths_pseudo_tokenizes.txt' createDirIfNotExist(fopEstimateTime) fpEstimate=fopEstimateTime+'pos_estimation.txt' model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') lstFpJsonFiles=[] if not os.path.isfile(fpCachedFilePath): lstFop1=sorted(glob.glob(fopPseudoTokens+'*/')) for fop1 in lstFop1: lstFp2=sorted(glob.glob(fop1+'*.txt')) for fp2 in lstFp2: lstFpJsonFiles.append(fp2) print('end {}'.format(fop1))
def logFileLocationForEachJavaProjects(fopItemAlonCorpus, fopItemJsonData, fpLogTotalProject, parser, isCollectFromStart): createDirIfNotExist(fopItemAlonCorpus) createDirIfNotExist(fopItemJsonData) lstProjectNames = glob.glob(fopItemAlonCorpus + '*/') dictFilesPerProject = {} dictAlreadyDownloadProject = {} if isCollectFromStart or not (os.path.exists(fpLogTotalProject)): f1 = open(fpLogTotalProject, 'w') f1.write('') f1.close() else: f1 = open(fpLogTotalProject, 'r') arrAlready = f1.read().strip().split('\n') f1.close() for item in arrAlready: arrTabs = item.split('\t') if len(arrTabs) >= 4: dictAlreadyDownloadProject[arrTabs[0]] = 1 for i in range(0, len(lstProjectNames)): try: fopProjectItem = lstProjectNames[i] arrFopItem = fopProjectItem.split('/') projectFolderName = arrFopItem[len(arrFopItem) - 2] if projectFolderName in dictAlreadyDownloadProject.keys(): continue # print('folder name {}'.format(projectFolderName)) fpItemDataAPICalls = fopItemJsonData + projectFolderName + '.txt' lstJavaFiles = glob.glob(fopProjectItem + '/**/*.java', recursive=True) # print('{} len {}'.format(fopProjectItem,len(lstJavaFiles))) dictFilesPerProject[projectFolderName] = len(lstJavaFiles) dictJavaFiles = {} for j in range(0, len(lstJavaFiles)): fpItemJava = lstJavaFiles[j] key = str(j + 1) dictJavaFiles[key] = fpItemJava lstWriteToFiles = [] for key in dictJavaFiles.keys(): lstWriteToFiles.append('{}\t{}'.format(key, dictJavaFiles[key])) f1 = open(fpItemDataAPICalls, 'w') f1.write('\n'.join(lstWriteToFiles)) f1.close() print('{} Prepare get ast project {} with {} files'.format( (i + 1), fopProjectItem, len(lstJavaFiles))) fopASTItemFather = fopItemJsonData + projectFolderName + '/' createDirIfNotExist(fopASTItemFather) numProjectRunOK = 0 for key in dictJavaFiles.keys(): try: fpItemJavaFiles = dictJavaFiles[key] jsonObject = getJsonDict(fpItemJavaFiles, parser) if (jsonObject is not None): fpJson = fopASTItemFather + str(key) + '_ast.txt' f1 = open(fpJson, 'w') f1.write(str(jsonObject)) f1.close() numProjectRunOK = numProjectRunOK + 1 except: traceback.print_exc() print('{} End get ast project {} with {} files'.format( (i + 1), fopProjectItem, len(lstJavaFiles))) f1 = open(fpLogTotalProject, 'a') f1.write('{}\t{}\t{}\t{}\n'.format( projectFolderName, numProjectRunOK, len(dictJavaFiles.keys()), dictFilesPerProject[projectFolderName])) f1.close() except: traceback.print_exc()
import glob import sys, os import operator sys.path.append(os.path.abspath(os.path.join('../..'))) from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText fopData = '../../../dataPapers/' fopTextInSPoC = fopData + 'textInSPOC/' fopTestPFolder = fopTextInSPoC + 'testPSPOCPlain/' fopTestPFolderOnlyC = fopTextInSPoC + 'testPOnlyC/' lstFiles = sorted(glob.glob(fopTestPFolder + "*_code.txt")) createDirIfNotExist(fopTestPFolderOnlyC) for i in range(0, len(lstFiles)): f1 = open(lstFiles[i], 'r') fileName = os.path.basename(lstFiles[i]) newFileName = fileName.replace('.txt', '.cpp') fpOutpurItem = fopTestPFolderOnlyC + newFileName strContent = f1.read() f1.close() lstHeaders = [ '#include <iostream>', '#include<stdio.h>', '#include <string.h>', '#include<stdlib.h>', '#include<math.h>', '#include<time.h>', '#include<ctype.h>', '#include<assert.h>', '#include<locale.h>', '#include<signal.h>', '#include<setjmp.h>', '#include<stdarg.h>', '#include<errno.h>', '#include<iomanip>', '#include <algorithm>', '#include <vector>', '#include <limits.h>', '#include <map>', '#include <set>', '#include <list>', '#include <stack>', '#include <queue>', '#include <array>', '#include <unordered_map>', '#include <unordered_set>', '#include <deque>', '#include <forward_list>'
dictAcceptableIdsForVersions[numContext].append(ancestorId) if 'children' in jsonAll.keys() and 'isNLRootNode' not in jsonAll.keys(): lstChildren = jsonAll['children'] for i in range(0, len(lstChildren)): itemChild = lstChildren[i] findAddableIdsForContext(itemChild, dictOfFatherIdMainAST, dictAcceptableIdsForVersions, mixStartLine, mixEndLine, dictAncestorToRoot) fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopMixVersion = fopRoot + 'step4_mixCode/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' createDirIfNotExist(fopMixVersion) f1 = open(fpDictLiterals, 'r') arrLits = f1.read().strip().split('\n') f1.close() dictLiterals = {} for item in arrLits: arrTabs = item.split('\t') if len(arrTabs) >= 2: strContent = '\t'.join(arrTabs[1:]) dictLiterals[arrTabs[0]] = strContent # print('len dict {}'.format(len(dictLiterals.keys()))) # input('abc ') print('before traverse')
fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopStep3V2 = fopRoot + 'step3_v2/' fopStep5V2 = fopRoot + 'step5_v2_HGT/' fopStep5EmbeddingModels = fopRoot + 'embeddingModels/' fopStep5TextInfo = fopStep5EmbeddingModels + 'textInfo/' fopFastTextModel = fopStep5EmbeddingModels + 'glove/' fpFastTextModelBin = fopFastTextModel + 'model.bin' fpFastTextModelText = fopFastTextModel + 'model.text.txt' fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/' fopStep2CodeReplaceDict = fopRoot + 'step2_code_replaceDict/' fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' fopGraphEntityInfo = fopStep3V2 + 'graphEntityInfo/' fopPOSNLTK = fopRoot + 'step2_afterTranslation/sortBySimilarityScore/pos_nltk/' fopPOSStanford = fopRoot + 'step2_afterTranslation/sortBySimilarityScore/pos_stanford/' createDirIfNotExist(fopStep5V2) createDirIfNotExist(fopStep5EmbeddingModels) createDirIfNotExist(fopStep5TextInfo) createDirIfNotExist(fopFastTextModel) lstFpTextFiles = glob.glob(fopStep5TextInfo + '*.textTrainEmb.txt') lstStrTextInfo = [] for fpItem in lstFpTextFiles: f1 = open(fpItem, 'r') arrTextItem = f1.read().strip().split('\n') f1.close() for j in range(0, len(arrTextItem)): arrTextItem[j] = arrTextItem[j].replace(',', ' COMMA ') lstStrTextInfo = lstStrTextInfo + list(arrTextItem)
def getAllLeaveNodes(g): leafs = [ x for x in g.nodes() if g.out_degree(x) == 0 and g.in_degree(x) == 1 ] return leafs fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopStep4NMT = fopRoot + 'step4_NMT/' fopStep5V2HGT = fopRoot + 'step5_v2_HGT/' fopStep3V2 = fopRoot + 'step3_v2/' fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/' fopStep2Tokenize = fopRoot + 'step2_tokenize/' fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/' fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' createDirIfNotExist(fopStep4NMT) f1 = open(fpDictLiterals, 'r') arrLits = f1.read().strip().split('\n') f1.close() dictLiterals = {} dictLiteralsReverse = {} for item in arrLits: arrTabs = item.split('\t') if len(arrTabs) >= 2: strContent = '\t'.join(arrTabs[1:]) dictLiterals[arrTabs[0]] = strContent dictLiteralsReverse[strContent] = arrTabs[0] # sorted(glob.glob(fopMixVersion+'**/**/a_json.txt')) distanceHeader = 33
from langdetect import detect from sklearn.metrics import confusion_matrix import langid fopRoot='/home/hungphd/git/dataPapers/textInSPOC/mixCode_v2/step6/' fpTrainText=fopRoot+'train.text.txt' fpTestPText=fopRoot+'testP.text.txt' fpTestWText=fopRoot+'testW.text.txt' fpTrainLabel=fopRoot+'train.label.txt' fpTestPLabel=fopRoot+'testP.label.txt' fpTestWLabel=fopRoot+'testW.label.txt' fopD2VRF=fopRoot+'result-d2v-rf/' createDirIfNotExist(fopD2VRF) fpOutModel=fopD2VRF+'model.d2v' fpOutResultDetail=fopD2VRF+'resultDetail.txt' fpOutResultSummary=fopD2VRF+'resultSummary.txt' X_Train = [] key_Train = [] X_TestP = [] key_TestP = [] X_TestW = [] key_TestW = [] y_Train=[] y_TestP=[] y_TestW=[] lstAllText = []
except: traceback.print_exc() fopDataPapers = '../../../../dataPapers/' fopAlonCorpus = fopDataPapers + 'java-large/' fopDataAPICalls = fopDataPapers + 'apiCallPapers/' fopJsonData = fopDataPapers + 'apiCallPapers/AlonJsonData/' # fopFilesPerProjectData=fopDataPapers+'apiCallPapers/AlonFilesPerProject/' fopDataRoot = '/home/hungphd/' fopGithub = '/home/hungphd/git/' fopBuildFolder = fopDataRoot + 'build-tree-sitter/' fpLanguageSo = fopBuildFolder + 'my-languages.so' createDirIfNotExist(fopJsonData) # createDirIfNotExist(fopFilesPerProjectData) lstFopAlonCorpus = [] lstFopJsonData = [] lstFpLogAPICalls = [] lstFopFilesPerProjectData = [] lstFolderNames = ['training', 'test', 'validation'] JAVA_LANGUAGE = Language(fpLanguageSo, 'java') parser = Parser() parser.set_language(JAVA_LANGUAGE) isCollectFromStart = False for i in range(0, len(lstFolderNames)): folderName = lstFolderNames[i]
def generateMixVersionsAndLabels(jsonObject,dictLinesAndElements,dictLabelStatistics,dictLiterals,arrCodes,arrPseudos,fopCodeVersion,fonameItemAST,idCode,fopAllocateByMainStmt,fopAllocateByNumOfStmts): try: isOK=False indexVersion=0 fpItemPseudo=fopCodeVersion+'_a_pseudo.txt' fpItemCode=fopCodeVersion+'_a_code.cpp' fpItemGraphText = fopCodeVersion + '_a_graph.dot' fpItemGraphPng = fopCodeVersion + '_a_graph.png' f1=open(fpItemPseudo,'w') f1.write('\n'.join(arrPseudos)) f1.close() f1=open(fpItemCode,'w') f1.write('\n'.join(arrCodes)) f1.close() if 'test' in fopCodeVersion: graph = pgv.AGraph(directed=True) isBlueColor = True idChange = '-1' generateGraph(jsonObject,'',arrCodes, idChange, isBlueColor, graph) graph.write(fpItemGraphText) # graph.layout() # graph.draw(fpItemGraphPng) # (graphPydot,) = pydot.graph_from_dot_file(fpItemGraphText) # graphPydot.write_png(fpItemGraphPng) # check_call(['dot', '-Tpng', fpItemGraphText, '-o', fpItemGraphPng]) # render('dot', 'png', fpItemGraphText) for keyItem in dictLinesAndElements.keys(): valItem=dictLinesAndElements[keyItem] indexVersion=indexVersion+1 fnIndexVersion='v_{}'.format(indexVersion) fpItemVersionCode=fopCodeVersion+fnIndexVersion+'_mix.cpp' fpItemVersionLabel = fopCodeVersion + fnIndexVersion + '_label.txt' mainStmt = valItem['mainStmt'] startLineMainStmt=mainStmt['startLine'] endLineMainStmt = mainStmt['endLine'] lstPseudoLines=range(startLineMainStmt-distanceHeader,endLineMainStmt-distanceHeader+1) lstStrPseudoLines=[] lstStrCodeLines = [] for it in lstPseudoLines: lstStrPseudoLines.append(arrPseudos[it]) lstStrCodeLines.append(arrCodes[it+distanceHeader]) strTotalComment='// '+' , then '.join(lstStrPseudoLines) strTotalCode=' '.join(lstStrCodeLines) strCodeSplit,strCodeTokSplit,strPseudoSplit,strTokSplit,numAppear,numDisappear,percent=checkAppearInImplementation(dictLiterals,strTotalComment,strTotalCode) lstMixCodes = [] for i in range(0,len(arrCodes)): itemLine=arrCodes[i] numTabs=0 strLineAdd=arrCodes[i] if (i-distanceHeader) in lstPseudoLines: lstStrs = [] while(itemLine[numTabs]=='\t'): numTabs=numTabs+1 lstStrs.append('\t') if numTabs == len(itemLine): break if i == startLineMainStmt: strLineAdd='{}{}'.format(''.join(lstStrs),strTotalComment) else: strLineAdd = '{}{}'.format(''.join(lstStrs), '//') lstMixCodes.append(strLineAdd) f1=open(fpItemVersionCode,'w') f1.write('\n'.join(lstMixCodes)) f1.close() strMainStatement=mainStmt['type'] strPosition='{}-{}-{}-{}'.format(mainStmt['startLine'],mainStmt['startOffset'],mainStmt['endLine'],mainStmt['endOffset']) numOfLine=endLineMainStmt-startLineMainStmt+1 numOfStatements=0 lstListOfBelowStatements=[] strDictToString='' if mainStmt['singleRoot']: numOfStatements=1 lstListOfBelowStatements.append(mainStmt['type']) if 'children' in mainStmt.keys(): for child in mainStmt['children']: if 'isReplaceable' in child.keys() and child['isReplaceable']: strStmtChild=child['type'] numOfStatements=numOfStatements+1 lstListOfBelowStatements.append(strStmtChild) strDictToString=str(mainStmt) else: indentDeepest = sorted(dictLinesAndElements[keyItem]['sortedElementsByIndents'].keys())[0] lstPossibleStatements = dictLinesAndElements[keyItem]['sortedElementsByIndents'][indentDeepest] numOfStatements=len(lstPossibleStatements) for stmt in lstPossibleStatements: lstListOfBelowStatements.append(stmt['type']) strDictToString =strSplitJson.join(map(str,lstPossibleStatements)) strBelowStatements=strStmtSplit.join(lstListOfBelowStatements) if strMainStatement not in dictLabelStatistics['mainStmt'].keys(): dictLabelStatistics['mainStmt'][strMainStatement]=1 else: dictLabelStatistics['mainStmt'][strMainStatement] =dictLabelStatistics['mainStmt'][strMainStatement] + 1 if numOfStatements not in dictLabelStatistics['numOfStatements'].keys(): dictLabelStatistics['numOfStatements'][numOfStatements]=1 else: dictLabelStatistics['numOfStatements'][numOfStatements] =dictLabelStatistics['numOfStatements'][numOfStatements] + 1 f1=open(fpItemVersionLabel,'w') strLbl='{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\t{}\t{}'.format(strMainStatement,strPosition,numOfLine,numOfStatements,strBelowStatements,strDictToString, strCodeSplit,strCodeTokSplit,strPseudoSplit,strTokSplit,numAppear,numDisappear,percent) f1.write(strLbl) f1.close() if 'test' in fopCodeVersion: graph = pgv.AGraph(directed=True) isBlueColor = True idChange = str(mainStmt['id']) generateGraph(jsonObject,'',arrCodes, idChange, isBlueColor, graph) graph.write(fopCodeVersion+fnIndexVersion+'_graph.dot') # graph.layout(prog='dot') # graph.draw(fopCodeVersion+fnIndexVersion+'_graph.png') fopItemAllMainStmt=fopAllocateByMainStmt+strMainStatement+'/'+fonameItemAST+'/'+idCode+'/' createDirIfNotExist(fopItemAllMainStmt) fopItemAllNumOfStmts = fopAllocateByNumOfStmts + str(numOfStatements) + '/' + fonameItemAST + '/' + idCode + '/' createDirIfNotExist(fopItemAllNumOfStmts) # if os.path.isdir(fopItemAllMainStmt): # shutil.rmtree(fopItemAllMainStmt) # if os.path.isdir(fopItemAllNumOfStmts): # shutil.rmtree(fopItemAllNumOfStmts) shutil.copy(fopCodeVersion+fnIndexVersion+'_mix.cpp', fopItemAllMainStmt+fnIndexVersion+'_mix.cpp') shutil.copy(fopCodeVersion+fnIndexVersion+ '_label.txt', fopItemAllMainStmt+fnIndexVersion+ '_label.txt') shutil.copy(fopCodeVersion+fnIndexVersion+'_mix.cpp', fopItemAllNumOfStmts+fnIndexVersion+'_mix.cpp') shutil.copy(fopCodeVersion + fnIndexVersion + '_label.txt',fopItemAllNumOfStmts + fnIndexVersion + '_label.txt') if 'test' in fopCodeVersion: shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.dot', fopItemAllNumOfStmts + fnIndexVersion + '_graph.dot') # shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.png', # fopItemAllNumOfStmts + fnIndexVersion + '_graph.png') except: traceback.print_exc()
import sys, os import operator import clang.cindex sys.path.append(os.path.abspath(os.path.join('../..'))) from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult fopData = '../../../dataPapers/' fopTextInSPoC = fopData + 'textInSPOC/' fopStatisticFolder = fopData + 'textInSPOC/trainOnlyC/' fopASTForC = fopData + 'textInSPOC/trainASTForC/' fpCombineASTs = fopTextInSPoC + 'combineASTs.txt' lstFiles = sorted(glob.glob(fopStatisticFolder + "*_code.cpp")) createDirIfNotExist(fopASTForC) dictCountWords = {} f1 = open(fpCombineASTs, 'w') f1.write('') f1.close() # walker = Walker('') numOmit = 30 for index in range(0, len(lstFiles)): fpCodeFileCPP = lstFiles[index] fineName = os.path.basename(fpCodeFileCPP) fpASTItem = fopASTForC + fineName.replace('_code.txt', '_ast.txt') # fFile=open(fpCodeFileCPP, 'r')
fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopStep3V2=fopRoot+'step3_v2/' fopStep5V2=fopRoot+'step5_v2_HGT/' fopStep5EmbeddingModels=fopRoot+'embeddingModels/' fopStep5TextInfo=fopStep5EmbeddingModels+'textInfo/' fopVectorModel= fopStep5EmbeddingModels + 'tfidf/' fpD2VModelBin= fopVectorModel + 'model.bin' fpD2VModelText= fopVectorModel + 'model.text.txt' fopStep3TreesitterTokenize=fopRoot+'step3_treesitter_tokenize/' fopStep2CodeReplaceDict=fopRoot+'step2_code_replaceDict/' fopStep2PseudoTokenize=fopRoot+'step2_pseudo_tokenize/' fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt' fopGraphEntityInfo=fopStep3V2+'graphEntityInfo/' fopPOSNLTK=fopRoot+'step2_afterTranslation/sortBySimilarityScore/pos_nltk/' fopPOSStanford=fopRoot+'step2_afterTranslation/sortBySimilarityScore/pos_stanford/' createDirIfNotExist(fopStep5V2) createDirIfNotExist(fopStep5EmbeddingModels) createDirIfNotExist(fopStep5TextInfo) createDirIfNotExist(fopVectorModel) lstFpTextFiles=glob.glob(fopStep5TextInfo+'*.textTrainEmb.txt') lstStrTextInfo=[] setVocabs=set() for fpItem in lstFpTextFiles: f1=open(fpItem,'r') arrTextItem=f1.read().strip().split('\n') f1.close() for line in arrTextItem: arrWords=line.split() for word in arrWords:
dictFolderContent={} # dictNewContent={} for i in range(0,len(lstFpInput)): fpItem=lstFpInput[i] nameItem=os.path.basename(fpItem) f1=open(fpItem,'r') arrContent=f1.read().strip().split('\n') f1.close() dictFolderContent[nameItem]=arrContent # dictNewContent[nameItem]=[[],[],[]] fnLocation='location.txt' fnSource='source.txt' fnLabelOverlap='label.p1.overlap.txt' fnTrainIndex='trainValidTest.index.txt' createDirIfNotExist(fopStep6ResultBaseline) lstResult=['EmbedModel\tRoot\tprecision\trecall\tfscore\tacc\ttrain time\tpredict time'] fpOverallResult=fopStep6ResultBaseline+'result_overall.txt' f1=open(fpOverallResult,'w') f1.write('\n'.join(lstResult)+'\n') f1.close() for embedModel in lstEmbeddingInput: fpEmbedVectorProgramRoot=fopStep5EmbeddingModels+embedModel+'/ProgramRoot.vectorForEmb.txt' fpEmbedVectorNLRoot = fopStep5EmbeddingModels + embedModel + '/NLRoot.vectorForEmb.txt' f1=open(fpEmbedVectorProgramRoot,'r') arrProgramRootEmb=f1.read().strip().split('\n') f1.close() f1 = open(fpEmbedVectorNLRoot, 'r') arrNLRootEmb = f1.read().strip().split('\n')
lstStringTotal.append(strId) else: lstStringTotal.append(token) strOutput = ' '.join(lstStringTotal) return strOutput fopRoot = '../../../../dataPapers/textInSPOC/correctCodeRaw/' fopCodeFile = fopRoot + 'step2_pseudo/' fopASTFile = fopRoot + 'step3_pseudo_treesitter/' fopTokASTFile = fopRoot + 'step2_pseudo_tokenize/' fpLogSuccessAndFailed = fopRoot + 'log_step2_pseudo_tok.txt' fpPseudoAll = fopRoot + 'step2_pseudo_all.txt' fpDictLiteral = fopRoot + 'step2_dictLiterals_all.txt' fpDictRvLiteral = fopRoot + 'step2_dictRvLiterals_all.txt' createDirIfNotExist(fopTokASTFile) f1 = open(fpLogSuccessAndFailed, 'w') f1.write('') f1.close() f1 = open(fpPseudoAll, 'w') f1.write('') f1.close() dictLiterals = {} dictReverseALs = {} lstFpCodes = glob.glob(fopCodeFile + '**/*_text.txt', recursive=True) for i in range(0, len(lstFpCodes)): fnItemCode = os.path.basename(lstFpCodes[i]) fopItemCode = os.path.dirname(lstFpCodes[i]) + '/'
sys.path.append(os.path.abspath(os.path.join('../..'))) from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult import asyncio import time from joblib import Parallel, delayed from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText from tree_sitter import Language, Parser from LibForGraphExtractionFromRawCode import getJsonDict, getTerminalValue import ast import re fopRoot = '../../../../dataPapers/textInSPOC/correctCodeRaw/' fopMixVersion = fopRoot + 'step4_mixCode/' fopMixDataAndLabel = fopRoot + 'step5_data_mixCode/' createDirIfNotExist(fopMixDataAndLabel) lstSubFolder = glob.glob(fopMixVersion + '*/') def scoreName(percentage): scoreLevel = '1' if percentage <= 10: scoreLevel = '1' elif percentage > 10 and percentage <= 20: scoreLevel = '2' elif percentage > 20 and percentage <= 30: scoreLevel = '3' elif percentage > 30 and percentage <= 40: scoreLevel = '4' elif percentage > 40 and percentage <= 50: scoreLevel = '5'