import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA from sklearn.metrics import classification_report from sklearn.metrics import cohen_kappa_score from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize fopRoot = '../../../../../../media/dataPapersExternal/mixCodeRaw/' fpInputText = fopRoot + 'embeddingModels/d2v/paragraph_text.txt' fopOutputML = fopRoot + 'resultMLs/doc2vec-rfs-small/' fpResultDetails = fopOutputML + 'result_details.txt' fpDoc2VecModel = fopRoot + 'embeddingModels/d2v/d2v.model.txt' createDirIfNotExist(fopOutputML) f1 = open(fpInputText, 'r') arrText = f1.read().strip().split('\n') f1.close() trainIndex = 0 lstAllData = [] num_features = 100 keyStartEnd = '' dictStartEnd = {} index = 0 lstTuplesTrainTestValid = [] arrText.reverse() for i in range(1, len(arrText), 2):
from nltk.tokenize import word_tokenize import ast import re import pygraphviz as pgv import pydot from subprocess import check_call from graphviz import render import copy import nltk from pathlib import Path fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/' fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/' fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/' createDirIfNotExist(fopInputStep2ReplaceDict) createDirIfNotExist(fopInputStep2BeforeTranslation) lstFopStep2Tok1 = sorted(glob.glob(fopInputStep2Pseudocode + '**/'), reverse=True) lstFpStep2Pseudocode = [] for fop1 in lstFopStep2Tok1: lstFpItem1 = sorted(glob.glob(fop1 + '*_text.txt'), reverse=True) for fpItem in lstFpItem1: lstFpStep2Pseudocode.append(fpItem) fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' f1 = open(fpDictLiterals, 'r') arrLits = f1.read().strip().split('\n') f1.close() dictLiteralsReverse = {}
tupItem=(strSourceValue,strTargetValue,strEdgeId) dictHGTEdges[strNewKey].append(tupItem) except: traceback.print_exc() fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopMixVersion=fopRoot+'step4_mixCode/' numContext=5 fopTotalGraphAll= fopRoot + 'step5_totalGraph_small/'+str(numContext)+'/' fpFileCachedVersion=fopRoot+'cached_graph_all.txt' fpDotTotalGraph= fopTotalGraphAll + 'total.'+str(numContext)+'.dot' fpPngTotalGraph= fopTotalGraphAll + 'total.'+str(numContext)+'.png' fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt' strSplitCharacterForNodeEdge=' ABAZ ' createDirIfNotExist(fopTotalGraphAll) dictTotalElements={} lstFpVersionFiles=[] if not os.path.isfile(fpFileCachedVersion): print('before traverse') lstFop1=sorted(glob.glob(fopMixVersion+'*/')) for fop1 in lstFop1: lstFop2=sorted(glob.glob(fop1+'*/')) for fop2 in lstFop2: lstFp3=sorted(glob.glob(fop2+'v_*_graphs/g_all.dot')) # print(fp3) for fp3 in lstFp3: lstFpVersionFiles.append(fp3) print('end {}'.format(fop1))
strSingleComment = ' SINGLECOMMENTCHAR ' fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/' fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/' fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/' fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/' fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/' fpSourceAfterTrans = fopInputAfterTranslation + 'source.txt' fopPOSStanford = fopInputAfterTranslation + 'pos_stanford/' fopPOSNLTK = fopInputAfterTranslation + 'pos_nltk/' fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/' fpSortedSource = fopSortedBySimScore + 'source.txt' fopSortedPOSStanford = fopSortedBySimScore + 'pos_stanford/' fopSortedPOSNLTK = fopSortedBySimScore + 'pos_nltk/' createDirIfNotExist(fopSortedPOSStanford) createDirIfNotExist(fopSortedPOSNLTK) strParseResultsType = "<class 'pyparsing.ParseResults'>" strStrType = "<class 'str'>" nltk.download('bllip_wsj_no_aux') model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) strServerPort = '9000' nlpObj = StanfordCoreNLP('http://localhost:' + strServerPort) fpAppendNLTKText = fopSortedBySimScore + 'appendPOS.nltk.text.txt' fpAppendNLTKPOS = fopSortedBySimScore + 'appendPOS.nltk.pos.txt' fpAppendStanfordText = fopSortedBySimScore + 'appendPOS.stanford.text.txt' fpAppendStanfordPOS = fopSortedBySimScore + 'appendPOS.stanford.pos.txt'
str2LineToAdd='{}\t{}\t{}\tProgramRoot\t{}\n{}\t{}\t{}\tNLRoot\t{}'.format(fpItemLabel,programName,versionName,strCodeText,fpItemLabel,programName,versionName,strPseudoText) lstProgramText.append(str2LineToAdd) if (i+1)%1000==0 or (i+1)==len(lstFpJsonFiles): f1=open(fpParagraphText,'a') f1.write('\n'.join(lstProgramText)+'\n') f1.close() lstProgramText=[] print('end {} paragraphText'.format((i+1))) except: traceback.print_exc() print('end paragraph text') modelD2v=Doc2Vec.load(fpD2VModel) if not os.path.isdir(fopParagraphEmb): createDirIfNotExist(fopParagraphEmb) f1=open(fpParagraphText,'r') arrParagraphText=f1.read().strip().split('\n') f1.close() lstEmbeddings=[] indexPara=0 print('reaad paragraph text {}'.format(len(arrParagraphText))) for i in range(0,len(arrParagraphText)): arrTabs=arrParagraphText[i].split('\t') if len(arrTabs)>=4: strContent=''.join(arrTabs[3:]) # print(strContent) vectorItem=modelD2v.infer_vector(word_tokenize(strContent)) strItem='{}\t{}\t{}\t{}'.format(arrTabs[0],arrTabs[1],arrTabs[2],' '.join(map(str,vectorItem))) lstEmbeddings.append(strItem) if len(lstEmbeddings)%2000==0 or (i+1)==len(arrParagraphText):
from bllipparser import RerankingParser import nltk from pyparsing import OneOrMore, nestedExpr from nltk.tokenize import word_tokenize, sent_tokenize from pycorenlp import StanfordCoreNLP fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/' fopFilterSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/filterByPOS/' fopSplit = fopInputAfterTranslation + 'sortBySimilarityScore/trainValidTest/' fpTrainValidTestIndex = fopSplit + 'trainValidTest.index.txt' fopTrain = fopSplit + 'train/' fopValid = fopSplit + 'valid/' fopTest = fopSplit + 'test/' createDirIfNotExist(fopTrain) createDirIfNotExist(fopValid) createDirIfNotExist(fopTest) lstFpInput = glob.glob(fopFilterSortedBySimScore + '*.txt') dictFolderContent = {} dictNewContent = {} for i in range(0, len(lstFpInput)): fpItem = lstFpInput[i] nameItem = os.path.basename(fpItem) f1 = open(fpItem, 'r') arrContent = f1.read().strip().split('\n') f1.close() dictFolderContent[nameItem] = arrContent dictNewContent[nameItem] = [[], [], []]
import ast import re import pygraphviz as pgv import pydot from subprocess import check_call from graphviz import render import copy import nltk from pathlib import Path fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/' fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/' fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/' fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/' createDirIfNotExist(fopInputStep2ReplaceDict) createDirIfNotExist(fopInputStep2Folds) fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt' f1 = open(fpDictLiterals, 'r') arrLits = f1.read().strip().split('\n') f1.close() dictLiteralsReverse = {} for item in arrLits: arrTabs = item.split('\t') if len(arrTabs) >= 2: strContent = '\t'.join(arrTabs[1:]) dictLiteralsReverse[strContent] = arrTabs[0] fpLocation = fopInputStep2BeforeTranslation + 'location.txt' fpSource = fopInputStep2BeforeTranslation + 'source.txt'
fopStanfordCoreNLP = '/home/hungphd/git/dataPapers/stanford-corenlp-4.2.2/' strParseResultsType = "<class 'pyparsing.ParseResults'>" strStrType = "<class 'str'>" strSplitElement = ' SPLITELEMENT ' strEndLine = ' ENDLINE ' strTabChar = ' TABCHAR ' strSingleComment = ' SINGLECOMMENTCHAR ' fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/' fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/' fopFilterSortedBySimScore = fopSortedBySimScore + 'filterByPOS/' fopStep3Filer = fopRoot + 'step3_filter_pos/' createDirIfNotExist(fopFilterSortedBySimScore) fpFilterSortedLocation = fopFilterSortedBySimScore + 'location.txt' fpFilterSortedSource = fopFilterSortedBySimScore + 'source.txt' fpFilterSortedTarget = fopFilterSortedBySimScore + 'target.txt' fpFilterSortedPred = fopFilterSortedBySimScore + 'pred.txt' fpFilterSortedIdentifierLog = fopFilterSortedBySimScore + 'identifier_log.txt' fpFilterSortedLabelOverlapDetection = fopFilterSortedBySimScore + 'label.p1.overlap.txt' fpFilterSortedLabelIdentifierDetection = fopFilterSortedBySimScore + 'label.p2.identifierDetection.txt' fpFilterSortedLabelJaccard = fopFilterSortedBySimScore + 'label.p3.jaccardSimilarity.txt' f1 = open(fpFilterSortedSource, 'r') arrSources = f1.read().strip().split('\n') f1.close() f1 = open(fpFilterSortedLocation, 'r') arrLocs = f1.read().strip().split('\n') f1.close()
strEndLine=' ENDLINE ' strTabChar=' TABCHAR ' strSingleComment=' SINGLECOMMENTCHAR ' fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputAfterTranslation=fopRoot+'step2_afterTranslation/' fopSortedBySimScore=fopInputAfterTranslation+'sortBySimilarityScore/' fpSortedLocation=fopSortedBySimScore+'location.txt' fpSortedSource=fopSortedBySimScore+'source.txt' fpSortedTarget=fopSortedBySimScore+'target.txt' fpSortedPred=fopSortedBySimScore+'pred.txt' fpSortedIdentifiers=fopSortedBySimScore+'identifier_log.txt' fpSortedDetails=fopSortedBySimScore+'sortedDetails.txt' fopSortedPOSStanford=fopSortedBySimScore+'pos_stanford/' fopSortedPOSNLTK=fopSortedBySimScore+'pos_nltk/' createDirIfNotExist(fopSortedPOSStanford) createDirIfNotExist(fopSortedPOSNLTK) fopFilterSortedBySimScore=fopSortedBySimScore+'filterByPOS/' fopStep3Filer=fopRoot+'step3_filter_pos/' createDirIfNotExist(fopFilterSortedBySimScore) fpFilterSortedLocation=fopFilterSortedBySimScore+'location.txt' fpFilterSortedSource=fopFilterSortedBySimScore+'source.txt' fpFilterSortedTarget=fopFilterSortedBySimScore+'target.txt' fpFilterSortedPred=fopFilterSortedBySimScore+'pred.txt' fpFilterSortedSortDetails=fopFilterSortedBySimScore+'sortedDetails.txt' fpFilterSortedIdentifier=fopFilterSortedBySimScore+'identifier_log.txt' fpFilterSortedPOSNLTK=fopFilterSortedBySimScore+'pos_nltk.txt' fpFilterSortedPOSStanford=fopFilterSortedBySimScore+'pos_stanford.txt'
break # if not result: # print('check false') return result fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputAfterTranslation=fopRoot+'step2_afterTranslation/' fopInputSortBySimilarityScore=fopInputAfterTranslation+'sortBySimilarityScore/' fpSourceAfterTrans=fopInputAfterTranslation+'source.txt' fpTargetAfterTrans=fopInputAfterTranslation+'target.txt' fpLocationAfterTrans=fopInputAfterTranslation+'location.txt' fpPredAfterTrans=fopInputAfterTranslation+'pred.txt' createDirIfNotExist(fopInputSortBySimilarityScore) fpSortedSourceAfterTrans=fopInputSortBySimilarityScore+'source.txt' fpSortedTargetAfterTrans=fopInputSortBySimilarityScore+'target.txt' fpSortedLocationAfterTrans=fopInputSortBySimilarityScore+'location.txt' fpSortedPredAfterTrans=fopInputSortBySimilarityScore+'pred.txt' fpSortedDetailsAfterTrans=fopInputSortBySimilarityScore+'sortedDetails.txt' fpFuncDeclInfo=fopInputSortBySimilarityScore+'function_declarations.txt' f1=open(fpFuncDeclInfo,'r') arrFuncDecls=f1.read().strip().split('\n') f1.close() dictFuncDecls={} for item in arrFuncDecls: arrTabs=item.split('\t') strKey=arrTabs[0]+'\t'+arrTabs[1] lstVal=ast.literal_eval(arrTabs[2])
for i in range(0, len(arrSources)): if arrSources[i].strip() != '': dictText.add(arrSources[i].strip()) print(len(dictText)) lstText = list(dictText) lstItemText = [] lstItemPOS1 = [] lstItemPOS2 = [] indexItem = 0 # if os.path.exists(fopPOSStanford) and os.path.isdir(fopPOSStanford): # shutil.rmtree(fopPOSStanford) # if os.path.exists(fopPOSNLTK) and os.path.isdir(fopPOSNLTK): # shutil.rmtree(fopPOSNLTK) createDirIfNotExist(fopPOSStanford) createDirIfNotExist(fopPOSNLTK) for i in range(0, len(lstText)): try: strText = lstText[i] strPOSStanford = getGraphDependencyFromTextByStanford(strText, nlpObj) try: lstNonT = [] lstT = [] best = parser.parse(strText) strParseContent = str(best.get_parser_best().ptb_parse) dataParseResult = OneOrMore( nestedExpr()).parseString(strParseContent) strPOSNLTK = walkAndGetPOSJSonByNLTK(dataParseResult, 0, lstNonT,
import copy import nltk from pathlib import Path fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/' fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/' fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/' fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/' fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/' fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/' fpTestLocExpected = fopInputStep2BeforeTranslation + 'location.txt' fpTestPredBeforeSort = fopInputAfterTranslation + 'test.pred.unsorted.txt' fpTestLocBeforeSort = fopInputAfterTranslation + 'test.loc.unsorted.txt' fpTestPredAfterSort = fopInputAfterTranslation + 'pred.txt' fpTestLocAfterSort = fopInputAfterTranslation + 'pred.loc.txt' createDirIfNotExist(fopInputAfterTranslation) f1 = open(fpTestLocBeforeSort, 'w') f1.write('') f1.close() f1 = open(fpTestPredBeforeSort, 'w') f1.write('') f1.close() dictLocBefore = {} for i in range(1, 11): nameFold = 'fold-{}'.format(i) fpFoldTestLocation = fopInputStep2Folds + nameFold + '/test.loc.txt' fpFoldTestPred = fopInputStep2Folds + nameFold + '/pred.txt' f1 = open(fpFoldTestLocation, 'r')