def genGraph(fpPseudoCode,fopGraph):
    createDirIfNotExist(fopGraph)
    f1=open(fpPseudoCode)
    strPseudoCodes=f1.read()
    f1.close()
    dictPseudoCodes={}
    arrPseudoCodes=strPseudoCodes.split('\n')
    for i in range(0,len(arrPseudoCodes)):
        strTrim=arrPseudoCodes[i].strip()
        if strTrim.endswith('_text.txt'):
            currentKey=strTrim
            lstPseudoCodes=[]
            dictPseudoCodes[currentKey]=lstPseudoCodes
            # print(currentKey)
        else:
            dictPseudoCodes[currentKey].append(arrPseudoCodes[i])
    index=0
    for key in dictPseudoCodes.keys():
        index=index+1
        pseudoCodeName=key.replace('_text.txt','')
        lstPseudoCodes=dictPseudoCodes[key]
        strPseudo='\n'.join(lstPseudoCodes).strip().replace('.',' PUNC_CHAR ')
        strPseudo =strPseudo.replace('\n',' . ')
        # lstPseudoCodes='\n'.join(lstPseudoCodes).strip().split('\n')
        fpItemPseudoCode=fopGraph+pseudoCodeName+'_graphNL.txt'
        # createDirIfNotExist(fopItemPseudoCode)
        print('{}\t{}'.format(index,fpItemPseudoCode))
        strJson = textToJson(strPseudo)
        f1 = open(fpItemPseudoCode, 'w')
        f1.write(strJson)
        f1.close()
def compileMixCCodeAndSave(fopStep1, fopStep2, fopASTInfo, fopStep4GraphAll,
                           fopStep4GraphSimplify, fpLog, nlpObj, offsetContext,
                           isSaveGraph):
    createDirIfNotExist(fopStep2)
    createDirIfNotExist(fopASTInfo)
    createDirIfNotExist(fopStep4GraphAll)
    createDirIfNotExist(fopStep4GraphSimplify)

    f1 = open(fpLog, 'w')
    f1.write('')
    f1.close()
    numWordProcess = 0
    totalTimeProcess = 0
    lstCFilesStep1 = glob.glob(fopStep1 + '*.cpp')

    # t = time.time()
    # Parallel(n_jobs=8)(delayed(checkAndGenerateAST)(i,lstCFilesStep1, fopStep2, fopASTInfo,fpLog) for i in range(0,len(lstCFilesStep1)))
    for i in range(0, len(lstCFilesStep1)):
        # if i!=2:
        #     continue
        ii, itemNumWord, itemNumTime = checkAndGenerateAST(
            i, lstCFilesStep1, fopStep2, fopASTInfo, fopStep4GraphAll,
            fopStep4GraphSimplify, fpLog, nlpObj, offsetContext, isSaveGraph)
        numWordProcess = numWordProcess + itemNumWord
        totalTimeProcess = totalTimeProcess + itemNumTime
        # break
    avgTimePerWords = (totalTimeProcess * 1.0) / numWordProcess
def extractTextFiles(fpData,fopSPOCPlain,fopSPOCNested):
    tsv_file = open(fpData)
    df = pd.read_csv(tsv_file, delimiter="\t")
    df_grouped=df.groupby(['subid','probid','workerid'])
    # iterate over each group
    index=0
    lenGrouped=len(df_grouped)
    for group_name, df_group in df_grouped:

        #print('\nCREATE TABLE {}('.format(group_name))
        #print('aaa {} {}'.format(type(df_group),group_name))
        index=index+1
        try:
            strTotalText, strTotalCode,strTotalLine=getPseudoCodeAndCode(df_group)
           # print('type {}'.format(df_group['workerid']))
          #  row0=df_group.itertuples()[0]
            workId=df_group['workerid'].iloc[0]
            probId=df_group['probid'].iloc[0]
            subId=df_group['subid'].iloc[0]
            fpPlainText='{}/{}_{}_{}_text.txt'.format(fopSPOCPlain, workId, probId, subId)
            fpPlainCode = '{}/{}_{}_{}_code.txt'.format(fopSPOCPlain, workId, probId, subId)
            fpPlainLine = '{}/{}_{}_{}_line.txt'.format(fopSPOCPlain, workId, probId, subId)
            fopNest= fopSPOCNested + '/' + str(workId) + '/' + str(probId) + '/' + str(subId) + '/'
            createDirIfNotExist(fopNest)
            fpNestText = '{}/{}_{}_{}_text.txt'.format(fopNest, workId, probId, subId)
            fpNestCode = '{}/{}_{}_{}_code.txt'.format(fopNest, workId, probId, subId)
            fpNestLine = '{}/{}_{}_{}_line.txt'.format(fopNest, workId, probId, subId)

            fff=open(fpPlainText,'w')
            fff.write(strTotalText)
            fff.close()
            fff=open(fpPlainCode,'w')
            fff.write(strTotalCode)
            fff.close()
            fff=open(fpPlainLine,'w')
            fff.write(strTotalLine)
            fff.close()
            fff=open(fpNestText,'w')
            fff.write(strTotalText)
            fff.close()
            fff=open(fpNestCode,'w')
            fff.write(strTotalCode)
            fff.close()
            fff=open(fpNestLine,'w')
            fff.write(strTotalLine)
            fff.close()
        except Exception as e:
            print('{}\n{}'.format(str(e),traceback.format_exc()))
        print('Index {}/{}'.format(index,lenGrouped))
def compileMixCCodeAndSave(fopStep1, fopStep2, fopASTInfo, fpLog, numOmit):
    createDirIfNotExist(fopStep2)
    createDirIfNotExist(fopASTInfo)

    f1 = open(fpLog, 'w')
    f1.write('')
    f1.close()

    lstCFilesStep1 = glob.glob(fopStep1 + '*.cpp')

    t = time.time()
    Parallel(n_jobs=8)(delayed(checkAndGenerateAST)(
        i, lstCFilesStep1, fopStep2, fopASTInfo, fpLog, numOmit)
                       for i in range(0, len(lstCFilesStep1)))
    print(time.time() - t)
Exemple #5
0
def generateMixFiles(fopPseudoCode, fopCodeFile, fopOutputMix):
    createDirIfNotExist(fopOutputMix)
    lstFileItemSPoC = glob.glob(fopCodeFile + '*_code.cpp')
    indexFile = -1
    for index in range(0, len(lstFileItemSPoC)):
        fpCodeFile = lstFileItemSPoC[index]
        indexFile = indexFile + 1
        nameOfSubmission = os.path.basename(fpCodeFile).replace(
            '_code.cpp', '')
        fpPseudoCode = fopPseudoCode + nameOfSubmission + '_text.txt'
        f1 = open(fpCodeFile, 'r')
        arrCodeLines = f1.read().strip().split('\n')
        f1.close()
        f1 = open(fpPseudoCode, 'r')
        arrPseudoLines = f1.read().strip().split('\n')
        f1.close()

        # lstIndexes=getMostComplicatedPseudocode(arrPseudoLines,arrCodeLines,distanceHeader,4)
        #     replace pseudocode to code
        #     for indPseudoCode in range(0,len(lstIndexes)):
        for indPseudoCode in range(0, len(arrPseudoLines)):
            if not checkComplicatedPseudoCodeAndCode(
                    arrPseudoLines[indPseudoCode].strip(),
                    arrCodeLines[indPseudoCode + distanceHeader].strip()):
                continue
            lstStrCodeCombine = []
            for ind in range(0, distanceHeader):
                lstStrCodeCombine.append(arrCodeLines[ind])
            for ind in range(distanceHeader, len(arrCodeLines)):
                indInCode = ind - distanceHeader
                if indInCode == indPseudoCode:
                    strLinePseudo = '// {}'.format(
                        arrPseudoLines[indPseudoCode])
                    lstStrCodeCombine.append(strLinePseudo)
                else:
                    lstStrCodeCombine.append(arrCodeLines[ind])
            fpVersion = fopOutputMix + nameOfSubmission + '_v' + str(
                indPseudoCode + 1) + '.cpp'
            f1 = open(fpVersion, 'w')
            f1.write('\n'.join(lstStrCodeCombine))
            f1.close()
        print('complete {}/{} {}'.format(index, len(lstFileItemSPoC),
                                         fpCodeFile))
def compileMixCCodeAndSave(fopStep1,fopStep2,fopASTInfo,fopStep4GraphAll,fopStep4GraphSimplify,fpLog,nlpObj,offsetContext,isSaveGraph):
    createDirIfNotExist(fopStep2)
    createDirIfNotExist(fopASTInfo)
    createDirIfNotExist(fopStep4GraphAll)
    createDirIfNotExist(fopStep4GraphSimplify)

    f1 = open(fpLog, 'w')
    f1.write('')
    f1.close()

    lstCFilesStep1=glob.glob(fopStep1+'*_text.txt')

    # t = time.time()
    # Parallel(n_jobs=8)(delayed(checkAndGenerateAST)(i,lstCFilesStep1, fopStep2, fopASTInfo,fpLog) for i in range(0,len(lstCFilesStep1)))
    for i in range(0, len(lstCFilesStep1)):
        # if i!=2:
        #     continue
        checkAndGenerateAST(i, lstCFilesStep1, fopStep2, fopASTInfo,fopStep4GraphAll,fopStep4GraphSimplify,fpLog,nlpObj,offsetContext,isSaveGraph)
def generateScript(fpDictRepo, fopMetadata,fopCommitMessage,fopRepoExtract,fopRepoZip,fpLogCommandDownload,fpLogCommandExtract,fpLogCommandCommit):
    try:
        f1=open(fpDictRepo,'r')
        arrDicts=f1.read().strip().split('\n')
        f1.close()
        createDirIfNotExist(fopMetadata)
        createDirIfNotExist(fopCommitMessage)
        createDirIfNotExist(fopRepoZip)
        createDirIfNotExist(fopRepoExtract)

        if not os.path.exists(fpLogCommandDownload):
            f1=open(fpLogCommandDownload,'w')
            f1.write('')
            f1.close()
        if not os.path.exists(fpLogCommandExtract):
            f1 = open(fpLogCommandExtract, 'w')
            f1.write('')
            f1.close()
        if not os.path.exists(fpLogCommandCommit):
            f1 = open(fpLogCommandCommit, 'w')
            f1.write('')
            f1.close()

        lstCommandDownloads=[]
        lstCommandExtract = []
        lstCommandCommitMessage = []
        for i in range(0,len(arrDicts)):
            try:
                arrItem=arrDicts[i].strip().split('\t')
                if len(arrItem)>=2:
                    strAuthor=arrItem[0].split('__')[0]
                    strRepo = arrItem[0].split('__')[1]
                    strSHA=''
                    strZipDownload='https://github.com/{}/{}/tree/{}'.format(strAuthor,strRepo,strSHA)
                    fopZipLocation=fopRepoZip+strAuthor+'/'+strRepo+'/'
                    createDirIfNotExist(fopZipLocation)
                    fpZipLocation=fopZipLocation+strSHA+'.zip'
                    commandDownload='wget {} -O {}'.format(strZipDownload,fpZipLocation)
                    # print(fopRepoZip)
                    commandExtract='unzip {} -d {}'.format(fpZipLocation,fopRepoExtract)

                    createDirIfNotExist(fopMetadata+strAuthor+'/')
                    fopLocalMeta=fopMetadata+strAuthor+'/'+strRepo+'/'
                    fpLogMessage=fopCommitMessage+strAuthor+'__'+strRepo+'.txt'
                    commandFirstRemove='rm -rf {}'.format(fopLocalMeta)
                    commandCommitMessage1 ='cd {}'.format(fopMetadata+strAuthor+'/')
                    commandCommitMessage2='git clone --no-checkout https://github.com/{}/{}/'.format(strAuthor,strRepo)
                    commandCommitMessage3 = 'cd {}'.format(fopLocalMeta)
                    commandCommitMessage4 = 'git log >{}'.format(fpLogMessage)
                    commandCommitMessage5 = 'cd {}'.format(fopCommitMessage)

                    lstCommandDownloads.append(commandDownload)
                    lstCommandExtract.append(commandExtract)
                    lstCommandCommitMessage.append(commandFirstRemove)
                    lstCommandCommitMessage.append(commandCommitMessage1)
                    lstCommandCommitMessage.append(commandCommitMessage2)
                    lstCommandCommitMessage.append(commandCommitMessage3)
                    lstCommandCommitMessage.append(commandCommitMessage4)
                    lstCommandCommitMessage.append(commandCommitMessage5)
            except:
                traceback.print_exc()
        f1=open(fpLogCommandDownload,'a')
        f1.write('\n'.join(lstCommandDownloads)+'\n')
        f1.close()
        f1 = open(fpLogCommandExtract, 'a')
        f1.write('\n'.join(lstCommandExtract) + '\n')
        f1.close()
        f1 = open(fpLogCommandCommit, 'a')
        f1.write('\n'.join(lstCommandCommitMessage) + '\n')
        f1.close()

    except:
        traceback.print_exc()
strTabChar = ' tabChar '
strEndLineChar = '_EL_'
strEndFileChar = '_EF_'
strSplitIndent = ' IndentSplit '
strSplitJson = ' JsonSplit '
strSplitCharacterForNodeEdge = '_ABAZ_'

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopStep3V2 = fopRoot + 'step3_v2/'
fopStep5V2 = fopRoot + 'step5_v2_HGT/'
fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/'
fopStep2Tokenize = fopRoot + 'step2_tokenize/'
fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
fopGraphEntityInfo = fopStep3V2 + 'graphEntityInfo/'
createDirIfNotExist(fopStep5V2)

f1 = open(fpDictLiterals, 'r')
arrLits = f1.read().strip().split('\n')
f1.close()
dictLiteralsToValues = {}
dictValuesToLiterals = {}
for item in arrLits:
    arrTabs = item.split('\t')
    if len(arrTabs) >= 2:
        strContent = '\t'.join(arrTabs[1:])
        dictLiteralsToValues[arrTabs[0]] = strContent
        dictValuesToLiterals[strContent] = arrTabs[0]

# sorted(glob.glob(fopMixVersion+'**/**/a_json.txt'))
distanceHeader = 33
fopDataFolder='/Users/hungphan/git/dataPapers/'
fopInputSPOCData=fopDataFolder+'/SPOCDataset/spoc/'
fopTrainSPOCNested= fopDataFolder + '/trainSPOCNest/'
fopTrainSPOCPlain= fopDataFolder + '/trainSPOCPlain/'
fpTrainData=fopInputSPOCData+'train/spoc-train.tsv'

fopTestPSPOCNested= fopDataFolder + '/testPSPOCNest/'
fopTestPSPOCPlain= fopDataFolder + '/testPSPOCPlain/'
fpTestPData=fopInputSPOCData+'test/spoc-testp.tsv'

fopTestWSPOCNested= fopDataFolder + '/testWSPOCNest/'
fopTestWSPOCPlain= fopDataFolder + '/testWSPOCPlain/'
fpTestWData=fopInputSPOCData+'test/spoc-testw.tsv'


createDirIfNotExist(fopTrainSPOCNested)
createDirIfNotExist(fopTrainSPOCPlain)
createDirIfNotExist(fopTestPSPOCNested)
createDirIfNotExist(fopTestPSPOCPlain)
createDirIfNotExist(fopTestWSPOCNested)
createDirIfNotExist(fopTestWSPOCPlain)

def getPseudoCodeAndCode(df):
    lstText=[]
    lstCode=[]
    lstLine=[]
    for row_index, row in df.iterrows():
        strCode=str(row['code'])
        strText = str(row['text'])
        lstText.append(strText)
        indent=int(row['indent'])
Exemple #10
0

strRegexCamelCases = r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))'

strStmtSplit = ' StmtSplit '
strTabChar = ' tabChar '
strEndLineChar = ' endLineChar '
strSplitIndent = ' IndentSplit '
strSplitJson = ' JsonSplit '
strParseResultsType = "<class 'pyparsing.ParseResults'>"
strStrType = "<class 'str'>"

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fpPseudocodeAfterPOS = fopRoot + 'pseudocode_after_pos_v1.txt'
fopGraph = fopRoot + 'graphPOS/'
createDirIfNotExist(fopGraph)
f1 = open(fpPseudocodeAfterPOS, 'r')
arrAfterLines = f1.read().split('\n')
f1.close()

for i in range(0, len(arrAfterLines)):
    itemStr = arrAfterLines[i]
    arrItTabs = itemStr.split('\t')
    if len(arrItTabs) >= 2:
        strJson = arrItTabs[1]
        jsonObj = ast.literal_eval(strJson)
        fpGraphDot = fopGraph + str((i + 1)) + '.dot'
        fpGraphPng = fopGraph + str((i + 1)) + '.png'
        graphAll = pgv.AGraph(directed=True)
        getGraphFromJsonPOS(jsonObj, graphAll)
        graphAll.write(fpGraphDot)
Exemple #11
0
        dictWords = {}
        dictTotal = walkAndGetPOSJson(data, indexSentence, lstNonTerminals,
                                      lstTerminals)
    except:
        strJsonObj = '{}'
        dictTotal = {}
        traceback.print_exc()
    return dictTotal


fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopMixVersion = fopRoot + 'step4_mixCode/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
fpPseudocodeBeforePOS = fopRoot + 'pseudocode_before_pos.txt'
fpCachedFilePath = fopRoot + 'cachedFilePaths.txt'
createDirIfNotExist(fopMixVersion)

model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)

print('before traverse')
lstFpJsonFiles = []
if not os.path.isfile(fpCachedFilePath):
    lstFop1 = sorted(glob.glob(fopMixVersion + '*/'))
    for fop1 in lstFop1:
        lstFop2 = sorted(glob.glob(fop1 + '*/'))
        for fop2 in lstFop2:
            lstFop3 = sorted(glob.glob(fop2 + 'v_*_label.txt'))
            # print(fp3)
            for fp3 in lstFop3:
                lstFpJsonFiles.append(fp3)
            #     shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.png',
            #                 fopItemAllNumOfStmts + fnIndexVersion + '_graph.png')
    except:
        traceback.print_exc()

fopRoot='../../../../dataPapers/textInSPOC/correctCodeRaw/'
fopCodeFile=fopRoot+'step2_tokenize/'
fopPseudoFile=fopRoot+'step2_pseudo_tokenize/'
fopTreeSitterFile=fopRoot+'step3_treesitter_tokenize/'
fopLabels=fopRoot+'step2_pseudo_tokenize/'
fopTreeSitterFile=fopRoot+'step3_treesitter_tokenize/'
fopMixVersion=fopRoot+'step4_mixCode/'
fopAllocateByMainStmt=fopRoot+'step4_mainStmt/'
fopAllocateByNumOfStmts=fopRoot+'step4_numOfStatements/'
fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt'
createDirIfNotExist(fopMixVersion)
createDirIfNotExist(fopAllocateByMainStmt)
createDirIfNotExist(fopAllocateByNumOfStmts)

f1=open(fpDictLiterals,'r')
arrLits=f1.read().strip().split('\n')
f1.close()
dictLiterals={}
for item in arrLits:
    arrTabs=item.split('\t')
    if len(arrTabs)>=2:
        strContent='\t'.join(arrTabs[1:])
        dictLiterals[arrTabs[0]]=strContent

# print('len dict {}'.format(len(dictLiterals.keys())))
# input('abc ')
    model.eval()
    #inference
    import spacy
    nlp = spacy.load('en_core_web_sm')
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    # acc_prob1=test_acc
    print('test loss and test acc \n{}\t{}'.format(test_loss, test_acc))
    return test_acc


fopRoot = '../../../../../media/dataPapersExternal/mixCodeRaw/'
fpInputText = fopRoot + 'embeddingModels/d2v/paragraph_text.txt'
fopOutputML = fopRoot + 'resultMLs/rnn-lstm-small/'
fpResultDetails = fopOutputML + 'result_details.txt'
fpDoc2VecModel = fopRoot + 'embeddingModels/d2v/d2v.model.txt'
createDirIfNotExist(fopOutputML)

sys.stdout = open(fpResultDetails, 'w')

# fpTrain = fopRoot + 'train.csv'
# fpTextTrain = fopRoot + 'train.text.txt'
fpLabelP1Train = fopOutputML + 'train.label.p1.txt'
fpLabelP2Train = fopOutputML + 'train.label.p2.txt'
fpLabelP3Train = fopOutputML + 'train.label.p3.txt'

# fpValid = fopRoot + 'valid.csv'
# fpTextValid = fopRoot + 'testP.text.txt'
fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt'
fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt'
fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt'
def extractPOSAndTree(fopTextCorpus, fopPOSCorpus, item):
    try:
        fopItemTextCorpus = fopTextCorpus + item + '/'
        # fopItemTreeCorpus = fopTextCorpus + item + '/'
        fopItemPOSCorpus = fopPOSCorpus + item + '/'
        # createDirIfNotExist(fopItemTreeCorpus)
        createDirIfNotExist(fopItemPOSCorpus)
        lstTextFiles = sorted(glob.glob(fopItemTextCorpus + '/*.txt'))
        lstPOSPerFile = []
        lstTreePerFile = []
        lstProcessTextPerFile = []

        # print('len {}'.format(len(lstTextFiles)))
        numWordProcess = 0
        totalTimeProcess = 0
        for j in range(0, len(lstTextFiles)):
            try:
                if j == 1:
                    break
                # start_time = time.time()
                f1 = open(lstTextFiles[j], 'r')
                arrTexts = f1.read().split('\n')
                f1.close()
                indexName = os.path.basename(lstTextFiles[j]).replace(
                    '.txt', '')
                fpPOS = fopItemPOSCorpus + indexName + '_pos.txt'
                # fpTree = fopItemPOSCorpus + indexName + '_tree.txt'
                fpTextPreprocess = fopItemPOSCorpus + indexName + '_preprocess.txt'
                # print('go here')
                f1 = open(fpPOS, 'w')
                f1.write('')
                f1.close()
                # f1 = open(fpTree, 'w')
                # f1.write('')
                # f1.close()
                f1 = open(fpTextPreprocess, 'w')
                f1.write('')
                f1.close()

                for i in range(0, len(arrTexts)):
                    try:
                        strItem = arrTexts[i].replace(
                            strEndLine, '\m').replace(strTabChar,
                                                      '\t').replace('// ', '')
                        start_time = time.time()
                        strPostText, strPOS = getPOSAndTreeFromText(strItem)
                        numWordItem = len(strItem.split())
                        numWordProcess = numWordProcess + numWordItem
                        end_time = time.time()
                        totalTimeProcess = totalTimeProcess + (end_time -
                                                               start_time)
                        if strPostText != '':
                            lstProcessTextPerFile.append(strPostText)
                            # lstTreePerFile.append(strTree)
                            lstPOSPerFile.append(strPOS)

                        if ((len(lstProcessTextPerFile) > 0)
                                and (((i + 1) == len(arrTexts)) or
                                     ((i + 1) % 1000 == 0))):
                            f1 = open(fpPOS, 'a')
                            f1.write('\n'.join(lstPOSPerFile) + '\n')
                            f1.close()
                            # f1 = open(fpTree, 'a')
                            # f1.write('\n'.join(lstTreePerFile)+'\n')
                            # f1.close()
                            f1 = open(fpTextPreprocess, 'a')
                            f1.write('\n'.join(lstProcessTextPerFile) + '\n')
                            f1.close()
                            lstPOSPerFile = []
                            lstTreePerFile = []
                            lstProcessTextPerFile = []
                            print('finish write at index {} of file {}'.format(
                                i, lstTextFiles[j]))

                        if i == 100:
                            break
                    except:
                        traceback.print_exc()
                # duration=time.time() - start_time
                duration = 0
                # print('index {}/{} duration {}'.format(i,len(lstTextFiles),duration))
            except:
                traceback.print_exc()
        avgTimePerWords = (totalTimeProcess * 1.0) / numWordProcess
        print('total time and words and avg\t{}\t{}\t{}'.format(
            numWordProcess, totalTimeProcess, avgTimePerWords))
    except:
        traceback.print_exc()
import operator
import clang.cindex

sys.path.append(os.path.abspath(os.path.join('../..')))
from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult

fopData = '../../../dataPapers/'
fopTextInSPoC = fopData + 'textInSPOC/'
fopStatisticFolderTestP = fopData + 'textInSPOC/testPOnlyC/'
fpCombineASTsTestP = fopTextInSPoC + 'combineASTs_TestP.txt'
fopStatisticFolderTestW = fopData + 'textInSPOC/testWOnlyC/'
fopASTForCTestW = fopData + 'textInSPOC/testWASTForC/'
fopASTForCTestP = fopData + 'textInSPOC/testPASTForC/'
fpCombineASTsTestW = fopTextInSPoC + 'combineASTs_TestW.txt'

createDirIfNotExist(fopASTForCTestP)
createDirIfNotExist(fopASTForCTestW)

numOmit = 30
lstFiles = sorted(glob.glob(fopStatisticFolderTestP + "*_code.cpp"))
dictCountWords = {}
f1 = open(fpCombineASTsTestP, 'w')
f1.write('')
f1.close()
'''
for index in range(0,len(lstFiles)):
    fpCodeFileCPP=lstFiles[index]
    fineName=os.path.basename(fpCodeFileCPP)
    fpASTItem = fopASTForCTestP + fineName.replace('_code.txt', '_ast.txt')
    # fFile=open(fpCodeFileCPP, 'r')
    # strContentOfFile=fFile.read().strip()
            numWordProcess, totalTimeProcess, avgTimePerWords))
    except:
        traceback.print_exc()


batch_size = 100000
strEndLine = ' ENDLINE '
strTabChar = ' TABCHAR '
strSingleComment = ' SINGLECOMMENTCHAR '

# fopCCContent='/home/hungphd/media/dataPapersExternal/apiCallPapers_v1/AlonCommentExtraction/'
fopTextCorpus = '/home/hungphd/media/dataPapersExternal/textCorpus/'
# fopTextPostProcessCorpus='/home/hungphd/media/dataPapersExternal/textPostProcessCorpus/'
fopPOSCorpus = '/home/hungphd/media/dataPapersExternal/posCorpus_small/'
# fopParseTreeCorpus='/home/hungphd/media/dataPapersExternal/treeCorpus/'
createDirIfNotExist(fopPOSCorpus)
lstTypesOfSDs = ['ad', 'cs', 'cc', 'cm', 'qa', 'sr']

fopStanfordParser = '/home/hungphd/git/dataPapers/stanford-parser-4.2.0'
os.environ['STANFORD_PARSER'] = fopStanfordParser
os.environ['STANFORD_MODELS'] = fopStanfordParser

from pycorenlp import StanfordCoreNLP

# for item in lstTypesOfSDs:

from multiprocessing.pool import ThreadPool as Pool

# from multiprocessing import Pool
from multiprocessing import Process
lstThreads = []
Exemple #17
0
      # jsonPOS=walkAndGetPOSJson(data,indexSentence,lstNonTerminals,lstTerminals)
      # dictTotal['children'].append(jsonPOS)
    strOutput=strSplitParsedTree.join(lstOutput)
  except:
    # strJsonObj = 'Error'
    # dictTotal=None
    traceback.print_exc()
  return strOutput


fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopPseudoTokens=fopRoot+'step2_pseudo_tokenize/'
fopEstimateTime=fopRoot+'estimate_time_pos/'
fopPseudocodePOS=fopRoot+'step2_pseudocode_pos/'
fpCachedFilePath=fopRoot+'cachedFilePaths_pseudo_tokenizes.txt'
createDirIfNotExist(fopEstimateTime)
fpEstimate=fopEstimateTime+'pos_estimation.txt'
model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

lstFpJsonFiles=[]
if not os.path.isfile(fpCachedFilePath):
    lstFop1=sorted(glob.glob(fopPseudoTokens+'*/'))
    for fop1 in lstFop1:
        lstFp2=sorted(glob.glob(fop1+'*.txt'))
        for fp2 in lstFp2:
            lstFpJsonFiles.append(fp2)
        print('end {}'.format(fop1))
def logFileLocationForEachJavaProjects(fopItemAlonCorpus, fopItemJsonData,
                                       fpLogTotalProject, parser,
                                       isCollectFromStart):
    createDirIfNotExist(fopItemAlonCorpus)
    createDirIfNotExist(fopItemJsonData)
    lstProjectNames = glob.glob(fopItemAlonCorpus + '*/')
    dictFilesPerProject = {}
    dictAlreadyDownloadProject = {}
    if isCollectFromStart or not (os.path.exists(fpLogTotalProject)):
        f1 = open(fpLogTotalProject, 'w')
        f1.write('')
        f1.close()
    else:
        f1 = open(fpLogTotalProject, 'r')
        arrAlready = f1.read().strip().split('\n')
        f1.close()
        for item in arrAlready:
            arrTabs = item.split('\t')
            if len(arrTabs) >= 4:
                dictAlreadyDownloadProject[arrTabs[0]] = 1
    for i in range(0, len(lstProjectNames)):
        try:
            fopProjectItem = lstProjectNames[i]
            arrFopItem = fopProjectItem.split('/')
            projectFolderName = arrFopItem[len(arrFopItem) - 2]
            if projectFolderName in dictAlreadyDownloadProject.keys():
                continue
            # print('folder name {}'.format(projectFolderName))
            fpItemDataAPICalls = fopItemJsonData + projectFolderName + '.txt'
            lstJavaFiles = glob.glob(fopProjectItem + '/**/*.java',
                                     recursive=True)
            # print('{} len {}'.format(fopProjectItem,len(lstJavaFiles)))
            dictFilesPerProject[projectFolderName] = len(lstJavaFiles)
            dictJavaFiles = {}
            for j in range(0, len(lstJavaFiles)):
                fpItemJava = lstJavaFiles[j]
                key = str(j + 1)
                dictJavaFiles[key] = fpItemJava
            lstWriteToFiles = []
            for key in dictJavaFiles.keys():
                lstWriteToFiles.append('{}\t{}'.format(key,
                                                       dictJavaFiles[key]))
            f1 = open(fpItemDataAPICalls, 'w')
            f1.write('\n'.join(lstWriteToFiles))
            f1.close()

            print('{} Prepare get ast project {} with {} files'.format(
                (i + 1), fopProjectItem, len(lstJavaFiles)))
            fopASTItemFather = fopItemJsonData + projectFolderName + '/'
            createDirIfNotExist(fopASTItemFather)
            numProjectRunOK = 0
            for key in dictJavaFiles.keys():
                try:
                    fpItemJavaFiles = dictJavaFiles[key]
                    jsonObject = getJsonDict(fpItemJavaFiles, parser)
                    if (jsonObject is not None):
                        fpJson = fopASTItemFather + str(key) + '_ast.txt'
                        f1 = open(fpJson, 'w')
                        f1.write(str(jsonObject))
                        f1.close()
                        numProjectRunOK = numProjectRunOK + 1
                except:
                    traceback.print_exc()
            print('{} End get ast project {} with {} files'.format(
                (i + 1), fopProjectItem, len(lstJavaFiles)))
            f1 = open(fpLogTotalProject, 'a')
            f1.write('{}\t{}\t{}\t{}\n'.format(
                projectFolderName, numProjectRunOK, len(dictJavaFiles.keys()),
                dictFilesPerProject[projectFolderName]))
            f1.close()
        except:
            traceback.print_exc()
Exemple #19
0
import glob
import sys, os
import operator

sys.path.append(os.path.abspath(os.path.join('../..')))
from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText

fopData = '../../../dataPapers/'
fopTextInSPoC = fopData + 'textInSPOC/'
fopTestPFolder = fopTextInSPoC + 'testPSPOCPlain/'
fopTestPFolderOnlyC = fopTextInSPoC + 'testPOnlyC/'
lstFiles = sorted(glob.glob(fopTestPFolder + "*_code.txt"))
createDirIfNotExist(fopTestPFolderOnlyC)
for i in range(0, len(lstFiles)):
    f1 = open(lstFiles[i], 'r')
    fileName = os.path.basename(lstFiles[i])
    newFileName = fileName.replace('.txt', '.cpp')
    fpOutpurItem = fopTestPFolderOnlyC + newFileName
    strContent = f1.read()
    f1.close()
    lstHeaders = [
        '#include <iostream>', '#include<stdio.h>', '#include <string.h>',
        '#include<stdlib.h>', '#include<math.h>', '#include<time.h>',
        '#include<ctype.h>', '#include<assert.h>', '#include<locale.h>',
        '#include<signal.h>', '#include<setjmp.h>', '#include<stdarg.h>',
        '#include<errno.h>', '#include<iomanip>', '#include <algorithm>',
        '#include <vector>', '#include <limits.h>', '#include <map>',
        '#include <set>', '#include <list>', '#include <stack>',
        '#include <queue>', '#include <array>', '#include <unordered_map>',
        '#include <unordered_set>', '#include <deque>',
        '#include <forward_list>'
Exemple #20
0
                dictAcceptableIdsForVersions[numContext].append(ancestorId)

    if 'children' in jsonAll.keys() and 'isNLRootNode' not in jsonAll.keys():
        lstChildren = jsonAll['children']
        for i in range(0, len(lstChildren)):
            itemChild = lstChildren[i]
            findAddableIdsForContext(itemChild, dictOfFatherIdMainAST,
                                     dictAcceptableIdsForVersions,
                                     mixStartLine, mixEndLine,
                                     dictAncestorToRoot)


fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopMixVersion = fopRoot + 'step4_mixCode/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
createDirIfNotExist(fopMixVersion)

f1 = open(fpDictLiterals, 'r')
arrLits = f1.read().strip().split('\n')
f1.close()
dictLiterals = {}
for item in arrLits:
    arrTabs = item.split('\t')
    if len(arrTabs) >= 2:
        strContent = '\t'.join(arrTabs[1:])
        dictLiterals[arrTabs[0]] = strContent

# print('len dict {}'.format(len(dictLiterals.keys())))
# input('abc ')

print('before traverse')
fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopStep3V2 = fopRoot + 'step3_v2/'
fopStep5V2 = fopRoot + 'step5_v2_HGT/'
fopStep5EmbeddingModels = fopRoot + 'embeddingModels/'
fopStep5TextInfo = fopStep5EmbeddingModels + 'textInfo/'
fopFastTextModel = fopStep5EmbeddingModels + 'glove/'
fpFastTextModelBin = fopFastTextModel + 'model.bin'
fpFastTextModelText = fopFastTextModel + 'model.text.txt'
fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/'
fopStep2CodeReplaceDict = fopRoot + 'step2_code_replaceDict/'
fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
fopGraphEntityInfo = fopStep3V2 + 'graphEntityInfo/'
fopPOSNLTK = fopRoot + 'step2_afterTranslation/sortBySimilarityScore/pos_nltk/'
fopPOSStanford = fopRoot + 'step2_afterTranslation/sortBySimilarityScore/pos_stanford/'
createDirIfNotExist(fopStep5V2)
createDirIfNotExist(fopStep5EmbeddingModels)
createDirIfNotExist(fopStep5TextInfo)
createDirIfNotExist(fopFastTextModel)

lstFpTextFiles = glob.glob(fopStep5TextInfo + '*.textTrainEmb.txt')

lstStrTextInfo = []
for fpItem in lstFpTextFiles:
    f1 = open(fpItem, 'r')
    arrTextItem = f1.read().strip().split('\n')
    f1.close()
    for j in range(0, len(arrTextItem)):
        arrTextItem[j] = arrTextItem[j].replace(',', ' COMMA ')
    lstStrTextInfo = lstStrTextInfo + list(arrTextItem)
Exemple #22
0
def getAllLeaveNodes(g):
    leafs = [
        x for x in g.nodes() if g.out_degree(x) == 0 and g.in_degree(x) == 1
    ]
    return leafs


fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopStep4NMT = fopRoot + 'step4_NMT/'
fopStep5V2HGT = fopRoot + 'step5_v2_HGT/'
fopStep3V2 = fopRoot + 'step3_v2/'
fopStep3TreesitterTokenize = fopRoot + 'step3_treesitter_tokenize/'
fopStep2Tokenize = fopRoot + 'step2_tokenize/'
fopStep2PseudoTokenize = fopRoot + 'step2_pseudo_tokenize/'
fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
createDirIfNotExist(fopStep4NMT)

f1 = open(fpDictLiterals, 'r')
arrLits = f1.read().strip().split('\n')
f1.close()
dictLiterals = {}
dictLiteralsReverse = {}
for item in arrLits:
    arrTabs = item.split('\t')
    if len(arrTabs) >= 2:
        strContent = '\t'.join(arrTabs[1:])
        dictLiterals[arrTabs[0]] = strContent
        dictLiteralsReverse[strContent] = arrTabs[0]

# sorted(glob.glob(fopMixVersion+'**/**/a_json.txt'))
distanceHeader = 33
from langdetect import detect
from sklearn.metrics import confusion_matrix
import langid



fopRoot='/home/hungphd/git/dataPapers/textInSPOC/mixCode_v2/step6/'
fpTrainText=fopRoot+'train.text.txt'
fpTestPText=fopRoot+'testP.text.txt'
fpTestWText=fopRoot+'testW.text.txt'
fpTrainLabel=fopRoot+'train.label.txt'
fpTestPLabel=fopRoot+'testP.label.txt'
fpTestWLabel=fopRoot+'testW.label.txt'

fopD2VRF=fopRoot+'result-d2v-rf/'
createDirIfNotExist(fopD2VRF)
fpOutModel=fopD2VRF+'model.d2v'
fpOutResultDetail=fopD2VRF+'resultDetail.txt'
fpOutResultSummary=fopD2VRF+'resultSummary.txt'

X_Train = []
key_Train = []
X_TestP = []
key_TestP = []
X_TestW = []
key_TestW = []
y_Train=[]
y_TestP=[]
y_TestW=[]
lstAllText = []
        except:
            traceback.print_exc()


fopDataPapers = '../../../../dataPapers/'
fopAlonCorpus = fopDataPapers + 'java-large/'
fopDataAPICalls = fopDataPapers + 'apiCallPapers/'
fopJsonData = fopDataPapers + 'apiCallPapers/AlonJsonData/'
# fopFilesPerProjectData=fopDataPapers+'apiCallPapers/AlonFilesPerProject/'

fopDataRoot = '/home/hungphd/'
fopGithub = '/home/hungphd/git/'
fopBuildFolder = fopDataRoot + 'build-tree-sitter/'
fpLanguageSo = fopBuildFolder + 'my-languages.so'

createDirIfNotExist(fopJsonData)
# createDirIfNotExist(fopFilesPerProjectData)
lstFopAlonCorpus = []
lstFopJsonData = []
lstFpLogAPICalls = []
lstFopFilesPerProjectData = []

lstFolderNames = ['training', 'test', 'validation']

JAVA_LANGUAGE = Language(fpLanguageSo, 'java')
parser = Parser()
parser.set_language(JAVA_LANGUAGE)
isCollectFromStart = False

for i in range(0, len(lstFolderNames)):
    folderName = lstFolderNames[i]
def generateMixVersionsAndLabels(jsonObject,dictLinesAndElements,dictLabelStatistics,dictLiterals,arrCodes,arrPseudos,fopCodeVersion,fonameItemAST,idCode,fopAllocateByMainStmt,fopAllocateByNumOfStmts):
    try:
        isOK=False
        indexVersion=0
        fpItemPseudo=fopCodeVersion+'_a_pseudo.txt'
        fpItemCode=fopCodeVersion+'_a_code.cpp'
        fpItemGraphText = fopCodeVersion + '_a_graph.dot'
        fpItemGraphPng = fopCodeVersion + '_a_graph.png'
        f1=open(fpItemPseudo,'w')
        f1.write('\n'.join(arrPseudos))
        f1.close()
        f1=open(fpItemCode,'w')
        f1.write('\n'.join(arrCodes))
        f1.close()
        if 'test' in fopCodeVersion:
            graph = pgv.AGraph(directed=True)
            isBlueColor = True
            idChange = '-1'
            generateGraph(jsonObject,'',arrCodes, idChange, isBlueColor, graph)
            graph.write(fpItemGraphText)
            # graph.layout()
            # graph.draw(fpItemGraphPng)
            # (graphPydot,) = pydot.graph_from_dot_file(fpItemGraphText)
            # graphPydot.write_png(fpItemGraphPng)
            # check_call(['dot', '-Tpng', fpItemGraphText, '-o', fpItemGraphPng])
            # render('dot', 'png', fpItemGraphText)

        for keyItem in dictLinesAndElements.keys():
            valItem=dictLinesAndElements[keyItem]
            indexVersion=indexVersion+1
            fnIndexVersion='v_{}'.format(indexVersion)
            fpItemVersionCode=fopCodeVersion+fnIndexVersion+'_mix.cpp'
            fpItemVersionLabel = fopCodeVersion + fnIndexVersion + '_label.txt'
            mainStmt = valItem['mainStmt']
            startLineMainStmt=mainStmt['startLine']
            endLineMainStmt = mainStmt['endLine']

            lstPseudoLines=range(startLineMainStmt-distanceHeader,endLineMainStmt-distanceHeader+1)
            lstStrPseudoLines=[]
            lstStrCodeLines = []
            for it in lstPseudoLines:
                lstStrPseudoLines.append(arrPseudos[it])
                lstStrCodeLines.append(arrCodes[it+distanceHeader])

            strTotalComment='// '+' , then '.join(lstStrPseudoLines)
            strTotalCode=' '.join(lstStrCodeLines)
            strCodeSplit,strCodeTokSplit,strPseudoSplit,strTokSplit,numAppear,numDisappear,percent=checkAppearInImplementation(dictLiterals,strTotalComment,strTotalCode)

            lstMixCodes = []
            for i in range(0,len(arrCodes)):
                itemLine=arrCodes[i]
                numTabs=0
                strLineAdd=arrCodes[i]

                if (i-distanceHeader) in lstPseudoLines:
                    lstStrs = []
                    while(itemLine[numTabs]=='\t'):
                        numTabs=numTabs+1
                        lstStrs.append('\t')
                        if numTabs == len(itemLine):
                            break
                    if i == startLineMainStmt:
                        strLineAdd='{}{}'.format(''.join(lstStrs),strTotalComment)
                    else:
                        strLineAdd = '{}{}'.format(''.join(lstStrs), '//')
                lstMixCodes.append(strLineAdd)
            f1=open(fpItemVersionCode,'w')
            f1.write('\n'.join(lstMixCodes))
            f1.close()

            strMainStatement=mainStmt['type']
            strPosition='{}-{}-{}-{}'.format(mainStmt['startLine'],mainStmt['startOffset'],mainStmt['endLine'],mainStmt['endOffset'])
            numOfLine=endLineMainStmt-startLineMainStmt+1
            numOfStatements=0
            lstListOfBelowStatements=[]
            strDictToString=''
            if mainStmt['singleRoot']:
                numOfStatements=1
                lstListOfBelowStatements.append(mainStmt['type'])
                if 'children' in mainStmt.keys():
                    for child in mainStmt['children']:
                        if 'isReplaceable' in child.keys() and child['isReplaceable']:
                            strStmtChild=child['type']
                            numOfStatements=numOfStatements+1
                            lstListOfBelowStatements.append(strStmtChild)
                strDictToString=str(mainStmt)
            else:
                indentDeepest = sorted(dictLinesAndElements[keyItem]['sortedElementsByIndents'].keys())[0]
                lstPossibleStatements = dictLinesAndElements[keyItem]['sortedElementsByIndents'][indentDeepest]
                numOfStatements=len(lstPossibleStatements)
                for stmt in lstPossibleStatements:
                    lstListOfBelowStatements.append(stmt['type'])
                strDictToString =strSplitJson.join(map(str,lstPossibleStatements))
            strBelowStatements=strStmtSplit.join(lstListOfBelowStatements)

            if strMainStatement not in dictLabelStatistics['mainStmt'].keys():
                dictLabelStatistics['mainStmt'][strMainStatement]=1
            else:
                dictLabelStatistics['mainStmt'][strMainStatement] =dictLabelStatistics['mainStmt'][strMainStatement] + 1

            if numOfStatements not in dictLabelStatistics['numOfStatements'].keys():
                dictLabelStatistics['numOfStatements'][numOfStatements]=1
            else:
                dictLabelStatistics['numOfStatements'][numOfStatements] =dictLabelStatistics['numOfStatements'][numOfStatements] + 1


            f1=open(fpItemVersionLabel,'w')
            strLbl='{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\t{}\t{}'.format(strMainStatement,strPosition,numOfLine,numOfStatements,strBelowStatements,strDictToString, strCodeSplit,strCodeTokSplit,strPseudoSplit,strTokSplit,numAppear,numDisappear,percent)
            f1.write(strLbl)
            f1.close()


            if 'test' in fopCodeVersion:
                graph = pgv.AGraph(directed=True)
                isBlueColor = True
                idChange = str(mainStmt['id'])
                generateGraph(jsonObject,'',arrCodes, idChange, isBlueColor, graph)
                graph.write(fopCodeVersion+fnIndexVersion+'_graph.dot')
                # graph.layout(prog='dot')
                # graph.draw(fopCodeVersion+fnIndexVersion+'_graph.png')


            fopItemAllMainStmt=fopAllocateByMainStmt+strMainStatement+'/'+fonameItemAST+'/'+idCode+'/'
            createDirIfNotExist(fopItemAllMainStmt)
            fopItemAllNumOfStmts = fopAllocateByNumOfStmts + str(numOfStatements) + '/' + fonameItemAST + '/' + idCode + '/'
            createDirIfNotExist(fopItemAllNumOfStmts)
            # if os.path.isdir(fopItemAllMainStmt):
            #     shutil.rmtree(fopItemAllMainStmt)
            # if os.path.isdir(fopItemAllNumOfStmts):
            #     shutil.rmtree(fopItemAllNumOfStmts)
            shutil.copy(fopCodeVersion+fnIndexVersion+'_mix.cpp', fopItemAllMainStmt+fnIndexVersion+'_mix.cpp')
            shutil.copy(fopCodeVersion+fnIndexVersion+ '_label.txt', fopItemAllMainStmt+fnIndexVersion+ '_label.txt')
            shutil.copy(fopCodeVersion+fnIndexVersion+'_mix.cpp', fopItemAllNumOfStmts+fnIndexVersion+'_mix.cpp')
            shutil.copy(fopCodeVersion + fnIndexVersion + '_label.txt',fopItemAllNumOfStmts + fnIndexVersion + '_label.txt')
            if 'test' in fopCodeVersion:
                shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.dot',
                            fopItemAllNumOfStmts + fnIndexVersion + '_graph.dot')
            #     shutil.copy(fopCodeVersion + fnIndexVersion + '_graph.png',
            #                 fopItemAllNumOfStmts + fnIndexVersion + '_graph.png')
    except:
        traceback.print_exc()
Exemple #26
0
import sys, os
import operator
import clang.cindex

sys.path.append(os.path.abspath(os.path.join('../..')))
from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult

fopData = '../../../dataPapers/'
fopTextInSPoC = fopData + 'textInSPOC/'
fopStatisticFolder = fopData + 'textInSPOC/trainOnlyC/'
fopASTForC = fopData + 'textInSPOC/trainASTForC/'
fpCombineASTs = fopTextInSPoC + 'combineASTs.txt'

lstFiles = sorted(glob.glob(fopStatisticFolder + "*_code.cpp"))

createDirIfNotExist(fopASTForC)

dictCountWords = {}

f1 = open(fpCombineASTs, 'w')
f1.write('')
f1.close()

# walker = Walker('')
numOmit = 30

for index in range(0, len(lstFiles)):
    fpCodeFileCPP = lstFiles[index]
    fineName = os.path.basename(fpCodeFileCPP)
    fpASTItem = fopASTForC + fineName.replace('_code.txt', '_ast.txt')
    # fFile=open(fpCodeFileCPP, 'r')
fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopStep3V2=fopRoot+'step3_v2/'
fopStep5V2=fopRoot+'step5_v2_HGT/'
fopStep5EmbeddingModels=fopRoot+'embeddingModels/'
fopStep5TextInfo=fopStep5EmbeddingModels+'textInfo/'
fopVectorModel= fopStep5EmbeddingModels + 'tfidf/'
fpD2VModelBin= fopVectorModel + 'model.bin'
fpD2VModelText= fopVectorModel + 'model.text.txt'
fopStep3TreesitterTokenize=fopRoot+'step3_treesitter_tokenize/'
fopStep2CodeReplaceDict=fopRoot+'step2_code_replaceDict/'
fopStep2PseudoTokenize=fopRoot+'step2_pseudo_tokenize/'
fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt'
fopGraphEntityInfo=fopStep3V2+'graphEntityInfo/'
fopPOSNLTK=fopRoot+'step2_afterTranslation/sortBySimilarityScore/pos_nltk/'
fopPOSStanford=fopRoot+'step2_afterTranslation/sortBySimilarityScore/pos_stanford/'
createDirIfNotExist(fopStep5V2)
createDirIfNotExist(fopStep5EmbeddingModels)
createDirIfNotExist(fopStep5TextInfo)
createDirIfNotExist(fopVectorModel)

lstFpTextFiles=glob.glob(fopStep5TextInfo+'*.textTrainEmb.txt')

lstStrTextInfo=[]
setVocabs=set()
for fpItem in lstFpTextFiles:
    f1=open(fpItem,'r')
    arrTextItem=f1.read().strip().split('\n')
    f1.close()
    for line in arrTextItem:
        arrWords=line.split()
        for word in arrWords:
dictFolderContent={}
# dictNewContent={}
for i in range(0,len(lstFpInput)):
    fpItem=lstFpInput[i]
    nameItem=os.path.basename(fpItem)
    f1=open(fpItem,'r')
    arrContent=f1.read().strip().split('\n')
    f1.close()
    dictFolderContent[nameItem]=arrContent
    # dictNewContent[nameItem]=[[],[],[]]
fnLocation='location.txt'
fnSource='source.txt'
fnLabelOverlap='label.p1.overlap.txt'
fnTrainIndex='trainValidTest.index.txt'

createDirIfNotExist(fopStep6ResultBaseline)
lstResult=['EmbedModel\tRoot\tprecision\trecall\tfscore\tacc\ttrain time\tpredict time']
fpOverallResult=fopStep6ResultBaseline+'result_overall.txt'
f1=open(fpOverallResult,'w')
f1.write('\n'.join(lstResult)+'\n')
f1.close()
for embedModel in lstEmbeddingInput:

    fpEmbedVectorProgramRoot=fopStep5EmbeddingModels+embedModel+'/ProgramRoot.vectorForEmb.txt'
    fpEmbedVectorNLRoot = fopStep5EmbeddingModels + embedModel + '/NLRoot.vectorForEmb.txt'

    f1=open(fpEmbedVectorProgramRoot,'r')
    arrProgramRootEmb=f1.read().strip().split('\n')
    f1.close()
    f1 = open(fpEmbedVectorNLRoot, 'r')
    arrNLRootEmb = f1.read().strip().split('\n')
Exemple #29
0
            lstStringTotal.append(strId)
        else:
            lstStringTotal.append(token)
    strOutput = ' '.join(lstStringTotal)
    return strOutput


fopRoot = '../../../../dataPapers/textInSPOC/correctCodeRaw/'
fopCodeFile = fopRoot + 'step2_pseudo/'
fopASTFile = fopRoot + 'step3_pseudo_treesitter/'
fopTokASTFile = fopRoot + 'step2_pseudo_tokenize/'
fpLogSuccessAndFailed = fopRoot + 'log_step2_pseudo_tok.txt'
fpPseudoAll = fopRoot + 'step2_pseudo_all.txt'
fpDictLiteral = fopRoot + 'step2_dictLiterals_all.txt'
fpDictRvLiteral = fopRoot + 'step2_dictRvLiterals_all.txt'
createDirIfNotExist(fopTokASTFile)
f1 = open(fpLogSuccessAndFailed, 'w')
f1.write('')
f1.close()

f1 = open(fpPseudoAll, 'w')
f1.write('')
f1.close()

dictLiterals = {}
dictReverseALs = {}

lstFpCodes = glob.glob(fopCodeFile + '**/*_text.txt', recursive=True)
for i in range(0, len(lstFpCodes)):
    fnItemCode = os.path.basename(lstFpCodes[i])
    fopItemCode = os.path.dirname(lstFpCodes[i]) + '/'
sys.path.append(os.path.abspath(os.path.join('../..')))
from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText, runASTGenAndSeeResult
import asyncio
import time
from joblib import Parallel, delayed
from UtilFunctions import createDirIfNotExist, getPOSInfo, writeDictToFileText

from tree_sitter import Language, Parser
from LibForGraphExtractionFromRawCode import getJsonDict, getTerminalValue
import ast
import re

fopRoot = '../../../../dataPapers/textInSPOC/correctCodeRaw/'
fopMixVersion = fopRoot + 'step4_mixCode/'
fopMixDataAndLabel = fopRoot + 'step5_data_mixCode/'
createDirIfNotExist(fopMixDataAndLabel)
lstSubFolder = glob.glob(fopMixVersion + '*/')


def scoreName(percentage):
    scoreLevel = '1'
    if percentage <= 10:
        scoreLevel = '1'
    elif percentage > 10 and percentage <= 20:
        scoreLevel = '2'
    elif percentage > 20 and percentage <= 30:
        scoreLevel = '3'
    elif percentage > 30 and percentage <= 40:
        scoreLevel = '4'
    elif percentage > 40 and percentage <= 50:
        scoreLevel = '5'