Esempio n. 1
0
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

fopRoot = '../../../../../../media/dataPapersExternal/mixCodeRaw/'
fpInputText = fopRoot + 'embeddingModels/d2v/paragraph_text.txt'
fopOutputML = fopRoot + 'resultMLs/doc2vec-rfs-small/'
fpResultDetails = fopOutputML + 'result_details.txt'
fpDoc2VecModel = fopRoot + 'embeddingModels/d2v/d2v.model.txt'
createDirIfNotExist(fopOutputML)

f1 = open(fpInputText, 'r')
arrText = f1.read().strip().split('\n')
f1.close()

trainIndex = 0
lstAllData = []
num_features = 100

keyStartEnd = ''
dictStartEnd = {}
index = 0
lstTuplesTrainTestValid = []
arrText.reverse()
for i in range(1, len(arrText), 2):
Esempio n. 2
0
from nltk.tokenize import word_tokenize
import ast
import re
import pygraphviz as pgv
import pydot
from subprocess import check_call
from graphviz import render
import copy
import nltk
from pathlib import Path

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/'
fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/'
fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/'
createDirIfNotExist(fopInputStep2ReplaceDict)
createDirIfNotExist(fopInputStep2BeforeTranslation)

lstFopStep2Tok1 = sorted(glob.glob(fopInputStep2Pseudocode + '**/'),
                         reverse=True)
lstFpStep2Pseudocode = []
for fop1 in lstFopStep2Tok1:
    lstFpItem1 = sorted(glob.glob(fop1 + '*_text.txt'), reverse=True)
    for fpItem in lstFpItem1:
        lstFpStep2Pseudocode.append(fpItem)

fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
f1 = open(fpDictLiterals, 'r')
arrLits = f1.read().strip().split('\n')
f1.close()
dictLiteralsReverse = {}
Esempio n. 3
0
                        tupItem=(strSourceValue,strTargetValue,strEdgeId)
                        dictHGTEdges[strNewKey].append(tupItem)
    except:
        traceback.print_exc()

fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopMixVersion=fopRoot+'step4_mixCode/'
numContext=5
fopTotalGraphAll= fopRoot + 'step5_totalGraph_small/'+str(numContext)+'/'
fpFileCachedVersion=fopRoot+'cached_graph_all.txt'
fpDotTotalGraph= fopTotalGraphAll + 'total.'+str(numContext)+'.dot'
fpPngTotalGraph= fopTotalGraphAll + 'total.'+str(numContext)+'.png'
fpDictLiterals=fopRoot+'step2_dictLiterals_all.txt'

strSplitCharacterForNodeEdge=' ABAZ '
createDirIfNotExist(fopTotalGraphAll)

dictTotalElements={}

lstFpVersionFiles=[]
if not os.path.isfile(fpFileCachedVersion):
    print('before traverse')
    lstFop1=sorted(glob.glob(fopMixVersion+'*/'))
    for fop1 in lstFop1:
        lstFop2=sorted(glob.glob(fop1+'*/'))
        for fop2 in lstFop2:
            lstFp3=sorted(glob.glob(fop2+'v_*_graphs/g_all.dot'))
            # print(fp3)
            for fp3 in lstFp3:
                lstFpVersionFiles.append(fp3)
        print('end {}'.format(fop1))
Esempio n. 4
0
strSingleComment = ' SINGLECOMMENTCHAR '

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/'
fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/'
fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/'
fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/'
fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/'
fpSourceAfterTrans = fopInputAfterTranslation + 'source.txt'
fopPOSStanford = fopInputAfterTranslation + 'pos_stanford/'
fopPOSNLTK = fopInputAfterTranslation + 'pos_nltk/'
fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/'
fpSortedSource = fopSortedBySimScore + 'source.txt'
fopSortedPOSStanford = fopSortedBySimScore + 'pos_stanford/'
fopSortedPOSNLTK = fopSortedBySimScore + 'pos_nltk/'
createDirIfNotExist(fopSortedPOSStanford)
createDirIfNotExist(fopSortedPOSNLTK)

strParseResultsType = "<class 'pyparsing.ParseResults'>"
strStrType = "<class 'str'>"
nltk.download('bllip_wsj_no_aux')
model_dir = find('models/bllip_wsj_no_aux').path
parser = RerankingParser.from_unified_model_dir(model_dir)

strServerPort = '9000'
nlpObj = StanfordCoreNLP('http://localhost:' + strServerPort)

fpAppendNLTKText = fopSortedBySimScore + 'appendPOS.nltk.text.txt'
fpAppendNLTKPOS = fopSortedBySimScore + 'appendPOS.nltk.pos.txt'
fpAppendStanfordText = fopSortedBySimScore + 'appendPOS.stanford.text.txt'
fpAppendStanfordPOS = fopSortedBySimScore + 'appendPOS.stanford.pos.txt'
Esempio n. 5
0
            str2LineToAdd='{}\t{}\t{}\tProgramRoot\t{}\n{}\t{}\t{}\tNLRoot\t{}'.format(fpItemLabel,programName,versionName,strCodeText,fpItemLabel,programName,versionName,strPseudoText)
            lstProgramText.append(str2LineToAdd)
            if (i+1)%1000==0 or (i+1)==len(lstFpJsonFiles):
                f1=open(fpParagraphText,'a')
                f1.write('\n'.join(lstProgramText)+'\n')
                f1.close()
                lstProgramText=[]
                print('end {} paragraphText'.format((i+1)))
        except:
            traceback.print_exc()

print('end paragraph text')

modelD2v=Doc2Vec.load(fpD2VModel)
if not os.path.isdir(fopParagraphEmb):
    createDirIfNotExist(fopParagraphEmb)
    f1=open(fpParagraphText,'r')
    arrParagraphText=f1.read().strip().split('\n')
    f1.close()
    lstEmbeddings=[]
    indexPara=0
    print('reaad paragraph text {}'.format(len(arrParagraphText)))
    for i in range(0,len(arrParagraphText)):
        arrTabs=arrParagraphText[i].split('\t')
        if len(arrTabs)>=4:
            strContent=''.join(arrTabs[3:])
            # print(strContent)
            vectorItem=modelD2v.infer_vector(word_tokenize(strContent))
            strItem='{}\t{}\t{}\t{}'.format(arrTabs[0],arrTabs[1],arrTabs[2],' '.join(map(str,vectorItem)))
            lstEmbeddings.append(strItem)
        if len(lstEmbeddings)%2000==0 or (i+1)==len(arrParagraphText):
from bllipparser import RerankingParser
import nltk
from pyparsing import OneOrMore, nestedExpr
from nltk.tokenize import word_tokenize, sent_tokenize
from pycorenlp import StanfordCoreNLP

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/'
fopFilterSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/filterByPOS/'

fopSplit = fopInputAfterTranslation + 'sortBySimilarityScore/trainValidTest/'
fpTrainValidTestIndex = fopSplit + 'trainValidTest.index.txt'
fopTrain = fopSplit + 'train/'
fopValid = fopSplit + 'valid/'
fopTest = fopSplit + 'test/'
createDirIfNotExist(fopTrain)
createDirIfNotExist(fopValid)
createDirIfNotExist(fopTest)

lstFpInput = glob.glob(fopFilterSortedBySimScore + '*.txt')

dictFolderContent = {}
dictNewContent = {}
for i in range(0, len(lstFpInput)):
    fpItem = lstFpInput[i]
    nameItem = os.path.basename(fpItem)
    f1 = open(fpItem, 'r')
    arrContent = f1.read().strip().split('\n')
    f1.close()
    dictFolderContent[nameItem] = arrContent
    dictNewContent[nameItem] = [[], [], []]
Esempio n. 7
0
import ast
import re
import pygraphviz as pgv
import pydot
from subprocess import check_call
from graphviz import render
import copy
import nltk
from pathlib import Path

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/'
fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/'
fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/'
fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/'
createDirIfNotExist(fopInputStep2ReplaceDict)
createDirIfNotExist(fopInputStep2Folds)

fpDictLiterals = fopRoot + 'step2_dictLiterals_all.txt'
f1 = open(fpDictLiterals, 'r')
arrLits = f1.read().strip().split('\n')
f1.close()
dictLiteralsReverse = {}
for item in arrLits:
    arrTabs = item.split('\t')
    if len(arrTabs) >= 2:
        strContent = '\t'.join(arrTabs[1:])
        dictLiteralsReverse[strContent] = arrTabs[0]

fpLocation = fopInputStep2BeforeTranslation + 'location.txt'
fpSource = fopInputStep2BeforeTranslation + 'source.txt'
Esempio n. 8
0
fopStanfordCoreNLP = '/home/hungphd/git/dataPapers/stanford-corenlp-4.2.2/'

strParseResultsType = "<class 'pyparsing.ParseResults'>"
strStrType = "<class 'str'>"
strSplitElement = ' SPLITELEMENT '
strEndLine = ' ENDLINE '
strTabChar = ' TABCHAR '
strSingleComment = ' SINGLECOMMENTCHAR '

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/'
fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/'
fopFilterSortedBySimScore = fopSortedBySimScore + 'filterByPOS/'
fopStep3Filer = fopRoot + 'step3_filter_pos/'
createDirIfNotExist(fopFilterSortedBySimScore)
fpFilterSortedLocation = fopFilterSortedBySimScore + 'location.txt'
fpFilterSortedSource = fopFilterSortedBySimScore + 'source.txt'
fpFilterSortedTarget = fopFilterSortedBySimScore + 'target.txt'
fpFilterSortedPred = fopFilterSortedBySimScore + 'pred.txt'
fpFilterSortedIdentifierLog = fopFilterSortedBySimScore + 'identifier_log.txt'
fpFilterSortedLabelOverlapDetection = fopFilterSortedBySimScore + 'label.p1.overlap.txt'
fpFilterSortedLabelIdentifierDetection = fopFilterSortedBySimScore + 'label.p2.identifierDetection.txt'
fpFilterSortedLabelJaccard = fopFilterSortedBySimScore + 'label.p3.jaccardSimilarity.txt'

f1 = open(fpFilterSortedSource, 'r')
arrSources = f1.read().strip().split('\n')
f1.close()
f1 = open(fpFilterSortedLocation, 'r')
arrLocs = f1.read().strip().split('\n')
f1.close()
strEndLine=' ENDLINE '
strTabChar=' TABCHAR '
strSingleComment=' SINGLECOMMENTCHAR '

fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputAfterTranslation=fopRoot+'step2_afterTranslation/'
fopSortedBySimScore=fopInputAfterTranslation+'sortBySimilarityScore/'
fpSortedLocation=fopSortedBySimScore+'location.txt'
fpSortedSource=fopSortedBySimScore+'source.txt'
fpSortedTarget=fopSortedBySimScore+'target.txt'
fpSortedPred=fopSortedBySimScore+'pred.txt'
fpSortedIdentifiers=fopSortedBySimScore+'identifier_log.txt'
fpSortedDetails=fopSortedBySimScore+'sortedDetails.txt'
fopSortedPOSStanford=fopSortedBySimScore+'pos_stanford/'
fopSortedPOSNLTK=fopSortedBySimScore+'pos_nltk/'
createDirIfNotExist(fopSortedPOSStanford)
createDirIfNotExist(fopSortedPOSNLTK)
fopFilterSortedBySimScore=fopSortedBySimScore+'filterByPOS/'
fopStep3Filer=fopRoot+'step3_filter_pos/'
createDirIfNotExist(fopFilterSortedBySimScore)
fpFilterSortedLocation=fopFilterSortedBySimScore+'location.txt'
fpFilterSortedSource=fopFilterSortedBySimScore+'source.txt'
fpFilterSortedTarget=fopFilterSortedBySimScore+'target.txt'
fpFilterSortedPred=fopFilterSortedBySimScore+'pred.txt'
fpFilterSortedSortDetails=fopFilterSortedBySimScore+'sortedDetails.txt'
fpFilterSortedIdentifier=fopFilterSortedBySimScore+'identifier_log.txt'
fpFilterSortedPOSNLTK=fopFilterSortedBySimScore+'pos_nltk.txt'
fpFilterSortedPOSStanford=fopFilterSortedBySimScore+'pos_stanford.txt'


            break
    # if not result:
    #     print('check false')
    return result



fopRoot='/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputAfterTranslation=fopRoot+'step2_afterTranslation/'
fopInputSortBySimilarityScore=fopInputAfterTranslation+'sortBySimilarityScore/'
fpSourceAfterTrans=fopInputAfterTranslation+'source.txt'
fpTargetAfterTrans=fopInputAfterTranslation+'target.txt'
fpLocationAfterTrans=fopInputAfterTranslation+'location.txt'
fpPredAfterTrans=fopInputAfterTranslation+'pred.txt'

createDirIfNotExist(fopInputSortBySimilarityScore)
fpSortedSourceAfterTrans=fopInputSortBySimilarityScore+'source.txt'
fpSortedTargetAfterTrans=fopInputSortBySimilarityScore+'target.txt'
fpSortedLocationAfterTrans=fopInputSortBySimilarityScore+'location.txt'
fpSortedPredAfterTrans=fopInputSortBySimilarityScore+'pred.txt'
fpSortedDetailsAfterTrans=fopInputSortBySimilarityScore+'sortedDetails.txt'
fpFuncDeclInfo=fopInputSortBySimilarityScore+'function_declarations.txt'

f1=open(fpFuncDeclInfo,'r')
arrFuncDecls=f1.read().strip().split('\n')
f1.close()
dictFuncDecls={}
for item in arrFuncDecls:
    arrTabs=item.split('\t')
    strKey=arrTabs[0]+'\t'+arrTabs[1]
    lstVal=ast.literal_eval(arrTabs[2])
Esempio n. 11
0
for i in range(0, len(arrSources)):
    if arrSources[i].strip() != '':
        dictText.add(arrSources[i].strip())
print(len(dictText))
lstText = list(dictText)

lstItemText = []
lstItemPOS1 = []
lstItemPOS2 = []
indexItem = 0

# if os.path.exists(fopPOSStanford) and os.path.isdir(fopPOSStanford):
#     shutil.rmtree(fopPOSStanford)
# if os.path.exists(fopPOSNLTK) and os.path.isdir(fopPOSNLTK):
#     shutil.rmtree(fopPOSNLTK)
createDirIfNotExist(fopPOSStanford)
createDirIfNotExist(fopPOSNLTK)

for i in range(0, len(lstText)):
    try:
        strText = lstText[i]
        strPOSStanford = getGraphDependencyFromTextByStanford(strText, nlpObj)

        try:
            lstNonT = []
            lstT = []
            best = parser.parse(strText)
            strParseContent = str(best.get_parser_best().ptb_parse)
            dataParseResult = OneOrMore(
                nestedExpr()).parseString(strParseContent)
            strPOSNLTK = walkAndGetPOSJSonByNLTK(dataParseResult, 0, lstNonT,
Esempio n. 12
0
import copy
import nltk
from pathlib import Path

fopRoot = '/home/hungphd/media/dataPapersExternal/mixCodeRaw/'
fopInputStep2Pseudocode = fopRoot + 'step2_pseudo_tokenize/'
fopInputStep2ReplaceDict = fopRoot + 'step2_code_replaceDict/'
fopInputStep2BeforeTranslation = fopRoot + 'step2_beforeTranslation/step1/'
fopInputStep2Folds = fopRoot + 'step2_beforeTranslation/folds/'
fopInputAfterTranslation = fopRoot + 'step2_afterTranslation/'
fpTestLocExpected = fopInputStep2BeforeTranslation + 'location.txt'
fpTestPredBeforeSort = fopInputAfterTranslation + 'test.pred.unsorted.txt'
fpTestLocBeforeSort = fopInputAfterTranslation + 'test.loc.unsorted.txt'
fpTestPredAfterSort = fopInputAfterTranslation + 'pred.txt'
fpTestLocAfterSort = fopInputAfterTranslation + 'pred.loc.txt'
createDirIfNotExist(fopInputAfterTranslation)
f1 = open(fpTestLocBeforeSort, 'w')
f1.write('')
f1.close()
f1 = open(fpTestPredBeforeSort, 'w')
f1.write('')
f1.close()

dictLocBefore = {}

for i in range(1, 11):
    nameFold = 'fold-{}'.format(i)
    fpFoldTestLocation = fopInputStep2Folds + nameFold + '/test.loc.txt'
    fpFoldTestPred = fopInputStep2Folds + nameFold + '/pred.txt'

    f1 = open(fpFoldTestLocation, 'r')