Beispiel #1
0
def buildCommitPart():
    repos = linkOperator.selectRepoOver(5000)
    logCorpus = open('commitLog.dat', "w")
    codeCorpus = open('commitCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            path = getPath(highRepo[1])
            try:
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            logCorpus.write(word.encode('utf-8'))
                            logCorpus.write(" ")
                        logCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", path, ":", e
                print traceback.format_exc()
        print 'end'
Beispiel #2
0
def buildIssueAndCommitSeq(repoId, repoPath, corpusName):
    corpus = open('corpus/code%s.dat' % corpusName, "w")
    try:
        print 'start'
        try:
            # commit part
            gitRe = gitResolver.GitResolver(repoPath)
            commits = gitRe.getCommits()
            print repoPath, ":", len(commits)
            for commit in commits:
                diffs = gitRe.getOneDiff(commit)
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        for word in diffCode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")

            # issue part
            issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
            print repoId, ":", len(issues)
            for issue in issues:
                if issue[5]:
                    bodycode = preprocessor.getIssueCode(
                        issue[5].decode('utf-8'))
                    if len(bodycode):
                        # 不是空列表
                        for word in bodycode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")
        except BaseException, e:
            print "***", repoId, ":", e
            print traceback.format_exc()
        print 'end'
Beispiel #3
0
        return -1
    else:
        return dot_val / ((a_norm * b_norm)**0.5)


textModel = Doc2Vec.load("text12983151.model")
codeModel = Doc2Vec.load("code12983151.model")

index = 0
while index < 3:
    linkList = []
    titleWords = preprocessor.preprocessToWord("test is for your parents")
    print type(textModel.infer_vector(titleWords))
    titelTextVec = textModel.infer_vector(titleWords).tolist()
    print type(titelTextVec[0])
    diffCode = preprocessor.processDiffCode("test is for your parents")
    commitCodeVec = codeModel.infer_vector(diffCode).tolist()
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    index += 1

    # res = json.dumps(linkList, encoding="utf-8", indent=4)
    # trainSet = open('./train/traruanhincase%d.dat' % index, "w")
    # trainSet.write(res)
    # trainSet.close()

# path = './train'
# filelist = os.listdir(path)
# for i in range(0, len(filelist)):
#     filepath = os.path.join(path, filelist[i])
#     print filepath
# -*- coding: UTF-8 -*-

from preprocessor import preprocessor
import re

print preprocessor.processHTML('''
    Examples shown in the javadoc for TESD_DSAFSA_DDS <code>ReplayingDecoder.addOption</code> seems to be wrong. In the document it shows <code>IntegerHeaderFrameDecoder, MyDecoder</code> taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
    ''')

print preprocessor.processDiffCode('''
@@ -349 +349 @@ public class JavadocUtilsTest {
-            "HTML_COMMENT", JavadocUtils.getTokenName(20077));
+            "HTML_COMMENT", JavadocUtils.getTokenName(20078));
    ''')

print preprocessor.preprocessToWord('''
Examples shown in the javadoc for TESD_DSAFSA_DDS ReplayingDecoder.addOption seems to be wrong. In the document it shows IntegerHeaderFrameDecoder, MyDecoder taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
     ''')

if None:
    print 'none'
else:
    print 'other'
# camelCase1 = re.compile(r'^[A-Z]+[a-z]+.*[A-Z]+.*$') # 3
# camelCase2 = re.compile(r'^[a-z]+.*[A-Z]+.*$') # 12
# upperCase = re.compile(r'^[A-Z]+[0-9]*$') # 7
# upperExtCase = re.compile(r'^[A-Z]*(_+[A-Z]*)+[0-9]*$') # 6
#
# print re.match(upperExtCase, 'aOption'), '1'
# print re.match(upperExtCase, 'addOption'), '2'
# print re.match(upperExtCase, 'AddToDeal'), '3'
Beispiel #5
0
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount):
    trueStart = 1
    falseStart = 1
    trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
    falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)

    index = 0
    repo = gitResolver.GitResolver(repoPath)
    while len(trueLinkList) > 0 and len(falseLinkList) > 0:
        print 'true: ', trueStart, ' to ', trueStart + trueCount
        print 'false: ', falseStart, ' to ', falseStart + falseCount
        linkList = []
        for trueLink in trueLinkList:
            commit = repo.getOneCommit(trueLink[1])
            issue = mysqlOperator.selectOneIssue(trueLink[2])
            if issue is None:
                continue

            res = {}
            res['type'] = 1
            res['commit'] = commit.message.decode('utf-8')
            res['issuetitle'] = issue[4].decode('utf-8')
            # issue body
            if issue[5]:
                res['issue'] = issue[5].decode('utf-8')
                issueCodes = []
                bodycode = preprocessor.getIssueCode(res['issue'])
                if len(bodycode):
                    issueCodes.append(bodycode)
                res['issuecode'] = issueCodes
            else:
                res['issue'] = ''
                res['issuecode'] = []

            diffs = repo.getOneDiff(commit)
            diffCodes = []
            for diff in diffs:
                diffCode = preprocessor.processDiffCode(diff.diff)
                if len(diffCode):
                    diffCodes.append(diffCode)
            res['commitcode'] = diffCodes

            linkList.append(res)

        for falseLink in falseLinkList:
            commit = repo.getOneCommit(falseLink[1])
            issue = mysqlOperator.selectOneIssue(falseLink[2])
            if issue is None:
                continue

            res = {}
            res['type'] = 0
            res['commit'] = commit.message.decode('utf-8')
            res['issuetitle'] = issue[4].decode('utf-8')
            # issue body
            if issue[5]:
                res['issue'] = issue[5].decode('utf-8')
                issueCodes = []
                bodycode = preprocessor.getIssueCode(res['issue'])
                if len(bodycode):
                    issueCodes.append(bodycode)
                res['issuecode'] = issueCodes
            else:
                res['issue'] = ''
                res['issuecode'] = []

            diffs = repo.getOneDiff(commit)
            diffCodes = []
            for diff in diffs:
                diffCode = preprocessor.processDiffCode(diff.diff)
                if len(diffCode):
                    diffCodes.append(diffCode)
            res['commitcode'] = diffCodes
            linkList.append(res)

        index += 1
        res = json.dumps(linkList, encoding="utf-8", indent=4)
        trainSet = open('./codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), "w")
        trainSet.write(res)
        trainSet.close()
        print './codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), 'Over'

        trueStart += trueGap
        falseStart += falseGap
        trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
        falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)
    mysqlOperator.close()
    linkOperator.close()
Beispiel #6
0
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount):
    trueStart = 1
    falseStart = 1
    textCorpus = open('frcorpus/frtext%d.dat' % repoId, "w")
    codeCorpus = open('frcorpus/frcode%d.dat' % repoId, "w")
    trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
    falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)

    index = 0
    repo = gitResolver.GitResolver(repoPath)
    try:
        while len(trueLinkList) > 0 and len(falseLinkList) > 0:
            print 'true: ', trueStart, ' to ', trueStart + trueCount
            print 'false: ', falseStart, ' to ', falseStart + falseCount
            my_linkList = []
            fr_linkList = []
            for trueLink in trueLinkList:
                commit = repo.getOneCommit(trueLink[1])
                issue = mysqlOperator.selectOneIssue(trueLink[2])
                if issue is None:
                    continue

                my_res = {}
                my_res['type'] = 1
                my_res['commit'] = commit.message.decode('utf-8')
                my_res['issuetitle'] = issue[4].decode('utf-8')
                # issue body
                if issue[5]:
                    my_res['issue'] = issue[5].decode('utf-8')
                    issueCodes = []
                    bodycode = preprocessor.getIssueCode(my_res['issue'])
                    if len(bodycode):
                        issueCodes.append(bodycode)
                    my_res['issuecode'] = issueCodes
                else:
                    my_res['issue'] = ''
                    my_res['issuecode'] = []

                diffs = repo.getOneDiff(commit)
                diffCodes = []
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        diffCodes.append(diffCode)
                my_res['commitcode'] = diffCodes
                my_linkList.append(my_res)

                fr_res = {}
                fr_res['type'] = 1
                fr_res['issueText'] = []
                # issue body
                if issue[5]:
                    fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8'))
                    fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8')))  # body
                else:
                    fr_res['issueCode'] = []
                fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8')))  # title
                fr_res['commitText'] = []
                fr_res['commitCode'] = []
                fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8')))
                comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
                for comment in comments:
                    fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8')))
                try:
                    files = repo.getFiles(trueLink[1])
                    for changeFile in files:
                        if not changeFile['path'].endswith('.java'):
                            try:
                                fr_res['commitText'].append(
                                    frpreprocesser.extractText(changeFile['text'].decode('utf-8')))
                            except:
                                print trueLink[1], ':', changeFile['path']
                        else:
                            codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8'))
                            for code in codes:
                                if code in fr_res['issueCode']:
                                    fr_res['commitCode'].extend(codes)
                                    break
                except:
                    print 'File Fail 1:', trueLink[1]
                fr_linkList.append(fr_res)
                writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode'])

            for falseLink in falseLinkList:
                commit = repo.getOneCommit(falseLink[1])
                issue = mysqlOperator.selectOneIssue(falseLink[2])
                if issue is None:
                    continue

                my_res = {}
                my_res['type'] = 0
                my_res['commit'] = commit.message.decode('utf-8')
                my_res['issuetitle'] = issue[4].decode('utf-8')
                # issue body
                if issue[5]:
                    my_res['issue'] = issue[5].decode('utf-8')
                    issueCodes = []
                    bodycode = preprocessor.getIssueCode(my_res['issue'])
                    if len(bodycode):
                        issueCodes.append(bodycode)
                    my_res['issuecode'] = issueCodes
                else:
                    my_res['issue'] = ''
                    my_res['issuecode'] = []

                diffs = repo.getOneDiff(commit)
                diffCodes = []
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        diffCodes.append(diffCode)
                my_res['commitcode'] = diffCodes
                my_linkList.append(my_res)

                fr_res = {}
                fr_res['type'] = 0
                fr_res['issueText'] = []
                # issue body
                if issue[5]:
                    fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8'))
                    fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8')))  # body
                else:
                    fr_res['issueCode'] = []
                fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8')))  # title
                fr_res['commitText'] = []
                fr_res['commitCode'] = []
                fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8')))
                comments = mysqlOperator.selectCommentInOneIssue(falseLink[2])
                for comment in comments:
                    fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8')))
                try:
                    files = repo.getFiles(falseLink[1])
                    for changeFile in files:
                        if not changeFile['path'].endswith('.java'):
                            try:
                                fr_res['commitText'].append(
                                    frpreprocesser.extractText(changeFile['text'].decode('utf-8')))
                            except:
                                print trueLink[1], ':', changeFile['path']
                        else:
                            codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8'))
                            for code in codes:
                                if code in fr_res['issueCode']:
                                    fr_res['commitCode'].extend(codes)
                                    break
                except:
                    print 'File Fail 0:', falseLink[1]
                fr_linkList.append(fr_res)
                writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode'])

            index += 1
            res = json.dumps(my_linkList, encoding="utf-8", indent=4)
            trainSet = open('%s/codetrain%d-%d.dat' % (my_folder, repoId, index), "w")
            trainSet.write(res)
            trainSet.close()
            print '%s/codetrain%d-%d.dat' % (my_folder, repoId, index), 'Over'
            fres = json.dumps(fr_linkList, encoding="utf-8", indent=4)
            ftrainSet = open('%s/traincase%d-%d.dat' % (fr_folder, repoId, index), "w")
            ftrainSet.write(fres)
            ftrainSet.close()
            print '%s/traincase%d-%d.dat' % (fr_folder, repoId, index), 'Over'

            trueStart += trueGap
            falseStart += falseGap
            trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
            falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)
    except IOError, e:
        print "***", e
        print traceback.format_exc()
Beispiel #7
0
def buildIssueAndCommit():
    repos = linkOperator.selectOneRepo(50904245)
    # repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('text50904245.dat', "w")
    codeCorpus = open('code50904245.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                # commit part
                path = getPath(highRepo[1])
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        preDiffCode = preprocessor.processPreDiffCode(
                            diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                        if len(preDiffCode):
                            for code in preDiffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'
Beispiel #8
0
index = 0
while len(trueLinkList) > 0 and len(falseLinkList) > 0:
    print 'true: ', trueStart, ' to ', trueStart+TRUE_COUNT
    print 'false: ', falseStart, ' to ', falseStart+FALSE_COUNT
    linkList = []
    for trueLink in trueLinkList:
        tempMap = {}
        tempMap['type'] = 1
        repo = repoMap[trueLink[0]]
        commit = repo.getOneCommit(trueLink[1])
        issue = mysqlOperator.selectOneIssue(trueLink[2])
        comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
        diffs = repo.getOneDiff(commit)
        diffCodeList = []
        for diff in diffs:
            diffCode = preprocessor.processDiffCode(diff.diff)
            preDiffCode = preprocessor.processPreDiffCode(diff.diff)
            diffCodeList.append((codeModel.infer_vector(diffCode), codeModel.infer_vector(preDiffCode)))

        # code part init
        codeMax = -1
        tempMap['commitCode'] = None
        tempMap['issueCode'] = None
        # text part init
        commitText = preprocessor.preprocessToWord(commit.message.decode('utf-8'))
        commitTextVec = textModel.infer_vector(commitText)
        tempMap['commitText'] = commitTextVec  # 确定不变
        titleWords = preprocessor.preprocessToWord(issue[4].decode('utf-8'))
        tempMap['issueText'] = textModel.infer_vector(titleWords)  # 可能改变
        textMax = similarity(commitTextVec, tempMap['issueText'])
        # issue body