Esempio n. 1
0
def divide_into_words(input_file, output_file, labels=None):
    res = []
    if labels is None:
        f = open(input_file, "r")
        issue_corpus = json.loads(f.read())
        f.close()

        i = 0
        length = len(issue_corpus)
        for issue in issue_corpus:
            content = unicode(issue['title']) + u': ' + unicode(issue['body'])
            words = preprocessor.preprocessToWord(content)
            res.append(words)
            # out = ""
            # for word in words:
            #     out += word+" "
            # f.write(out+"\n")
            i += 1
            print '%d / %d...............%.4f%%' % (i, length,
                                                    i * 100.0 / length)
            # if i == length/2:
            #     f = open(output_file+"-part1", "w")
            #     f.write(json.dumps(res, encoding="utf-8"))
            #     f.close()
            #     res = []
        # f = open(output_file + "-part2", "w")
        f = open(output_file, "w")
        f.write(json.dumps(res, encoding="utf-8"))
        f.close()
    else:
        for label in labels:
            f = open(input_file + label + ".ic", "r")
            issue_corpus = json.loads(f.read())
            f.close()

            i = 0
            length = len(issue_corpus)
            for issue in issue_corpus:
                content = unicode(issue['title']) + u': ' + unicode(
                    issue['body'])
                words = preprocessor.preprocessToWord(content)
                res.append(words)
                # out = ""
                # for word in words:
                #     out += word+" "
                # f.write(out+"\n")
                i += 1
                print '%d / %d...............%.4f%%' % (i, length,
                                                        i * 100.0 / length)
                # if i == length/2:
                #     f = open(output_file+"-part1", "w")
                #     f.write(json.dumps(res, encoding="utf-8"))
                #     f.close()
                #     res = []
            # f = open(output_file + "-part2", "w")
            f = open(output_file + label + ".ic", "w")
            f.write(json.dumps(res, encoding="utf-8"))
            f.close()
            res = []
    return res
Esempio n. 2
0
def buildCommitPart():
    repos = linkOperator.selectRepoOver(5000)
    logCorpus = open('commitLog.dat', "w")
    codeCorpus = open('commitCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            path = getPath(highRepo[1])
            try:
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            logCorpus.write(word.encode('utf-8'))
                            logCorpus.write(" ")
                        logCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", path, ":", e
                print traceback.format_exc()
        print 'end'
Esempio n. 3
0
def buildIssuePart():
    repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('issueText.dat', "w")
    codeCorpus = open('issueCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'
Esempio n. 4
0
def text2vec(text):
    words = preprocessor.preprocessToWord(text)
    res = []
    for word in words:
        try:
            res.append(EMBEDDING_MODEL[word])
        except KeyError:
            res.append(np.zeros(VECTOR_SIZE))
    return res
Esempio n. 5
0
def text2vec(text, isHtml):
    if isHtml:
        words = preprocessor.processHTML(text)[1]
    else:
        words = preprocessor.preprocessToWord(text)
    res = []
    for word in words:
        try:
            res.append(wordModel[word])
        except KeyError:
            res.append(np.zeros(VECTOR_SIZE))
    return res
Esempio n. 6
0
def build_words_in_json():
    f = open('./output/issue_corpus.ic', "r")
    issue_corpus = json.loads(f.read())
    f.close()

    f = open('./output/words_corpus.ic', "w")
    f.write("[\n")
    i = 0
    length = len(issue_corpus)
    for issue in issue_corpus:
        content = unicode(issue['title']) + u': ' + unicode(issue['body'])
        # print content
        res = preprocessor.preprocessToWord(content)
        f.write(json.dumps(res, encoding="utf-8"))
        if i != length:
            f.write(",\n")
        i += 1
        print '%d / %d...............%.4f%%' % (i, length, i * 100.0 / length)
    f.write("\n]")
    f.close()
Esempio n. 7
0
        dot_val += a * b
        a_norm += a**2
        b_norm += b**2
    if a_norm == 0.0 or b_norm == 0.0:
        return -1
    else:
        return dot_val / ((a_norm * b_norm)**0.5)


textModel = Doc2Vec.load("text12983151.model")
codeModel = Doc2Vec.load("code12983151.model")

index = 0
while index < 3:
    linkList = []
    titleWords = preprocessor.preprocessToWord("test is for your parents")
    print type(textModel.infer_vector(titleWords))
    titelTextVec = textModel.infer_vector(titleWords).tolist()
    print type(titelTextVec[0])
    diffCode = preprocessor.processDiffCode("test is for your parents")
    commitCodeVec = codeModel.infer_vector(diffCode).tolist()
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    index += 1

    # res = json.dumps(linkList, encoding="utf-8", indent=4)
    # trainSet = open('./train/traruanhincase%d.dat' % index, "w")
    # trainSet.write(res)
    # trainSet.close()

# path = './train'
Esempio n. 8
0
from preprocessor import preprocessor
import re

print preprocessor.processHTML('''
    Examples shown in the javadoc for TESD_DSAFSA_DDS <code>ReplayingDecoder.addOption</code> seems to be wrong. In the document it shows <code>IntegerHeaderFrameDecoder, MyDecoder</code> taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
    ''')

print preprocessor.processDiffCode('''
@@ -349 +349 @@ public class JavadocUtilsTest {
-            "HTML_COMMENT", JavadocUtils.getTokenName(20077));
+            "HTML_COMMENT", JavadocUtils.getTokenName(20078));
    ''')

print preprocessor.preprocessToWord('''
Examples shown in the javadoc for TESD_DSAFSA_DDS ReplayingDecoder.addOption seems to be wrong. In the document it shows IntegerHeaderFrameDecoder, MyDecoder taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
     ''')

if None:
    print 'none'
else:
    print 'other'
# camelCase1 = re.compile(r'^[A-Z]+[a-z]+.*[A-Z]+.*$') # 3
# camelCase2 = re.compile(r'^[a-z]+.*[A-Z]+.*$') # 12
# upperCase = re.compile(r'^[A-Z]+[0-9]*$') # 7
# upperExtCase = re.compile(r'^[A-Z]*(_+[A-Z]*)+[0-9]*$') # 6
#
# print re.match(upperExtCase, 'aOption'), '1'
# print re.match(upperExtCase, 'addOption'), '2'
# print re.match(upperExtCase, 'AddToDeal'), '3'
# print re.match(upperExtCase, 'dsadfsdf'), '4'
Esempio n. 9
0
def text2vec(text, isHtml):
    if isHtml:
        words = preprocessor.processHTML(text)[1]
    else:
        words = preprocessor.preprocessToWord(text)
    return docModel.infer_vector(words)
Esempio n. 10
0
def buildIssueAndCommit():
    repos = linkOperator.selectOneRepo(50904245)
    # repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('text50904245.dat', "w")
    codeCorpus = open('code50904245.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                # commit part
                path = getPath(highRepo[1])
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        preDiffCode = preprocessor.processPreDiffCode(
                            diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                        if len(preDiffCode):
                            for code in preDiffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'
Esempio n. 11
0
        commit = repo.getOneCommit(trueLink[1])
        issue = mysqlOperator.selectOneIssue(trueLink[2])
        comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
        diffs = repo.getOneDiff(commit)
        diffCodeList = []
        for diff in diffs:
            diffCode = preprocessor.processDiffCode(diff.diff)
            preDiffCode = preprocessor.processPreDiffCode(diff.diff)
            diffCodeList.append((codeModel.infer_vector(diffCode), codeModel.infer_vector(preDiffCode)))

        # code part init
        codeMax = -1
        tempMap['commitCode'] = None
        tempMap['issueCode'] = None
        # text part init
        commitText = preprocessor.preprocessToWord(commit.message.decode('utf-8'))
        commitTextVec = textModel.infer_vector(commitText)
        tempMap['commitText'] = commitTextVec  # 确定不变
        titleWords = preprocessor.preprocessToWord(issue[4].decode('utf-8'))
        tempMap['issueText'] = textModel.infer_vector(titleWords)  # 可能改变
        textMax = similarity(commitTextVec, tempMap['issueText'])
        # issue body
        if issue[5]:
            body = preprocessor.processHTML(issue[5].decode('utf-8'))
            bodyTextVec = textModel.infer_vector(body[1])
            sim = similarity(commitTextVec, bodyTextVec)
            if sim > textMax:
                tempMap['issueText'] = bodyTextVec
                textMax = sim
            if len(body[0]) > 0:
                codeVec = codeModel.infer_vector(body[0])