Python preprocessToWord Exemples, preprocessor.preprocessor.preprocessToWord Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : main.py Projet : JoeyLee0201/IssueClassify

def divide_into_words(input_file, output_file, labels=None):
    res = []
    if labels is None:
        f = open(input_file, "r")
        issue_corpus = json.loads(f.read())
        f.close()

        i = 0
        length = len(issue_corpus)
        for issue in issue_corpus:
            content = unicode(issue['title']) + u': ' + unicode(issue['body'])
            words = preprocessor.preprocessToWord(content)
            res.append(words)
            # out = ""
            # for word in words:
            #     out += word+" "
            # f.write(out+"\n")
            i += 1
            print '%d / %d...............%.4f%%' % (i, length,
                                                    i * 100.0 / length)
            # if i == length/2:
            #     f = open(output_file+"-part1", "w")
            #     f.write(json.dumps(res, encoding="utf-8"))
            #     f.close()
            #     res = []
        # f = open(output_file + "-part2", "w")
        f = open(output_file, "w")
        f.write(json.dumps(res, encoding="utf-8"))
        f.close()
    else:
        for label in labels:
            f = open(input_file + label + ".ic", "r")
            issue_corpus = json.loads(f.read())
            f.close()

            i = 0
            length = len(issue_corpus)
            for issue in issue_corpus:
                content = unicode(issue['title']) + u': ' + unicode(
                    issue['body'])
                words = preprocessor.preprocessToWord(content)
                res.append(words)
                # out = ""
                # for word in words:
                #     out += word+" "
                # f.write(out+"\n")
                i += 1
                print '%d / %d...............%.4f%%' % (i, length,
                                                        i * 100.0 / length)
                # if i == length/2:
                #     f = open(output_file+"-part1", "w")
                #     f.write(json.dumps(res, encoding="utf-8"))
                #     f.close()
                #     res = []
            # f = open(output_file + "-part2", "w")
            f = open(output_file + label + ".ic", "w")
            f.write(json.dumps(res, encoding="utf-8"))
            f.close()
            res = []
    return res

Exemple #2

0

Afficher le fichier

Fichier : buildcorpus.py Projet : ruanhang1993/DeepLink

def buildCommitPart():
    repos = linkOperator.selectRepoOver(5000)
    logCorpus = open('commitLog.dat', "w")
    codeCorpus = open('commitCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            path = getPath(highRepo[1])
            try:
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            logCorpus.write(word.encode('utf-8'))
                            logCorpus.write(" ")
                        logCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", path, ":", e
                print traceback.format_exc()
        print 'end'

Exemple #3

0

Afficher le fichier

Fichier : buildcorpus.py Projet : ruanhang1993/DeepLink

def buildIssuePart():
    repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('issueText.dat', "w")
    codeCorpus = open('issueCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'

Exemple #4

0

Afficher le fichier

Fichier : RNN.py Projet : JoeyLee0201/IssueClassify

def text2vec(text):
    words = preprocessor.preprocessToWord(text)
    res = []
    for word in words:
        try:
            res.append(EMBEDDING_MODEL[word])
        except KeyError:
            res.append(np.zeros(VECTOR_SIZE))
    return res

Exemple #5

0

Afficher le fichier

Fichier : nocodeRNN.py Projet : JoeyLee0201/IssueClassify

def text2vec(text, isHtml):
    if isHtml:
        words = preprocessor.processHTML(text)[1]
    else:
        words = preprocessor.preprocessToWord(text)
    res = []
    for word in words:
        try:
            res.append(wordModel[word])
        except KeyError:
            res.append(np.zeros(VECTOR_SIZE))
    return res

Exemple #6

0

Afficher le fichier

Fichier : main.py Projet : JoeyLee0201/IssueClassify

def build_words_in_json():
    f = open('./output/issue_corpus.ic', "r")
    issue_corpus = json.loads(f.read())
    f.close()

    f = open('./output/words_corpus.ic', "w")
    f.write("[\n")
    i = 0
    length = len(issue_corpus)
    for issue in issue_corpus:
        content = unicode(issue['title']) + u': ' + unicode(issue['body'])
        # print content
        res = preprocessor.preprocessToWord(content)
        f.write(json.dumps(res, encoding="utf-8"))
        if i != length:
            f.write(",\n")
        i += 1
        print '%d / %d...............%.4f%%' % (i, length, i * 100.0 / length)
    f.write("\n]")
    f.close()

Exemple #7

0

Afficher le fichier

Fichier : jsonTest.py Projet : ruanhang1993/DeepLink

        dot_val += a * b
        a_norm += a**2
        b_norm += b**2
    if a_norm == 0.0 or b_norm == 0.0:
        return -1
    else:
        return dot_val / ((a_norm * b_norm)**0.5)


textModel = Doc2Vec.load("text12983151.model")
codeModel = Doc2Vec.load("code12983151.model")

index = 0
while index < 3:
    linkList = []
    titleWords = preprocessor.preprocessToWord("test is for your parents")
    print type(textModel.infer_vector(titleWords))
    titelTextVec = textModel.infer_vector(titleWords).tolist()
    print type(titelTextVec[0])
    diffCode = preprocessor.processDiffCode("test is for your parents")
    commitCodeVec = codeModel.infer_vector(diffCode).tolist()
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    linkList.append({'text': titelTextVec, 'code': commitCodeVec})
    index += 1

    # res = json.dumps(linkList, encoding="utf-8", indent=4)
    # trainSet = open('./train/traruanhincase%d.dat' % index, "w")
    # trainSet.write(res)
    # trainSet.close()

# path = './train'

Exemple #8

0

Afficher le fichier

Fichier : testPreprocessor.py Projet : ruanhang1993/DeepLink

from preprocessor import preprocessor
import re

print preprocessor.processHTML('''
    Examples shown in the javadoc for TESD_DSAFSA_DDS <code>ReplayingDecoder.addOption</code> seems to be wrong. In the document it shows <code>IntegerHeaderFrameDecoder, MyDecoder</code> taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
    ''')

print preprocessor.processDiffCode('''
@@ -349 +349 @@ public class JavadocUtilsTest {
-            "HTML_COMMENT", JavadocUtils.getTokenName(20077));
+            "HTML_COMMENT", JavadocUtils.getTokenName(20078));
    ''')

print preprocessor.preprocessToWord('''
Examples shown in the javadoc for TESD_DSAFSA_DDS ReplayingDecoder.addOption seems to be wrong. In the document it shows IntegerHeaderFrameDecoder, MyDecoder taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5.
     ''')

if None:
    print 'none'
else:
    print 'other'
# camelCase1 = re.compile(r'^[A-Z]+[a-z]+.*[A-Z]+.*$') # 3
# camelCase2 = re.compile(r'^[a-z]+.*[A-Z]+.*$') # 12
# upperCase = re.compile(r'^[A-Z]+[0-9]*$') # 7
# upperExtCase = re.compile(r'^[A-Z]*(_+[A-Z]*)+[0-9]*$') # 6
#
# print re.match(upperExtCase, 'aOption'), '1'
# print re.match(upperExtCase, 'addOption'), '2'
# print re.match(upperExtCase, 'AddToDeal'), '3'
# print re.match(upperExtCase, 'dsadfsdf'), '4'

Exemple #9

0

Afficher le fichier

Fichier : nocodeBP.py Projet : ruanhang1993/DeepLink

def text2vec(text, isHtml):
    if isHtml:
        words = preprocessor.processHTML(text)[1]
    else:
        words = preprocessor.preprocessToWord(text)
    return docModel.infer_vector(words)

Exemple #10

0

Afficher le fichier

Fichier : buildcorpus.py Projet : ruanhang1993/DeepLink

def buildIssueAndCommit():
    repos = linkOperator.selectOneRepo(50904245)
    # repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('text50904245.dat', "w")
    codeCorpus = open('code50904245.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                # commit part
                path = getPath(highRepo[1])
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        preDiffCode = preprocessor.processPreDiffCode(
                            diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                        if len(preDiffCode):
                            for code in preDiffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'

Exemple #11

0

Afficher le fichier

        commit = repo.getOneCommit(trueLink[1])
        issue = mysqlOperator.selectOneIssue(trueLink[2])
        comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
        diffs = repo.getOneDiff(commit)
        diffCodeList = []
        for diff in diffs:
            diffCode = preprocessor.processDiffCode(diff.diff)
            preDiffCode = preprocessor.processPreDiffCode(diff.diff)
            diffCodeList.append((codeModel.infer_vector(diffCode), codeModel.infer_vector(preDiffCode)))

        # code part init
        codeMax = -1
        tempMap['commitCode'] = None
        tempMap['issueCode'] = None
        # text part init
        commitText = preprocessor.preprocessToWord(commit.message.decode('utf-8'))
        commitTextVec = textModel.infer_vector(commitText)
        tempMap['commitText'] = commitTextVec  # 确定不变
        titleWords = preprocessor.preprocessToWord(issue[4].decode('utf-8'))
        tempMap['issueText'] = textModel.infer_vector(titleWords)  # 可能改变
        textMax = similarity(commitTextVec, tempMap['issueText'])
        # issue body
        if issue[5]:
            body = preprocessor.processHTML(issue[5].decode('utf-8'))
            bodyTextVec = textModel.infer_vector(body[1])
            sim = similarity(commitTextVec, bodyTextVec)
            if sim > textMax:
                tempMap['issueText'] = bodyTextVec
                textMax = sim
            if len(body[0]) > 0:
                codeVec = codeModel.infer_vector(body[0])