예제 #1
0
def main():
    par = plyj.parser.Parser()
    file_path = "../Java/Corpus/"
    cul = []
    vocab = {}
    sentlens = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = [
        "5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"
    ]
    for subdir, dirs, files in os.walk(file_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                #cul.extend(cus)
            for i, cu in cus:
                #print cu.getStr()
                sf2, fi, sents = seq.getSents(cu, i, "levels")
                sf.extend(sf2)
                fields.extend(fi)
                print str(ctr) + ": " + str(len(sents))
                ctr += 1
                for sent, vl in sents:
                    #print str(len(sf)) + " importables"
                    #print str(len(fields)) + " fields"
                    if not len(sent) in sentlens:
                        sentlens[len(sent)] = 0
                    sentlens[len(sent)] += 1
                    if len(sent) > 0:
                        for stat in sent:
                            s = getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
            #break
    for s in vocab:
        print s
        for sig in resolveSigs(vocab[s]):
            print '\t' + e.nstr(sig)
    print len(vocab)
    print len(set(sf))
    print len(set(fields))
    print sentlens
예제 #2
0
def main():
    par = plyj.parser.Parser()
    file_path = "../Java/Corpus/"
    cul = []
    vocab = {}
    sentlens = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = ["5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"]
    for subdir, dirs, files in os.walk(file_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                #cul.extend(cus)
            for i, cu in cus:
                #print cu.getStr()
                sf2, fi, sents = seq.getSents(cu, i, "levels")
                sf.extend(sf2)
                fields.extend(fi)
                print str(ctr) + ": " + str(len(sents))
                ctr += 1
                for sent, vl in sents:
                    #print str(len(sf)) + " importables"
                    #print str(len(fields)) + " fields"
                    if not len(sent) in sentlens:
                        sentlens[len(sent)] = 0
                    sentlens[len(sent)] += 1
                    if len(sent) > 0:
                        for stat in sent:
                            s = getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
            #break
    for s in vocab:
        print s
        for sig in resolveSigs(vocab[s]):
            print '\t' + e.nstr(sig)
    print len(vocab)
    print len(set(sf))
    print len(set(fields))
    print sentlens
예제 #3
0
def main():
    par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    corpus_path = "../Java/Corpus/"
    data_path = "../Data/Raw"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'w')
    var_file = open(os.path.join(data_path, var_name), 'w')
    vocab_file = open(os.path.join(data_path, vocab_name), 'w')
    ####
    vocab = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = ["5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"]
    for subdir, dirs, files in os.walk(corpus_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                for i, cu in cus:
                    sf2, fi, sents = seq.getSents(cu, i, mode)
                    sf.extend(sf2)
                    fields.extend(fi)
                    print str(ctr) + ": " + str(len(sents))
                    ctr += 1
                    for sent, vl in sents:
                        meth_file.write("<S2>\n")
                        meth_file.write("<S1>\n")
                        for stat, ctx in sent:
                            meth_file.write(e.nstr(t.getSig(stat, vl, False)) + ' # ' + e.nstr(ctx) + '\n')
                            s = t.getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
                        meth_file.write('<END>\n')
                    vsents = seq.getVarSents(sents)
                    for vsent in vsents:
                        var_file.write("<S2>\n")
                        var_file.write("<S1>\n")
                        for stat, ctx in vsent:
                            var_file.write(e.nstr(stat) + '\n')
                        var_file.write('<END>\n')
            #break
        for s in vocab:
            vocab_file.write(s + '\n')
            for sig in t.resolveSigs(vocab[s]):
                vocab_file.write('\t' + e.nstr(sig) + '\n')
    meth_file.close()
    var_file.close()
    vocab_file.close()
예제 #4
0
def main():
    #par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    data_path = "../Data/Raw"
    new_path = "../Data/Revised"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    counts_name = "counts_" + mode + ".txt"
    memm_name = "memm_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'r')
    var_file = open(os.path.join(data_path, var_name), 'r')
    vocab_file = open(os.path.join(data_path, vocab_name), 'r')
    nvocab_file = open(os.path.join(new_path, vocab_name), 'wb')
    count_file = open(os.path.join(new_path, counts_name), 'wb')
    memm_file = open(os.path.join(new_path, memm_name), 'wb')
    ####
    meth_sigs = build_meth_vocab(vocab_file, meth_file)
    meth_vocab_list = {}
    ctr = 0
    for f in meth_sigs:
        print f
        for n in meth_sigs[f]:
            if not (f,n) in meth_vocab_list:
                meth_vocab_list[(f, n)] = ctr
                ctr += 1
            print '\t' + str(n) + " | ",
            for i in range(n):
                print str(i-1) + ":( ",
                for ty in meth_sigs[f][n][i-1]:
                    print ty + '/' + str(meth_sigs[f][n][i-1][ty]) + ' ', 
                print ") ",
            print
    meth_vocab_list["<END>"] = ctr
    meth_vocab_list["<S1>"] = ctr+1
    meth_vocab_list["<S2>"] = ctr+2
    pot_var_vocab_list = {}
    ctr = 0
    for k in meth_vocab_list:
        if type(k) is not str:
            f, n = k
            for s in t.powerset([i-1 for i in range(n)]):
                pot_var_vocab_list[(f, n, tuple(s))] = ctr
                ctr += 1
    pot_var_vocab_list["<END>"] = ctr
    pot_var_vocab_list["<S1>"] = ctr+1
    pot_var_vocab_list["<S2>"] = ctr+2
    vsents = getVarLines(var_file)
    act_var_vocab_list = {}
    ctr = 0
    for s in vsents:
        for stat in s:
            if not stat in act_var_vocab_list:
                act_var_vocab_list[stat] = ctr
                ctr += 1
    pickle.dump((meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list), nvocab_file)
    nvocab_file.close()
    print len(meth_vocab_list)
    print len(pot_var_vocab_list)
    print len(act_var_vocab_list)
    meth_sents = getReducedLines(meth_file)
    meth_sents = seq.getFeatures(meth_sents)
    X = [meth_sents[i][j][1] for i in range(len(meth_sents)) for j in range(len(meth_sents[i]))]
    print len(X)
    y = [meth_vocab_list[meth_sents[i][j][0]] for i in range(len(meth_sents)) for j in range(len(meth_sents[i]))]
    meth_ngram, meth_N1p, meth_ch = getNTuples(meth_sents, meth_vocab_list, "meth")
    print "N-GRAMS"
    pot_var_ngram, pot_var_N1p, pot_var_ch = getNTuples(vsents, pot_var_vocab_list, "var")
    print "N-GRAMS"
    act_var_ngram, act_var_N1p, act_var_ch = getNTuples(vsents, act_var_vocab_list, "var")
    print "N-GRAMS"
    pickle.dump(((meth_ngram, meth_N1p, meth_ch), (pot_var_ngram, pot_var_N1p, pot_var_ch), (act_var_ngram, act_var_N1p, act_var_ch)), count_file)
    count_file.close()
    MEMM = linear_model.LogisticRegression()
    if not mode == "cfs":
        MEMM.fit(X,y)
    pickle.dump(MEMM, memm_file)
    meth_file.close()
    var_file.close()
    memm_file.close()
예제 #5
0
def main():
    par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    corpus_path = "../Java/Corpus/"
    data_path = "../Data/Raw"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'w')
    var_file = open(os.path.join(data_path, var_name), 'w')
    vocab_file = open(os.path.join(data_path, vocab_name), 'w')
    ####
    vocab = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = [
        "5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"
    ]
    for subdir, dirs, files in os.walk(corpus_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                for i, cu in cus:
                    sf2, fi, sents = seq.getSents(cu, i, mode)
                    sf.extend(sf2)
                    fields.extend(fi)
                    print str(ctr) + ": " + str(len(sents))
                    ctr += 1
                    for sent, vl in sents:
                        meth_file.write("<S2>\n")
                        meth_file.write("<S1>\n")
                        for stat, ctx in sent:
                            meth_file.write(
                                e.nstr(t.getSig(stat, vl, False)) + ' # ' +
                                e.nstr(ctx) + '\n')
                            s = t.getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
                        meth_file.write('<END>\n')
                    vsents = seq.getVarSents(sents)
                    for vsent in vsents:
                        var_file.write("<S2>\n")
                        var_file.write("<S1>\n")
                        for stat, ctx in vsent:
                            var_file.write(e.nstr(stat) + '\n')
                        var_file.write('<END>\n')
            #break
        for s in vocab:
            vocab_file.write(s + '\n')
            for sig in t.resolveSigs(vocab[s]):
                vocab_file.write('\t' + e.nstr(sig) + '\n')
    meth_file.close()
    var_file.close()
    vocab_file.close()
예제 #6
0
def getLL(cu,
          i,
          seq_mode="levels",
          meth_prob="MEMM",
          var_prob=3,
          var_voc="pot",
          fill="max",
          num_guess=200):
    data_path = "../Data/Revised"
    vocab_name = "vocab_" + seq_mode + ".txt"
    counts_name = "counts_" + seq_mode + ".txt"
    memm_name = "memm_" + seq_mode + ".txt"
    vocab_file = open(os.path.join(data_path, vocab_name), 'rb')
    vocab_data = pickle.load(vocab_file)
    meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list = vocab_data
    print "Read vocab data"
    vocab_file.close()
    if var_voc == "act":
        var_vocab_list = act_var_vocab_list
        pot_var_vocab_list = None
    else:
        var_vocab_list = pot_var_vocab_list
        act_var_vocab_list = None
    count_file = open(os.path.join(data_path, counts_name), 'rb')
    count_data = pickle.load(count_file)
    meth_count, pot_count, act_count = count_data
    print "Read count data"
    count_file.close()
    if var_voc == "act":
        var_count = act_count
        pot_count = None
    else:
        var_count = pot_count
        act_count = None
    if type(meth_prob) == int:
        meth_dat = [meth_prob, meth_vocab_list]
        meth_dat.extend(meth_count)
        prob_mode = "ngram"
    else:
        meth_count = None
        memm_file = open(os.path.join(data_path, memm_name), 'rb')
        meth_dat = pickle.load(memm_file)
        meth_dat = [meth_dat, meth_vocab_list]
        print "Read MEMM"
        prob_mode = "MEMM"
        memm_file.close()
    imp, fi, sents = seq.getSents(cu, i, seq_mode)
    word_ll = {}
    noUnk = True
    for word in meth_vocab_list:
        if type(word) is not str and word[0].split('$')[0] in i:
            ll = 0
            ctr = 0
            for sent, vl in sents:
                newsent = []
                for stat, ctx in sent:
                    if stat == "UNK":
                        newsent.append((word, ctx))
                        noUnk = False
                    else:
                        f = stat[1]
                        n = len(stat) - 2
                        newsent.append(((f, n), ctx))
                newsent = seq.getFeatures([newsent])[0]
                newsent.insert(0, "<S1>")
                newsent.insert(0, "<S2>")
                newsent.append("<END>")
                if type(meth_prob) == int:
                    ctr += len(newsent) - 2
                else:
                    ctr += len(newsent) - 3
                ll += seqProb(newsent, meth_dat, prob_mode)
            ll /= ctr
            word_ll[word] = ll
    word_ll = sorted(word_ll.items(), key=operator.itemgetter(1), reverse=True)
    top_guess = {}
    i = 0
    if noUnk:
        ll = 0
        vsents = seq.getVarSents2(sents)
        var_dat = [var_prob, var_vocab_list]
        var_dat.extend(var_count)
        for sen in vsents:
            sen.insert(0, "<S1>")
            sen.insert(0, "<S2>")
            sen.append("<END>")
            ctr += len(sen) - 2
            #print sen
            ll += varSeqProb(sen, var_dat)
        top_guess = [[["NILL"], ll / ctr]]
    else:
        if fill == "max":
            len_guess = 20
        else:
            len_guess = 1
            num_guess = 1
        while i < len(word_ll) and len(top_guess) < len_guess:
            if fill == "max":
                w = word_ll[i][0]
            else:
                w = random.choice(word_ll)[0]
            i += 1
            var_guess = var_guesses(w, cu, meth_sigs, num_guess)
            if len(var_guess) > 0:
                var_dat = [var_prob, var_vocab_list]
                var_dat.extend(var_count)
                for call in var_guess:
                    ll = 0
                    ctr = 0
                    vsents = []
                    for sent, vl in sents:
                        newsent = []
                        for stat, ctx in sent:
                            if stat == "UNK":
                                newsent.append((call, ctx))
                            else:
                                newsent.append((stat, ctx))
                        vsents.append([newsent, vl])
                    vsents = seq.getVarSents2(vsents)
                    for sen in vsents:
                        sen.insert(0, "<S1>")
                        sen.insert(0, "<S2>")
                        sen.append("<END>")
                        ctr += len(sen) - 2
                        #print sen
                        ll += varSeqProb(sen, var_dat)
                    top_guess[call] = ll / ctr
        top_guess = sorted(top_guess.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    return top_guess
예제 #7
0
파일: LM.py 프로젝트: KleinFourGroup/NLP_PL
def getLL(cu, i, seq_mode = "levels", meth_prob = "MEMM", var_prob = 3, var_voc = "pot", fill = "max", num_guess = 200):
    data_path = "../Data/Revised"
    vocab_name = "vocab_" + seq_mode + ".txt"
    counts_name = "counts_" + seq_mode + ".txt"
    memm_name = "memm_" + seq_mode + ".txt"
    vocab_file = open(os.path.join(data_path, vocab_name), 'rb')
    vocab_data = pickle.load(vocab_file)
    meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list = vocab_data
    print "Read vocab data"
    vocab_file.close()
    if var_voc == "act":
        var_vocab_list = act_var_vocab_list
        pot_var_vocab_list = None
    else:
        var_vocab_list = pot_var_vocab_list
        act_var_vocab_list = None
    count_file = open(os.path.join(data_path, counts_name), 'rb')
    count_data = pickle.load(count_file)
    meth_count, pot_count, act_count = count_data
    print "Read count data"
    count_file.close()
    if var_voc == "act":
        var_count = act_count
        pot_count = None
    else:
        var_count = pot_count
        act_count = None
    if type(meth_prob) == int:
        meth_dat = [meth_prob, meth_vocab_list]
        meth_dat.extend(meth_count)
        prob_mode = "ngram"
    else:
        meth_count = None
        memm_file = open(os.path.join(data_path, memm_name), 'rb')
        meth_dat = pickle.load(memm_file)
        meth_dat = [meth_dat,  meth_vocab_list]
        print "Read MEMM"
        prob_mode = "MEMM"
        memm_file.close()
    imp, fi, sents = seq.getSents(cu, i, seq_mode)
    word_ll = {}
    noUnk = True
    for word in meth_vocab_list:
        if type(word) is not str and word[0].split('$')[0] in i:
            ll = 0
            ctr = 0
            for sent, vl in sents:
                newsent = []
                for stat, ctx in sent:
                    if stat == "UNK":
                        newsent.append((word, ctx))
                        noUnk = False
                    else:
                        f = stat[1]
                        n = len(stat) - 2
                        newsent.append(((f, n), ctx))
                newsent = seq.getFeatures([newsent])[0]
                newsent.insert(0, "<S1>")
                newsent.insert(0, "<S2>")
                newsent.append("<END>")
                if type(meth_prob) == int:
                    ctr += len(newsent) - 2
                else:
                    ctr += len(newsent) - 3
                ll += seqProb(newsent, meth_dat, prob_mode)
            ll /= ctr
            word_ll[word] = ll
    word_ll = sorted(word_ll.items(), key=operator.itemgetter(1), reverse = True)
    top_guess = {}
    i = 0
    if noUnk:
        ll = 0
        vsents = seq.getVarSents2(sents)
        var_dat = [var_prob, var_vocab_list]
        var_dat.extend(var_count)
        for sen in vsents:
            sen.insert(0, "<S1>")
            sen.insert(0, "<S2>")
            sen.append("<END>")
            ctr += len(sen) - 2
            #print sen
            ll += varSeqProb(sen, var_dat)
        top_guess = [[["NILL"], ll / ctr]]
    else:
        if fill == "max":
            len_guess = 20
        else:
            len_guess = 1
            num_guess = 1
        while i < len(word_ll) and len(top_guess) < len_guess:
            if fill == "max":
                w = word_ll[i][0]
            else:
                w = random.choice(word_ll)[0]
            i += 1
            var_guess = var_guesses(w, cu, meth_sigs, num_guess)
            if len(var_guess) > 0:
                var_dat = [var_prob, var_vocab_list]
                var_dat.extend(var_count)
                for call in var_guess:
                    ll = 0
                    ctr = 0
                    vsents = []
                    for sent, vl in sents:
                        newsent = []
                        for stat, ctx in sent:
                            if stat == "UNK":
                                newsent.append((call, ctx))
                            else:
                                newsent.append((stat, ctx))
                        vsents.append([newsent, vl])
                    vsents = seq.getVarSents2(vsents)
                    for sen in vsents:
                        sen.insert(0, "<S1>")
                        sen.insert(0, "<S2>")
                        sen.append("<END>")
                        ctr += len(sen) - 2
                        #print sen
                        ll += varSeqProb(sen, var_dat)
                    top_guess[call] = ll / ctr
        top_guess  = sorted(top_guess.items(), key=operator.itemgetter(1), reverse = True)
    return top_guess