예제 #1
0
def split_plus_test():
    separator_array_ = ['ab', ', ', ' ', 'e', '中文', ':', '|']
    str_ = 'abc,de fgh, ijk,lmn:op|qr中文st'

    separator_array_ = TypeUtils.convert_to_type(separator_array_, unicode)
    str_ = TypeUtils.convert_to_type(str_, unicode)

    print StringUtils.split_plus(str_, separator_array_)
예제 #2
0
def build_meth_vocab(vocab_file, meth_file):
    vocab_file.seek(0)
    meth_file.seek(0)
    vocab = {}
    last_func = ""
    for line in vocab_file:
        if not line[0] == '\t':
            last_func = line.split()[0]
            vocab[last_func] = {}
        else:
            n = len(line.split())
            vocab[last_func][n] = {}
            for i in range(n):
                vocab[last_func][n][i-1] = {}
    for line in meth_file:
        parts = line.split('#')
        if len(parts) > 1:
            call = t.splitLit(parts[0])
            func = call[0]
            args = call[1:]
            print call
            n = len(args)
            for i in range(n):
                if not args[i] in vocab[func][n][i-1]:
                    vocab[func][n][i-1][args[i]] = 0
                vocab[func][n][i-1][args[i]] += 1
    return vocab
예제 #3
0
def convert_to_type_test():

    print TypeUtils.convert_to_type(['杭州西湖', ['北京']], unicode)

    print TypeUtils.convert_to_type(TypeUtils.convert_to_type(['杭州西湖', '123', 123], int), unicode)

    print TypeUtils.convert_to_type([['123', '中文'], 123], float)
예제 #4
0
def getVarSents2(sents):
    vsents = []
    for sent, v_list in sents:
        for var in v_list:
            sen = []
            for stat, ctx in sent:
                app = []
                for i in range(len(stat)):
                    if stat[i] == var[2]:
                        app.append(i - 3)
                if len(app) > 0:
                    s = t.getSig(stat, v_list)
                    f = s[0]
                    n = len(s) - 1
                    sen.append((f, n, tuple(app)))
            vsents.append(sen)
    return vsents
예제 #5
0
def getVarSents(sents):
    vsents = []
    for sent, v_list in sents:
        for var in v_list:
            sen = []
            for stat, ctx in sent:
                app = []
                for i in range(len(stat)):
                    if stat[i] == var[2]:
                        app.append(i - 3)
                if len(app) > 0:
                    s = t.getSig(stat, v_list)
                    s.append('|')
                    s.extend(app)
                    sen.append((s, ctx))
            vsents.append(sen)
    return vsents
예제 #6
0
def getVarSents2(sents):
    vsents = []
    for sent, v_list in sents:
        for var in v_list:
            sen = []
            for stat, ctx in sent:
                app = []
                for i in range(len(stat)):
                    if stat[i] == var[2]:
                        app.append(i - 3)
                if len(app) > 0:
                    s = t.getSig(stat, v_list)
                    f = s[0]
                    n = len(s) - 1
                    sen.append((f, n, tuple(app)))
            vsents.append(sen)
    return vsents
예제 #7
0
def getVarSents(sents):
    vsents = []
    for sent, v_list in sents:
        for var in v_list:
            sen = []
            for stat, ctx in sent:
                app = []
                for i in range(len(stat)):
                    if stat[i] == var[2]:
                        app.append(i - 3)
                if len(app) > 0:
                    s = t.getSig(stat, v_list)
                    s.append("|")
                    s.extend(app)
                    sen.append((s, ctx))
            vsents.append(sen)
    return vsents
예제 #8
0
def getVarLines(var_file):
    var_file.seek(0)
    sents = []
    sent = []
    for line in var_file:
        if not line[0] == '<':
            parts = line.split('|')
            inf = tuple(map(int, parts[1].split()))
            call = t.splitLit(parts[0])
            func = call[0]
            n = len(call) - 1
            stat = (func, n, inf)
            sent.append(stat)
        else:
            sent.append(line[:-1])
        if line == "<END>\n":
            sents.append(sent)
            sent = []
    return sents
예제 #9
0
def getReducedLines(meth_file):
    meth_file.seek(0)
    sents = []
    sent = []
    for line in meth_file:
        if not line[0] == '<':
            parts = line.split('#')
            inf = tuple(parts[1].split())
            call = t.splitLit(parts[0])
            func = call[0]
            n = len(call) - 1
            stat = (func, n)
            sent.append((stat, inf))
        else:
            sent.append((line[:-1], ()))
        if line == "<END>\n":
            sents.append(sent)
            sent = []
    return sents
예제 #10
0
def format_station(station):
    """
    purpose:
        format a measure as a station value with hundreds offset
        0+00 notation
    arguments:
        station: number
            station measurement
    return value: string
        measurement formatted as 0+00
        None if error
    """

    try:
        if TypeUtils.is_numeric(station):
            station_str = str(station)
            if station < 100 and station >= 10:
                return "0+{0}".format(station_str)
            elif station < 10:
                return "0+0{0}".format(station_str)
            else:
                if "." in station_str:
                    return "{0}+{1}.{2}".format(
                        station_str.split(".")[0][:-2],
                        station_str.split(".")[0][-2:],
                        station_str.split(".")[1]
                    )
                else:
                    return "{0}+{1}".format(
                        station_str.split(".")[0][:-2],
                        station_str.split(".")[0][-2:]
                    )
        else:
            return None
    except Exception:
        return None
예제 #11
0
def main():
    par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    corpus_path = "../Java/Corpus/"
    data_path = "../Data/Raw"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'w')
    var_file = open(os.path.join(data_path, var_name), 'w')
    vocab_file = open(os.path.join(data_path, vocab_name), 'w')
    ####
    vocab = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = ["5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"]
    for subdir, dirs, files in os.walk(corpus_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                for i, cu in cus:
                    sf2, fi, sents = seq.getSents(cu, i, mode)
                    sf.extend(sf2)
                    fields.extend(fi)
                    print str(ctr) + ": " + str(len(sents))
                    ctr += 1
                    for sent, vl in sents:
                        meth_file.write("<S2>\n")
                        meth_file.write("<S1>\n")
                        for stat, ctx in sent:
                            meth_file.write(e.nstr(t.getSig(stat, vl, False)) + ' # ' + e.nstr(ctx) + '\n')
                            s = t.getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
                        meth_file.write('<END>\n')
                    vsents = seq.getVarSents(sents)
                    for vsent in vsents:
                        var_file.write("<S2>\n")
                        var_file.write("<S1>\n")
                        for stat, ctx in vsent:
                            var_file.write(e.nstr(stat) + '\n')
                        var_file.write('<END>\n')
            #break
        for s in vocab:
            vocab_file.write(s + '\n')
            for sig in t.resolveSigs(vocab[s]):
                vocab_file.write('\t' + e.nstr(sig) + '\n')
    meth_file.close()
    var_file.close()
    vocab_file.close()
예제 #12
0
def main():
    #par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    data_path = "../Data/Raw"
    new_path = "../Data/Revised"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    counts_name = "counts_" + mode + ".txt"
    memm_name = "memm_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'r')
    var_file = open(os.path.join(data_path, var_name), 'r')
    vocab_file = open(os.path.join(data_path, vocab_name), 'r')
    nvocab_file = open(os.path.join(new_path, vocab_name), 'wb')
    count_file = open(os.path.join(new_path, counts_name), 'wb')
    memm_file = open(os.path.join(new_path, memm_name), 'wb')
    ####
    meth_sigs = build_meth_vocab(vocab_file, meth_file)
    meth_vocab_list = {}
    ctr = 0
    for f in meth_sigs:
        print f
        for n in meth_sigs[f]:
            if not (f,n) in meth_vocab_list:
                meth_vocab_list[(f, n)] = ctr
                ctr += 1
            print '\t' + str(n) + " | ",
            for i in range(n):
                print str(i-1) + ":( ",
                for ty in meth_sigs[f][n][i-1]:
                    print ty + '/' + str(meth_sigs[f][n][i-1][ty]) + ' ', 
                print ") ",
            print
    meth_vocab_list["<END>"] = ctr
    meth_vocab_list["<S1>"] = ctr+1
    meth_vocab_list["<S2>"] = ctr+2
    pot_var_vocab_list = {}
    ctr = 0
    for k in meth_vocab_list:
        if type(k) is not str:
            f, n = k
            for s in t.powerset([i-1 for i in range(n)]):
                pot_var_vocab_list[(f, n, tuple(s))] = ctr
                ctr += 1
    pot_var_vocab_list["<END>"] = ctr
    pot_var_vocab_list["<S1>"] = ctr+1
    pot_var_vocab_list["<S2>"] = ctr+2
    vsents = getVarLines(var_file)
    act_var_vocab_list = {}
    ctr = 0
    for s in vsents:
        for stat in s:
            if not stat in act_var_vocab_list:
                act_var_vocab_list[stat] = ctr
                ctr += 1
    pickle.dump((meth_sigs, meth_vocab_list, pot_var_vocab_list, act_var_vocab_list), nvocab_file)
    nvocab_file.close()
    print len(meth_vocab_list)
    print len(pot_var_vocab_list)
    print len(act_var_vocab_list)
    meth_sents = getReducedLines(meth_file)
    meth_sents = seq.getFeatures(meth_sents)
    X = [meth_sents[i][j][1] for i in range(len(meth_sents)) for j in range(len(meth_sents[i]))]
    print len(X)
    y = [meth_vocab_list[meth_sents[i][j][0]] for i in range(len(meth_sents)) for j in range(len(meth_sents[i]))]
    meth_ngram, meth_N1p, meth_ch = getNTuples(meth_sents, meth_vocab_list, "meth")
    print "N-GRAMS"
    pot_var_ngram, pot_var_N1p, pot_var_ch = getNTuples(vsents, pot_var_vocab_list, "var")
    print "N-GRAMS"
    act_var_ngram, act_var_N1p, act_var_ch = getNTuples(vsents, act_var_vocab_list, "var")
    print "N-GRAMS"
    pickle.dump(((meth_ngram, meth_N1p, meth_ch), (pot_var_ngram, pot_var_N1p, pot_var_ch), (act_var_ngram, act_var_N1p, act_var_ch)), count_file)
    count_file.close()
    MEMM = linear_model.LogisticRegression()
    if not mode == "cfs":
        MEMM.fit(X,y)
    pickle.dump(MEMM, memm_file)
    meth_file.close()
    var_file.close()
    memm_file.close()
예제 #13
0
def main():
    par = plyj.parser.Parser()
    modes = ["cfs", "levels"]
    if len(sys.argv) > 1:
        mode = sys.argv[1]
    else:
        mode = "levels"
    if mode not in modes:
        mode = "levels"
    corpus_path = "../Java/Corpus/"
    data_path = "../Data/Raw"
    ####
    meth_name = "method_sentences_" + mode + ".txt"
    var_name = "variable_sentences_" + mode + ".txt"
    vocab_name = "vocab_" + mode + ".txt"
    ####
    meth_file = open(os.path.join(data_path, meth_name), 'w')
    var_file = open(os.path.join(data_path, var_name), 'w')
    vocab_file = open(os.path.join(data_path, vocab_name), 'w')
    ####
    vocab = {}
    sf = []
    fields = []
    ctr = 1
    blacklist = [
        "5a8beeae20366b5094d0db8148e0563", "3cd87ee90872cfcb72b3cb3b773d8efa"
    ]
    for subdir, dirs, files in os.walk(corpus_path):
        for f in files:
            clear = True
            for h in blacklist:
                if h in f: clear = False
            if f.endswith(".java") and clear:
                p = os.path.join(subdir, f)
                cus = e.ExtractCode(par, p)
                for i, cu in cus:
                    sf2, fi, sents = seq.getSents(cu, i, mode)
                    sf.extend(sf2)
                    fields.extend(fi)
                    print str(ctr) + ": " + str(len(sents))
                    ctr += 1
                    for sent, vl in sents:
                        meth_file.write("<S2>\n")
                        meth_file.write("<S1>\n")
                        for stat, ctx in sent:
                            meth_file.write(
                                e.nstr(t.getSig(stat, vl, False)) + ' # ' +
                                e.nstr(ctx) + '\n')
                            s = t.getSig(stat, vl)
                            if not s[0] in vocab:
                                vocab[s[0]] = []
                            vocab[s[0]].append(s[1:])
                        meth_file.write('<END>\n')
                    vsents = seq.getVarSents(sents)
                    for vsent in vsents:
                        var_file.write("<S2>\n")
                        var_file.write("<S1>\n")
                        for stat, ctx in vsent:
                            var_file.write(e.nstr(stat) + '\n')
                        var_file.write('<END>\n')
            #break
        for s in vocab:
            vocab_file.write(s + '\n')
            for sig in t.resolveSigs(vocab[s]):
                vocab_file.write('\t' + e.nstr(sig) + '\n')
    meth_file.close()
    var_file.close()
    vocab_file.close()
예제 #14
0
def calc_coords(starting_point, distance, h_angle, v_angle):
    """
    purpose:
        Calculate coordinates given a starting point, distance, horizontal angle, and vertical angle
    arguments:
        starting_point: tuple of float
            (x, y, z) coordinates of the starting point
        distance: float
            3D distance to the new point
        h_angle: float
            Horizontal angle, counter clockwise from east
            0 <= horiz_angle < 360
        v_angle: float
            Vertical angle, 180 degrees from zenith to nadir
            Horizon is 90
            0 <= vert_angle <= 180
    return value: dictionary
        success: boolean
        coords: tuple
            (x, y, z) coordinates of the new point
        messages: list
    """

    ret_dict = {"messages": [], "coords": ()}
    try:
        # validate starting_point
        if len(starting_point) != 3:
            ret_dict["messages"].append("Error in calc_coords: starting_point does not have 3 items")
            ret_dict["success"] = False
            return
        # validate distance
        if TypeUtils.is_numeric(distance):
            if distance <= 0:
                ret_dict["messages"].append("Error in calc_coords: distance must be > 0")
                ret_dict["success"] = False
                return
        else:
            ret_dict["messages"].append("Error in calc_coords: distance must be a number")
            ret_dict["success"] = False
            return
        # validate h_angle
        if TypeUtils.is_numeric(distance):
            if not 0 <= h_angle < 360:
                ret_dict["messages"].append("Error in calc_coords: horizontal angle much be between 0 and 360")
                ret_dict["success"] = False
                return
        else:
            ret_dict["messages"].append("Error in calc_coords: horizontal angle must be a number")
            ret_dict["success"] = False
            return
        # validate v_angle
        if TypeUtils.is_numeric(distance):
            if not 0 <= h_angle <= 180:
                ret_dict["messages"].append("Error in calc_coords: vertical angle much be between 0 and 180")
                ret_dict["success"] = False
                return
        else:
            ret_dict["messages"].append("Error in calc_coords: vertical angle must be a number")
            ret_dict["success"] = False
            return
        # get the starting coordinates
        x1 = starting_point[0]
        y1 = starting_point[1]
        z1 = starting_point[2]
        # compute the new coordinates
        x2 = x1 + (distance * math.sin(math.radians(v_angle)) * math.cos(math.radians(h_angle)))
        y2 = y1 + (distance * math.sin(math.radians(v_angle)) * math.sin(math.radians(h_angle)))
        z2 = z1 + (distance * math.cos(math.radians(v_angle)))
        # return the new point
        ret_dict["coords"] = (x2, y2, z2)
        ret_dict["success"] = True
    except Exception as e:
        ret_dict["messages"].append("Error: {0}".format(str(e)))
        ret_dict["success"] = False
    finally:
        return ret_dict
    return