def generate_pos_training(pos_training_file):
    raw_text_dir = read.read_from_json('raw_data_dir')
    max_len_text = read.get_char2id_dict(raw_text_dir)
    text_pos_text_dict = read.read_json(
        "data/pos/text_pos_text_dict_normalized")
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    pos_tag_dict = read.read_from_json("pos_tag_dict")
    data_size = len(raw_text_dir)

    f = h5py.File("data/" + pos_training_file + ".hdf5", "w")
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')
    #dset2 = f.create_dataset("output", (data_size, max_len_text), dtype='int8')

    for data_id in range(data_size):
        pos_list = text_pos_text_dict[raw_dir_simple[data_id]]
        print raw_dir_simple[data_id]

        text_inputs = [[pos_tag_dict[pos] for pos in pos_list]]
        # print text_inputs
        # print labels
        data_x = pad_sequences(text_inputs,
                               dtype='int8',
                               maxlen=max_len_text,
                               padding="post")
        dset[data_id, :] = data_x[0]
def get_list_cd():
    start = 0
    end = 63

    raw_dir_simple = read.read_from_json('raw_dir_simple')
    cd_list = list()
    for data_id in range(start, end):
        #print raw_dir_simple[data_id]
        pos = read.read_json("data/pos/" + raw_dir_simple[data_id])
        for pos_sen in pos:
            if len(pos_sen) > 0:
                for pos_token in pos_sen:
                    if pos_token[1] == "CD":
                        if pos_token not in cd_list:
                            cd_list.append(pos_token)
                            print pos_token
    read.save_json("data/pos/cd_list", cd_list)
def get_list_punctuation():
    start = 0
    end = 63
    p_list = ["/", ":", "-"]

    raw_dir_simple = read.read_from_json('raw_dir_simple')
    punctuation_list = list()
    for data_id in range(start, end):
        #print raw_dir_simple[data_id]
        pos = read.read_json("data/pos/" + raw_dir_simple[data_id])
        for pos_sen in pos:
            if len(pos_sen) > 0:
                for pos_token in pos_sen:
                    if any(e in pos_token[0] for e in p_list):
                        if pos_token not in punctuation_list:
                            punctuation_list.append(pos_token)
                            print pos_token
    read.save_json("data/pos/punctuation_list", punctuation_list)
Example #4
0
def span2xmlfiles(exp, target):
    import anafora

    raw_dir_simple = read1.read_from_json('raw_dir_simple')
    for data_id in range(0, 10):
        data_spans = read1.read_json(exp + "\\span_label_all" +
                                     target)[data_id]
        data = anafora.AnaforaData()
        id = 0
        for data_span in data_spans:
            e = anafora.AnaforaEntity()
            e.spans = ((int(data_span[0]), int(data_span[1]) + 1), )
            e.type = data_span[2]
            e.id = str(id) + "@e@" + raw_dir_simple[data_id]
            data.annotations.append(e)
            id += 1
        print data
        data.indent()

        outputfile = exp + "\\" + raw_dir_simple[data_id] + "\\"
        if not os.path.exists(outputfile):
            os.makedirs(outputfile)
        data.to_file(outputfile + raw_dir_simple[data_id] +
                     ".TimeNorm.gold.completed.xml")
#generate_pos()

start = 0
end = 63
raw_text_dir = read.read_from_json('raw_data_dir')
raw_dir_simple = read.read_from_json('raw_dir_simple')

# data_size = len(raw_text_dir)
max_len_text = read.get_char2id_dict(raw_text_dir)
char2int = read.read_from_json('char2int')
int2char = dict((int, char) for char, int in char2int.items())

text_pos_text_dict = dict()
for data_id in range(start, end):
    print raw_dir_simple[data_id]
    pos = read.read_json("data/pos/" + raw_dir_simple[data_id])
    raw_text = read.read_from_dir(raw_text_dir[data_id])

    text_inputs = [[char2int[char] for char in raw_text]]
    postag = list()
    index = 0
    for line in raw_text.splitlines():
        if len(line) == 0:
            postag.append('\n')
            index += 1
        else:
            token_index = 0
            term = ""
            for char in line:
                # if term =="leade":
                #     print "ok"