def features_extraction(raw_data_dir,
                        preprocessed_path,
                        model_path,
                        data_folder="",
                        mode="train"):
    max_len = 350
    pad = 3
    input_char = list()
    char2int = read.readfrom_json(char2int_path)

    total = 0
    for data_id in range(0, len(raw_data_dir)):
        print(raw_data_dir[data_id])
        #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              raw_data_dir[data_id])
        sent_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                 "_sent")
        print(len(sent_span_list_file))

        n_sent = len(sent_span_list_file)
        for index in range(n_sent):
            total += 1
            input_char.append(
                get_idx_from_sent("\n", sent_span_list_file[index][0],
                                  char2int, max_len, pad))

        print("Finished processing file: ", raw_data_dir[data_id])
    print(total)
    input_char = np.asarray(input_char, dtype="int16")

    if not os.path.exists(model_path):
        os.makedirs(model_path)
    read.save_hdf5(model_path + "/input" + data_folder, ["char"], [input_char],
                   ['int16'])
def get_train():
    file_dev = read.readfrom_json("data/dev_file_simple")
    train_all_simple = read.readfrom_json("data/train_all_simple")
    train = [
        train_file for train_file in train_all_simple
        if train_file not in file_dev
    ]
    read.savein_json("data/train_simple", train)
Exemple #3
0
def generate_output_multiclass(model,
                               input,
                               gold,
                               doc_list_sub,
                               processed_path,
                               output_pred_path,
                               pred=True,
                               data_folder="",
                               format_abbre=".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    labels_index = [non_operator, operator, operator]
    classes, probs = output.make_prediction_function_multiclass(
        input, model, output_pred_path)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes" + data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba" + data_folder,
                           probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):

        class_loc = output.found_location_with_constraint(classes[index])
        span = output.loc2span(class_loc, probs[index], post_process=False)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index],
                                                       1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks = 3
    sent_index = 0

    for data_id in range(0, len(doc_list_sub)):
        sent_spans = read.readfrom_json(
            os.path.join(processed_path, doc_list_sub[data_id],
                         doc_list_sub[data_id] + "_sent"))
        data_span = list()
        for sent_span in sent_spans:
            for index in range(len(classes)):
                span_list = spans[index][sent_index]
                if len(span_list[0]) < 1:
                    pass
                else:
                    for [posi_start, posi_end, label] in span_list:
                        data_span.append([
                            posi_start - n_marks + sent_span[1],
                            posi_end - n_marks + sent_span[1],
                            int2labels[index][label]
                        ])
            sent_index += 1
        data = span2xmlfiles(data_span, doc_list_sub[data_id])
        output_path = os.path.join(output_pred_path, doc_list_sub[data_id],
                                   doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path + format_abbre)
    del classes, probs, input
def evaluate(xml_path, output_pred_path, raw_data_path, doc_list,
             output_format):
    gold_count = 0
    pred_count = 0
    true_count = 0
    print('xml_path: %s', xml_path)
    print('doc_list: %s', len(doc_list))
    for file_id in range(len(doc_list)):
        print('path: ', os.path.join(xml_path, doc_list[file_id] + "_tag"))
        print(
            'path: %s',
            os.path.exists(
                os.path.join(xml_path, doc_list[file_id] + "_tag.txt")))
        if os.path.exists(
                os.path.join(xml_path, doc_list[file_id] + "_tag.txt")):
            gold_tag_dict = get_gold_dict(
                read.readfrom_json(
                    os.path.join(xml_path, doc_list[file_id] + "_tag")))
            output_path = os.path.join(output_pred_path, doc_list[file_id],
                                       doc_list[file_id] + output_format)
            raw_text_path = os.path.join(raw_data_path, doc_list[file_id])
            pre_tag_dict = process.extract_xmltag_anafora_pred(
                output_path, read.readfrom_txt(raw_text_path))
            scores = calculate_score(gold_tag_dict, pre_tag_dict)
            gold_count += scores[0]
            pred_count += scores[1]
            true_count += scores[2]
            metrics(true_count, pred_count, gold_count)
Exemple #5
0
def features_extraction(raw_data_dir,
                        preprocessed_path,
                        model_path,
                        data_folder="",
                        mode="train"):
    max_len = 350
    pad = 3
    input_char = list()
    input_pos = list()
    input_unic = list()
    char2int = read.readfrom_json(char2int_path)
    pos2int = read.readfrom_json(pos2int_path)
    unicode2int = read.readfrom_json(unicode2int_path)
    total = 0
    for data_id in range(0, len(raw_data_dir)):
        print(raw_data_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              raw_data_dir[data_id],
                                              raw_data_dir[data_id])
        sent_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                 "_sent")
        print(len(sent_span_list_file))
        pos_sentences_character = read.readfrom_json(preprocessed_file_path +
                                                     "_pos")
        print(len(pos_sentences_character))
        unico_sentences_characte = read.readfrom_json(preprocessed_file_path +
                                                      "_unicodecategory")
        print(len(unico_sentences_characte))
        n_sent = len(sent_span_list_file)
        for index in range(n_sent):
            total += 1
            input_char.append(
                get_idx_from_sent("\n", sent_span_list_file[index][0],
                                  char2int, max_len, pad))
            input_pos.append(
                get_idx_from_sent("\n", pos_sentences_character[index],
                                  pos2int, max_len, pad))
            input_unic.append(
                get_idx_from_sent("Cc", unico_sentences_characte[index],
                                  unicode2int, max_len, pad))
        print("Finished processing file: ", raw_data_dir[data_id])
    print(total)
    input_char = np.asarray(input_char, dtype="int")
    input_pos = np.asarray(input_pos, dtype="int")
    input_unic = np.asarray(input_unic, dtype="int")

    if not os.path.exists(model_path):
        os.makedirs(model_path)
    read.save_hdf5(model_path + "/input" + data_folder,
                   ["char", "pos", "unic"],
                   [input_char, input_pos, input_unic],
                   ['int8', 'int8', 'int8'])
Exemple #6
0
def generate_output_multiclass(model,input,gold,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    labels_index = [non_operator,operator,operator]
    classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes"+data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):

        class_loc = output.found_location_with_constraint(classes[index])
        span = output.loc2span(class_loc, probs[index],post_process = False)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks =3
    sent_index = 0

    for data_id in range(0,len(doc_list_sub)):
        sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id],doc_list_sub[data_id]+"_sent"))
        data_span = list()
        for sent_span in sent_spans:
            for index in range(len(classes)):
                span_list = spans[index][sent_index]
                if len(span_list[0]) <1:
                    pass
                else:
                    for [posi_start,posi_end,label] in span_list:
                        data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]])
            sent_index += 1
        data = span2xmlfiles(data_span,doc_list_sub[data_id])
        output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path+format_abbre)
    del classes,probs,input
Exemple #7
0
    x = []
    for word in sent:
        if word in word_idx_map:
            x.append(word_idx_map[word])
        else:
            x.append(0)

    while len(x) < 356:
        x.append(4)
    return x


char, pos, unicate = read.load_hdf5("data/cvcolon_train_input",
                                    ["char", "pos", "unic"])

char2int = read.readfrom_json("data/char2int")
int2char = {index: char for char, index in char2int.items()}
# print(char2int)
int2char = dict((c, i) for i, c in char2int.items())
sent = list()
sent_len = list()
for char_x_sent in char:  # 2637    8820     12760     ####2637     6183    3940     7140
    sent_single = [
        int2char[i] if i != 88 and i != 0 else ' ' for i in char_x_sent
    ]
    sent.append(sent_single)

import torch
forward_flairTorch = torch.load("data/lm-news-english-forward-v0.2rc.pt")
dictionary = {
    k.decode('utf8'): v
def output_encoding(
    raw_data_dir,
    preprocessed_path,
    model_path,
    data_folder="",
    activation="softmax",
    type="interval"
):  ###type in "[interval","operator","explicit_operator","implicit_operator"]
    target_labels = defaultdict(float)
    if type not in [
            "interval", "operator", "explicit_operator", "implicit_operator"
    ]:
        return
    interval = read.textfile2list(non_operator_path)
    operator = read.textfile2list(operator_path)
    max_len = 350
    n_marks = 3
    max_len_text = max_len + 2 * n_marks
    n_output = 0
    final_labels = 0

    if activation == "sigmoid":
        final_labels = interval + operator
        n_output = len(final_labels)
    elif activation == "softmax":
        if "interval" in type:
            final_labels = interval
        elif "operator" in type:
            final_labels = operator
        n_output = len(final_labels) + 1

    one_hot = read.counterList2Dict(list(enumerate(final_labels, 1)))
    output_one_hot = {y: x for x, y in one_hot.items()}

    sample_weights_output = []
    outputs = []
    total_with_timex = 0
    n_sent_total = 0
    for data_id in range(0, len(raw_data_dir)):
        #preprocessed_file_path = os.path.join(preprocessed_path, raw_data_dir[data_id], raw_data_dir[data_id]) - TODO
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              raw_data_dir[data_id])
        sent_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                 "_sent")
        tag_span_list_file = read.readfrom_json(preprocessed_file_path +
                                                "_tag")
        n_sent = len(tag_span_list_file)
        n_sent_total += n_sent
        for index in range(n_sent):
            sent_info = sent_span_list_file[index]
            tag_info = tag_span_list_file[index]

            sentence_start = sent_info[1]
            label_encoding_sent = np.zeros((max_len_text, n_output))
            if activation == "softmax":
                label_encoding_sent[:, 0] = 1
            sample_weights_sent = np.zeros(max_len_text)

            for label in tag_info:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info_new = list(set(info[2:]))

                if activation == "sigmoid":

                    label_indices = [
                        output_one_hot[token_tag] for token_tag in info_new
                        if token_tag in output_one_hot
                    ]
                    k = np.sum(np.eye(n_output)[[
                        sigmoid_index - 1 for sigmoid_index in label_indices
                    ]],
                               axis=0)

                    label_encoding_sent[position + n_marks:posi_end +
                                        n_marks, :] = np.repeat([k],
                                                                posi_end -
                                                                position,
                                                                axis=0)

                elif activation == "softmax":
                    if "explicit" in type or "interval" in type:
                        target_label = process.get_explict_label(
                            info_new, interval, operator)
                    elif "implicit" in type.split("_"):
                        target_label = process.get_implict_label(
                            info_new, interval, operator)
                    for token_tag in target_label:
                        if token_tag in final_labels:
                            target_labels[token_tag] += 1.0

                    label_indices = [
                        output_one_hot[token_tag] for token_tag in target_label
                        if token_tag in final_labels
                    ]
                    if len(label_indices) != 0:
                        k = np.sum(np.eye(n_output)[[
                            softmax_index for softmax_index in label_indices
                        ]],
                                   axis=0)
                        label_encoding_sent[position + n_marks:posi_end +
                                            n_marks, :] = np.repeat([k],
                                                                    posi_end -
                                                                    position,
                                                                    axis=0)
                t = len(label_indices)
                if t >= 1:
                    sample_weights_sent[position + n_marks:posi_end +
                                        n_marks] = label_indices[randint(
                                            0, t - 1)]
            sample_weights_output.append(sample_weights_sent)
            outputs.append(label_encoding_sent)
            total_with_timex += 1
            #print total_with_timex
    print(n_sent_total)
    sample_weights = np.asarray(sample_weights_output)
    sample_weights = get_sample_weights_multiclass(n_output, sample_weights,
                                                   0.05)
    #print target_labels
    np.save(
        model_path + "/sample_weights" + data_folder + "_" + type + "_" +
        activation, sample_weights)
    read.save_hdf5(
        model_path + "/output" + data_folder + "_" + type + "_" + activation,
        [type + "_" + activation], [outputs], ['int8'])
def generate_output_multiclass(sent_len,model,input,doc_list_sub, processed_path,output_pred_path,pred =True,data_folder = "",format_abbre = ".TimeNorm.system.completed.xml"):
    non_operator = read.textfile2list(non_operator_path)
    print('non_operator')
    print(non_operator)
    operator = read.textfile2list(operator_path)
    print('operator')
    print(operator)
    labels_index = [non_operator,operator,operator]
    print('labels_index')
    print(labels_index)
    classes, probs = output.make_prediction_function_multiclass(input, model, output_pred_path)
    print('sent_len')
    print(sent_len)
    print('classes, probs - ')
    print(classes)
    #print(probs)
    if pred == True:
        np.save(output_pred_path + "/y_predict_classes"+data_folder, classes)
        read.savein_pickle(output_pred_path + "/y_predict_proba"+data_folder, probs)

    spans = list()
    int2labels = list()
    for index in range(len(classes)):
        class_loc = output.found_location_with_constraint(classes[index], sent_len)
        print('class_loc')
        print(class_loc)
        span = output.loc2span(class_loc, probs[index],post_process = False)
        print('span')
        print(span)
        spans.append(span)

        one_hot = read.counterList2Dict(list(enumerate(labels_index[index], 1)))
        one_hot = {y: x for x, y in one_hot.items()}
        int2label = dict((int, char) for char, int in one_hot.items())
        int2labels.append(int2label)

    n_marks =3
    sent_index = 0

    for data_id in range(0,len(doc_list_sub)):
        print('HERE %s', doc_list_sub[data_id])
        print(os.path.join(processed_path,doc_list_sub[data_id]+"_sent"))
        sent_spans = read.readfrom_json(os.path.join(processed_path,doc_list_sub[data_id]+"_sent"))
        print('sent_spans %s', sent_spans)
        data_span = list()
        for sent_span in sent_spans:
            """ print('sent_span - ')
            print(sent_span)
            posi_start = sent_span[1]
            posi_end = sent_span[2]
            label = sent_span[0]
            print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label))
            data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]]) """
            print('classes - ', range(len(classes)))
            for index in range(len(classes)):
                #print('index - ' + index + ':' + sent_index)
                """ print(index)
                print(sent_index) """
                span_list = spans[index][sent_index]
                #print('span_list - ')
                #print(len(span_list[0]))
                #print(span_list)
                if len(span_list[0]) <1:
                    pass
                else:
                    for [posi_start,posi_end,label] in span_list:
                        print('posi_start:%s posi_end:%s, label:%s', (posi_start,posi_end,label))
                        data_span.append([posi_start-n_marks+sent_span[1],posi_end-n_marks+ sent_span[1],int2labels[index][label]])
            sent_index += 1
        print('data_span - ', data_span)
        data = span2xmlfiles(data_span,doc_list_sub[data_id])
        print('data %s', data)
        output_path = os.path.join(output_pred_path,doc_list_sub[data_id],doc_list_sub[data_id])
        read.create_folder(output_path)
        data.to_file(output_path+format_abbre)
    del classes,probs,input