def get_word_tag(start, end):
    multi_labels = read.textfile2list("data/label/multi-hot.txt")
    multi_hot = read.counterList2Dict(list(enumerate(multi_labels, 1)))
    multi_hot = {y: x for x, y in multi_hot.iteritems()}

    raw_text_dir = read.read_from_json('raw_data_dir')
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    max_len = 0  #  106
    for data_id in range(start, end):
        xml_tags = read.read_from_json("training_sentence/xml_tags/" +
                                       raw_dir_simple[data_id])
        sent_spans = read.read_from_json(
            "training_sentence/word_level_sentence/" + raw_dir_simple[data_id])
        word_level_tags = list()
        for sent_index in range(len(sent_spans)):
            tags = list()
            for word_index in range(len(sent_spans[sent_index][0])):
                if len(xml_tags[sent_index]) == 0:
                    tags.append(0)
                elif sent_spans[sent_index][1][word_index][0] == int(
                        xml_tags[sent_index][0]
                    [0]) and sent_spans[sent_index][1][word_index][1] == int(
                        xml_tags[sent_index][0][1][0]):
                    xml_tag = extract_tag(xml_tags[sent_index][0])
                    intersection = [x for x in xml_tag if x in multi_labels]
                    if len(intersection) > 0:
                        tags.append(multi_hot[intersection[0]])
                    xml_tags[sent_index].pop(0)
                elif sent_spans[sent_index][1][word_index][1] < int(
                        xml_tags[sent_index][0][0]):
                    tags.append(0)
                else:
                    tags.append(0)
                    while len(xml_tags[sent_index]) > 0 and int(
                            xml_tags[sent_index][0][1][0]) <= int(
                                sent_spans[sent_index][1][word_index][1]):
                        xml_tags[sent_index].pop(0)

            word_level_tags.append(tags)

            max_len = max(len(tags), max_len)
        print max_len
        read.save_in_json(
            "training_sentence/word_level_sentence_tag/" +
            raw_dir_simple[data_id], word_level_tags)
Beispiel #2
0
def prediction_debugging(char_x,predict,true,posi,instance_length,labels,char2int):

    #labels = read.textfile2list("data/label/one-hot_all.txt")
    one_hot = read.counterList2Dict(list(enumerate(labels, 1)))
    one_hot = {y:x for x,y in one_hot.iteritems()}

    int2char = dict((int, char) for char, int in char2int.items())
    int2label = dict((int, char) for char, int in one_hot.items())


    start, end = posi
    for iter in range(start, end):
        raw_text = [ int2char[int] for int in char_x[iter][0:instance_length[iter]]]
        gold_character = true[iter]
        prediction = predict[iter]
        imprecise = dict()
        imprecise_gold = dict()
        nonrecall = dict()
        imprecise_term = ""

        nonrecall_term = ""

        #######################precision#############################
        for key in prediction.keys():
            if not gold_character.has_key(key) or prediction[key] != gold_character[key]:
                imprecise[key] = int2label[prediction[key]]  ######multiclass_debugging
                #imprecise[key] = prediction[key]             ######binaryclass_debugging
                if gold_character.has_key(key) and prediction[key] != gold_character[key]:
                    imprecise_gold[key] = int2label[gold_character[key]]
                if imprecise.has_key(key-1):
                    if key <len(raw_text):
                        imprecise_term+=raw_text[key]
                    else:
                        imprecise_term = imprecise_term
                else:
                    imprecise_term +=" " +str(key)+": "+ raw_text[key]


        ############################recall##########################
        for key in gold_character.keys():
            if not prediction.has_key(key):
                nonrecall[key] = int2label[gold_character[key]]  ######multiclass_debugging
                #nonrecall[key] = gold_character[key]             ######binaryclass_debugging
                if nonrecall.has_key(key-1):
                    nonrecall_term+=raw_text[key]
                else:
                    nonrecall_term +=" " +str(key)+": "+ raw_text[key]

        if len(imprecise) > 0 or len(nonrecall)>0:
            print "sentence",iter, ''.join(raw_text)
        if len(imprecise)>0:
            new1 = OrderedDict(sorted(imprecise.items(), key=lambda t: t[0]))
            new3 = OrderedDict(sorted(imprecise_gold.items(), key=lambda t: t[0]))

            print "imprecise: ", new1, "    gold:",new3
            print "imprecise: ", imprecise_term

        if len(nonrecall)>0:
            new2 = OrderedDict(sorted(nonrecall.items(), key=lambda t: t[0]))
            print "non_recall: ", new2
            print "non_recall: ", nonrecall_term
        if len(imprecise) > 0 or len(nonrecall) > 0:
            print "\n"
def get_interval_inputs(n_marks, outputfilename1, outputfilename2):
    raw_dir_simple = read.read_from_json('raw_dir_simple')
    data_size = 1171  ## overall
    j = 0
    max_len_text = 606 + 2 * n_marks
    explicit_labels1 = read.textfile2list("data/label/explicit_label1.txt")
    explicit_labels2 = read.textfile2list("data/label/explicit_label2.txt")

    labels = explicit_labels1 + explicit_labels2

    one_hot = read.counterList2Dict(list(enumerate(labels, 1)))
    one_hot = {y: x for x, y in one_hot.iteritems()}
    n_softmax = len(labels) + 2

    f = h5py.File("data/" + outputfilename1 + str(n_marks) + ".hdf5", "w")
    dset = f.create_dataset("input", (data_size, max_len_text), dtype='int8')
    total_with_timex = 0

    for data_id in range(10, 63):
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        n_sent = len(sent_spans)
        k = 0
        print raw_dir_simple[data_id]
        for index in range(n_sent):
            softmax_index = np.zeros(max_len_text, dtype=np.int8)
            softmax_index[0:3] = n_softmax - 1
            sentence_start = sent_spans[index][1]
            sentence_stop = sent_spans[index][2]
            len_sentence = sentence_stop - sentence_start
            softmax_index[3:3 + len_sentence] = n_softmax
            softmax_index[3 + len_sentence:len_sentence + 6] = n_softmax - 1
            for label in xmltags[index]:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info.pop(0)
                info.pop(0)
                info_new = list(set(info))

                explicit_label = get_explict_label(info_new, explicit_labels1,
                                                   explicit_labels2)

                ########################   to check whether explicit_label is part of operator #####
                if explicit_label in labels:
                    label2int = one_hot[explicit_label]
                    ##################### add marks ########################################
                    softmax_index[position + n_marks:posi_end +
                                  n_marks] = np.repeat(label2int,
                                                       posi_end - position)

            dset[total_with_timex] = softmax_index
            total_with_timex += 1
    print total_with_timex

    data_size = 251
    f2 = h5py.File("data/" + outputfilename2 + str(n_marks) + ".hdf5", "w")
    dset2 = f2.create_dataset("input", (data_size, max_len_text), dtype='int8')
    total_with_timex = 0

    for data_id in range(0, 10):
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        n_sent = len(sent_spans)
        k = 0
        print raw_dir_simple[data_id]
        for index in range(n_sent):
            softmax_index = np.zeros(max_len_text, dtype=np.int8)
            softmax_index[0:3] = n_softmax - 1
            sentence_start = sent_spans[index][1]
            sentence_stop = sent_spans[index][2]
            len_sentence = sentence_stop - sentence_start
            softmax_index[3:3 + len_sentence] = n_softmax
            softmax_index[3 + len_sentence:len_sentence + 6] = n_softmax - 1
            for label in xmltags[index]:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info.pop(0)
                info.pop(0)
                info_new = list(set(info))

                explicit_label = get_explict_label(info_new, explicit_labels1,
                                                   explicit_labels2)

                ########################   to check whether explicit_label is part of operator #####
                if explicit_label in labels:
                    label2int = one_hot[explicit_label]
                    ##################### add marks ########################################
                    softmax_index[position + n_marks:posi_end +
                                  n_marks] = np.repeat(label2int,
                                                       posi_end - position)
            dset2[total_with_timex] = softmax_index
            total_with_timex += 1
    print total_with_timex
def get_multi_hot_labels_with_timex(
        n_marks, outputfilename1,
        outputfilename2):  ###contain onehot and binary labels
    from copy import deepcopy
    raw_dir_simple = read.read_from_json('raw_dir_simple')

    #data_size = 464  # total traininf sentence with time ex     #0-63 print total,total_with_timex     0:63 witout time ex 1422; with time ex 558;    10:63 with time ex 464; without time ex 1171
    #data_size = 1171 #### total training sentence
    data_size = 278  #### total training sentence with positive operators ######

    max_len_text = 606 + 2 * n_marks  #with marks 606: without marks            NYT19980206.0466       document_length = 10802
    #max_len_text = 10802 + 2 * n_marks

    multi_labels = read.textfile2list("data/label/multi-hot.txt")
    ############### multiclass classification #############
    multi_hot = read.counterList2Dict(list(enumerate(multi_labels, 1)))
    multi_hot = {y: x for x, y in multi_hot.iteritems()}
    n_sigmoid = len(multi_labels) + 1
    #####################binary_classification ############
    # one_hot = {label: 1 for label in labels}
    # n_softmax = 1
    #######################################################

    f = h5py.File("data/" + outputfilename1 + str(n_marks) + ".hdf5", "w")
    dset = f.create_dataset("input", (data_size, max_len_text, n_sigmoid),
                            dtype='int8')
    total_with_timex = 0
    #n_sents = list()
    for data_id in range(10, 63):
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])

        n_sent = len(xmltags)
        #print n_sent
        #n_sents.append(n_sent)
        for index in range(n_sent):
            sigmoid_labels = np.zeros((max_len_text, n_sigmoid), dtype=np.int8)
            sigmoid_labels[:, 0] = 1
            sentence_start = sent_spans[index][1]
            if not len(
                    xmltags[index]
            ) == 0:  ############## using this line to exclude sentence without time ex,
                a = deepcopy(
                    xmltags[index]
                )  ############## not using this line would include all sentences, and please also change the layout of the sub below  "Shift +Tab"
                xml_tag = extract_tag(a)
                intersection = [x for x in xml_tag if x in multi_labels]
                if len(intersection) > 0:
                    for label in xmltags[index]:
                        posi, info = label
                        position = int(posi) - sentence_start
                        posi_end = int(info[0]) - sentence_start
                        info.pop(0)
                        info.pop(0)
                        info_new = list(set(info))

                        sigmoid_index = list()
                        for label in info_new:
                            if label in multi_labels:
                                sigmoid_index.append(multi_hot[label])
                        if len(sigmoid_index) != 0:
                            k = np.sum(np.eye(n_sigmoid)[sigmoid_index],
                                       axis=0)
                            sigmoid_labels[position + n_marks:posi_end +
                                           n_marks, :] = np.repeat([k],
                                                                   posi_end -
                                                                   position,
                                                                   axis=0)
                        ##################### add marks ########################################
                        # softmax_index[position + n_marks:posi_end + n_marks] = np.repeat(index, posi_end - position)
                        ##################### without marks ########################################
                        # softmax_index[position :posi_end ] = np.repeat(index, posi_end - position)

                    ############################  multiclass ####################################
                    #softmax_labels = np.eye(n_softmax)[softmax_index]
                    ############################  binaryclass ###################################
                    #softmax_labels = softmax_index.reshape(softmax_index.shape + (1,))
                    ##########################################################################
                    dset[total_with_timex] = sigmoid_labels
                    total_with_timex += 1
    print total_with_timex

    data_size = 251
    f1 = h5py.File("data/" + outputfilename2 + str(n_marks) + ".hdf5", "w")
    dset1 = f1.create_dataset("input", (data_size, max_len_text, n_sigmoid),
                              dtype='int8')

    total_with_timex = 0

    for data_id in range(0, 10):
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])

        n_sent = len(xmltags)
        for index in range(n_sent):
            sigmoid_labels = np.zeros((max_len_text, n_sigmoid), dtype=np.int8)
            sigmoid_labels[:, 0] = 1
            sentence_start = sent_spans[index][1]

            for label in xmltags[index]:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info.pop(0)
                info.pop(0)
                info_new = list(set(info))

                sigmoid_index = list()
                for label in info_new:
                    if label in multi_labels:
                        sigmoid_index.append(multi_hot[label])
                if len(sigmoid_index) != 0:
                    k = np.sum(np.eye(n_sigmoid)[sigmoid_index], axis=0)
                    sigmoid_labels[position + n_marks:posi_end +
                                   n_marks, :] = np.repeat([k],
                                                           posi_end - position,
                                                           axis=0)

            dset1[total_with_timex] = sigmoid_labels
            total_with_timex += 1
def get_one_hot_labels_with_timex(
        n_marks, outputfilename1,
        outputfilename2):  ###contain onehot and binary labels
    raw_dir_simple = read.read_from_json('raw_dir_simple')

    data_size = 1171  # total traininf sentence with time ex     #0-63 print total,total_with_timex     0:63 witout time ex 1422; with time ex 558;    10:63 with time ex 464; without time ex 1171
    #data_size = 395 #### total training sentence
    max_len_text = 606 + 2 * n_marks  #with marks 606: without marks            NYT19980206.0466       document_length = 10802
    #max_len_text = 10802 + 2 * n_marks
    explicit_labels1 = read.textfile2list("data/label/explicit_label1_new.txt")
    explicit_labels2 = read.textfile2list("data/label/explicit_label2.txt")

    labels = explicit_labels2

    ############### multiclass classification #############
    one_hot = read.counterList2Dict(list(enumerate(labels, 1)))
    one_hot = {y: x for x, y in one_hot.iteritems()}
    n_softmax = len(labels) + 1
    #####################binary_classification ############
    # one_hot = {label: 1 for label in labels}
    # n_softmax = 1
    #######################################################

    f = h5py.File("data/" + outputfilename1 + str(n_marks) + ".hdf5", "w")
    dset = f.create_dataset("input", (data_size, max_len_text, n_softmax),
                            dtype='int8')
    total_with_timex = 0

    for data_id in range(10, 63):
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])

        n_sent = len(xmltags)
        for index in range(n_sent):
            softmax_index = np.zeros(max_len_text, dtype=np.int8)
            sentence_start = sent_spans[index][1]

            ############## not using this line would include all sentences, and please also change the layout of the sub below  "Shift +Tab"
            # if not len(xmltags[index]) == 0:  ####### using this line to exclude sentence without time ex
            #     xml_tag = extract_tag1(xmltags[index],explicit_labels1,explicit_labels2)
            #     intersection = [x for x in xml_tag if x in labels]
            #     if len(intersection) > 0:
            for label in xmltags[index]:
                posi, info = label
                position = int(posi) - sentence_start
                posi_end = int(info[0]) - sentence_start
                info.pop(0)
                info.pop(0)
                info_new = list(set(info))

                explicit_label = get_implict_label(info_new, explicit_labels1,
                                                   explicit_labels2)

                ########################   to check whether explicit_label is part of operator #####
                if explicit_label in explicit_labels2:
                    label2int = one_hot[explicit_label]
                    ##################### add marks ########################################
                    softmax_index[position + n_marks:posi_end +
                                  n_marks] = np.repeat(label2int,
                                                       posi_end - position)
                ##################### without marks ########################################
                # softmax_index[position :posi_end ] = np.repeat(index, posi_end - position)

            ############################  multiclass ####################################
            softmax_labels = np.eye(n_softmax)[softmax_index]
            ############################  binaryclass ###################################
            #softmax_labels = softmax_index.reshape(softmax_index.shape + (1,))
            ##########################################################################
            dset[total_with_timex] = softmax_labels
            total_with_timex += 1
    print total_with_timex

    data_size = 251
    f1 = h5py.File("data/" + outputfilename2 + str(n_marks) + ".hdf5", "w")
    dset1 = f1.create_dataset("input", (data_size, max_len_text, n_softmax),
                              dtype='int8')

    total_with_timex = 0

    for data_id in range(0, 10):
        xmltags = read.read_from_json("training_sentence/xml_tags/" +
                                      raw_dir_simple[data_id])
        sent_spans = read.read_from_json("training_sentence/sentences/" +
                                         raw_dir_simple[data_id])

        n_sent = len(xmltags)
        for index in range(n_sent):
            softmax_index = np.zeros(max_len_text, dtype=np.int8)
            sentence_start = sent_spans[index][1]
            if not len(xmltags[index]) == 0:
                for label in xmltags[index]:
                    posi, info = label
                    position = int(posi) - sentence_start
                    posi_end = int(info[0]) - sentence_start
                    info.pop(0)
                    info.pop(0)
                    info_new = list(set(info))
                    index = 0
                    explicit_label = get_implict_label(info_new,
                                                       explicit_labels1,
                                                       explicit_labels2)
                    ########################   to check whether explicit_label is part of operator #####
                    if explicit_label in explicit_labels2:

                        label2int = one_hot[explicit_label]
                        ##################### add marks ########################################
                        softmax_index[position + n_marks:posi_end +
                                      n_marks] = np.repeat(
                                          label2int, posi_end - position)
                    ##################### without marks ########################################
                    #softmax_index[position :posi_end ] = np.repeat(index, posi_end - position)

            ############################  multiclass ####################################
            softmax_labels = np.eye(n_softmax)[softmax_index]
            ############################  binaryclass ###################################
            #softmax_labels = softmax_index.reshape(softmax_index.shape + (1,))
            ##########################################################################
            dset1[total_with_timex] = softmax_labels
            total_with_timex += 1
    print total_with_timex