コード例 #1
0
def create_markable_for_coref_id_and_str(doc_obj, sent_obj, coref_id,
                                         ante_str):
    tokenized_ante_str = spacy_get_tokenized_word(doc_obj, ante_str)
    max_len = len(sent_obj.word_list)
    len_of_ante_str = len(tokenized_ante_str)
    max_start_idx = -1
    max_end_idx = -1

    match = False
    for i in range(0, max_len):
        #Check if the first token matches
        if sent_obj.word_list[i].word == tokenized_ante_str[0]:
            match = True
            for j in range(1, len_of_ante_str):
                if sent_obj.word_list[i + j].word != tokenized_ante_str[j]:
                    match = False
                    break
            if (match == True):
                #Max pattern is found in the tokenized obj
                max_start_idx = i
                max_end_idx = i + len_of_ante_str - 1
                break
    if (max_start_idx != -1) and (max_end_idx != -1):
        markable_obj = class_defs.markable(max_start_idx, max_end_idx, -1, -1,
                                           coref_id,
                                           class_defs.MARKABLE_FLAG_ANTECEDENT)
        sent_obj.gold_markables.append(markable_obj)
コード例 #2
0
def take_care_of_missed_antecedents(doc_obj, sent_obj, sent_num):
    gold_markables = sent_obj.gold_markables

    antecedent_markables = []
    #Filter the antecedents
    gm_len = len(gold_markables)
    for i in range(0, gm_len):
        marker = gold_markables[i]
        if (marker.flags == class_defs.MARKABLE_FLAG_ANTECEDENT):
            antecedent_markables.append(marker)

    gm_len = len(antecedent_markables)
    #Nothing is missed
    if (gm_len == 0):
        return

    for i in range(0, gm_len):
        g_marker = antecedent_markables[i]
        #Check if this g_marker's identical twin is found in our markable list
        our_markable_len = len(sent_obj.markables)
        inserted_or_found = False
        for j in range(0, our_markable_len):
            o_marker = sent_obj.markables[j]
            if ((o_marker.w_s_idx == g_marker.w_s_idx)
                    and (o_marker.w_e_idx == g_marker.w_e_idx)):
                if (o_marker.flags != g_marker.flags):
                    #Update if there is a mismatch in flags
                    o_marker.flags = class_defs.MARKABLE_FLAG_ANTECEDENT
                    sent_obj.markables[j] = o_marker
                inserted_or_found = True
                break
            elif ((o_marker.w_s_idx > g_marker.w_s_idx)):
                #We have crossed the place where we should have found this but we still haven't so add it
                #Create a new marker_obj
                new_marker = class_defs.markable(g_marker.w_s_idx,
                                                 g_marker.w_e_idx, -1, -1,
                                                 g_marker.coref_id,
                                                 g_marker.flags)
                sent_obj.markables.insert(j, new_marker)
                inserted_or_found = True
                break

        if (inserted_or_found == False):
            new_marker = class_defs.markable(g_marker.w_s_idx,
                                             g_marker.w_e_idx, -1, -1,
                                             g_marker.coref_id, g_marker.flags)
            sent_obj.markables.append(new_marker)
コード例 #3
0
def spacy_compute_markable_table(sent_obj):
    len_lst = len(sent_obj.word_list)
    markable_lst = []
    m_start_idx = -1
    m_end_idx = -1

    for i in range(len_lst):
        curr_word = sent_obj.word_list[i]
        pos_tag = curr_word.pos_tag
        NER_tag = curr_word.NER_tag
        np_tag = curr_word.chunk_tag

        if (((np_tag == "B-NP") or (NER_tag == "B")) and (m_start_idx == -1)):
            m_start_idx = i
            m_end_idx = i
        elif ((np_tag == "I-NP") or (NER_tag == "I")):
            m_end_idx = i
        else:
            if (m_start_idx != -1):
                markable_obj = class_defs.markable(m_start_idx, m_end_idx, -1,
                                                   -1, 0, 0)
                markable_lst.append(markable_obj)
                m_start_idx = -1
                m_end_idx = -1

            if (pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP"
                    or pos_tag == "WP$" or pos_tag == "NN" or pos_tag == "NNS"
                    or pos_tag == "NNP" or pos_tag == "NNPS"):
                markable_obj = class_defs.markable(i, i, -1, -1, 0, 0)
                markable_lst.append(markable_obj)

    if (m_start_idx != -1):
        markable_obj = class_defs.markable(m_start_idx, m_end_idx, -1, -1, 0,
                                           0)
        markable_lst.append(markable_obj)

    for marker in markable_lst:
        print("\n", end="")
        for i in range(marker.w_s_idx, marker.w_e_idx + 1):
            print(sent_obj.word_list[i].word, " ", end="")

    return markable_lst
コード例 #4
0
def compute_markable_table(sent_obj):
    len_lst = len(sent_obj.word_list)
    markable_lst = []

    for i in range(len_lst):
        curr_word = sent_obj.word_list[i]
        pos_tag = curr_word.pos_tag
        NER_tag = curr_word.NER_tag
        np_tag = curr_word.chunk_tag

        #Pronoun not part of Noun Phrase
        if (pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP"
                or pos_tag == "WP$"):
            if (np_tag == "O"):
                markable_obj = class_defs.markable(i, i, -1, -1, 0, 0)
                markable_lst.append(markable_obj)
                continue

        if (pos_tag == "NN" or pos_tag == "NNS" or pos_tag == "NNP"
                or pos_tag == "NNPS"):
            if (np_tag == "O"):
                markable_obj = class_defs.markable(i, i, -1, -1, 0, 0)
                markable_lst.append(markable_obj)
                continue

        if (np_tag != "O"):
            if (np_tag == "B-NP"):
                markable_obj = class_defs.markable(i, i, -1, -1, 0, 0)

            elif (np_tag == "I-NP"):
                markable_obj.w_e_idx = i

            if (i < len_lst - 1):
                if (sent_obj.word_list[i + 1].chunk_tag == "O"):
                    markable_lst.append(markable_obj)

            elif (i == len_lst - 1):
                markable_lst.append(markable_obj)

    return markable_lst
コード例 #5
0
def extract_markables_from_input_file(doc_obj, line_num, sent_tag_unrem,
                                      sent_tag_rem):
    coref_id_string = ""
    antecedent = None
    sent_tag_unrem = nltk.word_tokenize(sent_tag_unrem)
    sent_tag_rem = nltk.word_tokenize(sent_tag_rem)
    index = 0
    extraction = False
    begin_index = -1
    end_index = -1
    max_len = len(sent_tag_unrem)
    number_of_completed_corefs = 0
    for index in range(0, max_len):
        tok = sent_tag_unrem[index]
        if (tok == "ID="):
            #Check if this is due to the <COREF ID=X#>
            if (sent_tag_unrem[index - 2]
                    == "<") and (sent_tag_unrem[index - 1]
                                 == "COREF") and (sent_tag_unrem[index + 1]
                                                  == "''"):
                coref_id_string = sent_tag_unrem[index + 2]
                index = index + 5
                begin_index = index
        elif (tok == "<") and (sent_tag_unrem[index + 1] == "/COREF"):
            antecedent = sent_tag_unrem[begin_index:index]
            #Note
            #Compute the index in the tag removed sent tok position
            # 7 for <S ID= "X">
            # 7 for <COREF ID="X#">
            # 3 for </COREF>
            begin_index = begin_index - (number_of_completed_corefs *
                                         10) - 7 - 7
            end_index = index - (number_of_completed_corefs * 10) - 7 - 7 - 1
            create_markable_flag = True

            #Debug Prints
            if (antecedent != sent_tag_rem[begin_index:end_index + 1]):
                #print ("Mistmatched Antecedent")
                #print ("Coreference ID ", coref_id_string, "Unremoved Antecedent ", antecedent)
                #print ("Coreference ID ", coref_id_string, "Removed Antecedent ", sent_tag_rem[begin_index:end_index])
                create_markable_flag = False

            #Create a markable_obj
            if (create_markable_flag == True):
                markable_obj = class_defs.markable(
                    begin_index, end_index, -1, -1, coref_id_string,
                    class_defs.MARKABLE_FLAG_ANTECEDENT)
                sent_obj = doc_obj.sentences[line_num]
                sent_obj.gold_markables.append(markable_obj)
            begin_index = -1
            number_of_completed_corefs += 1
コード例 #6
0
def handle_key_file(doc_obj, kfp):
    for line in kfp:
        line = line.strip('\n')
        #Patten Check if the string matches for "<COREF ID="
        if (len(line) < 2):
            continue

        if ("<COREF ID=" in line):
            tokens = nltk.word_tokenize(line)
            coref_id_string = tokens[4]
            #print (coref_id_string)
        else:
            list_of_str = []
            extract = False
            string_required = ""
            for i in range(0, len(line)):
                if (line[i] == "{"):
                    extract = True
                elif (line[i] == "}"):
                    extract = False
                    string_required = string_required.lstrip(' ')
                    list_of_str.append(string_required)
                    string_required = ""
                else:
                    string_required += line[i]
            #Debug Print
            #print ("Sentence Num :", list_of_str[0], "Max :", list_of_str[1], "Min :", list_of_str[2])
            #Now we got all the anaphorts in a list format, lets do the following tasks
            # 1. Get the sentence from the doc
            # 2. Tokenize the max and min
            # 3. Iterate through the word_list inside sentence and find from which index to index there is a overlap.
            sentence_obj = doc_obj.sentences[int(list_of_str[0])]
            tokenized_max = nltk.word_tokenize(list_of_str[1])
            tokenized_min = nltk.word_tokenize(list_of_str[2])
            max_len = len(sentence_obj.word_list)
            match = False
            max_start_idx = -1
            max_end_idx = -1
            min_start_idx = -1
            min_end_idx = -1
            len_of_max_str = len(tokenized_max)
            for i in range(0, max_len):
                #Check if the first token matches
                if sentence_obj.word_list[i].word == tokenized_max[0]:
                    match = True
                    for j in range(1, len_of_max_str):
                        if sentence_obj.word_list[i +
                                                  j].word != tokenized_max[j]:
                            match = False
                            break
                    if (match == True):
                        #Max pattern is found in the tokenized obj
                        max_start_idx = i
                        max_end_idx = i + len_of_max_str - 1
                        break
            len_of_min_str = len(tokenized_min)
            for i in range(0, max_len):
                #Check if the first token matches
                if sentence_obj.word_list[i].word == tokenized_min[0]:
                    match = True
                    for j in range(1, len_of_min_str):
                        if sentence_obj.word_list[i +
                                                  j].word != tokenized_min[j]:
                            match = False
                            break
                    if (match == True):
                        #Max pattern is found in the tokenized obj
                        min_start_idx = i
                        min_end_idx = i + len_of_min_str - 1
                        break

            markable_obj = class_defs.markable(
                max_start_idx, max_end_idx, min_start_idx, min_end_idx,
                coref_id_string, class_defs.MARKABLE_FLAG_ANAPHOR)
            sent_obj = doc_obj.sentences[int(list_of_str[0])]
            sent_obj.gold_markables.append(markable_obj)
            '''