def evaluate(xml_path, output_pred_path, raw_data_path, doc_list,
             output_format):
    gold_count = 0
    pred_count = 0
    true_count = 0
    print('xml_path: %s', xml_path)
    print('doc_list: %s', len(doc_list))
    for file_id in range(len(doc_list)):
        print('path: ', os.path.join(xml_path, doc_list[file_id] + "_tag"))
        print(
            'path: %s',
            os.path.exists(
                os.path.join(xml_path, doc_list[file_id] + "_tag.txt")))
        if os.path.exists(
                os.path.join(xml_path, doc_list[file_id] + "_tag.txt")):
            gold_tag_dict = get_gold_dict(
                read.readfrom_json(
                    os.path.join(xml_path, doc_list[file_id] + "_tag")))
            output_path = os.path.join(output_pred_path, doc_list[file_id],
                                       doc_list[file_id] + output_format)
            raw_text_path = os.path.join(raw_data_path, doc_list[file_id])
            pre_tag_dict = process.extract_xmltag_anafora_pred(
                output_path, read.readfrom_txt(raw_text_path))
            scores = calculate_score(gold_tag_dict, pre_tag_dict)
            gold_count += scores[0]
            pred_count += scores[1]
            true_count += scores[2]
            metrics(true_count, pred_count, gold_count)
Ejemplo n.º 2
0
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()
    char_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id],
                                     file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id],
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)

        max_len_all += max_len_file
        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)
Ejemplo n.º 3
0
def textfile2list_twa_st():
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in twa_cuis:
            txt_list[line[0]] = line[2]
    read.save_in_json("data/TwADR-L/cui_st_dict", txt_list)
Ejemplo n.º 4
0
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()

    char_vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    unicode_vocab = defaultdict(float)
    word_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id],
                                     file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id],
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)

        max_len_all += max_len_file

        pos_sentences, pos_vocab = process.get_pos_sentence(
            sent_span_list_file, pos_vocab)
        #pos_sentences = read.readfrom_json("data/pos_sentences")#read.savein_json("data/pos_sentences",pos_sentences)
        word_sentences, word_vocab = process.get_words(sent_span_list_file,
                                                       word_vocab)
        pos_sentences_character = process.word_pos_2_character_pos(
            sent_span_list_file, pos_sentences)
        unico_sentences_characte, unicode_vocab = process.get_unicode(
            sent_span_list_file, unicode_vocab)

        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        read.savein_json(preprocessed_file_path + "_pos",
                         pos_sentences_character)
        read.savein_json(preprocessed_file_path + "_unicodecategory",
                         unico_sentences_characte)
        read.savein_json(preprocessed_file_path + "_words", word_sentences)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    #read.savein_json("data/word_vocab", word_vocab)
    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)
Ejemplo n.º 5
0
def textfile2list_smm4h_st():
    file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF"
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list[line[0]] = line[2]
    read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
Ejemplo n.º 6
0
def textfile2list_twa():
    data = read.readfrom_txt(file_path_synonym)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in twa_cuis:
            if line[0] not in txt_list:
                txt_list[line[0]] = [line[14]]
            else:
                txt_list[line[0]] += [line[14]]

    read.save_in_json("data/TwADR-L/cui_dict", txt_list)
Ejemplo n.º 7
0
def textfile2list_smm4h():
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    data = read.readfrom_txt(file_path_synonym)
    txt_list = []
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms_all", txt_list)
def textfile2list_twa():
    data = read.readfrom_txt(file_path)
    cuis_twa = []
    txt_list = []
    for line in data.splitlines():
        if "SNO" in line:
            print(line)
        line = line.split('|')
        if line[0] in twa:
            cuis_twa.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/TwADR-L/synonyms", txt_list)
    read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa)))
Ejemplo n.º 9
0
def textfile2list_smm4h():
    file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    smm4h = read.read_from_json("data/SMM4H/labels_ori")

    data = read.readfrom_txt(file_path)
    cuis_smm4h = []
    txt_list = []
    for line in data.splitlines():
        # if "MDR" in line:
        #     print(line)
        line = line.split('|')
        if "MDR" in line[11]:
            cuis_smm4h.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms", txt_list)
    read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
Ejemplo n.º 10
0
def textfile2list_ask():
    data = read.readfrom_txt(file_path)
    cui_code_ask = {}
    codes = []
    for line in data.splitlines():
        line = line.split('|')
        if line[13] in ask:
            codes.append(line[13])
            if line[0] not in cui_code_ask:
                cui_code_ask[line[0]] = [line[13]]
            else:
                cui_code_ask[line[0]] += [line[13]]
                # txt_list.append(line)

    # read.save_in_json("data/AskAPatient/synonyms",txt_list)
    read.save_in_json("data/AskAPatient/cui_codes", cui_code_ask)
    read.save_in_json("data/AskAPatient/codes", list(set(codes)))
def get_snomed_rxnorm_umls():

    rxnorm_term = read.read_from_json(
        "/extra/dongfangxu9/umls/processed/rxnorm_dict")
    snomed_term = read.read_from_json(
        "/extra/dongfangxu9/umls/processed/snomed_dict")
    cui_all = list(set(list(rxnorm_term.keys()) + list(snomed_term.keys())))

    cui_all_synonyms = {}
    # print(len(cui_all))
    data = read.readfrom_txt(
        "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF")
    for line in data.splitlines():
        line_split = line.split('|')
        if line_split[0] in cui_all:
            cui_all_synonyms = add_dict(cui_all_synonyms, line_split[0],
                                        line_split)
Ejemplo n.º 12
0
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()
    char_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        #raw_text_path = os.path.join(raw_data_path,file_dir[data_id],file_dir[data_id])
        #preprocessed_file_path = os.path.join(preprocessed_path,file_dir[data_id],file_dir[data_id])
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        #print('raw_text - %s' % raw_text)
        #print('raw_text AFTER NORMALIZE - %s' % raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)
        #print('sent_span_list_file - %s, max_len_file - %s,char_vocab - %s ' % (sent_span_list_file, max_len_file, char_vocab))
        max_len_all += max_len_file
        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            #xml_file_path = os.path.join(xml_path, file_dir[data_id] + file_format)
            print('xml_file_path - %s' % xml_file_path)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            print('posi_info_dict - ')
            for key, value in posi_info_dict.items():
                print(key, value)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            #print('sent_tag_list_file - %s' % sent_tag_list_file)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    print('max_len_all - %s' % max_len_all)
    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)
def process_umls():

    cui_all_snomed = {}
    cui_all_rxnorm = {}
    data = read.readfrom_txt(
        "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF")

    for line in data.splitlines():
        line_split = line.split('|')
        #### 11 = vocabulary, 12 = term type 14 = term name, 16 = suppress 0 = cui
        if "SNOMEDCT" in line_split[11]:
            if line_split[16] == "N":
                cui_all_snomed = add_dict(cui_all_snomed, line_split[0],
                                          line_split[14])

        if "RXNORM" in line_split[11]:
            if line_split[16] == "N":
                cui_all_rxnorm = add_dict(cui_all_rxnorm, line_split[0],
                                          line_split[14])

    read.save_in_json("/extra/dongfangxu9/umls/processed/snomed_dict",
                      cui_all_snomed)
    read.save_in_json("/extra/dongfangxu9/umls/processed/rxnorm_dict",
                      cui_all_rxnorm)