def get_list_name(file_list_name):
    file_names = read.textfile2list(file_list_name)
    file_simple = [
        file_name.split("/")[-1] for file_name in file_names
        if "THYMEColonFinal" in file_name
    ]
    read.savein_json(file_list_name.replace(".txt", "_simple"), file_simple)
def get_train():
    file_dev = read.readfrom_json("data/dev_file_simple")
    train_all_simple = read.readfrom_json("data/train_all_simple")
    train = [
        train_file for train_file in train_all_simple
        if train_file not in file_dev
    ]
    read.savein_json("data/train_simple", train)
Esempio n. 3
0
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()
    char_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id],
                                     file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id],
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)

        max_len_all += max_len_file
        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()
    char_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        #raw_text_path = os.path.join(raw_data_path,file_dir[data_id],file_dir[data_id])
        #preprocessed_file_path = os.path.join(preprocessed_path,file_dir[data_id],file_dir[data_id])
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        #print('raw_text - %s' % raw_text)
        #print('raw_text AFTER NORMALIZE - %s' % raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)
        #print('sent_span_list_file - %s, max_len_file - %s,char_vocab - %s ' % (sent_span_list_file, max_len_file, char_vocab))
        max_len_all += max_len_file
        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            #xml_file_path = os.path.join(xml_path, file_dir[data_id] + file_format)
            print('xml_file_path - %s' % xml_file_path)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            print('posi_info_dict - ')
            for key, value in posi_info_dict.items():
                print(key, value)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            #print('sent_tag_list_file - %s' % sent_tag_list_file)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    print('max_len_all - %s' % max_len_all)
    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)
Esempio n. 5
0
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path,
                                    xml_path, file_format):

    max_len_all = list()

    char_vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    unicode_vocab = defaultdict(float)
    word_vocab = defaultdict(float)

    for data_id in range(0, len(file_dir)):
        raw_text_path = os.path.join(raw_data_path, file_dir[data_id],
                                     file_dir[data_id])
        preprocessed_file_path = os.path.join(preprocessed_path,
                                              file_dir[data_id],
                                              file_dir[data_id])

        raw_text = read.readfrom_txt(raw_text_path)
        raw_text = process.text_normalize(raw_text)
        sent_span_list_file, max_len_file, char_vocab = split_by_sentence(
            raw_text, char_vocab)

        max_len_all += max_len_file

        pos_sentences, pos_vocab = process.get_pos_sentence(
            sent_span_list_file, pos_vocab)
        #pos_sentences = read.readfrom_json("data/pos_sentences")#read.savein_json("data/pos_sentences",pos_sentences)
        word_sentences, word_vocab = process.get_words(sent_span_list_file,
                                                       word_vocab)
        pos_sentences_character = process.word_pos_2_character_pos(
            sent_span_list_file, pos_sentences)
        unico_sentences_characte, unicode_vocab = process.get_unicode(
            sent_span_list_file, unicode_vocab)

        read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file)
        read.savein_json(preprocessed_file_path + "_pos",
                         pos_sentences_character)
        read.savein_json(preprocessed_file_path + "_unicodecategory",
                         unico_sentences_characte)
        read.savein_json(preprocessed_file_path + "_words", word_sentences)
        if xml_path != "":
            xml_file_path = os.path.join(xml_path, file_dir[data_id],
                                         file_dir[data_id] + file_format)
            posi_info_dict = process.extract_xmltag_anafora(
                xml_file_path, raw_text)
            sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file,
                                                     posi_info_dict)
            read.savein_json(preprocessed_file_path + "_tag",
                             sent_tag_list_file)

    #read.savein_json("data/word_vocab", word_vocab)
    max_len_all.sort(reverse=True)
    max_len_file_name = "/".join(
        preprocessed_path.split('/')[:-1]) + "/max_len_sent"
    read.savein_json(max_len_file_name, max_len_all)