Ejemplo n.º 1
0
 def __init__(self,
              train_processed_path,
              abbr_inventory_path,
              use_pretrain=False,
              use_softmax=False):
     """
     Initialize environment & model.
     """
     # Initialize processor and tokenizer
     self.pre_processor = TextProcessor(
         [white_space_remover_upmc, sub_deid_patterns_upmc])
     self.tokenizer = CoreNLPTokenizer()
     self.post_processor = TextProcessor(
         [AbbrDetector(abbr_inventory_path)])
     self.filter_processor = TextProcessor(
         [TextTokenFilter(), repeat_non_word_remover])
     # Load model
     train_path = train_processed_path + '/fasttext'
     if use_pretrain:
         model_path = train_path + '/model/pre_train'
     else:
         model_path = train_path + '/model'
     if use_softmax:
         model_file = model_path + '/all_softmax.bin'
     else:
         model_file = model_path + '/all.bin'
     self.model = load_model(model_file)
Ejemplo n.º 2
0
"""
Helper functions for ShARe/CLEF dataset.

"""

import os
import re
import tqdm
from collections import defaultdict
from preprocess.text_helper import white_space_remover, repeat_non_word_remover, recover_upper_cui, is_valid_abbr
from preprocess.text_helper import TextProcessor, CoreNLPTokenizer, TextTokenFilter
from preprocess.file_helper import txt_writer, json_writer, json_reader
from preprocess.dataset.mimic_preprocess import sub_deid_patterns_mimic


toknizer = CoreNLPTokenizer()


def add_annotation_share(folder_path):
    """
    Add annotation and build abbr sense inventory.
    To replace original abbr "AB" to "abbr|AB|C0123456 "

    """
    print("Processing annotations...")
    # read original data
    abbr_dict = defaultdict(list)
    abbr_invalid_dict = defaultdict(list)
    file_list = sorted(os.listdir(folder_path))

    docs_processed = []
Ejemplo n.º 3
0
                    long_form] = lf2cui_only_have_cui[long_form]
            else:
                UMN_sense_cui_inventory[abbr][long_form] = None
    json_writer(UMN_sense_cui_inventory,
                umn_processed_path + "/UMN_sense_cui_inventory.json")

    #############################
    # Process UMN documents
    #############################

    umn_txt_marked = add_abbr_marker_umn(umn_txt)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_umn])

    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30)
    # tokenizing
    umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30)
    # add real annotations
    umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory,
                                           umn_txt_tokenized)
    # Filter trivial tokens and Remove repeat non-words
    umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated,
                                                      n_jobs=30)
    # Write to file
Ejemplo n.º 4
0
    # Read original sense inventory (only one word abbrs)
    MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list)

    # save sense inventory to json
    json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json")
    json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json")

    #############################
    # Process MSH documents (only one word abbrs)
    #############################
    msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])
    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10)
    # tokenizing
    msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10)
    # Filter trivial tokens and Remove repeat non-words
    msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10)
    # Write to file
    txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
Ejemplo n.º 5
0
class AbbrDisambiguation:
    def __init__(self,
                 train_processed_path,
                 abbr_inventory_path,
                 use_pretrain=False,
                 use_softmax=False):
        """
        Initialize environment & model.
        """
        # Initialize processor and tokenizer
        self.pre_processor = TextProcessor(
            [white_space_remover_upmc, sub_deid_patterns_upmc])
        self.tokenizer = CoreNLPTokenizer()
        self.post_processor = TextProcessor(
            [AbbrDetector(abbr_inventory_path)])
        self.filter_processor = TextProcessor(
            [TextTokenFilter(), repeat_non_word_remover])
        # Load model
        train_path = train_processed_path + '/fasttext'
        if use_pretrain:
            model_path = train_path + '/model/pre_train'
        else:
            model_path = train_path + '/model'
        if use_softmax:
            model_file = model_path + '/all_softmax.bin'
        else:
            model_file = model_path + '/all.bin'
        self.model = load_model(model_file)

    def process_single_text(self, text, save_json_path=None):
        """
        Process one text.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_single_text(text)
        # tokenizing
        text_tokenized = self.tokenizer.process_single_text(text)
        # detect abbrs
        text_detected = self.post_processor.process_single_text(text_tokenized)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_single_text(
            text_detected)

        #############################
        # Build index
        #############################

        result_collector = AbbrInstanceCollectorUPMC([text_detected])
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC([text_filtered])
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################

        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)

    def process_texts(self, text_list, save_json_path=None, n_jobs=8):
        """
        Process list of texts.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs)
        # tokenizing
        text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs)
        # detect abbrs
        text_detected = self.post_processor.process_texts(text_tokenized,
                                                          n_jobs=n_jobs)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_texts(text_detected,
                                                            n_jobs=n_jobs)

        #############################
        # Build index
        #############################
        print("Building index...")
        result_collector = AbbrInstanceCollectorUPMC(text_detected)
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC(text_filtered)
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################
        print("Predicting...")
        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)
Ejemplo n.º 6
0
    PATH_PROCESSED_INVENTORY_PKL = BASE_FOLDER + 'sense_inventory/final_cleaned_sense_inventory.cased.processed.pkl'

    # Get pickle generated from mimic_inventory.py
    inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL)
    inventory_rmapper = inventory['longform-abbr_cui']

    ######################################
    # Processing
    ######################################

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter])

    remove_repeat_processor = TextProcessor([repeat_non_word_remover])

    for i in range(42):

        # read file
        filename = 'processed_text_chunk_%s.json' % i
        print("-"*50)
        print("Start File for %s" % filename)
        mimic_txt = []
        mimic_present_senses = []
Ejemplo n.º 7
0
    # with open(dataset_path + "/training_data.txt") as input, open(dataset_path + "/training_data_fixed.txt", "w") as output:
    #     for line in input:
    #         new_line = " ".join([replace(token) for token in line.rstrip("\n").split(" ")])
    #         output.write(new_line + "\n")

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    # dataset_txt_annotated = txt_reader(dataset_path + "/training_data_fixed.txt")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    all_processor = TextProcessor([
        white_space_remover,
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # # pre-processing
    # dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30)
Ejemplo n.º 8
0
    # save sense inventory to json
    json_writer(sense_inventory,
                dataset_processed_path + "/dataset_sense_inventory.json")

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    dataset_txt_annotated = add_annotation_dataset(sense_inventory,
                                                   dataset_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30)
    # tokenizing
    dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    dataset_txt_filtered = filter_processor.process_texts(
        dataset_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(dataset_txt_filtered,
               dataset_processed_path + "/dataset_processed.txt")