def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file)
""" Helper functions for ShARe/CLEF dataset. """ import os import re import tqdm from collections import defaultdict from preprocess.text_helper import white_space_remover, repeat_non_word_remover, recover_upper_cui, is_valid_abbr from preprocess.text_helper import TextProcessor, CoreNLPTokenizer, TextTokenFilter from preprocess.file_helper import txt_writer, json_writer, json_reader from preprocess.dataset.mimic_preprocess import sub_deid_patterns_mimic toknizer = CoreNLPTokenizer() def add_annotation_share(folder_path): """ Add annotation and build abbr sense inventory. To replace original abbr "AB" to "abbr|AB|C0123456 " """ print("Processing annotations...") # read original data abbr_dict = defaultdict(list) abbr_invalid_dict = defaultdict(list) file_list = sorted(os.listdir(folder_path)) docs_processed = []
long_form] = lf2cui_only_have_cui[long_form] else: UMN_sense_cui_inventory[abbr][long_form] = None json_writer(UMN_sense_cui_inventory, umn_processed_path + "/UMN_sense_cui_inventory.json") ############################# # Process UMN documents ############################# umn_txt_marked = add_abbr_marker_umn(umn_txt) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_umn]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30) # tokenizing umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30) # add real annotations umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory, umn_txt_tokenized) # Filter trivial tokens and Remove repeat non-words umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated, n_jobs=30) # Write to file
# Read original sense inventory (only one word abbrs) MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list) # save sense inventory to json json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json") json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json") ############################# # Process MSH documents (only one word abbrs) ############################# msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path) # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10) # tokenizing msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10) # Filter trivial tokens and Remove repeat non-words msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10) # Write to file txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
class AbbrDisambiguation: def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file) def process_single_text(self, text, save_json_path=None): """ Process one text. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_single_text(text) # tokenizing text_tokenized = self.tokenizer.process_single_text(text) # detect abbrs text_detected = self.post_processor.process_single_text(text_tokenized) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_single_text( text_detected) ############################# # Build index ############################# result_collector = AbbrInstanceCollectorUPMC([text_detected]) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC([text_filtered]) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path) def process_texts(self, text_list, save_json_path=None, n_jobs=8): """ Process list of texts. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs) # tokenizing text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs) # detect abbrs text_detected = self.post_processor.process_texts(text_tokenized, n_jobs=n_jobs) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_texts(text_detected, n_jobs=n_jobs) ############################# # Build index ############################# print("Building index...") result_collector = AbbrInstanceCollectorUPMC(text_detected) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC(text_filtered) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# print("Predicting...") wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path)
PATH_PROCESSED_INVENTORY_PKL = BASE_FOLDER + 'sense_inventory/final_cleaned_sense_inventory.cased.processed.pkl' # Get pickle generated from mimic_inventory.py inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL) inventory_rmapper = inventory['longform-abbr_cui'] ###################################### # Processing ###################################### # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter]) remove_repeat_processor = TextProcessor([repeat_non_word_remover]) for i in range(42): # read file filename = 'processed_text_chunk_%s.json' % i print("-"*50) print("Start File for %s" % filename) mimic_txt = [] mimic_present_senses = []
# with open(dataset_path + "/training_data.txt") as input, open(dataset_path + "/training_data_fixed.txt", "w") as output: # for line in input: # new_line = " ".join([replace(token) for token in line.rstrip("\n").split(" ")]) # output.write(new_line + "\n") ############################# # Process DataSet documents (only one word abbrs) ############################# # dataset_txt_annotated = txt_reader(dataset_path + "/training_data_fixed.txt") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) all_processor = TextProcessor([ white_space_remover, token_filter, repeat_non_word_remover, recover_upper_cui]) # # pre-processing # dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30)
# save sense inventory to json json_writer(sense_inventory, dataset_processed_path + "/dataset_sense_inventory.json") ############################# # Process DataSet documents (only one word abbrs) ############################# dataset_txt_annotated = add_annotation_dataset(sense_inventory, dataset_path) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30) # tokenizing dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words dataset_txt_filtered = filter_processor.process_texts( dataset_txt_tokenized, n_jobs=30) # Write to file txt_writer(dataset_txt_filtered, dataset_processed_path + "/dataset_processed.txt")