def generate_test_files(txt_path, test_processed_path): os.makedirs(test_processed_path, exist_ok=True) # Find abbrs, build abbr index print("Loading Test data...") test_collector = AbbrInstanceCollector(txt_path) abbr_index, test_no_mark = test_collector.generate_inverted_index() # save files txt_writer(test_no_mark, test_processed_path + '/test_no_mark.txt') abbr_index.save(test_processed_path + '/abbr_index_data.pkl')
def generate_train_files(txt_path, train_processed_path): os.makedirs(train_processed_path, exist_ok=True) # Find abbrs, build abbr index print("Loading TRAIN data...") train_collector = AbbrInstanceCollector(txt_path) abbr_index, train_no_mark = train_collector.generate_inverted_index() # save files txt_writer(train_no_mark, train_processed_path + '/train_no_mark.txt') abbr_index.save(train_processed_path + '/abbr_index_data.pkl') print("Training Word2Vec...") model = gensim.models.Word2Vec(Corpus(train_no_mark), workers=30, min_count=1) model.save(train_processed_path + '/train.model')
def abbr_job(abbr, abbr_index, abbr_idx_mapper, docs, content_dir, window_size): corpus = AbbrCorpus(abbr, abbr_index, docs, window_size=window_size) corpus_content = corpus.content_generator() dataset = [] for global_instance_idx, doc_id, pos, content_pos, content, label in corpus_content: content.insert(content_pos, abbr) content = " ".join(content) dataset.append("__label__{} {}".format(label, content)) # save vector to pickle file index = abbr_idx_mapper['abbr2idx'][abbr] txt_writer(dataset, content_dir + '%d.txt' % index)
def process_annotated_data(txt_preprocessed_path, upmc_processed_path, train_ratio=0.8, n_jobs=30): os.makedirs(upmc_processed_path, exist_ok=True) upmc_txt_annotated = txt_reader(txt_preprocessed_path) # pre-processing upmc_txt = all_processor.process_texts(upmc_txt_annotated, n_jobs=n_jobs) # train/test split (80% train) random.shuffle(upmc_txt) num_instances = len(upmc_txt) train_idx = set(random.sample(range(num_instances), int(train_ratio*num_instances))) upmc_train_txt = [] upmc_test_txt = [] for idx, txt in enumerate(tqdm.tqdm(upmc_txt)): if idx in train_idx: upmc_train_txt.append(txt) else: upmc_test_txt.append(txt) # Write to file txt_writer(upmc_train_txt, upmc_processed_path+"/upmc_train.txt") txt_writer(upmc_test_txt, upmc_processed_path+"/upmc_test.txt")
all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory) all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid) # save sense inventory to json json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json") json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json") json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json") json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30) # tokenizing share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30) # Write to file txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")
UMN_sense_cui_inventory[abbr][long_form] = None json_writer(UMN_sense_cui_inventory, umn_processed_path + "/UMN_sense_cui_inventory.json") ############################# # Process UMN documents ############################# umn_txt_marked = add_abbr_marker_umn(umn_txt) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_umn]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30) # tokenizing umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30) # add real annotations umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory, umn_txt_tokenized) # Filter trivial tokens and Remove repeat non-words umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated, n_jobs=30) # Write to file txt_writer(umn_txt_filtered, umn_processed_path + "/umn_processed.txt")
txt_list_processed_sorted = sorted(txt_list_processed, key=operator.itemgetter(0)) return [txt for _, txt in txt_list_processed_sorted] if __name__ == '__main__': ###################################### # Read texts from dataset ###################################### # BASE_FOLDER = '/home/mengr/Project/wsd/wsd_data/' dataset_paths = DataSetPaths(environment='luoz3_x1') DATASET_PATH = dataset_paths.upmc_all_no_mark_txt OUTPUT_PATH = dataset_paths.upmc_all_no_mark_folder PATH_PROCESSED_INVENTORY_PKL = dataset_paths.sense_inventory_pkl # Get pickle generated from mimic_inventory.py inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL) inventory_rmapper = inventory['longform-abbr_cui'] ###################################### # Processing ###################################### txt_list = list(open(DATASET_PATH, 'r').readlines()) print("Loaded %d docs from %s" % (len(txt_list), DATASET_PATH)) # Replace Long forms to abbrs mimic_txt_processed = longform_replacer(txt_list, inventory_rmapper, n_jobs=50) # Save to file txt_writer(mimic_txt_processed, OUTPUT_PATH+'train_no_mark_longform_replaced.txt')
# Read original sense inventory (only one word abbrs) MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list) # save sense inventory to json json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json") json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json") ############################# # Process MSH documents (only one word abbrs) ############################# msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path) # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10) # tokenizing msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10) # Filter trivial tokens and Remove repeat non-words msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10) # Write to file txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
###################################### # Read texts from dataset ###################################### # File paths data_path = "/home/luoz3/wsd_data" upmc_all_path = data_path + "/upmc/batch1_4" upmc_all_processed_path = upmc_all_path + "/processed" os.makedirs(upmc_all_processed_path, exist_ok=True) ############################# # Process DataSet documents (only one word abbrs) ############################# # Initialize processor and tokenizer token_filter = TextTokenFilter() processor = TextProcessor([ white_space_remover, token_filter, repeat_non_word_remover, ]) upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt") # pre-processing upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30) # Write to file txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt") print()
# read file filename = 'processed_text_chunk_%s.json' % i print("-"*50) print("Start File for %s" % filename) mimic_txt = [] mimic_present_senses = [] if not os.path.exists(PATH_FOLDER+filename): continue for line in open(PATH_FOLDER+filename, "r"): obj = json.loads(line) text = obj['TEXT'] present_senses = obj['present_senses'] mimic_txt.append(text) mimic_present_senses.append(present_senses) # pre-processing mimic_txt = processor.process_texts(mimic_txt, n_jobs=30) # Replace Long forms to abbrs mimic_txt_processed = longform_replacer(mimic_txt_filtered, mimic_present_senses, inventory_rmapper, n_jobs=16) # tokenizing mimic_txt_tokenized = toknizer.process_texts(mimic_txt, n_jobs=40) # Filter trivial tokens mimic_txt_filtered = filter_processor.process_texts(mimic_txt_tokenized, n_jobs=40) # Remove repeat non-words mimic_txt_processed = remove_repeat_processor.process_texts(mimic_txt_processed, n_jobs=40) # Save to file txt_writer(mimic_txt_processed, PATH_FOLDER_PROCESSED+'%s.txt' % filename[:-5])
# save sense inventory to json json_writer(sense_inventory, dataset_processed_path + "/dataset_sense_inventory.json") ############################# # Process DataSet documents (only one word abbrs) ############################# dataset_txt_annotated = add_annotation_dataset(sense_inventory, dataset_path) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30) # tokenizing dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words dataset_txt_filtered = filter_processor.process_texts( dataset_txt_tokenized, n_jobs=30) # Write to file txt_writer(dataset_txt_filtered, dataset_processed_path + "/dataset_processed.txt")