def predict_fasttext_classifier(train_processed_path, test_processed_path, use_pretrain=False, use_softmax=False): train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' # Load abbr index train_abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl') train_abbr2idx = train_abbr_idx_mapper['abbr2idx'] test_abbr_idx_mapper = pickle_reader(test_processed_path + '/fasttext/abbr_idx_mapper.pkl') test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') # Load model if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' model = load_model(model_file) label_set = set(map(lambda x: x.lstrip("__label__"), model.get_labels())) instance_collection = [] # generate testing data for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()): # if abbr not in train_abbr2idx: # for doc_id, pos_list in test_abbr_index[abbr].items(): # for global_instance_idx, pos, label in pos_list: # instance_collection.append(InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None)) # else: eval_abbr_instance_list = txt_reader(test_processed_path + '/fasttext/dataset/%d.txt' % test_abbr_idx) abbr_instance_idx = 0 for doc_id, pos_list in test_abbr_index[abbr].items(): for global_instance_idx, pos, label in pos_list: if label not in label_set: instance_collection.append( InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None)) else: # get instance tokens = eval_abbr_instance_list[abbr_instance_idx].split() label_in_txt = tokens[0].lstrip("__label__") assert label == label_in_txt context = " ".join(tokens[1:]) instance_collection.append( InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=model.predict(context)[0] [0].lstrip("__label__"))) abbr_instance_idx += 1 # sort collection list based on global instance idx instance_collection = sorted(instance_collection, key=lambda x: x.index) return instance_collection
def generate_whole_dataset(processed_path, shuffle=False): abbr_idx_mapper = pickle_reader(processed_path + '/fasttext/abbr_idx_mapper.pkl') with open(processed_path + '/fasttext/dataset/all.txt', 'w') as f: total_dataset = [] for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()): total_dataset.extend( txt_reader(processed_path + '/fasttext/dataset/%d.txt' % abbr_idx)) if shuffle: random.shuffle(total_dataset) f.write("\n".join(total_dataset))
def load_umn(umn_file_path, remove_umn_senses=True): instance_list = [] umn_txt = [] umn_file_original = txt_reader(umn_file_path, encoding="latin-1") for line in umn_file_original: items = line.split("|") abbr, sense, start = items[0], items[1], items[3] if remove_umn_senses and is_umn_senses(sense): continue else: instance_list.append((abbr, sense, start)) umn_txt.append(items[6]) return instance_list, umn_txt
def sense_inventory_msh(benchmark_mesh_file_path, abbr_list): inventory_file = txt_reader(benchmark_mesh_file_path) sense_inventory = {} sense_inventory_one_word = {} for line in inventory_file: items = line.split("\t") abbr = items[0] cuis = items[1:] if abbr in abbr_list: sense_inventory[abbr] = cuis if " " not in abbr: sense_inventory_one_word[abbr] = cuis return sense_inventory_one_word, sense_inventory
def process_annotated_data(txt_preprocessed_path, upmc_processed_path, train_ratio=0.8, n_jobs=30): os.makedirs(upmc_processed_path, exist_ok=True) upmc_txt_annotated = txt_reader(txt_preprocessed_path) # pre-processing upmc_txt = all_processor.process_texts(upmc_txt_annotated, n_jobs=n_jobs) # train/test split (80% train) random.shuffle(upmc_txt) num_instances = len(upmc_txt) train_idx = set(random.sample(range(num_instances), int(train_ratio*num_instances))) upmc_train_txt = [] upmc_test_txt = [] for idx, txt in enumerate(tqdm.tqdm(upmc_txt)): if idx in train_idx: upmc_train_txt.append(txt) else: upmc_test_txt.append(txt) # Write to file txt_writer(upmc_train_txt, upmc_processed_path+"/upmc_train.txt") txt_writer(upmc_test_txt, upmc_processed_path+"/upmc_test.txt")
def generate_test_content(test_processed_path, train_processed_path): # Load word2vec vectors model = gensim.models.Word2Vec.load(train_processed_path + '/train.model') # Load abbr index abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') train_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt")) # Build index for abbrs (for saving pickle files) abbr_idx_mapper = build_index_of_abbrs(abbr_index) pickle_writer(abbr_idx_mapper, test_processed_path + '/abbr_idx_mapper.pkl') # Save all content vectors to pickle files content_dir = test_processed_path + '/content_vectors/' os.makedirs(content_dir, exist_ok=True) print("Saving content vectors...") print(len(abbr_index)) for abbr in tqdm.tqdm(abbr_index): abbr_job(abbr, abbr_index, abbr_idx_mapper, train_docs, model, content_dir)
def generate_test_data(test_processed_path, window_size=5): # Load abbr index abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') test_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt")) data_processed_path = test_processed_path + '/fasttext' os.makedirs(data_processed_path, exist_ok=True) # Build index for abbrs (for saving pickle files) abbr_idx_mapper = build_index_of_abbrs(abbr_index) pickle_writer(abbr_idx_mapper, data_processed_path + '/abbr_idx_mapper.pkl') content_dir = data_processed_path + '/dataset/' os.makedirs(content_dir, exist_ok=True) print("Building dataset for fastText...") print(len(abbr_index)) for abbr in tqdm(abbr_index): abbr_job(abbr, abbr_index, abbr_idx_mapper, test_docs, content_dir, window_size)
def __init__(self, dataset_file_path): self.corpus = txt_reader(dataset_file_path)
def predict_fasttext_classifier_multi_model(train_processed_path, test_processed_path, use_pretrain=False): train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' # Load abbr index test_abbr_idx_mapper = pickle_reader(test_processed_path + '/fasttext/abbr_idx_mapper.pkl') test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') train_abbr_idx_mapper = pickle_reader(train_processed_path + '/fasttext/abbr_idx_mapper.pkl') train_abbr2idx = train_abbr_idx_mapper['abbr2idx'] train_abbr_label_set = pickle_reader(train_processed_path + '/fasttext/abbr_label_set.pkl') instance_collection = [] # generate testing data for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()): if abbr not in train_abbr_label_set: for doc_id, pos_list in test_abbr_index[abbr].items(): for global_instance_idx, pos, label in pos_list: instance_collection.append( InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None)) else: train_label_set = train_abbr_label_set[abbr] eval_abbr_instance_list = txt_reader(test_processed_path + '/fasttext/dataset/%d.txt' % test_abbr_idx) abbr_instance_idx = 0 context_list, global_idx_list = [], [] for doc_id, pos_list in test_abbr_index[abbr].items(): for global_instance_idx, pos, label in pos_list: # if true label not in train collection if label not in train_label_set: instance_collection.append( InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None)) # if only have 1 CUI elif len(train_label_set) == 1: instance_collection.append( InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=label)) # need predict else: # get instance tokens = eval_abbr_instance_list[ abbr_instance_idx].split() label_in_txt = tokens[0].lstrip("__label__") assert label == label_in_txt context = " ".join(tokens[1:]) context_list.append(context) global_idx_list.append(global_instance_idx) abbr_instance_idx += 1 # predict if len(context_list) > 0: # Load model model_file = model_path + '/%d.bin' % train_abbr2idx[abbr] model = load_model(model_file) predict_list = model.predict(context_list)[0] for idx, predict in zip(global_idx_list, predict_list): instance_collection.append( InstancePred( index=idx, abbr=abbr, sense_pred=predict[0].lstrip("__label__"))) # sort collection list based on global instance idx instance_collection = sorted(instance_collection, key=lambda x: x.index) return instance_collection
###################################### # Read texts from dataset ###################################### # File paths data_path = "/home/luoz3/wsd_data" upmc_all_path = data_path + "/upmc/batch1_4" upmc_all_processed_path = upmc_all_path + "/processed" os.makedirs(upmc_all_processed_path, exist_ok=True) ############################# # Process DataSet documents (only one word abbrs) ############################# # Initialize processor and tokenizer token_filter = TextTokenFilter() processor = TextProcessor([ white_space_remover, token_filter, repeat_non_word_remover, ]) upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt") # pre-processing upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30) # Write to file txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt") print()