def get_septic_labels(self, dir_clamp, dir_labels): clamp_obj = Clamp() labels = dict() n_septic = 0 for i in range(self.n_sepsis_sample_docs + self.n_negative_docs): cur_label = 'non_septic' is_infected = False n_present_labs = 0 entities = clamp_obj.get_entities(str(i) + '.txt', dir_clamp) for cur_entity in entities: if not is_infected: if (self.is_pneumonia_and_empyema(cur_entity.mention) or self.get_regex_match(cur_entity.mention, self.regex_meningitis) or self.get_regex_match(cur_entity.mention, self.regex_endocarditis) or self.get_regex_match(cur_entity.mention, self.regex_other_infections) ) and cur_entity.assertion.lower() == 'present': # infection term is mentioned and not negated in the sentence is_infected = True if n_present_labs < 2: if (self.get_regex_match(cur_entity.mention, self.regex_labs_temp) or self.get_regex_match(cur_entity.mention, self.regex_labs_wbc) or self.get_regex_match(cur_entity.mention, self.regex_mental_status) or self.get_regex_match(cur_entity.mention, self.regex_labs_tachycardia) or self.get_regex_match(cur_entity.mention, self.regex_labs_tachypnea) or self.get_regex_match(cur_entity.mention, self.regex_labs_hyperglycemia) ) and cur_entity.assertion.lower() == 'present': # patient condition term is mentioned and # not negated in the sentence n_present_labs += 1 # septic if the patient has an infection and # at least two of the pre-specified conditions if is_infected and n_present_labs >= 2: cur_label = 'septic' n_septic += 1 labels[str(i)] = cur_label print("Number of instances labeled as septic: {} of total {} instances" .format(n_septic, self.n_sepsis_sample_docs + self.n_negative_docs)) FileUtils.write_json(labels, 'sepsis_labels.json', dir_labels) return labels
class Newsgroups: PATH_DIR_CORPUS = realpath('../dataset/newsgroups/') print(PATH_DIR_CORPUS) FNAME_TRAIN = 'train_newsgroups.csv' FNAME_VAL = 'val_newsgroups.csv' FNAME_TEST = 'test_newsgroups.csv' FNAME_LABELDICT = 'newsgroups_labeldict.json' PATH_DIR_OUT = realpath('../out/') TOKENIZER = spacy_eng_tokenizer LABEL_DICT = FileUtils.read_json(FNAME_LABELDICT, PATH_DIR_CORPUS) PRETRAINED_EMBS = True PATH_DIR_EMBS = '/home/corpora/word_embeddings/' FNAME_EMBS = 'glove.840B.300d.txt' N_DIM_EMBS = 300 embs_from_disk = True FNAME_EMBS_WT = 'pretrained_embs_newsgroups.npy' load_encoder = True FNAME_ENCODER = 'corpus_encoder_newsgroups.json' PATH_ENCODER = realpath('../out/') train_model = True model_name = 'lstm' n_layers = 3 n_hid = 600 n_emb = N_DIM_EMBS dropout = 0.3 bidir = True test_mode = 'val' # val | test
def save(self, fname, dir_out='../out/'): seqs = self.corpus_encoder.get_decoded_sequences(self.corpus, strip_angular=True) with warnings.catch_warnings(): warnings.simplefilter("ignore") golds = self.corpus.label_encoder.inverse_transform(self.corpus.get_labels()) preds = self.corpus.label_encoder.inverse_transform(self.preds) if not isinstance(golds, list): golds = golds.tolist() if not isinstance(preds, list): preds = preds.tolist() # saving the sequences, the importance scores, and the gold and predicted labels as JSON file FileUtils.write_json( {'seq_lst': seqs, 'imp_scores': self.imp_scores, 'gold': golds, 'pred': preds}, fname, dir_out)
def from_imp(cls, pooling, model, corpus, encoder, dir_in='../out/'): fname = 'imp_scores_' + model.model_type + \ '_hid' + str(model.hidden_dim) + '_emb' + str(model.emb_dim) + \ '_' + splitext(corpus.fname)[0] + '_' + pooling + '.json' json_file = FileUtils.read_json(fname, dir_in) inst = cls(pooling, model, corpus, encoder, json_file['imp_scores'], corpus.label_encoder.transform(json_file['pred'])) return inst
def sg_param_search(seqs, scores, eval_obj): prec = dict() best_prec, best_min_n, best_max_n, best_skip = 0., None, None, None for min_n in range(1, 5): for max_n in range(min_n, min_n + 4): for skip in range(11): sg = SeqImpSkipGram.from_seqs(seqs, scores, min_n=1, max_n=max_n, skip=skip, topk=50) cur_prec = eval_obj.avg_prec_sg(sg.top_sg_seqs) prec[repr( (min_n, max_n, skip) )] = cur_prec # converting key to string for JSON serialization if cur_prec > best_prec: best_prec, best_min_n, best_max_n, best_skip = cur_prec, min_n, max_n, skip print( "Average precision at min_n {}, max_n {}, skip {} is: {}". format(min_n, max_n, skip, cur_prec)) if max_n == 1: # all skip values will give the same unigram. # Hence iterating over it only once. break print("Maximum precision {} for min_n {}, max_n {} and skip {}".format( best_prec, best_min_n, best_max_n, best_skip)) FileUtils.write_json(prec, 'sg_param_search.json', '../out/') return best_min_n, best_max_n, best_skip
def main(f_labels, dir_labels, dir_corpus, fname_suffix, dir_csv, dir_clamp=None): label_dict = FileUtils.read_json(f_labels, dir_labels) sorted_labels = sorted(label_dict.items()) fname_lst = [i for i, j in sorted_labels] # file names in sorted order all_labels_lst = [j for i, j in sorted_labels ] # labels sorted according to file names train_idx, val_idx, test_idx = split_data(fname_lst, all_labels_lst) # train_idx, val_idx, test_idx = read_splits('/home/madhumita/sepsis_synthetic/splits/') write_csv(train_idx, label_dict, dir_corpus, dir_clamp, 'train_' + fname_suffix + '.csv', dir_csv, ["label", "text"]) write_csv(val_idx, label_dict, dir_corpus, dir_clamp, 'val_' + fname_suffix + '.csv', dir_csv, ["label", "text"]) write_csv(test_idx, label_dict, dir_corpus, dir_clamp, 'test_' + fname_suffix + '.csv', dir_csv, ["label", "text"])
def process_model(ds): train_corp = CSVCorpus(ds.FNAME_TRAIN, realpath(ds.PATH_DIR_CORPUS), 'train', ds.TOKENIZER, ds.LABEL_DICT) val_corp = CSVCorpus(ds.FNAME_VAL, realpath(ds.PATH_DIR_CORPUS), 'val', ds.TOKENIZER, ds.LABEL_DICT) test_corp = CSVCorpus(ds.FNAME_TEST, realpath(ds.PATH_DIR_CORPUS), 'test', ds.TOKENIZER, ds.LABEL_DICT) if ds.load_encoder: if not exists(realpath(join(ds.PATH_ENCODER, ds.FNAME_ENCODER))): raise FileNotFoundError("Encoder not found") # load encoder corpus_encoder = CorpusEncoder.from_json(ds.FNAME_ENCODER, ds.PATH_ENCODER) else: print("Initializing vocabulary") corpus_encoder = CorpusEncoder.from_corpus(train_corp) if not exists(realpath(ds.PATH_ENCODER)): makedirs(realpath(ds.PATH_ENCODER)) print("Serializing corpus encoder") corpus_encoder.to_json(ds.FNAME_ENCODER, realpath(ds.PATH_ENCODER)) print("Vocab size:", len(corpus_encoder.vocab)) if ds.train_model: if ds.PRETRAINED_EMBS: # get embedding weights matrix if ds.embs_from_disk: print("Loading word embeddings matrix ...") weights = FileUtils.read_numpy(ds.FNAME_EMBS_WT, realpath(ds.PATH_DIR_OUT)) else: weights = EmbeddingUtils.get_embedding_weight( ds.FNAME_EMBS, realpath(ds.PATH_DIR_EMBS), ds.N_DIM_EMBS, corpus_encoder.vocab.word2idx) print("Saving word embeddings matrix ...") FileUtils.write_numpy(weights, ds.FNAME_EMBS_WT, realpath(ds.PATH_DIR_OUT)) weights = torch.from_numpy(weights).type(torch.FloatTensor) else: weights = None print("Word embeddings loaded!") net_params = { 'n_layers': ds.n_layers, 'hidden_dim': ds.n_hid, 'vocab_size': corpus_encoder.vocab.size, 'padding_idx': corpus_encoder.vocab.pad, 'embedding_dim': ds.n_emb, 'emb_weights': weights, 'dropout': ds.dropout, 'label_size': len(ds.LABEL_DICT.keys()), 'batch_size': 64, 'bidir': ds.bidir } classifier = LSTMClassifier(**net_params) n_epochs = 50 lr = 0.001 optimizer = torch.optim.Adam(classifier.parameters(), lr=lr) classifier.train_model(train_corp, corpus_encoder, n_epochs, optimizer, val_corp) classifier.save(f_model=splitext(ds.FNAME_TRAIN)[0][6:] + '_' + ds.model_name + '_' + str(ds.n_layers) + 'layer' + '_hid' + str(ds.n_hid) + '_emb' + str(ds.n_emb) + '_dropout' + str(ds.dropout) + '_bidir' + str(ds.bidir) + '.tar', dir_model=realpath(ds.PATH_DIR_OUT)) else: f_model = splitext(ds.FNAME_TRAIN)[0][6:] + '_' + ds.model_name + '_' + \ str(ds.n_layers) + 'layer' + \ '_hid' + str(ds.n_hid) + \ '_emb' + str(ds.n_emb) + \ '_dropout' + str(ds.dropout) + \ '_bidir' + str(ds.bidir) + '.tar' print("Loading model", f_model) classifier = LSTMClassifier.load(f_model=f_model, dir_model=realpath(ds.PATH_DIR_OUT)) if ds.test_mode == 'val': eval_corp = val_corp elif ds.test_mode == 'test': eval_corp = test_corp else: raise ValueError("Specify val|test corpus for evaluation") print("Testing on {} data".format(ds.test_mode)) # get predictions y_pred, y_true = classifier.predict(eval_corp, corpus_encoder) # compute scoring metrics print("Macro F1 score: ", f1_score(y_true=y_true, y_pred=y_pred, average='macro')) print("Accuracy %", accuracy_score(y_true=y_true, y_pred=y_pred) * 100)
def get_septic_notes(self, septic_hadm_ids, fname_notes=FNAME_NOTES, dir_in=PATH_MIMICIII): print("Loading notes csv") notes_df = PandasUtils.load_csv(fname_notes, dir_in) print("Removing error entries") prev_len = notes_df.shape[0] notes_df = notes_df[notes_df['ISERROR'] != 1] assert notes_df.shape[0] < prev_len, "None of the entries are removed" print( "Removing leading and trailing spaces and converting text to lowercase" ) notes_df['TEXT'] = notes_df['TEXT'].str.strip() print("Converting text to lowercase") notes_df['TEXT'] = notes_df['TEXT'].str.lower() print("Removing blank and NA entries from TEXT and HADM_ID columns") notes_df['TEXT'].replace('', np.nan, inplace=True) notes_df.dropna(subset=['HADM_ID', 'TEXT'], inplace=True) print("Converting HADM ID to int") notes_df['HADM_ID'] = notes_df['HADM_ID'].astype('int64') print("Converting chartdate to datetime") notes_df['CHARTDATE'] = pd.to_datetime(notes_df['CHARTDATE'], format='%Y-%m-%d') # print("All data types", notes_df.dtypes) print("Dropping duplicates") notes_df = notes_df.drop_duplicates(subset=[ 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'CATEGORY', 'DESCRIPTION', 'TEXT' ], keep='first') print("Adding septic labels") notes_df['SEPTIC'] = np.where( notes_df['HADM_ID'].isin(septic_hadm_ids), "septic", "non_septic") len_all_notes = [ len(cur_note.split()) for cur_note in list(notes_df['TEXT']) ] print("Average length of notes: ", statistics.mean(len_all_notes)) print("Total number of notes: ", len(len_all_notes)) print("Number of septic notes: ", notes_df[notes_df['SEPTIC'] == "septic"].shape[0]) print("All categories of notes") print(set(notes_df['CATEGORY'])) print("Removing social work notes") notes_df = notes_df[notes_df['CATEGORY'] != "Social Work"] print("Removing rehabilitation notes ") notes_df = notes_df[notes_df['CATEGORY'] != "Rehab Services"] print("Removing nutrition notes ") notes_df = notes_df[notes_df['CATEGORY'] != "Nutrition"] print( "Removing discharge notes to prevent direct mention of rnn_expl_rules" ) notes_df = notes_df[notes_df['CATEGORY'] != "Discharge summary"] print("New categories, ", set(notes_df['CATEGORY'])) print("Total Number of notes: ", notes_df.shape[0]) print("Number of septic notes: ", notes_df[notes_df['SEPTIC'] == "septic"].shape[0]) note_subset = notes_df.loc[notes_df.groupby( 'HADM_ID').CHARTDATE.idxmax()] print("Number of notes after selecting last note per admission: ", note_subset.shape[0]) print( "Number of septic notes after selecting last note per admission: ", note_subset[note_subset['SEPTIC'] == "septic"].shape[0]) hadm_ids = list( note_subset[note_subset['SEPTIC'] == "septic"]['HADM_ID']) n_mention_sepsis = 0 for hadm_id in hadm_ids: if 'rnn_expl_rules' in note_subset[note_subset['HADM_ID'] == hadm_id]['TEXT'].item(): n_mention_sepsis += 1 # print(note_subset[note_subset['HADM_ID'] == hadm_id]['TEXT'].item()) print("Number of septic cases that mention rnn_expl_rules: ", n_mention_sepsis) print("Serializing data") label_dict = {} # {"HADM_ID":"septic"/"non-septic"} for hadm_id in note_subset['HADM_ID'].tolist(): cur_label = note_subset[note_subset['HADM_ID'] == hadm_id]['SEPTIC'].item() label_dict[str(hadm_id)] = cur_label text = note_subset[note_subset['HADM_ID'] == hadm_id]['TEXT'].item() FileUtils.write_txt(text, str(hadm_id) + '.txt', PATH_MIMICIII_SEPSIS_TEXT) # write labels json file FileUtils.write_json(label_dict, FNAME_LABELS, PATH_MIMICIII_SEPSIS_LABELS) # pandas dataframe as csv? note_subset.to_csv( join(PATH_MIMICIII_SEPSIS, "mimic_sepsis_subset_df.csv"))
def write_dataset(self, dir_out): write_csv(self.x_train, self.y_train, 'train_newsgroups.csv', dir_out) write_csv(self.x_val, self.y_val, 'val_newsgroups.csv', dir_out) write_csv(self.x_test, self.y_test, 'test_newsgroups.csv', dir_out) FileUtils.write_json(self.label_dict, 'newsgroups_labeldict.json', dir_out)
from src.utils import MathUtils, FileUtils, Validator from src.mdchar import * validator = Validator() mathUtils = MathUtils() fileUtils = FileUtils() class RSA: def generate_key(self): """Gera a chave pública pegando todos os inputs do usuário""" [p, q] = validator.get_p_and_q_input() # Tamanho do conjunto finito de valores para # que possamos fazer o caminho inverso ao realizado # para cifrar a mensagem n = p * q while n <= 26: print("[!] P * Q precisa ser maior que 26") p = validator.get_prime_input("P") q = validator.get_prime_input("Q") n = p * q totiente = mathUtils.totiente(p, q) e = validator.get_e_input(totiente) fileUtils.write_file(f"{n} {e}", "public_key")