class EHRTokenizer(object): """Runs end-to-end tokenization""" def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")): self.vocab = Voc() # special tokens self.vocab.add_sentence(special_tokens) self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt')) self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt')) def add_vocab(self, vocab_file): voc = self.vocab specific_voc = Voc() with open(vocab_file, 'r') as fin: for code in fin: voc.add_sentence([code.rstrip('\n')]) specific_voc.add_sentence([code.rstrip('\n')]) return specific_voc def convert_tokens_to_ids(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" ids = [] for token in tokens: ids.append(self.vocab.word2idx[token]) return ids def convert_ids_to_tokens(self, ids): """Converts a sequence of ids in wordpiece tokens using the vocab.""" tokens = [] for i in ids: tokens.append(self.vocab.idx2word[i]) return tokens
def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")): self.vocab = Voc() # special tokens self.vocab.add_sentence(special_tokens) self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt')) self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt')) # code only in multi-visit data self.rx_voc_multi = Voc() self.dx_voc_multi = Voc() self.rx_voc_multi_pa = Voc() self.dx_voc_multi_pa = Voc() with open(os.path.join(data_dir, 'rx-vocab-multi.txt'), 'r') as fin: for code in fin: self.rx_voc_multi.add_sentence([code.rstrip('\n')]) with open(os.path.join(data_dir, 'dx-vocab-multi.txt'), 'r') as fin: for code in fin: self.dx_voc_multi.add_sentence([code.rstrip('\n')]) with open(os.path.join(data_dir, 'rx-vocab-multi-pa.txt'), 'r') as fin: for code in fin: self.rx_voc_multi_pa.add_sentence([code.rstrip('\n')]) with open(os.path.join(data_dir, 'dx-vocab-multi-pa.txt'), 'r') as fin: for code in fin: self.dx_voc_multi_pa.add_sentence([code.rstrip('\n')])
def add_vocab(self, vocab_file): voc = self.vocab specific_voc = Voc() with open(vocab_file, 'r') as fin: for code in fin: voc.add_sentence([code.rstrip('\n')]) specific_voc.add_sentence([code.rstrip('\n')]) return specific_voc
def __init__(self, data_dir, special_tokens=("[PAD]", "[CLS]", "[MASK]")): self.vocab = Voc() # special tokens self.vocab.add_sentence(special_tokens) self.rx_voc = self.add_vocab(os.path.join(data_dir, 'rx-vocab.txt')) self.dx_voc = self.add_vocab(os.path.join(data_dir, 'dx-vocab.txt'))
def build_atc_tree(unique_codes): res = [] graph_voc = Voc() root_node = 'atc_root' for code in unique_codes: sample = [code] + [code[:i] for i in [4, 3, 1]] + [root_node] graph_voc.add_sentence(sample) res.append(sample) return res, graph_voc
def build_icd9_tree(unique_codes): res = [] graph_voc = Voc() root_node = 'icd9_root' level3_dict = expand_level2() for code in unique_codes: level1 = code level2 = level1[:4] if level1[0] == 'E' else level1[:3] level3 = level3_dict[level2] level4 = root_node sample = [level1, level2, level3, level4] graph_voc.add_sentence(sample) res.append(sample) return res, graph_voc
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
import pandas as pd from utils import Voc voc = Voc() df = pd.read_csv("./data/char.train.csv") voc.add(df["content"]) voc.dumps("./data/voc.json")
self.num_layers = 2 self.pool_kernal = 4 self.dim_after_pool = int( np.ceil((self.hid_dim * 2 - self.pool_kernal) / self.pool_kernal) + 1) self.aspect_dim = 64 filename = "./data/char.valid.csv" # %% configs = Configs1() model = BilstmAspectAttPool(configs) model.load_state_dict(torch.load("./model-zoo/bilstm_aspect_att_pool2.pt")) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) voc = Voc() voc.loads("./data/voc.json") df = pd.read_csv("./data/char.test.csv") columns = df.columns[-20:] pred_df = df[columns].copy() # %% output_list = [] y_list = [] # %% for i, content in tqdm(enumerate(df["content"])): seq = voc.sentence2idx(content.split(" ")) seq_len = torch.LongTensor([len(seq)]) seq = torch.LongTensor(seq) seq = seq.unsqueeze(-1) seq = seq.to(device)
import pickle import os import torch import yaml from utils import Voc, Config, dump_pickle FILEPATH = '/home/lanco/zhaoliang/KB/en_concept_net_extracted.csv' ROOTPATH = '/home/lanco/zhaoliang/KB/' edgeList = [] errorList = [] nodeList = set() relationList = [] config = Config(os.path.join(ROOTPATH, 'config.yml')) voc = Voc(config) try: with open(FILEPATH, 'r') as file: for index, line in enumerate(file): if index % 100000 == 0: print('processing %d' % index) lineSearch = re.search( "/a/\[/r/(.+)/,/c/en/(.+?)/.*,/c/en/(.+)/\]", line) if lineSearch != None and lineSearch.group( 1) != None and lineSearch.group(2) != None: voc.addWord(lineSearch.group(3)) voc.addWord(lineSearch.group(2)) if lineSearch.group(1) not in relationList: relationList.append(lineSearch.group(1)) else:
import time import pickle import pickle with open('dic.pkl', 'rb') as f: dic = pickle.load(f) loadFilename = "300000_checkpoint.tar" USE_CUDA = torch.cuda.is_available() #device = torch.device("cuda" if USE_CUDA else "cpu") device = torch.device("cpu") PAD_token = 0 # Used for padding short sentences SOS_token = 1 # Start-of-sentence token EOS_token = 2 # End-of-sentence token corpus_name = 'expand_abbr' voc = Voc(corpus_name) attn_model = 'dot' hidden_size = 500 encoder_n_layers = 2 decoder_n_layers = 2 dropout = 0.1 batch_size = 100 MAX_LENGTH = 200 def evaluate(sentence, max_length=MAX_LENGTH): time_start = time.time() sentence = normalizeString(sentence) sentence = unicodedata.normalize('NFD', sentence) indexes_batch = [indexesFromSentence(voc, sentence)] lengths = torch.tensor([len(indexes) for indexes in indexes_batch])