def get_sentences(): data = load_description() ids = lmap(lambda x: x['icd10_code'].strip(), data) input2 = lmap(lambda x: x['short_desc'], data) desc_tokens = lmap(nltk.word_tokenize, input2) l = [] for id_token, desc_token in zip(ids, desc_tokens): l.append([id_token] + desc_token) return l
def extract_data(data, word2idx): def tokens_to_idx(tokens): return list([word2idx[t] for t in tokens]) ids = lmap(lambda x: x['order_number'], data) icd10_codes = lmap(lambda x: x['icd10_code'], data) desc_list = lmap(lambda x: x['short_desc'], data) train_tokens = lmap(tokens_to_idx, tokenize(desc_list)) icd10_codes = lmap(lambda x: x.strip(), icd10_codes) return icd10_codes, ids, train_tokens
def fit_and_tokenize(input2): tokenizer = tf.keras.preprocessing.text.Tokenizer() tokenizer.fit_on_texts(input2) input2 = lmap(tf.keras.preprocessing.text.text_to_word_sequence, input2) print(input2[0]) enc_text = tokenizer.texts_to_sequences_generator(input2) return enc_text, tokenizer
def build_voca(data): short_desc_list = lmap(lambda x: x['short_desc'], data) all_text = tokenize(short_desc_list) voca = set(flatten(all_text)) n_output_voca = len(voca) word2idx = {} for idx, word in enumerate(list(voca)): word2idx[word] = idx return n_output_voca, word2idx
def do_eval(w2v): data = load_description() ids = set(lmap(lambda x: x['icd10_code'].strip(), data)) ap_list = [] for e in data[:1000]: word = e['icd10_code'].strip() terms = nltk.word_tokenize(e['short_desc']) ranked_list = list( [w for w in w2v.most_similar(word, topn=50) if w[0] not in ids]) def is_correct(w_pair): return w_pair[0] in terms AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms)) ap_list.append(AP) print("MAP", sum(ap_list) / len(ap_list))
def input_fn(data, tokenizer, max_seq): input2 = lmap(lambda x: x['short_desc'], data) ids = lmap(lambda x: x['order_number'], data) code_ids = list([[e] for e in ids]) input2 = lmap(tf.keras.preprocessing.text.text_to_word_sequence, input2) enc_text = tokenizer.texts_to_sequences_generator(input2) def fit_to_length(e): e = e[:max_seq] e = e + (max_seq - len(e)) * [0] return [e] enc_desc_tokens = lmap(fit_to_length, enc_text) dataset = tf.data.Dataset.from_tensor_slices({ str_code_id: code_ids, str_desc_tokens: enc_desc_tokens }) dataset = dataset.shuffle(1000) dataset = dataset.repeat() return dataset
def train_loop(args): model_dir = os.path.join(model_path, "w2v_model") data = load_description() n_input_voca = data[-1]['order_number'] input2 = lmap(lambda x: x['short_desc'], data) dim = 1000 max_seq = 1 batch_size = int(args.batch_size) epochs = int(args.epochs) lr = float(args.lr) print("learning rate", lr) print("Batch_size", batch_size) enc_text, tokenizer = fit_and_tokenize(input2) token_config = tokenizer.get_config() n_output_voca = len(token_config['word_index']) random.shuffle(data) train_size = int(0.1 * len(data)) train_data = data[:train_size] val_data = data[train_size:] step_per_epoch = int(len(train_data) / batch_size) max_step = step_per_epoch * epochs config = tf.estimator.RunConfig().replace( keep_checkpoint_max=1, log_step_count_steps=10, save_checkpoints_steps=step_per_epoch) tf_logging = logging.getLogger('tensorflow') tf_logging.setLevel(logging.DEBUG) print("Building_estimator") estimator = tf.estimator.Estimator(model_dir=model_dir, model_fn=build_model_fn( lr, dim, max_seq, n_input_voca, n_output_voca), config=config) print("start training") # estimator.train( # input_fn=lambda :input_fn(train_data, tokenizer, max_seq), # steps=max_step # ) estimator.predict( input_fn=lambda: input_fn(train_data, tokenizer, max_seq), steps=max_step)
def work(args): data = load_description() n_input_voca = data[-1]['order_number'] input2 = lmap(lambda x: x['short_desc'], data) dim = 1000 max_seq = 1 batch_size = int(args.batch_size) epochs = int(args.epochs) lr = float(args.lr) print("learning rate", lr) print("Batch_size", batch_size) enc_text, tokenizer = fit_and_tokenize(input2) random.shuffle(data) train_size = int(0.1 * len(data)) train_data = data[:train_size] val_data = data[train_size:] step_per_epoch = int(len(train_data) / batch_size) train_dataset = input_fn(train_data, tokenizer, max_seq) val_dataset = input_fn(val_data, tokenizer, max_seq) token_config = tokenizer.get_config() n_output_voca = len(token_config['word_index']) loss, model = build_model(dim, max_seq, n_input_voca, n_output_voca) model.add_loss(loss) #model = multi_gpu_model(model, 4, cpu_relocation=True) optimizer = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, amsgrad=False) model.compile(optimizer=optimizer) model.fit( train_dataset, # validation_data=val_dataset, # validation_steps=3000, epochs=epochs, steps_per_epoch=step_per_epoch, batch_size=batch_size) model.save('my_model.h5')
def work2(): data = load_description() n_input_voca = data[-1]['order_number'] input2 = lmap(lambda x: x['short_desc'], data) dim = 100 max_seq = 30 all_text = tokenize(input2) voca = set(flatten(all_text)) word2idx = {} for idx, word in enumerate(list(voca)): word2idx[word] = idx def tokens_to_idx(tokens): return list([word2idx[t] for t in tokens]) random.shuffle(data) train_size = int(0.9 * len(data)) train_data = data[:train_size] val_data = data[train_size:] train_ids = lmap(lambda x: x['order_number'], train_data) icd10_codes = lmap(lambda x: x['icd10_code'], train_data) train_desc = lmap(lambda x: x['short_desc'], train_data) train_tokens = lmap(tokens_to_idx, train_desc) n_output_voca = len(voca) print("n_output_voca", n_output_voca) W2 = np.random.rand(n_output_voca + 1, dim) W2 = np.random.normal(0, 1, [n_output_voca + 1, dim]) W1 = np.zeros([n_input_voca + 1, dim]) icd10_codes = lmap(lambda x: x.strip(), icd10_codes) add_subword = False code_id_to_code = {} code_to_code_id = {} for code_id, icd10_code, text_seq in zip(train_ids, icd10_codes, train_tokens): for idx in text_seq: W1[code_id] += W2[idx] code_id_to_code[code_id] = icd10_code code_to_code_id[icd10_code] = code_id l = len(icd10_code) if add_subword: for j in range(1, l - 1): substr = icd10_code[:j] if substr in code_id_to_code: W1[code_id] += W1[code_to_code_id[substr]] new_w2 = list( [W2[i] / np.linalg.norm(W2[i]) for i in range(n_output_voca)]) all_voca = [] for code_id in train_ids: icd10_code = code_id_to_code[code_id] word = icd10_code emb = W1[code_id] all_voca.append((word, emb)) print("Testing") AP_list = [] for code_id, text_seq in zip(train_ids, train_tokens): a = W1[code_id] / np.linalg.norm(W1[code_id]) l = [] for j in range(n_output_voca): b = new_w2[j] e = j, np.dot(a, b) l.append(e) l.sort(key=lambda x: x[1], reverse=True) ranked_list = l[:50] terms = text_seq def is_correct(w_pair): return w_pair[0] in terms AP = AP_from_binary(lmap(is_correct, ranked_list), len(terms)) AP_list.append(AP) print("1") if len(AP_list) > 100: break print(sum(AP_list) / len(AP_list))
def encode_data(vocabulary_set, text_list): encoder = tf.keras.preprocessing.text.text_to_word_sequence(vocabulary_set) return lmap(encoder.encode, text_list)