def predict(text, name): en_text = clean(text, 'en') en_words = en_text.split() en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, keep_oov=True) en_sent = torch.LongTensor([en_pad_seq]).to(device) encode = map_item(name + '_encode', models) decode = map_item(name + '_decode', models) with torch.no_grad(): encode.eval() state = encode(en_sent) decode.eval() return search(decode, state, en_sent, cand=3)
def predict(text, name, mode): text1 = clean(text) sent1 = ' '.join([text1, eos]) seq1 = word2ind.texts_to_sequences([sent1])[0] pad_seq1 = pad_sequences([seq1], maxlen=seq_len, padding='pre', truncating='pre') encode = map_item(name + '_encode', models) state = encode.predict(pad_seq1) decode = map_item(name + '_decode', models) func = map_item(mode, funcs) return func(decode, state, cand=3)
def load_model(name, embed_mat, device, mode): embed_mat = torch.Tensor(embed_mat) model = torch.load(map_item(name, paths), map_location=device) full_dict = model.state_dict() arch = map_item('_'.join([name, mode]), archs) part = arch(embed_mat).to(device) part_dict = part.state_dict() for key, val in full_dict.items(): key = '.'.join(key.split('.')[1:]) if key in part_dict: part_dict[key] = val part.load_state_dict(part_dict) return part
def define_encode(name, embed_mat, seq_len): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, input_length=seq_len, name='embed') input = Input(shape=(seq_len, )) embed_input = embed(input) func = map_item(name, funcs) output = func(embed_input) model = Model(input, output) if __name__ == '__main__': plot_model(model, map_item(name + '_plot', paths), show_shapes=True) return model
def test(name, sent1s, labels): encode = map_item(name + '_encode', models) states = encode.predict(sent1s) decode = map_item(name + '_decode', models) probs = decode.predict([sent2s, states]) len_sum, log_sum = [0] * 2 for sent2, label, prob in zip(sent2s, labels, probs): bound = sum(sent2 > 0) len_sum = len_sum + bound sent_log = 0 for i in range(bound): sent_log = sent_log + np.log(prob[i][label[i]]) log_sum = log_sum + sent_log print('\n%s %s %.2f' % (name, 'perp:', np.power(2, -log_sum / len_sum)))
def load_model(name, embed_mat, pos_mat, att_mat, device, mode): embed_mat = torch.Tensor(embed_mat) model = torch.load(map_item(name, paths), map_location=device) full_dict = model.state_dict() arch = map_item('_'.join([name, mode]), archs) if mode == 'decode': part = arch(embed_mat, pos_mat, att_mat, head, stack).to(device) else: part = arch(embed_mat, pos_mat, head, stack).to(device) part_dict = part.state_dict() for part_key in part_dict.keys(): full_key = '.'.join([mode, part_key]) if full_key in full_dict: part_dict[part_key] = full_dict[full_key] part.load_state_dict(part_dict) return part
def rnn_predict(words, name): seq = word2ind.texts_to_sequences([' '.join(words)])[0] pad_seq = pad_sequences([seq], maxlen=seq_len) model = map_item(name, models) probs = model.predict(pad_seq)[0] bound = min(len(words), seq_len) return np.argmax(probs, axis=1)[-bound:]
def test(name, sents, labels): model = map_item(name, models) probs = model.predict(sents) preds = np.argmax(probs, axis=1) precs = precision_score(labels, preds, average=None) recs = recall_score(labels, preds, average=None) with open(map_item(name, paths), 'w') as f: f.write('label,prec,rec' + '\n') for i in range(class_num): f.write('%s,%.2f,%.2f\n' % (ind_labels[i], precs[i], recs[i])) f1 = f1_score(labels, preds, average='weighted') print('\n%s f1: %.2f - acc: %.2f\n' % (name, f1, accuracy_score(labels, preds))) if detail: for text, label, pred in zip(texts, labels, preds): if label != pred: print('{}: {} -> {}'.format(text, ind_labels[label], ind_labels[pred]))
def predict(text, name): text = clean(text) seq = word2ind.texts_to_sequences([text])[0] pad_seq = pad_sequences([seq], maxlen=seq_len) model = map_item(name, models) probs = model.predict(pad_seq)[0] sort_probs = sorted(probs, reverse=True) sort_inds = np.argsort(-probs) sort_preds = [ind_labels[ind] for ind in sort_inds] formats = list() for pred, prob in zip(sort_preds, sort_probs): formats.append('{} {:.3f}'.format(pred, prob)) if name == 'adnn': core = map_item(name + '_core', models) atts = core.predict(pad_seq)[0] plot_att(text, atts[-len(text):]) return ', '.join(formats)
def fit(name, max_epoch, en_embed_mat, zh_embed_mat, path_feats, detail): tensors = tensorize(load_feat(path_feats), device) bound = int(len(tensors) / 2) train_loader, dev_loader = get_loader(tensors[:bound]), get_loader( tensors[bound:]) en_embed_mat, zh_embed_mat = torch.Tensor(en_embed_mat), torch.Tensor( zh_embed_mat) arch = map_item(name, archs) model = arch(en_embed_mat, zh_embed_mat, pos_mat, att_mat, head, stack).to(device) loss_func = CrossEntropyLoss(ignore_index=0, reduction='sum') learn_rate, min_rate = 1e-3, 1e-5 min_dev_loss = float('inf') trap_count, max_count = 0, 3 print('\n{}'.format(model)) train, epoch = True, 0 while train and epoch < max_epoch: epoch = epoch + 1 model.train() optim = Adam(model.parameters(), lr=learn_rate) start = time.time() train_loss, train_acc = batch_train(model, loss_func, optim, train_loader, detail) delta = time.time() - start with torch.no_grad(): model.eval() dev_loss, dev_acc = batch_dev(model, loss_func, dev_loader) extra = '' if dev_loss < min_dev_loss: extra = ', val_loss reduce by {:.3f}'.format(min_dev_loss - dev_loss) min_dev_loss = dev_loss trap_count = 0 torch.save(model, map_item(name, paths)) else: trap_count = trap_count + 1 if trap_count > max_count: learn_rate = learn_rate / 10 if learn_rate < min_rate: extra = ', early stop' train = False else: extra = ', learn_rate divide by 10' trap_count = 0 epoch_print(epoch, delta, train_loss, train_acc, dev_loss, dev_acc, extra)
def compile(name, embed_mat, seq_len): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, weights=[embed_mat], input_length=seq_len, trainable=True) input1 = Input(shape=(seq_len,)) input2 = Input(shape=(seq_len,)) input3 = Input(shape=(seq_len,)) embed_input1 = embed(input1) embed_input2 = embed(input2) embed_input3 = embed(input3) func = map_item(name, funcs) output = func(embed_input1, embed_input2, embed_input3) model = Model([input1, input2, input3], output) model.summary() plot_model(model, map_item(name + '_plot', paths), show_shapes=True) model.compile(loss=triple_loss, optimizer=Adam(lr=0.001), metrics=[triple_acc]) return model
def fit(name, epoch, embed_mat, triples, margin): sents, pos_sents, neg_sents = triples margins = np.ones(len(sents)) * margin seq_len = len(sents[0]) model = compile(name, embed_mat, seq_len) check_point = ModelCheckpoint(map_item(name, paths), monitor='val_loss', verbose=True, save_best_only=True) model.fit([sents, pos_sents, neg_sents], margins, batch_size=batch_size, epochs=epoch, verbose=True, callbacks=[check_point], validation_split=0.2)
def fit(path_train): with open(path_train, 'rb') as f: sents = pk.load(f) for name, func in funcs.items(): model = func(sents, id2word=word2ind, num_topics=topic_num) topics = model.show_topics(num_words=key_num) save_dict(name, topics) with open(map_item(name, paths), 'wb') as f: pk.dump(model, f)
def ml_predict(text1, text2, name): text1, text2 = clean(text1), clean(text2) text = [text1, text2] sent = bow.transform(text) sent = svd.transform(sent) sent = merge(sent) model = map_item(name, models) prob = model.predict_proba(sent)[0][1] return '{:.3f}'.format(prob)
def compile(name, embed_mat, seq_len, class_num): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, weights=[embed_mat], input_length=seq_len, trainable=True) input = Input(shape=(seq_len, )) embed_input = embed(input) func = map_item(name, funcs) output = func(embed_input, class_num) model = Model(input, output) model.summary() plot_model(model, map_item(name + '_plot', paths), show_shapes=True) model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) return model
def xgb_fit(sents, labels): model = XGBC(max_depth=5, learning_rate=0.1, objective='multi:softmax', n_estimators=100, booster='gbtree') model.fit(sents, labels) with open(map_item('xgb', paths), 'wb') as f: pk.dump(model, f)
def nn_predict(text1, text2, name): text1, text2 = clean(text1), clean(text2) seq1 = word2ind.texts_to_sequences([text1])[0] seq2 = word2ind.texts_to_sequences([text2])[0] pad_seq1 = pad_sequences([seq1], maxlen=seq_len) pad_seq2 = pad_sequences([seq2], maxlen=seq_len) model = map_item(name, models) prob = model.predict([pad_seq1, pad_seq2])[0][0] return '{:.3f}'.format(prob)
def svm_fit(sents, labels): model = SVC(C=1.0, kernel='linear', max_iter=1000, probability=True, class_weight='balanced', verbose=True) model.fit(sents, labels) with open(map_item('svm', paths), 'wb') as f: pk.dump(model, f)
def test(name, sents, labels): sents, labels = tensorize([sents, labels], device) model = map_item(name, models) with torch.no_grad(): model.eval() probs = F.softmax(model(sents), dim=1) preds = torch.max(probs, dim=1)[1] precs = precision_score(labels, preds, average=None) recs = recall_score(labels, preds, average=None) with open(map_item(name, paths), 'w') as f: f.write('label,prec,rec' + '\n') for i in range(class_num): f.write('%s,%.2f,%.2f\n' % (ind_labels[i], precs[i], recs[i])) f1 = f1_score(labels, preds, average='weighted') print('\n%s f1: %.2f - acc: %.2f\n' % (name, f1, accuracy_score(labels, preds))) if detail: for text, label, pred in zip(texts, labels.numpy(), preds.numpy()): if label != pred: print('{}: {} -> {}'.format(text, ind_labels[label], ind_labels[pred]))
def compile(name, embed_mat, seq_len, class_num): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, weights=[embed_mat], input_length=seq_len, trainable=True) input = Input(shape=(seq_len, )) embed_input = embed(input) func = map_item(name, funcs) crf = CRF(class_num) output = func(embed_input, crf) model = Model(input, output) model.summary() plot_model(model, map_item(name + '_plot', paths), show_shapes=True) model.compile(loss=crf.loss_function, optimizer=Adam(lr=0.001), metrics=[crf.accuracy]) return model
def test_pair(name, pairs, flags, thre): model = map_item(name, models) sent1s, sent2s = pairs dists = model.predict([sent1s, sent2s]) dists = np.reshape(dists, (1, -1))[0] preds = dists > thre print('\n%s %s %.2f\n' % (name, 'acc:', accuracy_score(flags, preds))) for flag, dist, text1, text2, pred in zip(flags, dists, text1s, text2s, preds): if flag != pred: print('{} {:.3f} {} | {}'.format(flag, dist, text1, text2))
def save_dict(name, topics): topic_pairs = list() for ind, all_str in topics: pair_strs = all_str.split(' + ') pairs = [pair_str.split('*') for pair_str in pair_strs] pair_dict = dict() for score, key in pairs: pair_dict[key[1:-1]] = float(score) topic_pairs.append(pair_dict) with open(map_item(name + '_dict', paths), 'w') as f: json.dump(topic_pairs, f, ensure_ascii=False, indent=4)
def test(name, sents, labels, thre): model = map_item(name, models) if name == 'svm' or name == 'xgb': probs = model.predict_proba(sents)[:, 1] else: sent1s, sent2s = sents probs = model.predict([sent1s, sent2s]) preds = probs > thre f1 = f1_score(labels, preds) print('\n%s f1: %.2f - acc: %.2f' % (name, f1, accuracy_score(labels, preds)))
def cache(sents, labels): sent_mat, label_mat = split(sents, labels) for name, model in models.items(): encode_mat = list() for sents in sent_mat: encode_mat.append(model.predict(sents)) encode_mat, label_mat = clean(encode_mat, label_mat) core_sents, core_labels = merge(encode_mat, label_mat) path_cache = map_item(name + '_cache', paths) with open(path_cache, 'wb') as f: pk.dump((core_sents, core_labels), f)
def define_model(name, embed_mat, seq_len, class_num): vocab_num, embed_len = embed_mat.shape if name == 'cnn_crf': seq_len = seq_len + win_len - 1 embed = Embedding(input_dim=vocab_num, output_dim=embed_len, input_length=seq_len) input = Input(shape=(seq_len,)) embed_input = embed(input) func = map_item(name, funcs) crf = CRF(class_num) output = func(embed_input, crf) return Model(input, output)
def define_model(name, embed_mat, seq_len): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, input_length=seq_len) input1 = Input(shape=(seq_len,)) input2 = Input(shape=(seq_len,)) input3 = Input(shape=(seq_len,)) embed_input1 = embed(input1) embed_input2 = embed(input2) embed_input3 = embed(input3) func = map_item(name, funcs) output = func(embed_input1, embed_input2, embed_input3) return Model([input1, input2, input3], output)
def compile(name, embed_mat, seq_len): vocab_num, embed_len = embed_mat.shape embed = Embedding(input_dim=vocab_num, output_dim=embed_len, weights=[embed_mat], input_length=seq_len, trainable=True, name='embed') input1 = Input(shape=(seq_len, )) input2 = Input(shape=(seq_len, )) embed_input1 = embed(input1) embed_input2 = embed(input2) func = map_item(name, funcs) output = func(embed_input1, embed_input2) model = Model([input1, input2], output) model.summary() plot_model(model, map_item(name + '_plot', paths), show_shapes=True) model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) return model
def predict(text, name): en_text = clean(text, 'en') en_text = ' '.join([en_text, eos]) en_words = en_text.split() en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, 'pre', keep_oov=True) en_sent = torch.LongTensor([en_pad_seq]).to(device) encode = map_item(name + '_encode', models) decode = map_item(name + '_decode', models) with torch.no_grad(): encode.eval() state = encode(en_sent) decode.eval() zh_pred = search(decode, state, cand=3) if name == 'att' and __name__ == '__main__': zh_text = bos + zh_pred zh_pad_seq = sent2ind(zh_text, zh_word_inds, seq_len, 'post', keep_oov=True) zh_sent = torch.LongTensor([zh_pad_seq]).to(device) core = map_item(name + '_core', models) atts = core(zh_sent, state)[0] plot_att(en_words[:-1], zh_text[1:] + eos, atts) return zh_pred
def dnn_predict(words, name): seq = word2ind.texts_to_sequences([' '.join(words)])[0] trunc_wins = list() buf = [0] * int((win_len - 1) / 2) buf_seq = buf + seq + buf for u_bound in range(win_len, len(buf_seq) + 1): l_bound = u_bound - win_len trunc_wins.append(buf_seq[l_bound:u_bound]) trunc_wins = np.array(trunc_wins) model = map_item(name, models) probs = model.predict(trunc_wins) return np.argmax(probs, axis=1)
def test(name, sents, labels, thre): sents, labels = tensorize([sents, labels], device) model = map_item(name, models) with torch.no_grad(): model.eval() probs = torch.sigmoid(model(sents)) probs = torch.squeeze(probs, dim=-1) mask = labels > -1 mask_probs, mask_labels = probs.masked_select(mask), labels.masked_select(mask) mask_preds = mask_probs > thre f1 = f1_score(mask_labels, mask_preds) print('\n%s f1: %.2f - acc: %.2f' % (name, f1, accuracy_score(mask_labels, mask_preds)))