def __init__(self, config, model): self.num_layers = 1 self.input_dim = config.embedding_dim self.model = model self.use_char_rnn = config.use_char_rnn self.char_rnn = CharRNN(config, model) if self.use_char_rnn else None input_size = self.input_dim if not self.char_rnn else self.input_dim + config.charlstm_hidden_dim self.bilstm = dy.BiRNNBuilder(1, input_size, config.hidden_dim, self.model, dy.LSTMBuilder) print("Input to word-level BiLSTM size: %d" % (input_size)) print("BiLSTM hidden size: %d" % (config.hidden_dim)) # self.bilstm.set_dropout(config.dropout_bilstm) self.num_labels = len(config.label2idx) self.label2idx = config.label2idx self.labels = config.idx2labels # print(config.hidden_dim) # self.tanh_w = self.model.add_parameters((config.tanh_hidden_dim, config.hidden_dim)) # self.tanh_bias = self.model.add_parameters((config.tanh_hidden_dim,)) self.linear_w = self.model.add_parameters( (self.num_labels, config.hidden_dim)) self.linear_bias = self.model.add_parameters((self.num_labels, )) self.transition = self.model.add_lookup_parameters( (self.num_labels, self.num_labels)) vocab_size = len(config.word2idx) self.word2idx = config.word2idx print("Word Embedding size: %d x %d" % (vocab_size, self.input_dim)) self.word_embedding = self.model.add_lookup_parameters( (vocab_size, self.input_dim), init=config.word_embedding) self.dropout = config.dropout
def main(config): words, word_id_map, poems_id_vector, id_word_map = process_poems(config.file_name, start_token='S', end_token='E') generate_batches = generate_batch(config.batch_size, poems_id_vector, word_id_map) with tf.Session() as sess: model = CharRNN(sess, config.epoch_size, config.num_layers, config.batch_size, config.learning_rate, len(words)+1, config.rnn_size, generate_batches, config.checkpoint_dir, False) model.train()
def __init__(self, config, model, mask): self.num_layers = 1 self.input_dim = config.embedding_dim self.model = model self.use_char_rnn = config.use_char_rnn self.char_rnn = CharRNN(config, model) if self.use_char_rnn else None input_size = self.input_dim if not self.char_rnn else self.input_dim + config.charlstm_hidden_dim self.bilstm = dy.BiRNNBuilder(1, input_size, config.hidden_dim, self.model, dy.LSTMBuilder) print("Input to word-level BiLSTM size: %d" % (input_size)) print("BiLSTM hidden size: %d" % (config.hidden_dim)) # self.bilstm.set_dropout(config.dropout_bilstm) self.num_labels = len(config.label2idx) self.label2idx = config.label2idx self.labels = config.idx2labels # print(config.hidden_dim) self.linear_w = self.model.add_parameters( (self.num_labels, config.hidden_dim)) self.linear_bias = self.model.add_parameters((self.num_labels, )) trans_np = np.random.rand(self.num_labels, self.num_labels) trans_np[self.label2idx[START], :] = -1e10 trans_np[:, self.label2idx[STOP]] = -1e10 self.init_iobes_constraint(trans_np) # print(trans_np) self.transition = self.model.add_lookup_parameters( (self.num_labels, self.num_labels), init=trans_np) vocab_size = len(config.word2idx) self.word2idx = config.word2idx print("Word Embedding size: %d x %d" % (vocab_size, self.input_dim)) self.word_embedding = self.model.add_lookup_parameters( (vocab_size, self.input_dim), init=config.word_embedding) # self.mask_tensor = [ self.model.add_lookup_parameters((vocab_size, self.input_dim), init=config.word_embedding) for inst_mask in mask ] self.mask = mask # for inst_mask in mask: # print(inst_mask) self.dropout = config.dropout
def train(args): # load data text, vocab_size, mapping = load_data('transcription_train.txt') test = load_test_file('transcription_test.txt') # Dump few states to use in generation dump([vocab_size, args.hidden_size, args.embedding_dim, args.n_layers], open('state_vars.pkl', 'wb')) my_net = CharRNN(hidden_size=args.hidden_size, embedding_dim=args.embedding_dim, output_size=vocab_size, n_layers=args.n_layers) # Create the network, loss_fn = torch.nn.CrossEntropyLoss() # loss function / optimizer optim = torch.optim.Adam(my_net.parameters(), lr=args.learning_rate) if torch.cuda.is_available(): # Move the network and the optimizer to the GPU my_net = my_net.cuda() loss_fn = loss_fn.cuda() loss_avg = 0 prev_ppl = 1000000 for epoch in range(1, args.n_epochs + 1): start_time = timer() loss = train_batch(my_net, optim, loss_fn, args, *train_set(args, text, len(text), mapping)) loss_avg += loss if epoch % args.print_every == 0: val_loss = evaluate(my_net, loss_fn, mapping, test) ppl = math.exp(val_loss) print( "Epoch {} : Training Loss: {:.5f}, Test ppl: {:.5f}, Time elapsed {:.2f} mins" .format(epoch, loss, ppl, (timer() - start_time) / 60)) if ppl < prev_ppl: prev_ppl = ppl torch.save(my_net.state_dict(), 'bestModel.t7') print("Perplexity reduced, saving model !!") return my_net
print("\ntrain/dev/test size: {:d}/{:d}/{:d}\n".format(len(train_y), len(dev_y), len(test_y))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Instantiate our model rnn = CharRNN(vocabulary_size, FLAGS.sentence_length, FLAGS.batch_size, 2, embedding_size=FLAGS.embedding_dim, hidden_dim=FLAGS.hidden_dim, num_layers=FLAGS.num_layers, loss=FLAGS.loss_type) # Generate input batches (using tensorflow) with tf.variable_scope("input"): placeholder_x = tf.placeholder(tf.int32, train_x.shape) placeholder_y = tf.placeholder(tf.float32, train_y.shape) train_x_var = tf.Variable(placeholder_x, trainable=False, collections=[]) train_y_var = tf.Variable(placeholder_y, trainable=False, collections=[]) x_slice, y_slice = tf.train.slice_input_producer(
import theano import theano.tensor as T import sys import random ############################### # # Prepare the data # ############################### # f = open("../data/reuters21578/reut2-002.sgm") f = open("../data/tinyshakespeare/input.txt") text = f.read() f.close() rnn = CharRNN() seq_len = 150 def train(eta, iters): for it in xrange(iters): i = random.randint(0, len(text) / seq_len) j = i * seq_len X = text[j:(j + seq_len)] Y = text[(j + 1):(j + 1 + seq_len)] print "iteration: %s, cost: %s" % ( str(it), str(rnn.train(one_hot(X), one_hot(Y), eta, 1.0)))
_, hidden = model(primer[:, p], hidden) input = primer[:, -1] predicted = in_text # generate a fixed number of characters, to generate indefinite string replace this with while loop for _ in range(n_chars): # predict character y, hidden = model(input, hidden) _, yhat = y.max(dim=1) yhat = yhat.data.cpu()[0] char = reverse_mapping[yhat] predicted += char input = get_encoded_sequence(char, mapping) return predicted vocab_size, hidden_size, embedding_dim, n_layers = load(open('state_vars.pkl', 'rb')) # load the model model = CharRNN(hidden_size=hidden_size, embedding_dim=embedding_dim, output_size=vocab_size, n_layers=n_layers) if torch.cuda.is_available(): model = model.cuda() model.load_state_dict(torch.load('model.t7')) # load the mapping mapping = load(open('mapping.pkl', 'rb')) reverse_mapping = load(open('reverse_mapping.pkl', 'rb')) # Generate few sentences print(generate_seq(model, mapping, reverse_mapping, 'టాలు కూడా వ', 3000)) print(generate_seq(model, mapping, reverse_mapping, 'కదా అంత మంచ', 3000)) print(generate_seq(model, mapping, reverse_mapping, 'ును అందుకని', 3000))
class Partial_Perceptron: def __init__(self, config, model): self.num_layers = 1 self.input_dim = config.embedding_dim self.model = model self.use_char_rnn = config.use_char_rnn self.char_rnn = CharRNN(config, model) if self.use_char_rnn else None input_size = self.input_dim if not self.char_rnn else self.input_dim + config.charlstm_hidden_dim self.bilstm = dy.BiRNNBuilder(1, input_size, config.hidden_dim, self.model, dy.LSTMBuilder) print("Input to word-level BiLSTM size: %d" % (input_size)) print("BiLSTM hidden size: %d" % (config.hidden_dim)) # self.bilstm.set_dropout(config.dropout_bilstm) self.num_labels = len(config.label2idx) self.label2idx = config.label2idx self.labels = config.idx2labels # print(config.hidden_dim) self.o_id = self.label2idx["O"] self.linear_w = self.model.add_parameters( (self.num_labels, config.hidden_dim)) self.linear_bias = self.model.add_parameters((self.num_labels, )) trans_np = np.random.rand(self.num_labels, self.num_labels) trans_np[self.label2idx[START], :] = -1e10 trans_np[:, self.label2idx[STOP]] = -1e10 self.init_iobes_constraint(trans_np) # print(trans_np) self.transition = self.model.add_lookup_parameters( (self.num_labels, self.num_labels), init=trans_np) vocab_size = len(config.word2idx) self.word2idx = config.word2idx print("Word Embedding size: %d x %d" % (vocab_size, self.input_dim)) self.word_embedding = self.model.add_lookup_parameters( (vocab_size, self.input_dim), init=config.word_embedding) self.dropout = config.dropout def init_iobes_constraint(self, trans_np): for l1 in range(self.num_labels): ##previous label if l1 == self.label2idx[START] or l1 == self.label2idx[STOP]: continue for l2 in range(self.num_labels): ##next label if l2 == self.label2idx[START] or l2 == self.label2idx[STOP]: continue if not check_bies_constraint(self.labels[l1], self.labels[l2]): trans_np[l2, l1] = -1e10 def build_graph_with_char(self, x, all_chars, is_train): if is_train: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(dy.dropout(concat, self.dropout)) else: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(concat) lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features # computing the negative log-likelihood def build_graph(self, x, is_train): # dy.renew_cg() if is_train: embeddings = [ dy.dropout(self.word_embedding[w], self.dropout) for w in x ] else: embeddings = [self.word_embedding[w] for w in x] lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features def forward_unlabeled(self, features, output): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) for pos, obs in enumerate(features): alphas_t = [] if output[pos] != self.o_id: for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast if pos == 0 or output[ pos - 1] != self.o_id else for_expr + obs_broadcast alphas_t.append(max_score(next_tag_expr)) for_expr = dy.concatenate(alphas_t) # for_expr = dy.max_dim(alphas_t) # dy.emax() terminal_expr = for_expr + self.transition[ self.label2idx[STOP]] if output[-1] != self.o_id else for_expr alpha = max_score(terminal_expr) return alpha # def forward_labeled(self, id, features, output): # init_alphas = [-1e10] * self.num_labels # init_alphas[self.label2idx[START]] = 0 # # for_expr = dy.inputVector(init_alphas) # for pos, obs in enumerate(features): # alphas_t = [] # if output[pos] == self.o_id: # for next_tag in range(self.num_labels): # next_tag_expr = for_expr # alphas_t.append(max_score(next_tag_expr)) # else: # for next_tag in range(self.num_labels): # if next_tag != output[pos]: # next_tag_expr = for_expr + dy.inputVector([-1e10] * self.num_labels) # else: # obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) # next_tag_expr = for_expr + self.transition[next_tag] + obs_broadcast # alphas_t.append(max_score(next_tag_expr)) # for_expr = dy.concatenate(alphas_t) # # for_expr = dy.max_dim(alphas_t) # # dy.emax() # terminal_expr = for_expr + self.transition[self.label2idx[STOP]] # alpha = max_score(terminal_expr) # return alpha # Labeled network score def forward_labeled(self, id, features, tags, is_prediction): score = dy.scalarInput(0) # tags = [self.label2idx[w] for w in tags] tags = [self.label2idx[START]] + tags is_prediction = [False] + is_prediction for i, obs in enumerate(features): # if tags[i+1] != self.o_id: if not is_prediction[i + 1]: score = score + dy.pick(self.transition[tags[ i + 1]], tags[i]) + dy.pick(obs, tags[ i + 1]) if not is_prediction[i] else score + dy.pick( obs, tags[i + 1]) if not is_prediction[-1]: labeled_score = score + dy.pick( self.transition[self.label2idx[STOP]], tags[-1]) else: labeled_score = score return labeled_score def negative_log_bak(self, id, x, y, x_chars=None): features = self.build_graph( x, True) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, True) # features = self.build_graph(x, True) unlabed_score = self.forward_unlabeled(features, y) labeled_score = self.forward_labeled(id, features, y) return unlabed_score - labeled_score def negative_log(self, id, x, y, x_chars=None): features = self.build_graph( x, True) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, True) # features = self.build_graph(x, True) # unlabed_score = self.forward_unlabeled(features, y) is_prediction = [tag == self.o_id for tag in y] best_path, _ = self.viterbi_decoding(features) unlabeled_score = self.forward_labeled(id, features, best_path, is_prediction) labeled_score = self.forward_labeled(id, features, y, is_prediction) return unlabeled_score - labeled_score def viterbi_decoding(self, features): backpointers = [] init_vvars = [-1e10] * self.num_labels init_vvars[ self.label2idx[START]] = 0 # <Start> has all the probability for_expr = dy.inputVector(init_vvars) trans_exprs = [self.transition[idx] for idx in range(self.num_labels)] for obs in features: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_labels): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.label2idx[STOP]] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.label2idx[START] # Return best path and best path's score return best_path, path_score def decode(self, x, x_chars=None): features = self.build_graph( x, False) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, False) # features = self.build_graph(x, False) best_path, path_score = self.viterbi_decoding(features) best_path = [self.labels[x] for x in best_path] # print(best_path) # print('path_score:', path_score.value()) return best_path
def main(): parser = argparse.ArgumentParser() parser.add_argument('--char-dict', help='character list file location') parser.add_argument('--fwd-in', help='forward model file location') parser.add_argument('--bwd-in', help='backward model file location') parser.add_argument('--layers', type=int, default=NUM_LAYERS) parser.add_argument('--hid-dim', type=int, default=HIDDEN_DIM) parser.add_argument('--emb-dim', type=int, default=EMB_DIM) parser.add_argument('--anns', help='annotations') parser.add_argument('--contexts', help='contexts file location') parser.add_argument('--neg-samps', help='negative samples file location') parser.add_argument('--not-nltk', action='store_true', help='input was not tokenized using nltk') parser.add_argument('--output', help='output file location') args = parser.parse_args() with open(args.char_dict) as chars_f: char_dict = {c: i for i, c in enumerate(chars_f.read())} num_chars = len(char_dict) print(f'loaded {num_chars} characters') # load backup bases bases = {} with open(args.anns) as in_f: in_f.readline() # header for l in in_f: # Annotation, Form, Bases, Semantic Affixes, Triv Affixes, Additional Segment Count, PAXOBS, Blend Type _, fullform, basess, _, _, _, _, _ = l.strip().split('\t') bases[fullform] = basess.split(' ') fmodel = CharRNN(num_chars, args.emb_dim, args.hid_dim, n_layers=args.layers) fmodel.load_state_dict(torch.load(args.fwd_in)) fmodel.eval() bmodel = CharRNN(num_chars, args.emb_dim, args.hid_dim, n_layers=args.layers) bmodel.load_state_dict(torch.load(args.bwd_in)) bmodel.eval() loss_crit = torch.nn.CrossEntropyLoss() sent_contexts_df = pd.read_csv(args.contexts, sep=SENT_CONTEXT_DELIM) neg_samps = pd.read_csv(args.neg_samps, sep=NEGSAMP_DELIM).to_dict(orient='record') results = defaultdict(list) resultlog = [] exceptions = 0 for ix, row in tqdm(sent_contexts_df.iterrows()): neo = row['neologism'] if neo not in bases: continue f_tru = bases[neo][0] b_tru = bases[neo][-1] instkey = (neo, f_tru, b_tru) negs = [x for x in neg_samps if x['FORM'] == neo] if len(negs) == 0: instres = (10000, 10000, 0, 0, 'CHECK', 'CHECK') results[instkey].append(instres) continue sent = '\\n' + row['sentence_context'] + '\\n' if not args.not_nltk: sent = nltk_clean(sent) tnes = rev(sent) # find the location for each bases's start start_loc = sent.find(MASK_TEXT) end_loc = tnes.find(rev(MASK_TEXT)) sent_chars = [enc_c(c, char_dict) for c in sent[:start_loc]] tnes_chars = [enc_c(c, char_dict) for c in tnes[:end_loc]] # run each model on the input fwd_outs, f_hids = fmodel(torch.tensor(sent_chars).view(1, -1)) f_out_last = fwd_outs[:, -1, :].view( 1, 1, num_chars) # needed for predicting first candidate char f_hid_last = f_hids[:, -1, :].view(args.layers, 1, args.hid_dim) bwd_outs, b_hids = bmodel(torch.tensor(tnes_chars).view(1, -1)) b_out_last = bwd_outs[:, -1, :].view(1, 1, num_chars) b_hid_last = b_hids[:, -1, :].view(args.layers, 1, args.hid_dim) # evaluate loss on each candidate fcand_losses = {} bcand_losses = {} ftrg = [enc_c(c, char_dict) for c in f_tru] fcand_losses[f_tru] = conditioned_loss(ftrg, fmodel, loss_crit, f_hid_last, f_out_last) btrg = [enc_c(c, char_dict) for c in rev(b_tru)] bcand_losses[b_tru] = conditioned_loss(btrg, bmodel, loss_crit, b_hid_last, b_out_last) for n in negs: w = n['NEGATIVE'] if n['PLACE'] == "PRE": if w not in fcand_losses: # should always be the case trg = [enc_c(c, char_dict) for c in w] fcand_losses[w] = conditioned_loss(trg, fmodel, loss_crit, f_hid_last, f_out_last) elif n['PLACE'] == "SUF": if w not in bcand_losses: # should always be the case trg = [enc_c(c, char_dict) for c in rev(w)] bcand_losses[w] = conditioned_loss(trg, bmodel, loss_crit, b_hid_last, b_out_last) else: raise Exception(f'unknown location value: {n["PLACE"]}') # complete from bases if f_tru not in fcand_losses: fcand_losses[f_tru] = 0.0 if b_tru not in bcand_losses: bcand_losses[b_tru] = 0.0 # rank ftnll = fcand_losses[f_tru] btnll = bcand_losses[b_tru] fnlls = sorted(fcand_losses.values()) bnlls = sorted(bcand_losses.values()) frank = fnlls.index(ftnll) + 1 brank = bnlls.index(btnll) + 1 instres = (frank, brank, fnlls[0], bnlls[0], len(fnlls), len(bnlls)) instlog = (f'{ftnll:.3f}', f'{btnll:.3f}', str(frank), str(brank), f'{fnlls[0]:.3f}', f'{bnlls[0]:.3f}', str(len(fnlls)), str(len(bnlls))) results[instkey].append(instres) resultlog.append(instkey + instlog) for b, bs in bases.items(): k = (b, bs[0], bs[-1]) if k not in results: instres = (10000, 10000, 0, 0, 'CHECK', 'CHECK') results[k].append(instres) with open(args.output + '.log', 'w') as outf: outf.write( 'form\tpref\tsuf\tpref nll\tsuf nll\tpref rank\tsuf rank\tpref min\tsuf min\t#prefs\t#sufs\n' ) for res in resultlog: outf.write('\t'.join(res) + '\n') with open(args.output, 'w') as outf: outf.write( 'Form\tPref\tSuf\tNULL\tBoth rank\tPref rank\tSuf rank\tpref max\tsuf max\t#prefs\t#sufs\n' ) for k, resl in sorted(results.items()): mean_frank = np.average([r[0] for r in resl]) mean_brank = np.average([r[1] for r in resl]) both_rank = mean_frank * mean_brank mean_fmax = np.average([r[2] for r in resl]) mean_bmax = np.average([r[3] for r in resl]) assert len(set([r[4] for r in resl ])) == 1, f'uneven prefix candidates in {k}' assert len(set([r[5] for r in resl ])) == 1, f'uneven suffix candidates in {k}' outf.write( '\t'.join(k) + f'\t\t{both_rank:.1f}\t{mean_frank:.1f}\t{mean_brank:.1f}\t{mean_fmax:.3f}\t{mean_bmax:.3f}\t{resl[0][-2]}\t{resl[0][-1]}\n' ) print( f'finished with {exceptions} unfound true values. reporting {len(resultlog)} results from {len(results)} blends.' )
SYMBOL_TABLE = os.path.join('../saved_model', 'vocab.sym') if args.type and os.path.exists(SYMBOL_TABLE): all_characters = list(set(open(SYMBOL_TABLE).read())) else: file = open(args.filename).read() print('Loaded file', args.filename) print('File length', len(file)/80, 'lines') all_characters = list(set(file)) with open(SYMBOL_TABLE, 'w') as vocab: print("".join(all_characters), file=vocab) n_characters = len(all_characters) decoder = CharRNN(n_characters, args.hidden_size, n_characters, n_layers=args.n_layers) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate) criterion = nn.CrossEntropyLoss() if args.type: # Enter typing mode print ('Typing Mode...') decoder = torch.load('../saved_model/linux.pt') from typing import build_getch with build_getch() as getch: try: getchar = getch()
print("\ntrain/dev/test size: {:d}/{:d}/{:d}\n".format(len(train_y), len(dev_y), len(test_y))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Instantiate our model rnn = CharRNN( vocabulary_size, FLAGS.sentence_length, FLAGS.batch_size, 2, embedding_size=FLAGS.embedding_dim, hidden_dim=FLAGS.hidden_dim, num_layers=FLAGS.num_layers, loss=FLAGS.loss_type) # Generate input batches (using tensorflow) with tf.variable_scope("input"): placeholder_x = tf.placeholder(tf.int32, train_x.shape) placeholder_y = tf.placeholder(tf.float32, train_y.shape) train_x_var = tf.Variable(placeholder_x, trainable=False, collections=[]) train_y_var = tf.Variable(placeholder_y, trainable=False, collections=[]) x_slice, y_slice = tf.train.slice_input_producer([train_x_var, train_y_var], num_epochs=FLAGS.num_epochs) x_batch, y_batch = tf.train.batch([x_slice, y_slice], batch_size=FLAGS.batch_size) # Define Training procedure
class BiLSTM_CRF: def __init__(self, config, model): self.num_layers = 1 self.input_dim = config.embedding_dim self.model = model self.use_char_rnn = config.use_char_rnn self.char_rnn = CharRNN(config, model) if self.use_char_rnn else None input_size = self.input_dim if not self.char_rnn else self.input_dim + config.charlstm_hidden_dim self.bilstm = dy.BiRNNBuilder(1, input_size, config.hidden_dim, self.model, dy.LSTMBuilder) print("Input to word-level BiLSTM size: %d" % (input_size)) print("BiLSTM hidden size: %d" % (config.hidden_dim)) # self.bilstm.set_dropout(config.dropout_bilstm) self.num_labels = len(config.label2idx) self.label2idx = config.label2idx self.labels = config.idx2labels # print(config.hidden_dim) # self.tanh_w = self.model.add_parameters((config.tanh_hidden_dim, config.hidden_dim)) # self.tanh_bias = self.model.add_parameters((config.tanh_hidden_dim,)) self.linear_w = self.model.add_parameters( (self.num_labels, config.hidden_dim)) self.linear_bias = self.model.add_parameters((self.num_labels, )) self.transition = self.model.add_lookup_parameters( (self.num_labels, self.num_labels)) vocab_size = len(config.word2idx) self.word2idx = config.word2idx print("Word Embedding size: %d x %d" % (vocab_size, self.input_dim)) self.word_embedding = self.model.add_lookup_parameters( (vocab_size, self.input_dim), init=config.word_embedding) self.dropout = config.dropout def save_shared_parameters(self): print("Saving the encoder parameter") # self.word_embedding.save("models/word_embedding.m") dy.save("basename", [ self.char_rnn.char_emb, self.char_rnn.fw_lstm, self.char_rnn.bw_lstm, self.word_embedding, self.bilstm ]) def build_graph_with_char(self, x, all_chars, is_train): if is_train: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(dy.dropout(concat, self.dropout)) else: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(concat) lstm_out = self.bilstm.transduce(embeddings) # tanh_feats = [dy.tanh(dy.affine_transform([self.tanh_bias, self.tanh_w, rep])) for rep in lstm_out] features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features # computing the negative log-likelihood def build_graph(self, x, is_train): # dy.renew_cg() if is_train: embeddings = [ dy.dropout(self.word_embedding[w], self.dropout) for w in x ] else: embeddings = [self.word_embedding[w] for w in x] lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features def forward_unlabeled(self, features): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) for obs in features: alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr, self.num_labels)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = log_sum_exp(terminal_expr, self.num_labels) return alpha # Labeled network score def forward_labeled(self, features, tags): score = dy.scalarInput(0) tags = [self.label2idx[w] for w in tags] tags = [self.label2idx[START]] + tags for i, obs in enumerate(features): score = score + dy.pick(self.transition[tags[i + 1]], tags[i]) + dy.pick(obs, tags[i + 1]) labeled_score = score + dy.pick(self.transition[self.label2idx[STOP]], tags[-1]) return labeled_score def negative_log(self, x, y, x_chars=None): features = self.build_graph( x, True) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, True) # features = self.build_graph(x, True) unlabed_score = self.forward_unlabeled(features) labeled_score = self.forward_labeled(features, y) return unlabed_score - labeled_score def viterbi_decoding(self, features): backpointers = [] init_vvars = [-1e10] * self.num_labels init_vvars[ self.label2idx[START]] = 0 # <Start> has all the probability for_expr = dy.inputVector(init_vvars) trans_exprs = [self.transition[idx] for idx in range(self.num_labels)] for obs in features: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_labels): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.label2idx[STOP]] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.label2idx[START] # Return best path and best path's score return best_path, path_score def decode(self, x, x_chars=None): features = self.build_graph( x, False) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, False) # features = self.build_graph(x, False) best_path, path_score = self.viterbi_decoding(features) best_path = [self.labels[x] for x in best_path] # print(best_path) # print('path_score:', path_score.value()) return best_path
from prepare_hamlet_text import text_ints, chars from char_rnn_utiles import reshape_data from char_rnn import CharRNN batch_size = 64 num_steps = 100 train_x, train_y = reshape_data(text_ints, batch_size, num_steps) rnn = CharRNN(num_classes=len(chars), batch_size=batch_size) rnn.train(train_x, train_y, num_epochs=100, ckpt_dir='./model-100/')
class Soft_BiLSTM_CRF: def __init__(self, config, model): self.num_layers = 1 self.input_dim = config.embedding_dim self.model = model self.use_char_rnn = config.use_char_rnn self.char_rnn = CharRNN(config, model) if self.use_char_rnn else None input_size = self.input_dim if not self.char_rnn else self.input_dim + config.charlstm_hidden_dim self.bilstm = dy.BiRNNBuilder(1, input_size, config.hidden_dim, self.model, dy.LSTMBuilder) print("Input to word-level BiLSTM size: %d" % (input_size)) print("BiLSTM hidden size: %d" % (config.hidden_dim)) # self.bilstm.set_dropout(config.dropout_bilstm) self.num_labels = len(config.label2idx) self.label2idx = config.label2idx self.labels = config.idx2labels # print(config.hidden_dim) self.linear_w = self.model.add_parameters( (self.num_labels, config.hidden_dim)) self.linear_bias = self.model.add_parameters((self.num_labels, )) trans_np = np.random.rand(self.num_labels, self.num_labels) trans_np[self.label2idx[START], :] = -1e10 trans_np[:, self.label2idx[STOP]] = -1e10 self.init_iobes_constraint(trans_np) self.transition = self.model.add_lookup_parameters( (self.num_labels, self.num_labels), init=trans_np) vocab_size = len(config.word2idx) self.word2idx = config.word2idx print("Word Embedding size: %d x %d" % (vocab_size, self.input_dim)) self.word_embedding = self.model.add_lookup_parameters( (vocab_size, self.input_dim), init=config.word_embedding) self.dropout = config.dropout def init_iobes_constraint(self, trans_np): for l1 in range(self.num_labels): ##previous label if l1 == self.label2idx[START] or l1 == self.label2idx[STOP]: continue for l2 in range(self.num_labels): ##next label if l2 == self.label2idx[START] or l2 == self.label2idx[STOP]: continue if not check_bies_constraint(self.labels[l1], self.labels[l2]): trans_np[l2, l1] = -1e10 def build_graph_with_char(self, x, all_chars, is_train): if is_train: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(dy.dropout(concat, self.dropout)) else: embeddings = [] for w, chars in zip(x, all_chars): word_emb = self.word_embedding[w] f, b = self.char_rnn.forward_char(chars) concat = dy.concatenate([word_emb, f, b]) embeddings.append(concat) lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features # computing the negative log-likelihood def build_graph(self, x, is_train): # dy.renew_cg() if is_train: embeddings = [ dy.dropout(self.word_embedding[w], self.dropout) for w in x ] else: embeddings = [self.word_embedding[w] for w in x] lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features def forward_unlabeled(self, features): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) for obs in features: alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr, self.num_labels)) for_expr = dy.concatenate(alphas_t) terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = log_sum_exp(terminal_expr, self.num_labels) return alpha # Labeled network score def forward_labeled(self, id, features, marginals): init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) # print(id) # print(len(features)) # print(self.mask_tensor[id].dim()) marginal = dy.inputTensor(marginals) for pos, obs in enumerate(features): alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast score = log_sum_exp(next_tag_expr, self.num_labels) alphas_t.append(score) # print(self.transition[next_tag].value()) # print(" pos is %d, tag is %s, label score is %.2f "% ( pos, self.labels[next_tag],score.value()) ) for_expr = dy.concatenate(alphas_t) + marginal[pos] terminal_expr = for_expr + self.transition[self.label2idx[STOP]] alpha = log_sum_exp(terminal_expr, self.num_labels) return alpha def negative_log(self, id, x, y, x_chars=None, marginals=None): features = self.build_graph( x, True) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, True) # features = self.build_graph(x, True) unlabed_score = self.forward_unlabeled(features) labeled_score = self.forward_labeled(id, features, marginals) return unlabed_score - labeled_score def viterbi_decoding(self, features): backpointers = [] init_vvars = [-1e10] * self.num_labels init_vvars[ self.label2idx[START]] = 0 # <Start> has all the probability for_expr = dy.inputVector(init_vvars) trans_exprs = [self.transition[idx] for idx in range(self.num_labels)] for obs in features: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_labels): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.label2idx[STOP]] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.label2idx[START] # Return best path and best path's score return best_path, path_score def constrained_viterbi_decoding(self, features, tags, is_prediction): backpointers = [] init_vvars = [-1e10] * self.num_labels init_vvars[ self.label2idx[START]] = 0 # <Start> has all the probability for_expr = dy.inputVector(init_vvars) trans_exprs = [self.transition[idx] for idx in range(self.num_labels)] for pos, obs in enumerate(features): bptrs_t = [] vvars_t = [] if not is_prediction[pos]: mask = dy.inputVector([-1e10] * self.num_labels) for next_tag in range(self.num_labels): next_tag_expr = for_expr + trans_exprs[ next_tag] if next_tag == tags[pos] else for_expr + mask next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) else: for next_tag in range(self.num_labels): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) bptrs_t.append(best_tag_id) vvars_t.append(dy.pick(next_tag_expr, best_tag_id)) for_expr = dy.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[self.label2idx[STOP]] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dy.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == self.label2idx[START] # Return best path and best path's score return best_path, path_score def decode(self, x, x_chars=None, is_constrained=False, y=None, is_prediction=None): features = self.build_graph( x, False) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, False) # features = self.build_graph(x, False) best_path, path_score = self.viterbi_decoding(features) if not is_constrained else \ self.constrained_viterbi_decoding(features, y, is_prediction) if not is_constrained: best_path = [self.labels[x] for x in best_path] # print(best_path) # print('path_score:', path_score.value()) return best_path def max_marginal_decode(self, x, x_chars=None, y=None, is_prediction=None): features = self.build_graph( x, False) if not self.use_char_rnn else self.build_graph_with_char( x, x_chars, False) init_alphas = [-1e10] * self.num_labels init_alphas[self.label2idx[START]] = 0 for_expr = dy.inputVector(init_alphas) all_alphas = [] # print(y) for pos, obs in enumerate(features): alphas_t = [] for next_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, next_tag)] * self.num_labels) next_tag_expr = for_expr + self.transition[ next_tag] + obs_broadcast if (not is_prediction[pos]) and next_tag != y[pos]: mask = dy.inputVector([-1e10] * self.num_labels) next_tag_expr = next_tag_expr + mask alphas_t.append(log_sum_exp(next_tag_expr, self.num_labels)) for_expr = dy.concatenate(alphas_t) all_alphas.append(for_expr) terminal_expr = for_expr + self.transition[self.label2idx[STOP]] final_alpha = log_sum_exp(terminal_expr, self.num_labels) final_alpha.forward() ##backward # print(self.transition[self.label2idx[STOP]].value()) previous_trans = dy.transpose(dy.transpose(self.transition)) # print(previous_trans.value()[:,self.label2idx[STOP]]) init_betas = [-1e10] * self.num_labels init_betas[self.label2idx[STOP]] = 0 back_expr = dy.inputVector(init_betas) all_betas = [] for rev_pos, obs in enumerate(features[::-1]): betas_t = [] for previous_tag in range(self.num_labels): obs_broadcast = dy.concatenate([dy.pick(obs, previous_tag)] * self.num_labels) prev_tag_expr = back_expr + previous_trans[ previous_tag] + obs_broadcast if (not is_prediction[-rev_pos - 1]) and previous_tag != y[-rev_pos - 1]: mask = dy.inputVector([-1e10] * self.num_labels) prev_tag_expr = prev_tag_expr + mask score = log_sum_exp(prev_tag_expr, self.num_labels) betas_t.append(score) back_expr = dy.concatenate(betas_t) all_betas.append(back_expr) start_expr = back_expr + previous_trans[self.label2idx[START]] final_beta = log_sum_exp(start_expr, self.num_labels) final_beta.forward() all_betas_rev = all_betas[::-1] marginals = [] # print(final_alpha.value()) # print(final_beta.value()) k = 0 for f, b in zip(all_alphas, all_betas_rev): marginal = f + b - final_alpha - features[k] x = marginal.value() marginals.append(x) # print("log") # print(x) # print("prob") k += 1 # print(math.fsum([ math.exp(w) for w in x])) return marginals