def make_functions( input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs,'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:,-output_length:,:-2] Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)"%(time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop( params, grads, learning_rate=1e-4, P=P_learn ) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function( inputs=[input_seqs, output_seqs], outputs=bits_loss ) print "Done. (%0.3f s)"%(time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
def make_functions(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]): start_time = time.time() input_seqs = T.btensor3('input_sequences') output_seqs = T.btensor3('output_sequences') P = Parameters() process = model.build(P, input_size, output_size, mem_size, mem_width, hidden_sizes[0]) outputs = process(T.cast(input_seqs, 'float32')) output_length = (input_seqs.shape[1] - 2) // 2 Y = output_seqs[:, -output_length:, :-2] Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2]) cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y)) bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2) params = P.values() cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params) print "Computing gradients", grads = T.grad(cost, wrt=params) grads = updates.clip_deltas(grads, np.float32(clip_length)) print "Done. (%0.3f s)" % (time.time() - start_time) start_time = time.time() print "Compiling function", P_learn = Parameters() update_pairs = updates.rmsprop(params, grads, learning_rate=1e-4, P=P_learn) train = theano.function( inputs=[input_seqs, output_seqs], outputs=cross_entropy, updates=update_pairs, ) test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss) print "Done. (%0.3f s)" % (time.time() - start_time) print P.parameter_count() return P, P_learn, train, test
word_rep_size = 128, stmt_hidden_size = 128, diag_hidden_size = 128, vocab_size = vocab_size, output_size = vocab_size, map_fun_size = 128, evidence_count = evidence_count ) output_evds,output_ans = attention(story,idxs,qstn) cross_entropy = -T.log(output_ans[ans_lbl]) \ + -T.log(output_evds[0][ans_evds[0]]) \ + -T.log(output_evds[1][ans_evds[1]]) #cost += -T.log(ordered_probs(output_evds,ans_e.vds)) print "Done." print "Parameter count:", P.parameter_count() print "Calculating gradient expression...", params = P.values() cost = cross_entropy grads = T.grad(cost,wrt=params) print "Done." inputs = [story,idxs,qstn,ans_lbl,ans_evds] outputs = cross_entropy pickle.dump( (inputs,outputs,params,grads), open("compute_tree.pkl","wb"),2 ) print "Compiling native...",
attention = model.build(P, word_rep_size=128, stmt_hidden_size=128, diag_hidden_size=128, vocab_size=vocab_size, output_size=vocab_size, map_fun_size=128, evidence_count=evidence_count) output_evds, output_ans = attention(story, idxs, qstn) cross_entropy = -T.log(output_ans[ans_lbl]) \ + -T.log(output_evds[0][ans_evds[0]]) \ + -T.log(output_evds[1][ans_evds[1]]) #cost += -T.log(ordered_probs(output_evds,ans_e.vds)) print "Done." print "Parameter count:", P.parameter_count() print "Calculating gradient expression...", params = P.values() cost = cross_entropy grads = T.grad(cost, wrt=params) print "Done." inputs = [story, idxs, qstn, ans_lbl, ans_evds] outputs = cross_entropy pickle.dump((inputs, outputs, params, grads), open("compute_tree.pkl", "wb"), 2) print "Compiling native...", lr = T.fscalar('lr') acc, update = make_functions(inputs, outputs, params, grads, lr)