def step(inp_t, inpgate_t, h1_tm1, h2_tm1, h3_tm1): h1 = GRU(inp_t, inpgate_t, h1_tm1, h1_dim, h1_dim, random_state) h1_t, h1gate_t = GRUFork([h1], [h1_dim], h2_dim, random_state) h2 = GRU(h1_t, h1gate_t, h2_tm1, h2_dim, h2_dim, random_state) h2_t, h2gate_t = GRUFork([h2], [h2_dim], h3_dim, random_state) h3 = GRU(h2_t, h2gate_t, h3_tm1, h3_dim, h3_dim, random_state) return h1, h2, h3 h1, h2, h3 = scan(step, [inp_proj, inpgate_proj], [init_h1, init_h2, init_h3]) final_h1, final_h2, final_h3 = [ni(h1, -1), ni(h2, -1), ni(h3, -1)] pred = Linear([h3], [h3_dim], out_dim, random_state) cost = tf.reduce_mean(categorical_crossentropy(softmax(pred), target)) # cost in bits # cost = cost * 1.44269504089 params = tf.trainable_variables() print_network(params) grads = tf.gradients(cost, params) grads = [tf.clip_by_value(grad, -grad_clip, grad_clip) for grad in grads] opt = tf.train.AdamOptimizer(learning_rate) updates = opt.apply_gradients(zip(grads, params)) def _loop(itr, sess, inits=None, do_updates=True): if inits is None: i_h1 = np.zeros((batch_size, h1_dim)).astype("float32") i_h2 = np.zeros((batch_size, h2_dim)).astype("float32")
outgate_t + outctxgate_proj, h1_tm1, dec_h1_dim, dec_h1_dim, random_state, mask=outmask_t) return dec_h1 dec_h1 = scan(dec_step, [out_proj, outgate_proj, target_mask], [init_dec_h1]) # Add decode context with shape/broadcast games ctx = broadcast(final_enc_h1, dec_h1) pred = Linear([dec_h1, ctx], [dec_h1_dim, enc_h1_dim], out_dim, random_state) full_cost = categorical_crossentropy(softmax(pred), target) cost = tf.reduce_mean(target_mask * full_cost) # cost in bits # cost = cost * 1.44269504089 params = tf.trainable_variables() print_network(params) grads = tf.gradients(cost, params) grad_clip = 5.0 grads = [tf.clip_by_value(grad, -grad_clip, grad_clip) for grad in grads] opt = tf.train.AdamOptimizer(learning_rate) updates = opt.apply_gradients(zip(grads, params)) def _loop(X_mb, X_mb_mask, y_mb, y_mb_mask, do_updates=True): i_enc_h1 = np.zeros((batch_size, enc_h1_dim)).astype("float32")
target_note_embed = Multiembedding(note_target, n_note_symbols, note_embed_dim, random_state) target_note_masked = Automask(target_note_embed, n_notes) costs = [] note_preds = [] duration_preds = [] for i in range(n_notes): note_pred = Linear([h1, h2, target_note_masked[i]], [h_dim, h_dim, n_notes * note_embed_dim], note_out_dims[i], random_state, weight_norm=False) # reweight by empirical counts? n = categorical_crossentropy(softmax(note_pred), note_target[:, :, i], class_weights={0: .001}) cost = tf.reduce_sum(n) note_preds.append(note_pred) costs.append(cost) cost = sum(costs) #/ (sequence_length * batch_size) # cost in bits # cost = cost * 1.44269504089 params = tf.trainable_variables() grads = tf.gradients(cost, params) grads = [tf.clip_by_value(grad, -grad_clip, grad_clip) for grad in grads] opt = tf.train.AdamOptimizer(learning_rate) updates = opt.apply_gradients(zip(grads, params))
note_out_dims[i], random_state, weight_norm=weight_norm_outputs, name=name_note) duration_pred = Linear([ h1[:, :, :h_dim], scan_inp, target_note_masked[i], target_duration_masked[i] ], [ h_dim, scan_inp_dim, n_notes * note_embed_dim, n_notes * duration_embed_dim ], duration_out_dims[i], random_state, weight_norm=weight_norm_outputs, name=name_dur) n = categorical_crossentropy(softmax(note_pred), note_target[:, :, i]) d = categorical_crossentropy(softmax(duration_pred), duration_target[:, :, i]) cost = n_duration_symbols * tf.reduce_mean( n) + n_note_symbols * tf.reduce_mean(d) cost /= (n_duration_symbols + n_note_symbols) note_preds.append(note_pred) duration_preds.append(duration_pred) costs.append(cost) # 4 notes pitch and 4 notes duration cost = sum(costs) / float(n_notes + n_notes) params = tf.trainable_variables() grads = tf.gradients(cost, params) grads = [tf.clip_by_value(grad, -grad_clip, grad_clip) for grad in grads]