def mask_chorale(self, chorale, constraints_location=None): """ (batch_size, num_voices, chorale_length) :param chorale: :return: """ p = random.random() * 0.5 if constraints_location is None: constraints_location = cuda_variable( (torch.rand(*chorale.size()) < p).long()) else: assert constraints_location.size() == chorale.size() constraints_location = cuda_variable(constraints_location) batch_size, num_voices, chorale_length = chorale.size() no_constraint = torch.from_numpy( np.array([ len(note2index) for note2index in self.chorale_dataset.note2index_dicts ])) no_constraint = no_constraint[None, :, None] no_constraint = no_constraint.long().clone().repeat( batch_size, 1, chorale_length) no_constraint = cuda_variable(no_constraint) return chorale * constraints_location + no_constraint * ( 1 - constraints_location)
def loss_and_acc_on_epoch(self, data_loader, train=True): """ :param data_loader: :param train: :return: (float, float) """ mean_loss = 0 mean_accuracy = 0 for sample_id, (chorale, metadata) in tqdm(enumerate(data_loader)): chorale, metadata = (cuda_variable(chorale.long()), cuda_variable(metadata.long())) self.optimizer.zero_grad() weights = self.forward(chorale=chorale, metadata=metadata) t = int((self.chorale_dataset.sequences_size * self.chorale_dataset.subdivision / 2)) targets = chorale[:, :, t] targets = targets.transpose(0, 1) # targets is now (num_voices, batch) weights = [ weight_per_voice[:, t, :] for weight_per_voice in weights ] # list of (batch, num_notes) loss = self.mean_crossentropy_loss(weights=weights, targets=targets) if train: loss.backward() self.optimizer.step() # compute mean loss and accuracy mean_loss += to_numpy(loss.mean())[0] accuracy = self.mean_accuracy(weights=weights, targets=targets) mean_accuracy += to_numpy(accuracy)[0] mean_loss /= len(data_loader) mean_accuracy /= len(data_loader) return (mean_loss, mean_accuracy)
def forward(self, chorale: Variable, metadata: Variable): """ :param chorale: (batch, num_voices, length in ticks) :param metadata: (batch, num_voices, length in ticks, num_metadatas) :return: list of probabilities per voice (batch, chorale_length, num_notes) """ batch_size, num_voices, chorale_length = chorale.size() sequence_length = num_voices * chorale_length # === embed as wrapped sequence === # --- chorale x = self.embed_chorale(chorale) # --- metadata m = self.embed_metadata(metadata, chorale) # === LSTM on constraints === output_constraints = self.output_lstm_constraints(m) # === LSTM on notes === offset_seq = torch.cat([ cuda_variable(torch.zeros(batch_size, 1, self.note_embedding_dim)), x[:, :sequence_length - 1, :] ], 1) if self.dropout_input_prob > 0: offset_seq = self.drop_input(offset_seq) input = torch.cat([offset_seq, output_constraints], 2) hidden = self.init_hidden(batch_size=batch_size, type='generation') output_gen, hidden = self.lstm_generation(input, hidden) # distributed NN on output weights = [ F.relu(self.linear_1(time_slice)) for time_slice in output_gen.split(split_size=1, dim=1) ] weights = torch.cat(weights, 1) weights = weights.view(batch_size, chorale_length, num_voices, self.num_units_linear) # CrossEntropy includes a LogSoftMax layer weights = [ linear_layer(voice[:, :, 0, :]) for voice, linear_layer in zip( weights.split(split_size=1, dim=2), self.linear_ouput_notes) ] return weights
def output_lstm_constraints(self, flat_embedded_metadata): """ :param flat_embedded_metadata: (batch_size, length, total_embedding_dim) :return: """ batch_size = flat_embedded_metadata.size(0) hidden = self.init_hidden(batch_size=batch_size, type='constraint') # reverse seq idx = [i for i in range(flat_embedded_metadata.size(1) - 1, -1, -1)] idx = cuda_variable(torch.LongTensor(idx)) flat_embedded_metadata = flat_embedded_metadata.index_select(1, idx) output_constraints, hidden = self.lstm_constraint( flat_embedded_metadata, hidden) output_constraints = output_constraints.index_select(1, idx) return output_constraints
def fill(self, ascii_input): self.eval() # constants num_voices = self.chorale_dataset.num_voices padding_size = self.chorale_dataset.num_voices * 8 * self.chorale_dataset.subdivision temperature = 1. chorale_length = len(ascii_input[0]) # preprocessing constraint_metadata = [[ d[c] if c != 'NC' else len(d) for c in ascii_voice ] for d, ascii_voice in zip(self.chorale_dataset.note2index_dicts, ascii_input)] constraint_metadata = torch.from_numpy( np.array(constraint_metadata)).long() constraint_metadata = self.chorale_dataset.extract_metadata_with_padding( constraint_metadata[:, :, None], -padding_size, end_tick=chorale_length + padding_size)[:, :, 0] constraint_metadata = cuda_variable(constraint_metadata, volatile=True) constraint_metadata = self.embed_chorale( constraint_metadata[None, :, :]) other_metadata = cuda_variable(torch.from_numpy( np.array([ metadata.generate(chorale_length + 2 * padding_size) for metadata in self.chorale_dataset.metadatas ])), volatile=True) # add voice index?! other_metadata = torch.cat( [other_metadata, torch.zeros_like(other_metadata)], 0) other_metadata = other_metadata.transpose(0, 1) other_metadata = other_metadata[None, None, :, :] other_metadata = self.embed_metadata(other_metadata) tensor_metadata = torch.cat([ other_metadata, constraint_metadata, ], 2) # generated chorale gen_chorale = self.chorale_dataset.empty_chorale(chorale_length) output_constraints = self.output_lstm_constraints(tensor_metadata) hidden = self.init_hidden(batch_size=1, type='generation') # 1 bar of start symbols for tick_index in range(padding_size): voice_index = tick_index % self.chorale_dataset.num_voices # notes time_slice = gen_chorale[voice_index, 0] time_slice = torch.from_numpy(np.array([time_slice]))[None, :] note = self.note_embeddings[voice_index](cuda_variable( time_slice, volatile=True)) time_slice = note # concat with first metadata time_slice_cat = torch.cat( (time_slice, output_constraints[:, tick_index:tick_index + 1, :]), 2) output_gen, hidden = self.lstm_generation(time_slice_cat, hidden) output_constraints = output_constraints[:, padding_size:-padding_size, :] # generation: for tick_index in range(-1, chorale_length * num_voices - 1): voice_index = tick_index % num_voices time_index = (tick_index - voice_index) // num_voices next_voice_index = (tick_index + 1) % num_voices next_time_index = (tick_index + 1 - next_voice_index) // num_voices if tick_index == -1: last_start_symbol = gen_chorale[-1, 0] last_start_symbol = torch.from_numpy( np.array([last_start_symbol]))[None, :] time_slice = self.note_embeddings[-1](cuda_variable( (last_start_symbol), volatile=True)) else: time_slice = gen_chorale[voice_index, time_index] time_slice = torch.from_numpy(np.array([time_slice]))[None, :] note = self.note_embeddings[voice_index](cuda_variable( time_slice, volatile=True)) time_slice = note time_slice_cat = torch.cat( (time_slice, output_constraints[:, tick_index + 1:tick_index + 2, :]), 2) output_gen, hidden = self.lstm_generation(time_slice_cat, hidden) weights = F.relu(self.linear_1(output_gen[:, 0, :])) weights = self.linear_ouput_notes[next_voice_index](weights) # compute predictions # temperature weights = weights * temperature preds = F.softmax(weights) # first batch element preds = to_numpy(preds[0]) new_pitch_index = np.random.choice(np.arange( self.num_notes_per_voice[next_voice_index]), p=preds) gen_chorale[next_voice_index, next_time_index] = int(new_pitch_index) score = self.chorale_dataset.tensor_chorale_to_score( tensor_chorale=gen_chorale) return score, gen_chorale, tensor_metadata
def generate(self, original_tensor_chorale, tensor_metadata, constraints_location, temperature=1.): self.eval() original_tensor_chorale = cuda_variable(original_tensor_chorale, volatile=True) num_voices, chorale_length, num_metadatas = tensor_metadata.size() # generated chorale gen_chorale = self.chorale_dataset.empty_chorale(chorale_length) m = cuda_variable(tensor_metadata[None, :, :, :], volatile=True) m = self.embed_metadata( m, original_tensor_chorale[None, :, :], constraints_location=constraints_location[None, :, :]) output_constraints = self.output_lstm_constraints(m) hidden = self.init_hidden(batch_size=1, type='generation') for tick_index in range(self.chorale_dataset.num_voices * 4 * self.chorale_dataset.subdivision - 1): voice_index = tick_index % self.chorale_dataset.num_voices # notes time_slice = gen_chorale[voice_index, 0] time_slice = torch.from_numpy(np.array([time_slice]))[None, :] note = self.note_embeddings[voice_index](cuda_variable( time_slice, volatile=True)) time_slice = note time_slice_cat = torch.cat( (time_slice, output_constraints[:, tick_index + 1:tick_index + 2, :]), 2) output_gen, hidden = self.lstm_generation(time_slice_cat, hidden) # generation: for tick_index in range(-1, chorale_length * num_voices - 1): voice_index = tick_index % num_voices time_index = (tick_index - voice_index) // num_voices next_voice_index = (tick_index + 1) % num_voices next_time_index = (tick_index + 1 - next_voice_index) // num_voices if tick_index == -1: last_start_symbol = gen_chorale[-1, 0] last_start_symbol = torch.from_numpy( np.array([last_start_symbol]))[None, :] time_slice = self.note_embeddings[-1](cuda_variable( (last_start_symbol), volatile=True)) else: time_slice = gen_chorale[voice_index, time_index] time_slice = torch.from_numpy(np.array([time_slice]))[None, :] note = self.note_embeddings[voice_index](cuda_variable( time_slice, volatile=True)) time_slice = note time_slice_cat = torch.cat( (time_slice, output_constraints[:, tick_index + 1:tick_index + 2, :]), 2) output_gen, hidden = self.lstm_generation(time_slice_cat, hidden) weights = F.relu(self.linear_1(output_gen[:, 0, :])) weights = self.linear_ouput_notes[next_voice_index](weights) # compute predictions # temperature weights = weights * temperature preds = F.softmax(weights) # first batch element preds = to_numpy(preds[0]) new_pitch_index = np.random.choice(np.arange( self.num_notes_per_voice[next_voice_index]), p=preds) gen_chorale[next_voice_index, next_time_index] = int(new_pitch_index) score = self.chorale_dataset.tensor_chorale_to_score( tensor_chorale=gen_chorale) return score, gen_chorale, tensor_metadata