def decode(acoustic, model, sess, branch_factor=50, beam_size=200, weight=[[0.8], [0.2]], hash_length=10, out=None, weight_model_dict=None, weight_model=None, verbose=False, gt=None): """ Transduce the given acoustic probabilistic piano roll into a binary piano roll. Parameters ========== acoustic : matrix A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0 inclusive. acoustic[p, t] represents the probability of pitch p being present at frame t. model : Model The language model to use for the transduction process. sess : tf.session The session for the given model. branch_factor : int The number of samples to use per frame. Defaults to 50. beam_size : int The beam size for the search. Defaults to 200. weight : matrix A 2 x (1 or 88) matrix, whose first row is the weight for the acoustic model and whose 2nd row is the weight for the language model, either for each pitch (2x88) or across all pitches (2x1). Each column in the matrix should be normalized to sum to 1. Defaults to [[0.8], [0.2]]. hash_length : int The history length for the hashed beam. If two states do not differ in the past hash_length frames, only the most probable one is saved in the beam. Defaults to 10. out : string The directory in which to save the outputs, or None to not save anything. Defaults to None. weight_model_dict : dict A dictionary containing information about the weight model to use, if any. Defaults to None, which uses the static weight of the weight parameter. weight_model : sklearn.model or tf.keras.Model The model to be used as a weight_model, or None to use static weighting. verbose : bool Print progress in number of frames. Defaults to False (no printing). gt : matrix The ground truth piano roll, 88 x T. If given, this will be used to always use the optimum weight for each frame. Defaults to None. Returns ======= piano_roll : np.ndarray An 88 x T binary piano roll, where a 1 represents the presence of a pitch at a given frame. priors : np.ndarray An 88 x T matrix, giving the prior assigned to each pitch detection by the most probable language model state. weights : np.ndarray An 88 X T matrix, giving the acoustic weights for each pitch at each frame. """ if gt is not None: weight_model = True is_weight = True if (not weight_model) and weight[0][0] == 1.0: return (acoustic > 0.5).astype(int), np.zeros(acoustic.shape), np.ones( acoustic.shape), acoustic weights_all = None priors_all = None beam = Beam() beam.add_initial_state(model, sess) acoustic = np.transpose(acoustic) for frame_num, frame in enumerate(acoustic): if verbose and frame_num % 20 == 0: print(str(frame_num) + " / " + str(acoustic.shape[0])) # Run the LSTM! if frame_num != 0: run_lstm(sess, model, beam) # Here, beam contains a list of states, with sample histories, priors, and LSTM hidden_states, # but needs to be updated with weights and combined_priors when sampling. # Here, we are calculating dynamic weights or priors if we are using gt or a weight_model if weight_model: weights_all, priors_all = run_weight_model(gt, weight_model, weight_model_dict, beam, acoustic, frame_num) new_beam = Beam() # Here we sample from each state in the beam for i, state in enumerate(beam): weight_this = weights_all[:, i * 88:( i + 1) * 88] if weights_all is not None else weight if priors_all is not None: prior = np.squeeze(priors_all[i * 88:(i + 1) * 88]) else: prior = np.squeeze(weight_this[0] * frame + weight_this[1] * state.prior) # Update state state.update_from_weight_model(weight_this[0], prior) for log_prob, sample in itertools.islice(enumerate_samples(prior), branch_factor): # Binarize the sample (return from enumerate_samples is an array of indexes) binary_sample = np.zeros(88) binary_sample[sample] = 1 # Transition on sample new_beam.add(state.transition(binary_sample, log_prob)) new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1)) beam = new_beam if out: output = [(s.get_piano_roll(), s.get_priors(), s.get_weights(), s.get_combined_priors()) for s in beam] with open(os.path.join(out, 'data_' + str(frame_num) + '.pkl'), 'wb') as file: pickle.dump(output, file) top_state = beam.get_top_state() return top_state.get_piano_roll(), top_state.get_priors( ), top_state.get_weights(), top_state.get_combined_priors()
def decode_pitchwise(piano_roll, acoustic, model, sess, pitches, beam_size=200, weight=[[0.8], [0.2]], hash_length=10, uncertainty=0.0): """ Transduce the given binary piano roll prior into a binary piano roll by changing the given pitches. Parameters ========== piano_roll : np.ndarray A binary piano roll, 88 X T. acoustic : np.ndarray A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0 inclusive. acoustic[p, t] represents the probability of pitch p being present at frame t. model : Model The language model to use for the transduction process. sess : tf.session The session for the given model. pitches : list(int) A list of the pitches to transduce. beam_size : int The beam size for the search. Defaults to 200. weight : matrix A 2 x 1 matrix, whose first row is the weight for the acoustic model and whose 2nd row is the weight for the language model. Defaults to [[0.8], [0.2]]. hash_length : int The history length for the hashed beam. If two states do not differ in the past hash_length frames, only the most probable one is saved in the beam. Defaults to 10. uncertainty : float The uncertainty of the LSTM prior outputs. The outputs will be scaled to a range of size (1 - 2*uncertainty), centered around 0.5. Specifically, (0.0, 1.0) -> (0.0+uncertainty, 1.0-uncertainty). Defaults to 0.0. Returns ======= pr : np.ndarray The resulting binary piano roll, 88 X T. """ window = int((model.n_notes - 1) / 2) pr_padded = np.vstack((np.zeros((window, piano_roll.shape[1])), piano_roll, np.zeros((window, piano_roll.shape[1])))) # One beam per pitch beams = [] for i in range(len(pitches)): beam = Beam() beam.add_initial_state(model, sess, iterative_pw=True) beams.append(beam) acoustic = np.transpose(acoustic) for frame_num, frame in enumerate(acoustic): # Run the LSTM! if frame_num != 0: run_lstm_pitchwise_iterative(sess, model, [s for beam in beams for s in beam], uncertainty=uncertainty) # Here, the beams contain a list of states, with sample histories, priors, and hidden_states, # but needs to be updated with weights and combined_priors when sampling. new_beams = [] for i in range(len(pitches)): new_beams.append(Beam()) # Here we sample from each state in each beam for i, (pitch, beam, new_beam) in enumerate(zip(pitches, beams, new_beams)): pr_windowed = pr_padded[pitch:pitch + 2 * window + 1, frame_num] for state in beam: if weight[0][0] == -1: prior = frame[pitch] * state.prior[0] anti_prior = (1 - frame[pitch]) * (1 - state.prior[0]) # print(frame[pitch] , state.prior[0]) # print(prior, anti_prior,np.log([prior, anti_prior])) else: prior = np.squeeze(weight[0][0] * frame[pitch] + weight[1][0] * state.prior[0]) anti_prior = 1 - prior # Update state state.update_from_weight_model(weight[0], [prior]) for log_prob, sample in zip(np.log([prior, anti_prior]), [1, 0]): if window == 0: sample_full = np.array([sample]) else: sample_full = np.concatenate( (pr_windowed[:window], [sample], pr_windowed[-window:])) # Transition on sample new_beam.add(state.transition(sample_full, log_prob)) new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1)) beams[i] = new_beam for i, (pitch, beam) in enumerate(zip(pitches, beams)): piano_roll[pitch, :] = beam.get_top_state().get_piano_roll()[window, :] return piano_roll
def get_weight_data(gt, acoustic, model, sess, branch_factor=50, beam_size=200, union=False, weight=[[0.5], [0.5]], hash_length=10, gt_only=False, history=5, min_diff=0.01, features=False, verbose=False): """ Get the average ranks of the ground truth frame from decode.enumerate_samples(). Parameters ========== gt : matrix The ground truth binary piano roll, 88 x T. acoustic : matrix A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0 inclusive. acoustic[p, t] represents the probability of pitch p being present at frame t. model : Model The language model to use for the transduction process. sess : tf.session The session for the given model. branch_factor : int The number of samples to use per frame. Defaults to 50. beam_size : int The beam size for the search. Defaults to 50. union : boolean True to use union sampling. False (default) to use joint sampling with the weight. weight : list A length-2 list, whose first element is the weight for the acoustic model and whose 2nd element is the weight for the language model. This list should be normalized to sum to 1. Defaults to [0.5, 0.5]. hash_length : int The history length for the hashed beam. If two states do not differ in the past hash_length frames, only the most probable one is saved in the beam. Defaults to 10. gt_only : boolean True to transition only on the ground truth sample, no matter its rank. Flase to transition normally. Defaults to False. history : int How many frames to save in the x data point. Defaults to 5. min_diff : float The minimum difference (between language and acoustic) to save a data point. Defaults to 0.01. features : boolean Whether to use features in the weight_model's data points. Defaults to False. Returns ======= x : np.ndarray The x data from this decoding process. A (data x 7) size matrix. y : np.array The y data from this decoding process. A data-length array. diffs : np.array The differences between the language and acoustic model priors for each data point. """ if union: branch_factor = int(branch_factor / 2) x = np.zeros((0, 0)) y = np.zeros(0) diffs = np.zeros(0) beam = Beam() beam.add_initial_state(model, sess) gt = np.transpose(gt) ranks = [] acoustic = np.transpose(acoustic) for frame_num, frame in enumerate(acoustic): if frame_num % 20 == 0 and verbose: print(str(frame_num) + " / " + str(acoustic.shape[0])) gt_frame = gt[frame_num, :] states = [] samples = [] weights = [] priors = [] # Used for union sampling unique_samples = [] # Get data for state in beam: pitches = np.argwhere(1 - np.isclose(np.squeeze(state.prior), np.squeeze(frame), rtol=0.0, atol=min_diff) )[:, 0] if min_diff > 0 else np.arange(88) if len(pitches) > 0: if len(x) > 0: x = np.vstack( (x, decode.create_weight_x_sk(state, acoustic, frame_num, history, pitches=pitches, features=features))) else: x = decode.create_weight_x_sk(state, acoustic, frame_num, history, pitches=pitches, features=features) y = np.append(y, gt_frame[pitches]) diffs = np.append( diffs, np.abs( np.squeeze(frame)[pitches] - np.squeeze(state.prior)[pitches])) # Gather all computations to perform them batched # Acoustic sampling is done separately because the acoustic samples will be identical for every state. if gt_only: states = [beam.get_top_state()] samples = [gt_frame] weights = [[[1.0], [0.0]]] priors = [np.squeeze(states[0].prior)] else: if union or weight[0][0] == 1.0: # If sampling method is acoustic (or union), we generate the same samples for every current hypothesis for _, sample in itertools.islice( decode.enumerate_samples(frame, beam.beam[0].prior, weight=[[1.0], [0.0]]), branch_factor): binary_sample = np.zeros(88) binary_sample[sample] = 1 # This is used to check for overlaps in union case if union: unique_samples.append(list(binary_sample)) for i, state in enumerate(beam): weight_this = weight states.append(state) priors.append(np.squeeze(state.prior)) weights.append(weight_this) samples.append(binary_sample) if union or weight[0][0] != 1.0: for i, state in enumerate(beam): sample_weight = [[0.0], [1.0]] if union else weight for _, sample in itertools.islice( decode.enumerate_samples(frame, state.prior, weight=sample_weight), branch_factor): binary_sample = np.zeros(88) binary_sample[sample] = 1 # Overlap with acoustic sample in union case. Skip this sample. if not (union and list(binary_sample) in unique_samples): weight_this = weight priors.append(np.squeeze(state.prior)) states.append(state) samples.append(binary_sample) weights.append(weight_this) log_probs, combined_priors = decode.get_log_prob( np.array(samples), np.array(frame), np.array(priors), np.array(weights)) np_samples = np.zeros((len(samples), 1, 88)) for i, sample in enumerate(samples): np_samples[i, 0, :] = sample hidden_states, priors = model.run_one_step( [s.hidden_state for s in states], np_samples, sess) beam = Beam() for hidden_state, prior, log_prob, state, sample, w, combined_prior in zip( hidden_states, priors, log_probs, states, samples, weights, combined_priors): beam.add( state.transition(sample, log_prob, hidden_state, prior, w, combined_prior)) beam.cut_to_size(beam_size, min(hash_length, frame_num + 1)) return x, y, diffs