Python Beam.get_top_state Exemples, beam.Beam.get_top_state Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : decode.py Projet : yeweili94/MLM_decoding

def decode(acoustic,
           model,
           sess,
           branch_factor=50,
           beam_size=200,
           weight=[[0.8], [0.2]],
           hash_length=10,
           out=None,
           weight_model_dict=None,
           weight_model=None,
           verbose=False,
           gt=None):
    """
    Transduce the given acoustic probabilistic piano roll into a binary piano roll.

    Parameters
    ==========
    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.

    model : Model
        The language model to use for the transduction process.

    sess : tf.session
        The session for the given model.

    branch_factor : int
        The number of samples to use per frame. Defaults to 50.

    beam_size : int
        The beam size for the search. Defaults to 200.

    weight : matrix
        A 2 x (1 or 88) matrix, whose first row is the weight for the acoustic model and whose 2nd
        row is the weight for the language model, either for each pitch (2x88) or across all pitches
        (2x1). Each column in the matrix should be normalized to sum to 1. Defaults to [[0.8], [0.2]].

    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.

    out : string
        The directory in which to save the outputs, or None to not save anything. Defaults to None.

    weight_model_dict : dict
        A dictionary containing information about the weight model to use, if any. Defaults to None,
        which uses the static weight of the weight parameter.

    weight_model : sklearn.model or tf.keras.Model
        The model to be used as a weight_model, or None to use static weighting.

    verbose : bool
        Print progress in number of frames. Defaults to False (no printing).

    gt : matrix
        The ground truth piano roll, 88 x T. If given, this will be used to always use the optimum
        weight for each frame. Defaults to None.


    Returns
    =======
    piano_roll : np.ndarray
        An 88 x T binary piano roll, where a 1 represents the presence of a pitch
        at a given frame.

    priors : np.ndarray
        An 88 x T matrix, giving the prior assigned to each pitch detection by the
        most probable language model state.

    weights : np.ndarray
        An 88 X T matrix, giving the acoustic weights for each pitch at each frame.
    """
    if gt is not None:
        weight_model = True
        is_weight = True

    if (not weight_model) and weight[0][0] == 1.0:
        return (acoustic > 0.5).astype(int), np.zeros(acoustic.shape), np.ones(
            acoustic.shape), acoustic

    weights_all = None
    priors_all = None

    beam = Beam()
    beam.add_initial_state(model, sess)

    acoustic = np.transpose(acoustic)

    for frame_num, frame in enumerate(acoustic):
        if verbose and frame_num % 20 == 0:
            print(str(frame_num) + " / " + str(acoustic.shape[0]))

        # Run the LSTM!
        if frame_num != 0:
            run_lstm(sess, model, beam)

        # Here, beam contains a list of states, with sample histories, priors, and LSTM hidden_states,
        # but needs to be updated with weights and combined_priors when sampling.

        # Here, we are calculating dynamic weights or priors if we are using gt or a weight_model
        if weight_model:
            weights_all, priors_all = run_weight_model(gt, weight_model,
                                                       weight_model_dict, beam,
                                                       acoustic, frame_num)

        new_beam = Beam()

        # Here we sample from each state in the beam
        for i, state in enumerate(beam):
            weight_this = weights_all[:, i * 88:(
                i + 1) * 88] if weights_all is not None else weight

            if priors_all is not None:
                prior = np.squeeze(priors_all[i * 88:(i + 1) * 88])
            else:
                prior = np.squeeze(weight_this[0] * frame +
                                   weight_this[1] * state.prior)

            # Update state
            state.update_from_weight_model(weight_this[0], prior)

            for log_prob, sample in itertools.islice(enumerate_samples(prior),
                                                     branch_factor):

                # Binarize the sample (return from enumerate_samples is an array of indexes)
                binary_sample = np.zeros(88)
                binary_sample[sample] = 1

                # Transition on sample
                new_beam.add(state.transition(binary_sample, log_prob))

        new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))
        beam = new_beam

        if out:
            output = [(s.get_piano_roll(), s.get_priors(), s.get_weights(),
                       s.get_combined_priors()) for s in beam]
            with open(os.path.join(out, 'data_' + str(frame_num) + '.pkl'),
                      'wb') as file:
                pickle.dump(output, file)

    top_state = beam.get_top_state()
    return top_state.get_piano_roll(), top_state.get_priors(
    ), top_state.get_weights(), top_state.get_combined_priors()

Exemple #2

0

Afficher le fichier

Fichier : decode.py Projet : yeweili94/MLM_decoding

def decode_pitchwise(piano_roll,
                     acoustic,
                     model,
                     sess,
                     pitches,
                     beam_size=200,
                     weight=[[0.8], [0.2]],
                     hash_length=10,
                     uncertainty=0.0):
    """
    Transduce the given binary piano roll prior into a binary piano roll by changing the given pitches.

    Parameters
    ==========
    piano_roll : np.ndarray
        A binary piano roll, 88 X T.

    acoustic : np.ndarray
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.

    model : Model
        The language model to use for the transduction process.

    sess : tf.session
        The session for the given model.

    pitches : list(int)
        A list of the pitches to transduce.

    beam_size : int
        The beam size for the search. Defaults to 200.

    weight : matrix
        A 2 x 1 matrix, whose first row is the weight for the acoustic model and whose 2nd
        row is the weight for the language model. Defaults to [[0.8], [0.2]].

    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.

    uncertainty : float
        The uncertainty of the LSTM prior outputs. The outputs will be scaled
        to a range of size (1 - 2*uncertainty), centered around 0.5. Specifically,
        (0.0, 1.0) -> (0.0+uncertainty, 1.0-uncertainty). Defaults to 0.0.

    Returns
    =======
    pr : np.ndarray
        The resulting binary piano roll, 88 X T.
    """
    window = int((model.n_notes - 1) / 2)
    pr_padded = np.vstack((np.zeros((window, piano_roll.shape[1])), piano_roll,
                           np.zeros((window, piano_roll.shape[1]))))

    # One beam per pitch
    beams = []
    for i in range(len(pitches)):
        beam = Beam()
        beam.add_initial_state(model, sess, iterative_pw=True)
        beams.append(beam)

    acoustic = np.transpose(acoustic)

    for frame_num, frame in enumerate(acoustic):
        # Run the LSTM!
        if frame_num != 0:
            run_lstm_pitchwise_iterative(sess,
                                         model,
                                         [s for beam in beams for s in beam],
                                         uncertainty=uncertainty)

        # Here, the beams contain a list of states, with sample histories, priors, and hidden_states,
        # but needs to be updated with weights and combined_priors when sampling.

        new_beams = []
        for i in range(len(pitches)):
            new_beams.append(Beam())

        # Here we sample from each state in each beam
        for i, (pitch, beam,
                new_beam) in enumerate(zip(pitches, beams, new_beams)):
            pr_windowed = pr_padded[pitch:pitch + 2 * window + 1, frame_num]

            for state in beam:
                if weight[0][0] == -1:
                    prior = frame[pitch] * state.prior[0]
                    anti_prior = (1 - frame[pitch]) * (1 - state.prior[0])
                    # print(frame[pitch] , state.prior[0])
                    # print(prior, anti_prior,np.log([prior, anti_prior]))
                else:
                    prior = np.squeeze(weight[0][0] * frame[pitch] +
                                       weight[1][0] * state.prior[0])
                    anti_prior = 1 - prior
                    # Update state
                    state.update_from_weight_model(weight[0], [prior])

                for log_prob, sample in zip(np.log([prior, anti_prior]),
                                            [1, 0]):

                    if window == 0:
                        sample_full = np.array([sample])
                    else:
                        sample_full = np.concatenate(
                            (pr_windowed[:window], [sample],
                             pr_windowed[-window:]))

                    # Transition on sample
                    new_beam.add(state.transition(sample_full, log_prob))

            new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))
            beams[i] = new_beam

    for i, (pitch, beam) in enumerate(zip(pitches, beams)):
        piano_roll[pitch, :] = beam.get_top_state().get_piano_roll()[window, :]

    return piano_roll

Exemple #3

0

Afficher le fichier

def get_weight_data(gt,
                    acoustic,
                    model,
                    sess,
                    branch_factor=50,
                    beam_size=200,
                    union=False,
                    weight=[[0.5], [0.5]],
                    hash_length=10,
                    gt_only=False,
                    history=5,
                    min_diff=0.01,
                    features=False,
                    verbose=False):
    """
    Get the average ranks of the ground truth frame from decode.enumerate_samples().
    
    Parameters
    ==========
    gt : matrix
        The ground truth binary piano roll, 88 x T.
    
    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.
        
    model : Model
        The language model to use for the transduction process.
        
    sess : tf.session
        The session for the given model.
        
    branch_factor : int
        The number of samples to use per frame. Defaults to 50.
        
    beam_size : int
        The beam size for the search. Defaults to 50.
        
    union : boolean
        True to use union sampling. False (default) to use joint sampling with the weight.
        
    weight : list
        A length-2 list, whose first element is the weight for the acoustic model and whose 2nd
        element is the weight for the language model. This list should be normalized to sum to 1.
        Defaults to [0.5, 0.5].
        
    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.
        
    gt_only : boolean
        True to transition only on the ground truth sample, no matter its rank. Flase to transition
        normally. Defaults to False.
        
    history : int
        How many frames to save in the x data point. Defaults to 5.
        
    min_diff : float
        The minimum difference (between language and acoustic) to save a data point. Defaults to 0.01.
        
    features : boolean
        Whether to use features in the weight_model's data points. Defaults to False.
        
    
    Returns
    =======
    x : np.ndarray
        The x data from this decoding process. A (data x 7) size matrix.
        
    y : np.array
        The y data from this decoding process. A data-length array.
        
    diffs : np.array
        The differences between the language and acoustic model priors for each data point.
    """
    if union:
        branch_factor = int(branch_factor / 2)

    x = np.zeros((0, 0))
    y = np.zeros(0)
    diffs = np.zeros(0)

    beam = Beam()
    beam.add_initial_state(model, sess)

    gt = np.transpose(gt)
    ranks = []

    acoustic = np.transpose(acoustic)

    for frame_num, frame in enumerate(acoustic):
        if frame_num % 20 == 0 and verbose:
            print(str(frame_num) + " / " + str(acoustic.shape[0]))
        gt_frame = gt[frame_num, :]

        states = []
        samples = []
        weights = []
        priors = []

        # Used for union sampling
        unique_samples = []

        # Get data
        for state in beam:
            pitches = np.argwhere(1 - np.isclose(np.squeeze(state.prior),
                                                 np.squeeze(frame),
                                                 rtol=0.0,
                                                 atol=min_diff)
                                  )[:, 0] if min_diff > 0 else np.arange(88)
            if len(pitches) > 0:
                if len(x) > 0:
                    x = np.vstack(
                        (x,
                         decode.create_weight_x_sk(state,
                                                   acoustic,
                                                   frame_num,
                                                   history,
                                                   pitches=pitches,
                                                   features=features)))
                else:
                    x = decode.create_weight_x_sk(state,
                                                  acoustic,
                                                  frame_num,
                                                  history,
                                                  pitches=pitches,
                                                  features=features)
                y = np.append(y, gt_frame[pitches])
                diffs = np.append(
                    diffs,
                    np.abs(
                        np.squeeze(frame)[pitches] -
                        np.squeeze(state.prior)[pitches]))

        # Gather all computations to perform them batched
        # Acoustic sampling is done separately because the acoustic samples will be identical for every state.
        if gt_only:
            states = [beam.get_top_state()]
            samples = [gt_frame]
            weights = [[[1.0], [0.0]]]
            priors = [np.squeeze(states[0].prior)]

        else:
            if union or weight[0][0] == 1.0:
                # If sampling method is acoustic (or union), we generate the same samples for every current hypothesis
                for _, sample in itertools.islice(
                        decode.enumerate_samples(frame,
                                                 beam.beam[0].prior,
                                                 weight=[[1.0], [0.0]]),
                        branch_factor):
                    binary_sample = np.zeros(88)
                    binary_sample[sample] = 1

                    # This is used to check for overlaps in union case
                    if union:
                        unique_samples.append(list(binary_sample))

                    for i, state in enumerate(beam):
                        weight_this = weight
                        states.append(state)
                        priors.append(np.squeeze(state.prior))
                        weights.append(weight_this)
                        samples.append(binary_sample)

            if union or weight[0][0] != 1.0:
                for i, state in enumerate(beam):
                    sample_weight = [[0.0], [1.0]] if union else weight
                    for _, sample in itertools.islice(
                            decode.enumerate_samples(frame,
                                                     state.prior,
                                                     weight=sample_weight),
                            branch_factor):

                        binary_sample = np.zeros(88)
                        binary_sample[sample] = 1

                        # Overlap with acoustic sample in union case. Skip this sample.
                        if not (union
                                and list(binary_sample) in unique_samples):
                            weight_this = weight

                            priors.append(np.squeeze(state.prior))
                            states.append(state)
                            samples.append(binary_sample)
                            weights.append(weight_this)

        log_probs, combined_priors = decode.get_log_prob(
            np.array(samples), np.array(frame), np.array(priors),
            np.array(weights))

        np_samples = np.zeros((len(samples), 1, 88))
        for i, sample in enumerate(samples):
            np_samples[i, 0, :] = sample

        hidden_states, priors = model.run_one_step(
            [s.hidden_state for s in states], np_samples, sess)

        beam = Beam()
        for hidden_state, prior, log_prob, state, sample, w, combined_prior in zip(
                hidden_states, priors, log_probs, states, samples, weights,
                combined_priors):
            beam.add(
                state.transition(sample, log_prob, hidden_state, prior, w,
                                 combined_prior))

        beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))

    return x, y, diffs