def get_weight_data(gt,
                    acoustic,
                    model,
                    sess,
                    branch_factor=50,
                    beam_size=200,
                    union=False,
                    weight=[[0.5], [0.5]],
                    hash_length=10,
                    gt_only=False,
                    history=5,
                    min_diff=0.01,
                    features=False,
                    verbose=False,
                    no_mlm=False):
    """
    Get the average ranks of the ground truth frame from decode.enumerate_samples().

    Parameters
    ==========
    gt : matrix
        The ground truth binary piano roll, 88 x T.

    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.

    model : Model
        The language model to use for the transduction process.

    sess : tf.session
        The session for the given model.

    branch_factor : int
        The number of samples to use per frame. Defaults to 50.

    beam_size : int
        The beam size for the search. Defaults to 50.

    union : boolean
        True to use union sampling. False (default) to use joint sampling with the weight.

    weight : list
        A length-2 list, whose first element is the weight for the acoustic model and whose 2nd
        element is the weight for the language model. This list should be normalized to sum to 1.
        Defaults to [0.5, 0.5].

    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.

    gt_only : boolean
        True to transition only on the ground truth sample, no matter its rank. Flase to transition
        normally. Defaults to False.

    history : int
        How many frames to save in the x data point. Defaults to 5.

    min_diff : float
        The minimum difference (between language and acoustic) to save a data point. Defaults to 0.01.

    features : boolean
        Whether to use features in the weight_model's data points. Defaults to False.

    no_mlm : boolean
        Whether to suppress mlm-based inputs. Defaults to False.


    Returns
    =======
    x : np.ndarray
        The x data from this decoding process. A (data x 7) size matrix.

    y : np.array
        The y data from this decoding process. A data-length array.

    diffs : np.array
        The differences between the language and acoustic model priors for each data point.
    """
    weights_all = None
    priors_all = None

    beam = Beam()
    beam.add_initial_state(model, sess)

    acoustic = np.transpose(acoustic)

    x = np.zeros((0, 0))
    y = np.zeros(0)
    diffs = np.zeros(0)

    gt = np.transpose(gt)

    for frame_num, (gt_frame, frame) in enumerate(zip(gt, acoustic)):
        if verbose and frame_num % 20 == 0:
            print(str(frame_num) + " / " + str(acoustic.shape[0]))

        # Run the LSTM!
        if frame_num != 0:
            decode.run_lstm(sess, model, beam)

        # Here, beam contains a list of states, with sample histories, priors, and LSTM hidden_states,
        # but needs to be updated with weights and combined_priors when sampling.

        # Get data
        for state in beam:
            pitches = np.argwhere(1 - np.isclose(np.squeeze(state.prior),
                                                 np.squeeze(frame),
                                                 rtol=0.0,
                                                 atol=min_diff)
                                  )[:, 0] if min_diff > 0 else np.arange(88)

            if len(pitches) > 0:
                if len(x) > 0:
                    x = np.vstack((x,
                                   decode.create_weight_x_sk(state,
                                                             acoustic,
                                                             frame_num,
                                                             history,
                                                             pitches=pitches,
                                                             features=features,
                                                             no_mlm=no_mlm)))
                else:
                    x = decode.create_weight_x_sk(state,
                                                  acoustic,
                                                  frame_num,
                                                  history,
                                                  pitches=pitches,
                                                  features=features,
                                                  no_mlm=no_mlm)
                y = np.append(y, gt_frame[pitches])
                diffs = np.append(
                    diffs,
                    np.abs(
                        np.squeeze(frame)[pitches] -
                        np.squeeze(state.prior)[pitches]))

        new_beam = Beam()

        # Here we sample from each state in the beam
        if gt_only:
            new_beam.add(state.transition(gt_frame, 0.0))

        else:
            for i, state in enumerate(beam):
                weight_this = weights_all[:, i * 88:(
                    i + 1) * 88] if weights_all is not None else weight

                if priors_all is not None:
                    prior = np.squeeze(priors_all[i * 88:(i + 1) * 88])
                else:
                    prior = np.squeeze(weight_this[0] * frame +
                                       weight_this[1] * state.prior)

                # Update state
                state.update_from_weight_model(weight_this[0], prior)

                for log_prob, sample in itertools.islice(
                        decode.enumerate_samples(prior), branch_factor):

                    # Binarize the sample (return from enumerate_samples is an array of indexes)
                    binary_sample = np.zeros(88)
                    binary_sample[sample] = 1

                    # Transition on sample
                    new_beam.add(state.transition(binary_sample, log_prob))

        new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))
        beam = new_beam

    return x, y, diffs
Beispiel #2
0
def get_weight_data(gt,
                    acoustic,
                    model,
                    sess,
                    branch_factor=50,
                    beam_size=200,
                    union=False,
                    weight=[[0.5], [0.5]],
                    hash_length=10,
                    gt_only=False,
                    history=5,
                    min_diff=0.01,
                    features=False,
                    verbose=False):
    """
    Get the average ranks of the ground truth frame from decode.enumerate_samples().
    
    Parameters
    ==========
    gt : matrix
        The ground truth binary piano roll, 88 x T.
    
    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.
        
    model : Model
        The language model to use for the transduction process.
        
    sess : tf.session
        The session for the given model.
        
    branch_factor : int
        The number of samples to use per frame. Defaults to 50.
        
    beam_size : int
        The beam size for the search. Defaults to 50.
        
    union : boolean
        True to use union sampling. False (default) to use joint sampling with the weight.
        
    weight : list
        A length-2 list, whose first element is the weight for the acoustic model and whose 2nd
        element is the weight for the language model. This list should be normalized to sum to 1.
        Defaults to [0.5, 0.5].
        
    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.
        
    gt_only : boolean
        True to transition only on the ground truth sample, no matter its rank. Flase to transition
        normally. Defaults to False.
        
    history : int
        How many frames to save in the x data point. Defaults to 5.
        
    min_diff : float
        The minimum difference (between language and acoustic) to save a data point. Defaults to 0.01.
        
    features : boolean
        Whether to use features in the weight_model's data points. Defaults to False.
        
    
    Returns
    =======
    x : np.ndarray
        The x data from this decoding process. A (data x 7) size matrix.
        
    y : np.array
        The y data from this decoding process. A data-length array.
        
    diffs : np.array
        The differences between the language and acoustic model priors for each data point.
    """
    if union:
        branch_factor = int(branch_factor / 2)

    x = np.zeros((0, 0))
    y = np.zeros(0)
    diffs = np.zeros(0)

    beam = Beam()
    beam.add_initial_state(model, sess)

    gt = np.transpose(gt)
    ranks = []

    acoustic = np.transpose(acoustic)

    for frame_num, frame in enumerate(acoustic):
        if frame_num % 20 == 0 and verbose:
            print(str(frame_num) + " / " + str(acoustic.shape[0]))
        gt_frame = gt[frame_num, :]

        states = []
        samples = []
        weights = []
        priors = []

        # Used for union sampling
        unique_samples = []

        # Get data
        for state in beam:
            pitches = np.argwhere(1 - np.isclose(np.squeeze(state.prior),
                                                 np.squeeze(frame),
                                                 rtol=0.0,
                                                 atol=min_diff)
                                  )[:, 0] if min_diff > 0 else np.arange(88)
            if len(pitches) > 0:
                if len(x) > 0:
                    x = np.vstack(
                        (x,
                         decode.create_weight_x_sk(state,
                                                   acoustic,
                                                   frame_num,
                                                   history,
                                                   pitches=pitches,
                                                   features=features)))
                else:
                    x = decode.create_weight_x_sk(state,
                                                  acoustic,
                                                  frame_num,
                                                  history,
                                                  pitches=pitches,
                                                  features=features)
                y = np.append(y, gt_frame[pitches])
                diffs = np.append(
                    diffs,
                    np.abs(
                        np.squeeze(frame)[pitches] -
                        np.squeeze(state.prior)[pitches]))

        # Gather all computations to perform them batched
        # Acoustic sampling is done separately because the acoustic samples will be identical for every state.
        if gt_only:
            states = [beam.get_top_state()]
            samples = [gt_frame]
            weights = [[[1.0], [0.0]]]
            priors = [np.squeeze(states[0].prior)]

        else:
            if union or weight[0][0] == 1.0:
                # If sampling method is acoustic (or union), we generate the same samples for every current hypothesis
                for _, sample in itertools.islice(
                        decode.enumerate_samples(frame,
                                                 beam.beam[0].prior,
                                                 weight=[[1.0], [0.0]]),
                        branch_factor):
                    binary_sample = np.zeros(88)
                    binary_sample[sample] = 1

                    # This is used to check for overlaps in union case
                    if union:
                        unique_samples.append(list(binary_sample))

                    for i, state in enumerate(beam):
                        weight_this = weight
                        states.append(state)
                        priors.append(np.squeeze(state.prior))
                        weights.append(weight_this)
                        samples.append(binary_sample)

            if union or weight[0][0] != 1.0:
                for i, state in enumerate(beam):
                    sample_weight = [[0.0], [1.0]] if union else weight
                    for _, sample in itertools.islice(
                            decode.enumerate_samples(frame,
                                                     state.prior,
                                                     weight=sample_weight),
                            branch_factor):

                        binary_sample = np.zeros(88)
                        binary_sample[sample] = 1

                        # Overlap with acoustic sample in union case. Skip this sample.
                        if not (union
                                and list(binary_sample) in unique_samples):
                            weight_this = weight

                            priors.append(np.squeeze(state.prior))
                            states.append(state)
                            samples.append(binary_sample)
                            weights.append(weight_this)

        log_probs, combined_priors = decode.get_log_prob(
            np.array(samples), np.array(frame), np.array(priors),
            np.array(weights))

        np_samples = np.zeros((len(samples), 1, 88))
        for i, sample in enumerate(samples):
            np_samples[i, 0, :] = sample

        hidden_states, priors = model.run_one_step(
            [s.hidden_state for s in states], np_samples, sess)

        beam = Beam()
        for hidden_state, prior, log_prob, state, sample, w, combined_prior in zip(
                hidden_states, priors, log_probs, states, samples, weights,
                combined_priors):
            beam.add(
                state.transition(sample, log_prob, hidden_state, prior, w,
                                 combined_prior))

        beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))

    return x, y, diffs
Beispiel #3
0
def decode(acoustic,
           model,
           sess,
           branch_factor=50,
           beam_size=200,
           weight=[[0.8], [0.2]],
           hash_length=10,
           out=None,
           weight_model_dict=None,
           weight_model=None,
           verbose=False,
           gt=None):
    """
    Transduce the given acoustic probabilistic piano roll into a binary piano roll.

    Parameters
    ==========
    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.

    model : Model
        The language model to use for the transduction process.

    sess : tf.session
        The session for the given model.

    branch_factor : int
        The number of samples to use per frame. Defaults to 50.

    beam_size : int
        The beam size for the search. Defaults to 200.

    weight : matrix
        A 2 x (1 or 88) matrix, whose first row is the weight for the acoustic model and whose 2nd
        row is the weight for the language model, either for each pitch (2x88) or across all pitches
        (2x1). Each column in the matrix should be normalized to sum to 1. Defaults to [[0.8], [0.2]].

    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.

    out : string
        The directory in which to save the outputs, or None to not save anything. Defaults to None.

    weight_model_dict : dict
        A dictionary containing information about the weight model to use, if any. Defaults to None,
        which uses the static weight of the weight parameter.

    weight_model : sklearn.model or tf.keras.Model
        The model to be used as a weight_model, or None to use static weighting.

    verbose : bool
        Print progress in number of frames. Defaults to False (no printing).

    gt : matrix
        The ground truth piano roll, 88 x T. If given, this will be used to always use the optimum
        weight for each frame. Defaults to None.


    Returns
    =======
    piano_roll : np.ndarray
        An 88 x T binary piano roll, where a 1 represents the presence of a pitch
        at a given frame.

    priors : np.ndarray
        An 88 x T matrix, giving the prior assigned to each pitch detection by the
        most probable language model state.

    weights : np.ndarray
        An 88 X T matrix, giving the acoustic weights for each pitch at each frame.
    """
    if gt is not None:
        weight_model = True
        is_weight = True

    if (not weight_model) and weight[0][0] == 1.0:
        return (acoustic > 0.5).astype(int), np.zeros(acoustic.shape), np.ones(
            acoustic.shape), acoustic

    weights_all = None
    priors_all = None

    beam = Beam()
    beam.add_initial_state(model, sess)

    acoustic = np.transpose(acoustic)

    for frame_num, frame in enumerate(acoustic):
        if verbose and frame_num % 20 == 0:
            print(str(frame_num) + " / " + str(acoustic.shape[0]))

        # Run the LSTM!
        if frame_num != 0:
            run_lstm(sess, model, beam)

        # Here, beam contains a list of states, with sample histories, priors, and LSTM hidden_states,
        # but needs to be updated with weights and combined_priors when sampling.

        # Here, we are calculating dynamic weights or priors if we are using gt or a weight_model
        if weight_model:
            weights_all, priors_all = run_weight_model(gt, weight_model,
                                                       weight_model_dict, beam,
                                                       acoustic, frame_num)

        new_beam = Beam()

        # Here we sample from each state in the beam
        for i, state in enumerate(beam):
            weight_this = weights_all[:, i * 88:(
                i + 1) * 88] if weights_all is not None else weight

            if priors_all is not None:
                prior = np.squeeze(priors_all[i * 88:(i + 1) * 88])
            else:
                prior = np.squeeze(weight_this[0] * frame +
                                   weight_this[1] * state.prior)

            # Update state
            state.update_from_weight_model(weight_this[0], prior)

            for log_prob, sample in itertools.islice(enumerate_samples(prior),
                                                     branch_factor):

                # Binarize the sample (return from enumerate_samples is an array of indexes)
                binary_sample = np.zeros(88)
                binary_sample[sample] = 1

                # Transition on sample
                new_beam.add(state.transition(binary_sample, log_prob))

        new_beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))
        beam = new_beam

        if out:
            output = [(s.get_piano_roll(), s.get_priors(), s.get_weights(),
                       s.get_combined_priors()) for s in beam]
            with open(os.path.join(out, 'data_' + str(frame_num) + '.pkl'),
                      'wb') as file:
                pickle.dump(output, file)

    top_state = beam.get_top_state()
    return top_state.get_piano_roll(), top_state.get_priors(
    ), top_state.get_weights(), top_state.get_combined_priors()
Beispiel #4
0
def get_gt_rank(gt,
                acoustic,
                model,
                sess,
                branch_factor=50,
                beam_size=200,
                union=False,
                weight=[0.5, 0.5],
                hash_length=10,
                gt_max=None,
                gt_only=False):
    """
    Get the average ranks of the ground truth frame from decode.enumerate_samples().
    
    Parameters
    ==========
    gt : matrix
        The ground truth binary piano roll, 88 x T.
    
    acoustic : matrix
        A probabilistic piano roll, 88 x T, containing values between 0.0 and 1.0
        inclusive. acoustic[p, t] represents the probability of pitch p being present
        at frame t.
        
    model : Model
        The language model to use for the transduction process.
        
    sess : tf.session
        The session for the given model.
        
    branch_factor : int
        The number of samples to use per frame. Defaults to 50.
        
    beam_size : int
        The beam size for the search. Defaults to 50.
        
    union : boolean
        True to use union sampling. False (default) to use joint sampling with the weight.
        
    weight : list
        A length-2 list, whose first element is the weight for the acoustic model and whose 2nd
        element is the weight for the language model. This list should be normalized to sum to 1.
        Defaults to [0.5, 0.5].
        
    hash_length : int
        The history length for the hashed beam. If two states do not differ in the past hash_length
        frames, only the most probable one is saved in the beam. Defaults to 10.
        
    gt_max : int
        The maximum rank to check for the ground truth sample. Defaults to None (no limit).
        
    gt_only : boolean
        True to transition only on the ground truth sample, no matter its rank. Flase to transition
        normally. Defaults to False.
        
    
    Returns
    =======
    ranks : list
        A list of the ranks of the ground truth sample for each transition.
    """
    if union:
        branch_factor = int(branch_factor / 2)

    beam = Beam()
    beam.add_initial_state(model, sess)

    gt = np.transpose(gt)
    ranks = []

    if gt_max is not None and union:
        gt_max = int(gt_max / 2)

    for frame_num, frame in enumerate(np.transpose(acoustic)):
        print(str(frame_num) + " / " + str(acoustic.shape[1]))
        gt_frame = np.nonzero(gt[frame_num, :])[0]

        states = []
        samples = []
        log_probs = []

        # Used for union sampling
        unique_samples = []

        # Gather all computations to perform them batched
        # Acoustic sampling is done separately because the acoustic samples will be identical for every state.
        if union or weight[0] == 1.0:
            # If sampling method is acoustic (or union), we generate the same samples for every current hypothesis
            rank_ac, enumerated_samples = get_rank_and_samples(
                gt_frame, frame, beam.beam[0].prior, [1.0, 0.0],
                0 if gt_only else branch_factor, gt_max)
            if not union:
                ranks.append(rank_ac)

            if gt_only:
                enumerated_samples = [gt_frame]

            for sample in enumerated_samples:
                binary_sample = np.zeros(88)
                binary_sample[sample] = 1

                # This is used to check for overlaps in union case
                if union:
                    unique_samples.append(list(binary_sample))

                for state in beam:
                    states.append(state)
                    samples.append(binary_sample)
                    log_probs.append(
                        decode.get_log_prob(binary_sample, frame, state.prior,
                                            weight))

        if union or weight[0] != 1.0:
            for state in beam:
                rank_la, enumerated_samples = get_rank_and_samples(
                    gt_frame, frame, state.prior,
                    [0.0, 1.0] if union else weight,
                    0 if gt_only else branch_factor, gt_max)

                if union:
                    ranks.append(min(rank_ac, rank_la))
                else:
                    ranks.append(rank_la)

                if gt_only and not union:
                    enumerated_samples = [gt_frame]

                for sample in enumerated_samples:
                    binary_sample = np.zeros(88)
                    binary_sample[sample] = 1

                    # Overlap with acoustic sample in union case. Skip this sample.
                    if not (union and list(binary_sample) in unique_samples):
                        states.append(state)
                        samples.append(binary_sample)
                        log_probs.append(
                            decode.get_log_prob(binary_sample, frame,
                                                state.prior, weight))

        np_samples = np.zeros((len(samples), 1, 88))
        for i, sample in enumerate(samples):
            np_samples[i, 0, :] = sample

        hidden_states, priors = model.run_one_step(
            [s.hidden_state for s in states], np_samples, sess)

        beam = Beam()
        for hidden_state, prior, log_prob, state, sample in zip(
                hidden_states, priors, log_probs, states, samples):
            beam.add(state.transition(sample, log_prob, hidden_state, prior))

        beam.cut_to_size(beam_size, min(hash_length, frame_num + 1))

    return ranks