def generate(mode='random'):
    speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker')
    lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll')
    max_ep_duration = 5000
    timestep = 20
    episode_length = 50
    env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=max_ep_duration)
    preproc = AudioPreprocessor(numcep=12, winlen=timestep/1000)
    replay_buffer = ReplayBuffer(1000000)

    num_samples = 500000

    dt = str(datetime.datetime.now().strftime("%m_%d_%Y_%I_%M_%p_%S"))
    video_dir = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_' + mode + '_' + dt + r'\Videos'
    buffer_fname = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_' + mode + '_' + dt + r'\replay_buffer.pkl'
    os.makedirs(video_dir, exist_ok=True)

    if mode == 'random':
        generate_model_dynamics_training_data_random_policy(env, preproc, replay_buffer, num_samples, episode_length,
                                                            video_dir=video_dir)
    elif mode == 'linear_transition':
        generate_model_dynamics_training_data_linear_transition(env, preproc, replay_buffer, num_samples,
                                                                episode_length, video_dir=video_dir)
    elif mode == 'sigmoid_transition':
        generate_model_dynamics_training_data_sigmoid_transition(env, preproc, replay_buffer, num_samples,
                                                                 episode_length, video_dir=video_dir)
    with open(buffer_fname, mode='wb') as f:
        pickle.dump(replay_buffer, f)
Exemple #2
0
def main():
    speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'JD2.speaker')
    lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL', 'VocalTractLab2.dll')
    ep_duration = 5000
    timestep = 20
    env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration)
    preproc = AudioPreprocessor(numcep=13, winlen=timestep / 1000)
    settings = {
            'state_dim': env.state_dim,
            'action_dim': env.action_dim,
            'state_bound': env.state_bound,
            'action_bound': [(p[0] / 5, p[1] / 5) for p in env.action_bound ], #env.action_bound,
            'goal_dim': preproc.get_dim(),
            'goal_bound': [(-50, 50) for _ in range(preproc.get_dim())],
            'episode_length': 40,
            'minibatch_size': 512,
            'max_train_per_simulation': 50,
            'save_video_step': 200,

            'actor_tau': 0.01,
            'actor_learning_rate': 0.000001,

            'model_dynamics_learning_rate': 0.05,

            'summary_dir': r'C:\Study\SpeechAcquisitionModel\reports\summaries',
            'videos_dir': r'C:\Study\SpeechAcquisitionModel\reports\videos'
        }

    replay_buffer = ReplayBuffer(100000)

    reference_fname = r'C:\Study\SpeechAcquisitionModel\src\VTL\references\a_i.pkl'
    with open(reference_fname, 'rb') as f:
        (tract_params, glottis_params) = pickle.load(f)
        target_trajectory = np.hstack((np.array(tract_params), np.array(glottis_params)))
    # generate audio and then goal target trajectpry based on given state space target trajectory
    s0 = env.reset(target_trajectory[0])
    g0 = np.zeros(preproc.get_dim())
    target_actions = []
    target_goals = []
    target_goals.append(g0)
    target_states = []
    target_states.append(s0)

    for i in range(1, len(target_trajectory) - 1):
        action = np.subtract(target_trajectory[i], s0)
        s1, audio = env.step(action)
        wav_audio = np.int16(audio * (2 ** 15 - 1))
        mfcc = preproc(wav_audio, env.audio_sampling_rate)
        isnans = np.isnan(mfcc)
        if isnans.any():
            print(mfcc)
            print("NAN OCCURED")
            raise TypeError("NAN in target")
        g1 = np.reshape(mfcc, (preproc.get_dim()))

        target_actions.append(action)
        target_goals.append(g1)
        target_states.append(s1)



        s0 = s1
        g0 = g1

    target_goals[1] = target_goals[2]
    target_trajectory = (target_actions, target_goals, target_states)
    train(settings, env, replay_buffer, preproc, target_trajectory)
    return
Exemple #3
0
        return x


speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                             'JD2.speaker')
lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                        'VocalTractLab2.dll')
ep_duration = 5000
timestep = 20
episode_length = 40
env = VTLEnv(lib_path,
             speaker_fname,
             timestep,
             max_episode_duration=ep_duration)
win_len = int(timestep * env.audio_sampling_rate)
preproc = AudioPreprocessor(numcep=13, winlen=timestep / 1000)

dir_name = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_simple_transition_08_23_2018_10_52_AM_54'
video_dir = dir_name + r'\Videos'
buffer_fname = dir_name + r'\replay_buffer.pkl'
with open(buffer_fname, mode='rb') as f:
    replay_buffer = pickle.load(f)

s_dim = env.state_dim
a_dim = env.action_dim
g_dim = preproc.get_dim()

s_bound = env.state_bound
a_bound = env.action_bound
a_bound = [(p[0] / 5, p[1] / 5) for p in env.action_bound]
g_bound = [(-40, 40) for _ in range(g_dim)]
def train(*args, **kwargs):
    print(kwargs)

    torch.random.manual_seed(0)


    device = kwargs['train']['device']

    # 1. Init audio preprocessing
    preproc = AudioPreprocessor(**kwargs['preprocessing_params'])
    sr = kwargs['preprocessing_params']['sample_rate']

    # 2. Load preprocessing net
    preproc_net = torch.load(kwargs['preproc_net_fname']).to(device)

    # 3. Init model dynamics net
    md_net = StochasticLstmModelDynamics(**kwargs['model_dynamics_params']).to(device)
    optim = torch.optim.Adam(md_net.parameters(), lr=kwargs['train']['learning_rate'], eps=kwargs['train']['learning_rate_eps'])

    # 4. Init Policy
    policy = SimpleStochasticPolicy(**kwargs['policy_params']).to(device)

    # 5. Init environment
    speaker_fname = os.path.join(kwargs['vtl_dir'], 'JD2.speaker')
    lib_path = os.path.join(kwargs['vtl_dir'], 'VocalTractLab2.dll')
    ep_duration = 1000
    timestep = 20
    num_steps_per_ep = ep_duration // timestep

    env = VTLEnv(lib_path, speaker_fname, timestep, max_episode_duration=ep_duration)

    # 6. Load reference for policy
    reference_wav_fname = kwargs['reference_fname']
    reference_preproc = torch.from_numpy(preproc(reference_wav_fname)[np.newaxis]).float().to(device)
    _, _, reference = preproc_net(reference_preproc, seq_lens=np.array([reference_preproc.shape[1]]))
    reference = reference.detach().cpu().numpy().squeeze()

    # 7. Init replay buffer
    replay_buffer = ReplayBuffer(kwargs['buffer_size'])

    # 8. Train loop
    params = kwargs['train']
    policy.eval()
    md_net.train()
    for i in range(params['num_steps']):

        state = env.reset()
        goal_state = np.zeros(kwargs['model_dynamics_params']['goal_dim'])
        states = [state]
        actions = []
        goal_states = [goal_state]
        hidden = None
        for step in range(num_steps_per_ep):
            policy_input = np.concatenate((state, reference[step, :]))[np.newaxis]
            policy_input = torch.from_numpy(policy_input).float().to(device)
            action, _, _ = policy(policy_input)
            # action = (np.random.rand(action_space)) * 100.
            action = action.detach().cpu().numpy().squeeze()
            action[env.number_vocal_tract_parameters:] = 0.
            action = action * 0.1 # reduce amplitude for now
            new_state, audio = env.step(action, True)

            preproc_audio = preproc(audio, sr)[np.newaxis]
            preproc_audio = torch.from_numpy(preproc_audio).float().to(device)
            _, hidden, new_goal_state = preproc_net(preproc_audio, seq_lens=np.array([preproc_audio.shape[1]]), hidden=hidden)
            new_goal_state = new_goal_state.detach().cpu().numpy().squeeze()

            states.append(new_state)
            goal_states.append(new_goal_state)
            actions.append(action)

            state = new_state
            goal_state = new_goal_state

            env.render()

        replay_buffer.add((states, goal_states, actions, None, None))

        minibatch_size = kwargs['train']['minibatch_size']
        if replay_buffer.size() > minibatch_size:
            num_updates_per_epoch = kwargs['train']['updates_per_episode']
            for k in range(num_updates_per_epoch):
                # sample minibatch
                s0, g0, a, _, _ = replay_buffer.sample_batch(minibatch_size)

                # train
                seq_len = a.shape[1]
                goal_dim = kwargs['model_dynamics_params']["goal_dim"]

                s_bound = env.state_bound
                a_bound = env.action_bound

                s = torch.from_numpy(normalize(s0, s_bound)).float().to(device)
                g = torch.from_numpy(g0).float().to(device)
                a = torch.from_numpy(normalize(a, a_bound)).float().to(device)


                # forward prop
                s_pred, g_pred, s_prob, g_prob, state_dists, goal_dists = md_net(s[:, :-1, :], g[:, :-1, :], a)

                # compute error
                mse_loss = MSELoss(reduction='sum')(g_pred, g[:, 1:, :]) / (seq_len * kwargs['train']['minibatch_size'])

                loss = -goal_dists.log_prob(g[:, 1:, :]).sum(dim=-1, keepdim=True).mean()

                state_mse_loss = MSELoss(reduction='sum')(s_pred, s[:, 1:, :]) / (seq_len * kwargs['train']['minibatch_size'])
                state_loss = -state_dists.log_prob(s[:, 1:, :]).sum(dim=-1, keepdim=True).mean()
                total_loss = loss + state_loss
                # backprop
                optim.zero_grad()
                total_loss.backward()
                optim.step()

                dynamics = MSELoss(reduction='sum')(g[:, 1:, :], g[:, :-1, :]) / (seq_len * kwargs['train']['minibatch_size'])

            print("\rstep: {} | stochastic_loss: {:.4f} | loss: {:.4f}| actual_dynamics: {:.4f} |  state stochastic loss: {:.4f} | state_loss: {:.4f}".format(i, loss.detach().cpu().item(),
                                                                                                        mse_loss.detach().cpu().item(),
                                                                                                        dynamics.detach().cpu().item(),
                                                                                                        state_loss.detach().cpu().item(),
                  state_mse_loss.detach().cpu().item()),
                  end="")
            if step % 100 == 0:
                print()

    # 9. Save model
    dt = str(datetime.datetime.now().strftime("%m_%d_%Y_%I_%M_%p"))
    md_fname = os.path.join(kwargs['save_dir'], '{}_{}.pt'.format("rnn_md", dt))
    torch.save(md_net, md_fname)
def train(*args, **kwargs):
    print(kwargs)

    device = kwargs['train']['device']

    # 1. Init audio preprocessing
    preproc = AudioPreprocessor(**kwargs['preprocessing_params'])
    sr = kwargs['preprocessing_params']['sample_rate']

    # 2. Load preprocessing net
    preproc_net = torch.load(kwargs['preproc_net_fname']).to(device)

    # 3. Init model dynamics net
    md_net = LstmModelDynamics(**kwargs['model_dynamics_params']).to(device)
    optim = torch.optim.Adam(md_net.parameters(), lr=kwargs['train']['learning_rate'], eps=kwargs['train']['learning_rate_eps'])

    # 4. Load training set
    data_fname = kwargs['data_fname']
    df = pd.read_pickle(data_fname)

    # 5. Train loop
    params = kwargs['train']
    md_net.train()
    for i in range(params['num_steps']):
        sample = df.sample(n=kwargs['train']['minibatch_size'])
        states = np.stack(sample.loc[:, 'states'].values)
        actions = np.stack(sample.loc[:, 'actions'].values)
        audio = np.stack(sample.loc[:, 'audio'].values)

        preproc_audio = np.array([preproc(audio[j], sr) for j in range(audio.shape[0])])

        acoustic_states = torch.from_numpy(preproc_audio).float().to(device)
        # acoustic_states = acoustic_states.view(-1, kwargs['model_dynamics_params']["acoustic_state_dim"])
        # mean_norm = acoustic_states.mean(dim=0)
        # mean_std = acoustic_states.std(dim=0)
        # acoustic_states = (acoustic_states - mean_norm.view(1, -1)) / mean_std.view(1, -1)
        # acoustic_states = acoustic_states.view(kwargs['train']['minibatch_size'], -1, kwargs['model_dynamics_params']["acoustic_state_dim"])
        _, _, acoustic_states = preproc_net(torch.from_numpy(preproc_audio).float().to(device),
                                 seq_lens=np.array([preproc_audio.shape[-2]]))



        seq_len = actions.shape[1]
        acoustic_state_dim = kwargs['model_dynamics_params']["acoustic_state_dim"]

        # forward prop
        lstm_outs, predicted_acoustic_states = md_net(acoustic_states,
                                           torch.from_numpy(states[:, :seq_len, :]).float().to(device),
                                           torch.from_numpy(actions).float().to(device))

        # compute error
        loss = MSELoss(reduction='sum')(predicted_acoustic_states[:, :-1, :].contiguous().view(-1, acoustic_state_dim),
                                        acoustic_states[:, 1:, :].contiguous().view(-1, acoustic_state_dim)) / (seq_len * kwargs['train']['minibatch_size'])

        # backprop
        optim.zero_grad()
        loss.backward()
        optim.step()

        dynamics = MSELoss(reduction='sum')(acoustic_states[:, :-1, :].contiguous().view(-1, acoustic_state_dim),
                                        acoustic_states[:, 1:, :].contiguous().view(-1, acoustic_state_dim)) / (seq_len * kwargs['train']['minibatch_size'])

        print("\rstep: {} | loss: {:.4f}| actual_dynamics: {:.4f}".format(i, loss.detach().cpu().item(), dynamics.detach().cpu().item()), end="")
lstm_net.load_state_dict(torch.load(lstm_net_fname))

# instantiate environment and its properties
speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                             'JD2.speaker')
lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                        'VocalTractLab2.dll')
ep_duration = 5000
timestep = 20
episode_length = 40
env = VTLEnv(lib_path,
             speaker_fname,
             timestep,
             max_episode_duration=ep_duration)
win_len = int(timestep * env.audio_sampling_rate)
preproc = AudioPreprocessor(numcep=12, winlen=timestep / 1000)
replay_buffer = ReplayBuffer(1000000)

s_dim = env.state_dim
a_dim = env.action_dim
# remember that lstm hidden state is a tuple h, c so we have to predict tuple (h, c)
g_dim = 2 * lstm_model_settings['hidden_reccurent_cells_count']

s_bound = env.state_bound
a_bound = env.action_bound
g_bound = [(-1., 1.) for _ in range(g_dim)]

action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.action_dim),
                                            sigma=0.01)

n_minibatch_size = 512
def main():
    speaker_fname = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                                 'JD2.speaker')
    lib_path = os.path.join(r'C:\Study\SpeechAcquisitionModel\src\VTL',
                            'VocalTractLab2.dll')
    ep_duration = 5000
    timestep = 20
    env = VTLEnv(lib_path,
                 speaker_fname,
                 timestep,
                 max_episode_duration=ep_duration)
    preproc = AudioPreprocessor(numcep=12,
                                winlen=timestep / 1000,
                                winstep=timestep / 1000)
    # load lstm net for classification
    lstm_net_fname = r'C:\Study\SpeechAcquisitionModel\reports\VTL_sigmoid_transition_classification\checkpoints\simple_lstm_08_29_2018_03_13_PM_acc_0.9961.pt'
    lstm_net_classes = 25
    lstm_model_settings = {
        'dct_coefficient_count': 12,
        'label_count': lstm_net_classes + 2,
        'hidden_reccurent_cells_count': 50,
        'winlen': 0.02,
        'winstep': 0.02
    }

    lstm_net = LstmNet(lstm_model_settings)
    lstm_net.load_state_dict(torch.load(lstm_net_fname))

    settings = {
        'state_dim':
        env.state_dim,
        'action_dim':
        env.action_dim,
        'state_bound':
        env.state_bound,
        'action_bound':
        [(p[0] / 5, p[1] / 5) for p in env.action_bound],  #env.action_bound,
        'goal_dim':
        lstm_model_settings['hidden_reccurent_cells_count'] * 2,
        'goal_bound':
        [(-1., 1.)
         for _ in range(lstm_model_settings['hidden_reccurent_cells_count'] *
                        2)],
        'episode_length':
        40,
        'minibatch_size':
        512,
        'max_train_per_simulation':
        50,
        'save_video_step':
        200,
        'summary_dir':
        r'C:\Study\SpeechAcquisitionModel\reports\summaries',
        'videos_dir':
        r'C:\Study\SpeechAcquisitionModel\reports\videos'
    }

    replay_buffer = ReplayBuffer(100000)

    # load target sound
    reference_wav_fname = r'C:\Study\SpeechAcquisitionModel\data\raw\VTL_model_dynamics_sigmoid_transition_08_28_2018_03_57_PM_03\Videos\a_i\episode_08_28_2018_03_57_PM_06.wav'
    reference_s0 = get_cf('a')
    reference_mfcc = preproc(reference_wav_fname)
    # feed target sound to lstm net and  get target goal from hidden state

    hidden = None
    target_trajectory = []
    for i in range(reference_mfcc.shape[0]):
        net_input = torch.from_numpy(
            np.reshape(reference_mfcc[i, :],
                       (1, 1, reference_mfcc.shape[1]))).float()
        _, hidden, _ = lstm_net(net_input, np.array([1]), hidden)
        t = np.concatenate([
            hidden[0].detach().numpy().flatten(),
            hidden[0].detach().numpy().flatten()
        ])
        target_trajectory.append(t)

    train(settings, env, replay_buffer, preproc, lstm_net, target_trajectory,
          reference_s0)
    return