Beispiel #1
0
 def load_model_package(cls, package):
     # TODO Added for backwards compatibility, should be remove for new release
     if OmegaConf.get_type(package['audio_conf']) == dict:
         audio_conf = package['audio_conf']
         package['audio_conf'] = SpectConfig(sample_rate=audio_conf['sample_rate'],
                                             window_size=audio_conf['window_size'],
                                             window=SpectrogramWindow(audio_conf['window']))
     model = cls(rnn_hidden_size=package['hidden_size'],
                 nb_layers=package['hidden_layers'],
                 labels=package['labels'],
                 audio_conf=package['audio_conf'],
                 rnn_type=supported_rnns[package['rnn_type']],
                 bidirectional=package.get('bidirectional', True))
     model.load_state_dict(package['state_dict'])
     return model
class OptimizerConfig:
    model_path: str = ''
    test_path: str = ''  # Path to test manifest or csv
    is_character_based: bool = True  # Use CER or WER for finding optimal parameters
    lm_path: str = ''
    beam_width: int = 10
    alpha_from: float = 0.0
    alpha_to: float = 3.0
    beta_from: float = 0.0
    beta_to: float = 1.0
    n_trials: int = 500  # Number of trials for optuna
    n_jobs: int = 2  # Number of parallel jobs for optuna
    precision: int = 16
    batch_size: int = 1  # For dataloader
    num_workers: int = 1  # For dataloader
    spect_cfg: SpectConfig = SpectConfig()
Beispiel #3
0

def normalize_tensor(x):
   return x / x.sum()



if __name__ == '__main__':
    LABELS = ["_", "'", "A", "B", "C", "D", "E", "F", "G",
              "H", "I", "J", "K", "L", "M", "N", "O", "P",
              "Q", "R", "S", "T", "U", "V", "W", "X", "Y",
              "Z", " "
              ]

    path_input = '/home/coml/Documents/Victoria/noise_classifier/deepspeech_model/data/CommonVoice_dataset/train'
    test_dataset = SpectrogramDataset(audio_conf=SpectConfig(), input_path=path_input, labels=LABELS)
    test_loader = AudioDataLoader(dataset=test_dataset)
    model = DeepSpeech(labels=LABELS, precision=32, spect_cfg=SpectConfig(),
                       optim_cfg=AdamConfig(), model_cfg=BiDirectionalConfig()) # args: 'labels', 'model_cfg', 'precision', 'optim_cfg', and 'spect_cfg'
    # im = test_loader[0]

    NUM_CLASSES = 29 # Corresponds to the length of the labels
    # layer = nn.Linear()

    for i, data in enumerate(test_loader, 0):
        # print('DATA \n',  data[1], '\n', data[2], '\n', data[3], '\n')
        print('\n Sample {}: input length {}'.format(i, data[3]))
        out, length = model.forward(data[0], data[3])
        print('   Outputs shapes: {} {}'.format(out.shape, length))
        print('Final output', out)
Beispiel #4
0
        'Raindrop', 'Run', 'Scissors', 'Screaming', 'Shatter', 'Sigh',
        'Sink_(filling_or_washing)', 'Skateboard', 'Slam', 'Sneeze', 'Squeak',
        'Stream', 'Strum', 'Tap', 'Tick-tock', 'Toilet_flush',
        'Traffic_noise_and_roadway_noise', 'Trickle_and_dribble',
        'Walk_and_footsteps', 'Water_tap_and_faucet', 'Waves_and_surf',
        'Whispering', 'Writing', 'Yell', 'Zipper_(clothing)'
    }

    LABELS = list(set.difference(ALL_LABELS, HUMAN_LABELS))

    import matplotlib

    matplotlib.use('TkAgg')
    import matplotlib.pyplot as plt
    path_input = '/home/coml/Documents/Victoria/noise_classifier/deepspeech_model/data/Freesound_dataset'
    data = SpectrogramDataset(audio_conf=SpectConfig(),
                              input_path=path_input,
                              labels=LABELS)

    print('First element \n', type(data[0][0]), data[0][0].shape,
          len(data[0][1]))
    print('Second element \n', type(data[1][0]), data[1][0].shape,
          len(data[1][1]))
    print('Third element \n', type(data[2][0]), data[2][0].shape,
          len(data[2][1]))
    print('Fourth element \n', type(data[3][0]), data[3][0].shape,
          len(data[3][1]))

    plt.imshow(data[0][0].detach().numpy())
    plt.show()
    """