def __getitem__(self, index):
     x = self.load_sample(self.files[index])
     x = VoiceActivityDetector.from_picture_to_tensor(x)
     if self.mode == 'test':
         return x
     else:
         label_id = VoiceActivityDetector.LABEL_TO_IDX[self.labels[index]]
         return x, label_id

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('model_path',
                        type=str,
                        help='The path where model is stored')
    parser.add_argument('audio_path',
                        type=str,
                        help='The path to the audio file to be processed')
    args = parser.parse_args()

    print(f'Processing on: {VoiceActivityDetector.DEVICE}')

    # ========================================================
    detector = VoiceActivityDetector()
    detector.load(args.model_path)

    rate, signal, labels = load_labeled_audio(args.audio_path)

    signal = signal[int(0 * rate):int(30 * rate)]
    labels = labels[int(0 * rate):int(30 * rate)]

    detector.setup(rate)

    buffer_sizes = list(range(20, 151, 5))  # ms
    ratios = []

    for buffer_size in buffer_sizes:
        print(f'buffer_size = {buffer_size}')
        st = time.time()
Esempio n. 3
0
    parser.add_argument('--net-window-size',
                        type=float,
                        default=0.05,
                        help='Window size of neural network in seconds')
    parser.add_argument(
        '--net-step-size-ratio',
        type=float,
        default=0.5,
        help=
        'Step size ratio of neural network: percentage of window size for neural network in [0, 1]'
    )
    parser.add_argument('model_path',
                        type=str,
                        help='The path where model will be saved')
    parser.add_argument('--arc',
                        type=str,
                        default='cnn',
                        help='Architecture type of neural network')
    args = parser.parse_args()

    params: dict = copy.deepcopy(vars(args))

    params.pop('model_path')
    params.pop('arc')

    detector = VoiceActivityDetector(params)

    print(f'Saving model...\n{detector}')
    detector.save(args.model_path)
    print('Done')
    )
    parser.add_argument(
        '--cuda',
        type=bool,
        default=True
    )
    args = parser.parse_args()

    if args.cuda and torch.cuda.is_available():
        VoiceActivityDetector.DEVICE = torch.device('cuda')
    else:
        VoiceActivityDetector.DEVICE = torch.device('cpu')

    print(f'Processing on: {VoiceActivityDetector.DEVICE}')

    detector = VoiceActivityDetector()
    detector.load(args.model_path)

    dataset_dir = Path(args.dataset_dir)
    dataset_paths = sorted(list(dataset_dir.rglob('*.png')))
    labels = [path.parent.name for path in dataset_paths]

    X_train, X_val, y_train, y_val = train_test_split(
        dataset_paths,
        labels,
        test_size=args.val_ratio,
        shuffle=True
    )

    train_dataset = TrainVadDataset(X_train, mode='train')
    val_dataset = TrainVadDataset(X_val, mode='val')
Esempio n. 5
0
                        default=0.05,
                        help='The buffer size for audio pieces in seconds')
    args = parser.parse_args()

    print(args.cuda)

    if args.cuda and torch.cuda.is_available():
        print('???')
        VoiceActivityDetector.DEVICE = torch.device('cuda')
    else:
        VoiceActivityDetector.DEVICE = torch.device('cpu')

    print(f'Processing on: {VoiceActivityDetector.DEVICE}')

    # ========================================================
    detector = VoiceActivityDetector()
    detector.load(args.model_path)

    rate, signal, labels = load_labeled_audio(args.audio_path)

    # signal = signal[int(500 * rate): int(750 * rate)]
    # labels = labels[int(500 * rate): int(750 * rate)]

    ts = np.linspace(0, len(signal) / rate, num=len(signal))

    detector.setup(rate)
    stream_buffer = StreamBuffer(rate)

    buffer_size_f = int(np.ceil(rate * args.buffer_size))
    signal_size_f = len(signal)
Esempio n. 6
0
 def __getitem__(self, index):
     pxl_l = self.pxl_ls[index]
     x = spectrogram[:, pxl_l:pxl_l + sample_pxl_width, :]
     x = Image.fromarray(x)
     x = VoiceActivityDetector.from_picture_to_tensor(x)
     return x
Esempio n. 7
0
        '--mat-output-path',
        type=str,
        help='The path to the .mat file where labels will be stored')
    parser.add_argument('--device',
                        type=str,
                        help='Device type for computations',
                        default='cuda')
    parser.add_argument(
        '--statistics-path',
        type=str,
        help=
        'The path to the file where statistics of processing will be stored')
    args = parser.parse_args()

    # ========================================================
    detector = VoiceActivityDetector()
    detector.load(args.model_path)

    rate, signal, labels = load_labeled_audio(args.audio_path)

    ts = np.linspace(0, len(signal) / rate, num=len(signal))

    # ========================================================
    spectrogram = build_spectrogram(
        signal,
        rate,
        n_filters=detector.params['n_filters'],
        window_size_s=detector.params['window_size'],
        step_size_ratio=detector.params['step_size_ratio'])
    # ========================================================
    net_window_size_f = int(rate * detector.params['net_window_size'])