Esempio n. 1
0
def load_model(device, model_path, is_cuda):
    model = DeepSpeech.load_model(model_path)
    model.eval()
    model = model.to(device)
    if is_cuda and model.mixed_precision:
        model = convert_model_to_half(model)
    return model
Esempio n. 2
0
def load_model(device, model_path, use_half):
    model = DeepSpeech.load_model(model_path)
    model.eval()
    model = model.to(device)
    if use_half:
        model = model.half()
    return model
Esempio n. 3
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
Esempio n. 4
0
def load_model(device, model_path, model_name, use_half):
    if model_name == 'DeepSpeech':
        model = DeepSpeech.load_model(model_path)
    elif model_name == 'DFCNN':
        model = DFCNN.load_model(model_path)
    model.eval()
    model = model.to(device)
    if use_half:
        model = model.half()
    return model
Esempio n. 5
0
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)
def load_model(device, model_path, use_half):

    # use load_model method from DeepSpeech class
    model = DeepSpeech.load_model(model_path)

    # set model to eval
    model.eval()

    # put model on device (GPU/CPU)
    model = model.to(device)

    # if the model is using half-precision sampling, use the half method of the model to indicate so
    if use_half:
        model = model.half()

    # return the model
    return model
Esempio n. 7
0
parser.add_argument('--verbose',
                    action="store_true",
                    help="print out decoded output and error of each sample")
no_decoder_args = parser.add_argument_group(
    "No Decoder Options", "Configuration options for when no decoder is "
    "specified")
no_decoder_args.add_argument('--output-path',
                             default=None,
                             type=str,
                             help="Where to save raw acoustic output")
parser = add_decoder_args(parser)
args = parser.parse_args()

if __name__ == '__main__':
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = model.to(device)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
Esempio n. 8
0
beam_args.add_argument('--lm_alpha',
                       default=0.8,
                       type=float,
                       help='Language model weight')
beam_args.add_argument('--lm_beta1',
                       default=1,
                       type=float,
                       help='Language model word bonus (all words)')
beam_args.add_argument('--lm_beta2',
                       default=1,
                       type=float,
                       help='Language model word bonus (IV words)')
args = parser.parse_args()

if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder
        decoder = BeamCTCDecoder(labels,
                                 beam_width=args.beam_width,
                                 top_paths=1,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'),
                                 lm_path=args.lm_path,
                                 trie_path=args.trie_path,
                                 lm_alpha=args.lm_alpha,
    def attack2(self, init_delta, target, model_path):
        self.delta2 = torch.FloatTensor(init_delta).cuda()
        self.delta2.requires_grad = True
        self.rescale = torch.ones((self.batch_size, 1)).cuda()
        self.final_deltas = [None] * self.batch_size
        self.alpha = torch.ones((self.batch_size, )).cuda() * 1
        #self.alpha = 1

        model = DeepSpeech.load_model(model_path)
        model = model.cuda()

        self.optim21 = torch.optim.Adam([self.delta2], lr=2)
        self.optim22 = torch.optim.Adam([self.delta2], lr=self.lr2)

        criterion = CTCLoss()

        th_batch = []
        psd_max_batch = []
        for ii in range(self.batch_size):
            th, _, psd_max = generate_th(self.original[ii].cpu().numpy(),
                                         fs=16000,
                                         window_size=2048)
            th_batch.append(th)
            psd_max_batch.append(psd_max)
        th_batch = np.array(th_batch)
        psd_max_batch = np.array(psd_max_batch)
        th_batch = torch.FloatTensor(th_batch).cuda()
        psd_max_batch = torch.FloatTensor(psd_max_batch).cuda()

        MAX = self.num_iterations2
        model.train()
        loss_th = [np.inf] * self.batch_size
        for i in range(MAX):
            # print out some debug information every 10 iterations
            #print(self.delta)
            apply_delta = torch.clamp(
                self.delta2, -2000,
                2000) * self.rescale  #[batch_size * max_audio_len]
            new_input = apply_delta * self.mask + self.original  #[batch_size * max_audio_len]
            #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len]
            pass_in = torch.clamp(new_input, -2**15, 2**15 - 1)
            pass_in = torch.div(pass_in, 2**15)  #[batch_szie * max_audio_len]
            logits, logits_sizes = get_logits(pass_in, self.lengths.int(),
                                              model)  #[batch_size * T * H]
            logits_ = logits.transpose(0, 1)
            # loss

            loss2 = criterion(logits_, self.target_phrase, logits_sizes,
                              self.target_phrase_lengths).cuda()
            loss_value_2 = loss2.item()
            self.optim21.zero_grad()
            loss2.backward(retain_graph=True)
            self.delta2.grad = torch.sign(self.delta2.grad)
            self.optim21.step()

            loss1 = 0
            loss1_each = []
            for ii in range(self.batch_size):
                psd = psd_transform(apply_delta[ii],
                                    psd_max_batch[ii],
                                    win_length=2048,
                                    win_step=512)
                loss1 += self.alpha[ii] * torch.mean(
                    torch.relu(psd - th_batch[ii]))
                loss1_each.append(
                    torch.mean(torch.relu(psd - th_batch[ii])).item())
                #psd_num = psd.cpu().detach().numpy()
                #th_ = th_batch[ii].cpu().detach().numpy()

            loss1 = loss1 / self.batch_size
            loss_value_1 = np.mean(loss1_each)
            self.optim22.zero_grad()
            loss1.backward()
            for ii in range(self.batch_size):
                self.delta2.grad[ii] = self.alpha[ii] * torch.sign(
                    self.delta2.grad[ii])

            #grad = np.sum(self.delta2.grad.cpu().numpy())
            #if grad != grad:
            #    print("NaN")

            self.optim22.step()

            apply_delta_ = torch.clamp(self.delta2, -2000, 2000) * self.rescale

            print('loss: ', loss_value_1, loss_value_2)

            if i + 1 == 2000:
                param_groups = self.optim21.param_groups
                for g in param_groups:
                    g['lr'] = 0.1
                param_groups = self.optim22.param_groups
                for g in param_groups:
                    g['lr'] = 0.1
            if i + 1 == 3200:
                param_groups = self.optim21.param_groups
                for g in param_groups:
                    g['lr'] = 0.01
                param_groups = self.optim22.param_groups
                for g in param_groups:
                    g['lr'] = 0.01

            if (i + 1) % 10 == 0:
                decode_out, _ = self.decoder.decode(logits, logits_sizes)
                print(i + 1, decode_out[0], [target[0]])

            for ii in range(self.batch_size):
                if ((i + 1) % 50 == 0
                        and decode_out[ii] == [target[ii].upper()]) or (
                            i == MAX - 1 and self.final_deltas[ii] is None):
                    self.alpha[ii] = 1.2 * self.alpha[ii]
                    if self.alpha[ii] > 1000:
                        self.alpha[ii] = 1000
                    # Adjust the best solution found so far
                    if loss1_each[ii] < loss_th[ii]:
                        loss_th[ii] = loss1_each[ii]
                        self.final_deltas[ii] = new_input[ii][
                            0:self.lengths[ii].int()].cpu().detach().numpy()
                    print("up alpha=%f" % (self.alpha[ii]))

                if ((i + 1) % 100 == 0
                        and decode_out[ii] != [target[ii].upper()]):
                    self.alpha[ii] = 0.6 * self.alpha[ii]
                    '''
                    if self.alpha <= 100:
                        self.alpha = 100
                    else:
                        # Adjust the best solution found so far
                        print("down alpha=%f" % (self.alpha))
                    '''
                    print("down alpha=%f" % (self.alpha[ii]))
        return self.final_deltas
    def attack1(self, audios, lengths, max_audio_len, targets, model_path):
        self.max_audio_len = max_audio_len
        self.original = torch.FloatTensor(audios).cuda()
        self.lengths = torch.FloatTensor(lengths)
        #define some variables
        self.delta1 = torch.zeros((self.batch_size, self.max_audio_len)).cuda()
        self.delta1.requires_grad = True
        self.rescale = torch.ones((self.batch_size, 1)).cuda()
        self.mask = torch.FloatTensor(
            np.array([[1 if i < l else 0 for i in range(self.max_audio_len)]
                      for l in self.lengths])).cuda()
        self.final_deltas = [None] * self.batch_size

        self.target_phrase_lengths = torch.IntTensor(self.batch_size)
        self.target_phrase = []
        for x in range(self.batch_size):
            phrase = list(
                filter(
                    None,
                    [self.labels_map.get(x)
                     for x in list(targets[x].upper())]))
            self.target_phrase_lengths[x] = len(phrase)
            self.target_phrase.extend(phrase)
        self.target_phrase = torch.IntTensor(self.target_phrase)
        #print(self.target_phrase.size(), self.target_phrase_lengths)
        model = DeepSpeech.load_model(model_path)
        model = model.cuda()
        self.optim1 = torch.optim.Adam([self.delta1], lr=self.lr1)

        criterion = CTCLoss()

        MAX = self.num_iterations1
        model.train()
        #self.noise = torch.randn(self.delta1.shape).cuda()  #[batch_szie * max_audio_len]
        for i in range(MAX):

            # print out some debug information every 10 iterations
            apply_delta = torch.clamp(
                self.delta1, -2000,
                2000) * self.rescale  #[batch_size * max_audio_len]
            new_input = apply_delta * self.mask + self.original  #[batch_size * max_audio_len]
            #pass_in = torch.clamp(new_input + self.noise, -2**15, 2**15-1) #[batch_szie * max_audio_len]
            pass_in = torch.clamp(new_input, -2**15, 2**15 - 1)
            pass_in = torch.div(pass_in, 2**15)  #[batch_szie * max_audio_len]
            logits, logits_sizes = get_logits(pass_in, self.lengths.int(),
                                              model)  #[batch_size * T * H]
            logits_ = logits.transpose(0, 1)
            # loss
            if not np.isinf(self.l2penalty):
                loss = torch.mean(
                    (new_input - self.original)
                    **2) + self.l2penalty * criterion(
                        logits_, self.target_phrase, logits_sizes,
                        self.target_phrase_lengths).cuda()
            else:
                loss = criterion(logits_, self.target_phrase, logits_sizes,
                                 self.target_phrase_lengths).cuda()
            loss_value = loss.item()
            # optimize
            self.optim1.zero_grad()
            loss.backward()
            # grad sign
            self.delta1.grad = torch.sign(self.delta1.grad)
            self.optim1.step()

            print('loss: ', loss_value)
            if (i + 1) % 10 == 0:
                decode_out, _ = self.decoder.decode(logits, logits_sizes)
                #print(decode_out, targets)

            for ii in range(self.batch_size):
                if ((i + 1) % 10 == 0
                        and decode_out[ii] == [targets[ii].upper()]) or (
                            i == MAX - 1 and self.final_deltas[ii] is None):
                    bound_tmp = torch.max(torch.abs(self.delta1[ii])).item()
                    if self.rescale[ii][0] * 2000 > bound_tmp:
                        print("It's way over", bound_tmp / 2000.0)
                        self.rescale[ii][0] = bound_tmp / 2000.0

                    self.rescale[ii][0] *= .8

                    # Adjust the best solution found so far
                    self.final_deltas[ii] = new_input[ii].cpu().detach().numpy(
                    )
                    print("bound=%f" % (2000 * self.rescale[ii][0]))

        return self.final_deltas
Esempio n. 11
0

def transcribe(audio_path, parser, model, decoder, cuda=False):
    spect = parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    if cuda:
        spect = spect.cuda()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets


if __name__ == '__main__':
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
Esempio n. 12
0
model_file = model_path + "\\librispeech_pretrained.pth"
cuda = "store_true"
batch_size = 20
num_workers = 4
decoder = "greedy"
verbose = "store_true"
top_paths = 1
beam_width = 10
lm_path = None
alpha = 0.8
beta = 1
cutoff_top_n = 40
cutoff_prob = 1
lm_workers = 1

model = DeepSpeech.load_model(model_file)
model.eval()

labels = DeepSpeech.get_labels(model)
audio_conf = DeepSpeech.get_audio_conf(model)

if decoder == "beam":
    from decoder import BeamCTCDecoder

    decoder = BeamCTCDecoder(labels, lm_path=lm_path, alpha=alpha, beta=beta,
                             cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob,
                             beam_width=beam_width, num_processes=lm_workers)
elif decoder == "greedy":
    decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
else:
    decoder = None
Esempio n. 13
0
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]


if __name__ == '__main__':
    if args.lm_path is None:
        print("error: LM must be provided for tuning")
        sys.exit(1)

    model = DeepSpeech.load_model(args.model_path, cuda=False)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
                                      normalize=True)

    logits = np.load(args.logits)
    batch_size = logits[0][0].shape[0]

    results = []


    def result_callback(result):
        results.append(result)
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_num)

if __name__ == '__main__':
    # args.cuda = True
    # args.verbose = True
    # args.decoder = "beam"
    for a in range(args.start_epoch, args.end_epoch + 1):
        from jiwer import wer
        t0 = time.time()
        model_path = os.path.join(args.model_path,
                                  "deepspeech_{}.pth".format(str(a)))
        torch.set_grad_enabled(False)

        if not args.finetune:
            model = DeepSpeech.load_model(model_path)

            if args.cuda:
                model.cuda()
            model.eval()

            labels = DeepSpeech.get_labels(model)
            C_labels = labels.copy()
            with open(args.E2C) as label_file:
                E2C = json.load(label_file)
            with open(args.C2E) as label_file:
                C2E = json.load(label_file)
            for i, v in enumerate(C_labels):
                if v in E2C:
                    C_labels[i] = E2C[v]
Esempio n. 15
0
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    #model = DeepSpeech(rnn_hidden_size=args.hidden_size,
    #                   nb_layers=args.hidden_layers,
    #                   labels=labels,
    #                   rnn_type=supported_rnns[rnn_type],
    #                   audio_conf=audio_conf,
    #                   bidirectional=args.bidirectional)
    model = DeepSpeech.load_model('models/librispeech_pretrained.pth')
    parameters = model.parameters()
    #optimizer = torch.optim.SGD(parameters, lr=args.lr,
    #                            momentum=args.momentum, nesterov=True)

    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)