Ejemplo n.º 1
0
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            # x = x.to("cuda")
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset : offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
    model.train()
    return cer


if __name__ == "__main__":
    with open("./labels.json") as f:
        vocabulary = json.load(f)
        vocabulary = "".join(vocabulary)
    model = GatedConv(vocabulary)
    if(not os.path.exists(save_path)):
        os.mkdir(save_path)
    model.to(device)
    train(model)
Ejemplo n.º 2
0
import torch
import feature
from models.conv import GatedConv
import torch.nn.functional as F
from ctcdecode import CTCBeamDecoder
from config import lm_path, pretrained_model_path

alpha = 0.8
beta = 0.3
cutoff_top_n = 40
cutoff_prob = 1.0
beam_width = 32
num_processes = 4
blank_index = 0

model = GatedConv.load(pretrained_model_path)
model.eval()

decoder = CTCBeamDecoder(
    model.vocabulary,
    lm_path,
    alpha,
    beta,
    cutoff_top_n,
    cutoff_prob,
    beam_width,
    num_processes,
    blank_index,
)

Ejemplo n.º 3
0
import _init_path
import platform
from models.conv import GatedConv

use_lm = True
if use_lm:
    import beamdecode

system_type = platform.system()
if (system_type == 'Windows'):
    model = GatedConv.load("AboutDL\\语音识别MASR\\pretrained\\gated-conv.pth")
    #import scipy
    #_,receipt_data = scipy.io.wavfile.read("E:\\打开欢呼比.wav")
    #text = model.predict(receipt_data)事实证明效果相同
    text = model.predict("E:\\打开欢呼比.wav")
elif (system_type == 'Linux'):
    model = GatedConv.load('AboutDL/语音识别MASR/pretrained/gated-conv.pth')
    text = model.predict(
        "/media/yangjinming/DATA/Dataset/PrimeWords/d/d2/d25104a2-6be0-4950-9ec0-42e8e1303492.wav"
    )

print("识别结果:", text)
Ejemplo n.º 4
0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset:offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
    model.train()
    return cer


if __name__ == "__main__":
    model = GatedConv(
        json.load(open("../data_aishell/labels.json", encoding='utf-8')))
    epoch = 40
    model.load_state_dict(torch.load("pretrained/model_{}.pth".format(epoch)))
    print("reload model: pretrained/model_{}.pth".format(epoch))
    model.to(device)
    train(model, start_epoch=epoch)
Ejemplo n.º 5
0
import _init_path
from models.conv import GatedConv
import json

# model = GatedConv.load("pretrained/gated-conv.pth")

# model.to_train()

# model.fit("train.manifest", "train.manifest")

with open("data_aishell/labels.json") as f:
    vocabulary = json.load(f)
    vocabulary = "".join(vocabulary)
model = GatedConv(vocabulary)

model.to_train()

model.fit("/home/dolan/Desktop/masr/data_aishell/train.index",
          "/home/dolan/Desktop/masr/data_aishell/dev.index",
          "/home/dolan/Desktop/masr/data_aishell/labels.json", 10)
import _init_path
from models.conv import GatedConv
import pre_transform_2 as pt2
import enhance_speach as es
# import beamdecode

model = GatedConv.load("pretrained/gated-conv.pth")
# model = GatedConv.load("pretrained/model_110.pth")

# with open("train_index","w") as f:
# 	for filename in os.listdir('data/'):
# 		# print(filename)
# 		s1 = filename[:8]
# 		s2 = ""
# 		for x in s1:
# 			s2 += trans[x]
# 		s1 = "data/" + s1 +".wav"
# 		f.write(s1+","+s2+"\n")

# text = model.predict("test.wav")
# text = model.predict("12345_man.wav")
# text = model.predict("678910_man.wav")
# text = model.predict("862409_in.wav")
# text = model.predict("20164239_kuai.wav")
# text = model.predict("20164762_kuai.wav")
# text = model.predict("20164762_man.wav")
# text = model.predict("data/20166565.wav")
# text = model.predict("20164786.wav")
input_file_src = "record.wav"
# output_file_src = "record_out.wav"
# es.denoise(input_file_src,output_file_src)
Ejemplo n.º 7
0
# import _init_path
from models.conv import GatedConv

# model = GatedConv.load("pretrained/gated-conv.pth")
# model = GatedConv.load("pretrained/model_62.pth")
model = GatedConv.load("pretrained2/model_81.pth")

text = model.predict("./sample_audio/8_16.wav")

print("")
print("识别结果:")
print(text)
Ejemplo n.º 8
0
import _init_path
from models.conv import GatedConv

model = GatedConv.load("语音识别MASR/pretrained/gated-conv.pth")
model.to_train()
model.fit("data/train.index", "data/dev.index", train_batch_size=2)
Ejemplo n.º 9
0
__mtime__ = '20210318'
import os
from models.conv import GatedConv
from config import pretrained_model_path

model = GatedConv.load(os.path.join('..', pretrained_model_path))

text = model.predict("../data_aishell/BAC009S0765W0130.wav")

print("")
print("识别结果:")
print(text)
Ejemplo n.º 10
0
    cer = 0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to("cuda")
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset : offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
    model.train()
    return cer


if __name__ == "__main__":
    with open("data_aishell/labels.json") as f:
        vocabulary = json.load(f)
        vocabulary = "".join(vocabulary)
    model = GatedConv(vocabulary)
    model.to("cuda")
    train(model)
Ejemplo n.º 11
0
import torch
import feature
from models.conv import GatedConv
import torch.nn.functional as F
from ctcdecode import CTCBeamDecoder

alpha = 0.8
beta = 0.3
lm_path = "/home/db/bing/yuyingshibie/masr/lm/zh_giga.no_cna_cmn.prune01244.klm"
cutoff_top_n = 40
cutoff_prob = 1.0
beam_width = 32
num_processes = 4
blank_index = 0

model = GatedConv.load(
    "/home/db/bing/yuyingshibie/masr/pretrained/gated-conv.pth")
model.eval()

decoder = CTCBeamDecoder(
    model.vocabulary,
    lm_path,
    alpha,
    beta,
    cutoff_top_n,
    cutoff_prob,
    beam_width,
    num_processes,
    blank_index,
)

Ejemplo n.º 12
0
import _init_path
from models.conv import GatedConv

# model = GatedConv.load("pretrained/gated-conv.pth")
model = GatedConv.load("pretrained/model_3.pth")

text = model.predict("./sample_audio/test.wav")

print("")
print("识别结果:")
print(text)
Ejemplo n.º 13
0
parser.add_argument('--lm-alpha-from', default=0.0, type=float, help='Language model weight start tuning')
parser.add_argument('--lm-alpha-to', default=3.0, type=float, help='Language model weight end tuning')
parser.add_argument('--lm-beta-from', default=0.0, type=float,
					help='Language model word bonus (all words) start tuning')
parser.add_argument('--lm-beta-to', default=0.5, type=float,
					help='Language model word bonus (all words) end tuning')
parser.add_argument('--lm-num-alphas', default=45, type=float, help='Number of alpha candidates for tuning')
parser.add_argument('--lm-num-betas', default=5, type=float, help='Number of beta candidates for tuning')
parser = add_decoder_args(parser)
args = parser.parse_args()

if args.lm_path is None:
	print("error: LM must be provided for tuning")
	sys.exit(1)

model = GatedConv.load(args.model_path)

saved_output = np.load(args.saved_output, allow_pickle=True)


def init(beam_width, blank_index, lm_path):
	global decoder, ae_decoder
	decoder = BeamCTCDecoder(model.vocabulary, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers,
							 blank_index=blank_index)
	ae_decoder = GreedyDecoder(model.vocabulary)


def decode_dataset(params):
	lm_alpha, lm_beta = params
	global decoder
	decoder._decoder.reset_params(lm_alpha, lm_beta)