def train(config): train_set, dev_set, test_set = mod.SpeechDataset.splits(config) if config["input_file"]: model = mod.SpeechModel(config) model.load(config["input_file"]) else: model = mod.SpeechModel(config) if not config["no_cuda"]: torch.cuda.set_device(config["gpu_no"]) model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=config["lr"]) criterion = nn.CrossEntropyLoss() min_loss = sys.float_info.max train_loader = data.DataLoader(train_set, batch_size=config["batch_size"], shuffle=True, drop_last=True) dev_loader = data.DataLoader(dev_set, batch_size=min(len(dev_set), 100), shuffle=True) test_loader = data.DataLoader(test_set, batch_size=min(len(test_set), 100), shuffle=True) step_no = 0 for epoch_idx in range(config["n_epochs"]): for batch_idx, (model_in, labels) in enumerate(train_loader): model.train() optimizer.zero_grad() if not config["no_cuda"]: model_in = model_in.cuda() labels = labels.cuda() model_in = Variable(model_in, requires_grad=False) scores = model(model_in) labels = Variable(labels, requires_grad=False) loss = criterion(scores, labels) loss.backward() optimizer.step() step_no += 1 print_eval("train step #{}".format(step_no), scores, labels, loss) if epoch_idx % config["dev_every"] == config["dev_every"] - 1: model.eval() for model_in, labels in dev_loader: model_in = Variable(model_in, requires_grad=False) if not config["no_cuda"]: model_in = model_in.cuda() labels = labels.cuda() scores = model(model_in) labels = Variable(labels, requires_grad=False) loss = criterion(scores, labels) loss_numeric = loss.cpu().data.numpy()[0] if loss_numeric < min_loss: min_loss = loss_numeric model.save(config["output_file"]) print_eval("dev", scores, labels, loss) evaluate(config, model, test_loader)
def evaluate(config, model=None, test_loader=None): if not test_loader: _, _, test_set = mod.SpeechDataset.splits(config) test_loader = data.DataLoader(test_set, batch_size=len(test_set)) if not config["no_cuda"]: torch.cuda.set_device(config["gpu_no"]) if not model: model = mod.SpeechModel(config) model.load(config["input_file"]) if not config["no_cuda"]: torch.cuda.set_device(config["gpu_no"]) model.cuda() model.eval() criterion = nn.CrossEntropyLoss() results = [] total = 0 for model_in, labels in test_loader: model_in = Variable(model_in, requires_grad=False) if not config["no_cuda"]: model_in = model_in.cuda() labels = labels.cuda() scores = model(model_in) labels = Variable(labels, requires_grad=False) loss = criterion(scores, labels) results.append(print_eval("test", scores, labels, loss) * model_in.size(0)) total += model_in.size(0) print("final test accuracy: {}".format(sum(results) / total))
def __init__(self): self.CHUNK = 1024 * 3 self.THRESHOLD = 0.1 self.FORMAT = pyaudio.paFloat32 self.CHANNELS = 1 self.SAMPLE_RATE = 16000 self.model = md.SpeechModel() self.p = pyaudio.PyAudio()
import model import librosa import os path_name = "/home/minhhiu/MyProjects/Compressed Speech Data/full_command_data/test/wav/" cmd_list = [] m = model.SpeechModel() correct = 0 total = 0 with open("./model/cmd_list.txt", "r", encoding="utf8") as cmd_r: for i in range(71): cmd = cmd_r.readline() cmd = cmd.lower().strip("\n") cmd_list.append(cmd) print(cmd_list) for d in os.listdir(path_name): path = os.path.join(path_name, d) for file in os.listdir(path): wav_path = os.path.join(path, file) txt_path = wav_path.replace("wav", "txt") audio, _ = librosa.load(wav_path, sr=16000, mono=True) audio = audio.reshape(-1, 1) pred_index = m.predict(audio) with open(txt_path, "r", encoding="utf8") as fr:
n_coms = np.arange(3, 6) n_mixs = np.arange(3, 6) window_sizes = np.arange(0.015, 0.045, 0.01) strides = np.arange(0.005, 0.015, 0.005) n_mfccs = np.array([6, 9, 13]) params = itertools.product(n_coms, n_mixs, window_sizes, strides, n_mfccs) max_setting = {} max_acc = -np.inf for n_com, n_mix, window_size, stride, n_mfcc in params: print('______') print('n_com = {n_com}, n_mix = {n_mix}, window_size = {window_size}, stride = {stride}, \ n_mfcc = {n_mfcc}'.format(n_com=n_com, n_mix=n_mix, window_size=window_size, stride=stride, n_mfcc=n_mfcc)) data_loader = model.DataLoader(path='tiengviet', n_mfcc=n_mfcc, window_size=window_size, overlap=stride, test_size=0.1 , shuffle = True) data_loader.load_data() m = model.SpeechModel(data_loader, n_com=n_com, n_mix=n_mix) m.fit() acc = m.cal_accuracy() print(acc) if acc > max_acc: max_acc = acc max_setting['n_com'] = n_com max_setting['n_mix'] = n_mix max_setting['window_size'] = window_size max_setting['stride'] = stride max_setting['n_mfcc'] = n_mfcc