def train_nn(speech_data, speech_alignment): print print datetime.datetime.now() print random.seed(0) try: f = open(fetures_file_name, "rb") crossvalid_x = np.load(f) crossvalid_y = np.load(f) train_x = np.load(f) train_y = np.load(f) tx_m = np.load(f) tx_std = np.load(f) f.close() except IOError: crossvalid_x, crossvalid_y, train_x, train_y, tx_m, tx_std = gen_features( speech_data, speech_alignment) input_size = train_x.shape[1] * (prev_frames + 1 + next_frames) output_size = np.amax(train_y) + 1 print "The shape of non-multiplied training data: ", train_x.shape, train_y.shape print "The shape of non-multiplied test data: ", crossvalid_x.shape, crossvalid_y.shape # add prev, and last frames tx_m = np.tile(tx_m, prev_frames + 1 + next_frames) tx_std = np.tile(tx_std, prev_frames + 1 + next_frames) print print datetime.datetime.now() print print "prev_frames +1+ last_frames:", prev_frames, 1, next_frames print "The shape of training data: ", train_x.shape, train_y.shape print "The shape of test data: ", crossvalid_x.shape, crossvalid_y.shape print "The shape of tx_m, tx_std: ", tx_m.shape, tx_std.shape print "The output size: ", output_size print e = tffnn.TheanoFFNN(input_size, hidden_units, hidden_layers, output_size, hidden_activation=hact, weight_l2=weight_l2, training_set_x=train_x, training_set_y=train_y, prev_frames=prev_frames, next_frames=next_frames, amplify_center_frame=amplify_center_frame, batch_size=batch_size) e.set_input_norm(tx_m, tx_std) dc_acc = [] dt_acc = [] epoch = 0 i_hidden_layers = hidden_layers while True: print print '-' * 80 print 'Predictions' print '-' * 80 predictions_y, gold_y = e.predict(crossvalid_x, batch_size, prev_frames, next_frames, crossvalid_y) c_acc, c_sil = get_accuracy(gold_y, predictions_y) predictions_y, gold_y = e.predict(train_x, batch_size, prev_frames, next_frames, train_y) t_acc, t_sil = get_accuracy(gold_y, predictions_y) dc_acc.append(c_acc) dt_acc.append(t_acc) print print "method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, " \ "hidden_units, hidden_layers, hidden_layers_add, prev_frames, next_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only " print method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, \ hidden_units, hidden_layers, hidden_layers_add, prev_frames, next_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only print "Epoch: %d" % (epoch, ) print print "Cross-validation stats" print "------------------------" print "Epoch predictive accuracy: %0.4f" % c_acc print "Last epoch accs:", ["%.4f" % x for x in dc_acc[-20:]] print "Epoch sil bias: %0.2f" % c_sil print print "Training stats" print "------------------------" print "Epoch predictive accuracy: %0.4f" % t_acc print "Last epoch accs:", ["%.4f" % x for x in dt_acc[-20:]] print "Epoch sil bias: %0.2f" % t_sil print print "Best results" print "------------------------" print "Best iteration:", np.argmax(dc_acc) print "Best iteration - cross-validation acc: %.4f" % dc_acc[np.argmax( dc_acc)] print "Best iteration - training acc: %.4f" % dt_acc[np.argmax( dc_acc)] print print datetime.datetime.now() print if epoch == np.argmax(dc_acc): print print "Saving the FFNN and TFFN models" print file_name = "model_voip/lid_nnt_%d_hu%d_hl%d_hla%d_pf%d_nf%d_acf_%.1f_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d_bs%d" % \ (input_size, hidden_units, hidden_layers, hidden_layers_add, prev_frames, next_frames, amplify_center_frame, max_frames, max_files, max_frames_per_segment, trim_segments, usec0, usedelta, useacc, mel_banks_only, batch_size) nn = ffnn.FFNN() for w, b in [ e.params[n:n + 2] for n in range(0, len(e.params), 2) ]: nn.add_layer(w.get_value(), b.get_value()) nn.set_input_norm(tx_m, tx_std) nn.save(file_name + ".ffnn") e.save(file_name + ".tffnn") if epoch == max_epoch: break epoch += 1 if epoch > 1 and epoch % 1 == 0: if i_hidden_layers < hidden_layers + hidden_layers_add: print print '-' * 80 print 'Adding a hidden layer: ', i_hidden_layers + 1 print '-' * 80 e.add_hidden_layer(hidden_units) i_hidden_layers += 1 print print '-' * 80 print 'Training' print '-' * 80 e.train(method=method, learning_rate=learning_rate * learning_rate_decay / (learning_rate_decay + epoch))
def train_nn(speech_data, speech_alignment): vta = MLFMFCCOnlineAlignedArray(usec0=usec0, n_last_frames=last_frames, usedelta=usedelta, useacc=useacc, mel_banks_only=mel_banks_only) sil_count = 0 speech_count = 0 for sd, sa in zip(speech_data, speech_alignment): mlf_speech = load_mlf(sa, max_files, max_frames_per_segment) vta.append_mlf(mlf_speech) vta.append_trn(sd) sil_count += mlf_speech.count_length('sil') speech_count += mlf_speech.count_length('speech') print "The length of sil segments: ", sil_count print "The length of speech segments: ", speech_count mfcc = vta.__iter__().next() print "MFCC length:", len(mfcc[0]) input_size = len(mfcc[0]) e = theanets.Experiment( theanets.Classifier, layers=(input_size, hidden_units, hidden_units, hidden_units, hidden_units, 2), optimize="hf", num_updates=30, validate=1, initial_lambda=0.1, preconditioner=True if preconditioner else False, hidden_dropouts=hidden_dropouts, weight_l2=weight_l2, batch_size=500, ) random.seed(0) print "Generating the cross-validation and train MFCC features" crossvalid_x = [] crossvalid_y = [] train_x = [] train_y = [] i = 0 for frame, label in vta: frame = frame - (10.0 if mel_banks_only else 0.0) if i % (max_frames / 10) == 0: print "Already processed: %.2f%% of data" % (100.0 * i / max_frames) if i > max_frames: break if random.random() < float(crossvalid_frames) / max_frames: # sample validation (test) data crossvalid_x.append(frame) if label == "sil": crossvalid_y.append(0) else: crossvalid_y.append(1) else: train_x.append(frame) if label == "sil": train_y.append(0) else: train_y.append(1) i += 1 crossvalid = [ np.array(crossvalid_x), np.array(crossvalid_y).astype('int32') ] train = [np.array(train_x), np.array(train_y).astype('int32')] print print "The length of training data: ", len(train_x) print "The length of test data: ", len(crossvalid_x) print dc_acc = deque(maxlen=20) dt_acc = deque(maxlen=20) epoch = 0 while True: predictions_y = e.network.predict(crossvalid_x) c_acc, c_sil = get_accuracy(crossvalid_y, predictions_y) predictions_y = e.network.predict(train_x) t_acc, t_sil = get_accuracy(train_y, predictions_y) print print "max_frames, max_files, max_frames_per_segment, trim_segments, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only, preconditioner, hidden_dropouts, weight_l2" print max_frames, max_files, max_frames_per_segment, trim_segments, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only, preconditioner, hidden_dropouts, weight_l2 print "Epoch: %d" % (epoch, ) print print "Cross-validation stats" print "------------------------" print "Epoch predictive accuracy: %0.2f" % c_acc print "Last epoch accs:", ["%.2f" % x for x in dc_acc] print "Epoch sil bias: %0.2f" % c_sil print print "Training stats" print "------------------------" print "Epoch predictive accuracy: %0.2f" % t_acc print "Last epoch accs:", ["%.2f" % x for x in dt_acc] print "Epoch sil bias: %0.2f" % t_sil if epoch == max_epoch: break epoch += 1 e.run(train, crossvalid) dc_acc.append(c_acc) dt_acc.append(t_acc) nn = ffnn.FFNN() for w, b in zip(e.network.weights, e.network.biases): nn.add_layer(w.get_value(), b.get_value()) nn.save(file_name = "model_voip/vad_sds_mfcc_is%d_hu%d_lf%d_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d.nn" % \ (input_size, hidden_units, last_frames, max_frames, max_files, max_frames_per_segment, trim_segments, usec0, usedelta, useacc, mel_banks_only))
def train_nn(speech_data, speech_alignment): print print datetime.datetime.now() print random.seed(0) try: f = open(fetures_file_name, "rb") crossvalid_x = np.load(f) crossvalid_y = np.load(f) train_x = np.load(f) train_y = np.load(f) f.close() except: vta = MLFMFCCOnlineAlignedArray(usec0=usec0, n_last_frames=last_frames, usedelta=usedelta, useacc=useacc, mel_banks_only=mel_banks_only) sil_count = 0 speech_count = 0 for sd, sa in zip(speech_data, speech_alignment): mlf_speech = load_mlf(sa, max_files, max_frames_per_segment) vta.append_mlf(mlf_speech) vta.append_trn(sd) sil_count += mlf_speech.count_length('sil') speech_count += mlf_speech.count_length('speech') print "The length of sil segments: ", sil_count print "The length of speech segments: ", speech_count mfcc = vta.__iter__().next() print "MFCC length:", len(mfcc[0]) input_size = len(mfcc[0]) print "Generating the cross-validation and train MFCC features" crossvalid_x = [] crossvalid_y = [] train_x = [] train_y = [] i = 0 for frame, label in vta: # downcast frame = frame.astype(np.float32) # frame = frame - (10.0 if mel_banks_only else 0.0) if i % (max_frames / 10) == 0: print "Already processed: %.2f%% of data" % (100.0 * i / max_frames) if i > max_frames: break if random.random() < float(crossvalid_frames) / max_frames: # sample validation (test) data crossvalid_x.append(frame) if label == "sil": crossvalid_y.append(0) else: crossvalid_y.append(1) else: train_x.append(frame) if label == "sil": train_y.append(0) else: train_y.append(1) i += 1 gc.collect() crossvalid_x = np.array(crossvalid_x).astype(np.float32) gc.collect() crossvalid_y = np.array(crossvalid_y).astype('int32') gc.collect() train_x = np.array(train_x).astype(np.float32) gc.collect() train_y = np.array(train_y).astype('int32') gc.collect() # normalise the data tx_m = np.mean(train_x, axis=0) tx_std = np.std(train_x, axis=0) gc.collect() crossvalid_x -= tx_m gc.collect() crossvalid_x /= tx_std gc.collect() train_x -= tx_m gc.collect() train_x /= tx_std gc.collect() print 'Saving data to:', fetures_file_name f = open(fetures_file_name, "wb") np.save(f, crossvalid_x) np.save(f, crossvalid_y) np.save(f, train_x) np.save(f, train_y) f.close() print print datetime.datetime.now() print print "The shape of training data: ", train_x.shape, train_y.shape print "The shape of test data: ", crossvalid_x.shape, crossvalid_y.shape print input_size = train_x.shape[1] e = ffnn.TheanoFFNN2(input_size, hidden_units, hidden_layers, 2, hidden_activation=hact, weight_l2=weight_l2) dc_acc = [] dt_acc = [] epoch = 0 while True: print print '-' * 80 print 'Predictions' print '-' * 80 predictions_y = e.predict(crossvalid_x, batch_size) c_acc, c_sil = get_accuracy(crossvalid_y, predictions_y) predictions_y = e.predict(train_x, batch_size) t_acc, t_sil = get_accuracy(train_y, predictions_y) dc_acc.append(c_acc) dt_acc.append(t_acc) print print "method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only " print method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only print "Epoch: %d" % (epoch, ) print print "Cross-validation stats" print "------------------------" print "Epoch predictive accuracy: %0.4f" % c_acc print "Last epoch accs:", ["%.4f" % x for x in dc_acc[-20:]] print "Epoch sil bias: %0.2f" % c_sil print print "Training stats" print "------------------------" print "Epoch predictive accuracy: %0.4f" % t_acc print "Last epoch accs:", ["%.4f" % x for x in dt_acc[-20:]] print "Epoch sil bias: %0.2f" % t_sil print print "Best results" print "------------------------" print "Best iteration:", np.argmax(dc_acc) print "Best iteration - cross-validation acc: %.4f" % dc_acc[np.argmax( dc_acc)] print "Best iteration - training acc: %.4f" % dt_acc[np.argmax( dc_acc)] print print datetime.datetime.now() print if epoch == np.argmax(dc_acc): print print "Saving the FFNN model" print nn = ffnn.FFNN() for w, b in e.params: nn.add_layer(w.get_value(), b.get_value()) nn.save(file_name = "model_voip/vad_sds_mfcc_is%d_hu%d_lf%d_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d.nnt" % \ (input_size, hidden_units, last_frames, max_frames, max_files, max_frames_per_segment, trim_segments, usec0, usedelta, useacc, mel_banks_only)) if epoch == max_epoch: break epoch += 1 print print '-' * 80 print 'Training' print '-' * 80 e.train(train_x, train_y, method=method, learning_rate=learning_rate * learning_rate_decay / (learning_rate_decay + epoch), batch_size=batch_size)