def getData(database_binaryDir, datasetType, processedDir): # just get the names testVolunteerNumbers = ["13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M"]; testVolunteers = [str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers]; lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"]; allSpeakers = [f for f in os.listdir(database_binaryDir) if os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl"] trainVolunteers = [f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers)]; if datasetType == "combined": trainingSpeakerFiles = trainVolunteers + lipspeakers testSpeakerFiles = testVolunteers else: # datasetType == "volunteers": trainingSpeakerFiles = trainVolunteers testSpeakerFiles = testVolunteers # else: # raise Exception("invalid dataset entered") datasetFiles = [trainingSpeakerFiles, testSpeakerFiles] # get a sample of the dataset to debug the network if datasetType == "lipspeakers": lipspkr_path = os.path.expanduser("~/TCDTIMIT/combinedSR/TCDTIMIT/binaryLipspeakers/allLipspeakersTest.pkl") logger_combined.info("data: lipspkr_path") data = unpickle(lipspkr_path) else: data, _, _ = preprocessingCombined.getOneSpeaker(trainingSpeakerFiles[0], sourceDataDir=database_binaryDir, storeProcessed=True, processedDir=processedDir, trainFraction=1.0, validFraction=0.0, verbose=False) return data, datasetFiles, testSpeakerFiles
# speakerFile=speakerFile, sourceDataDir=database_binaryDir, # trainFraction=0.8, validFraction=0.2, # storeProcessed=True, loadData=False, processedDir=processedDir, logger=logger_RNN) # # for speakerFile in testSpeakerFiles: # logger_RNN.info("%s", os.path.basename(speakerFile)) # preprocessingCombined.getOneSpeaker( # speakerFile=speakerFile, sourceDataDir=database_binaryDir, # trainFraction=0.0, validFraction=0.0, # storeProcessed=True, loadData=False, processedDir=processedDir, logger=logger_RNN) #used for debugging dataset_test, _, _ = preprocessingCombined.getOneSpeaker( trainingSpeakerFiles[0], sourceDataDir=database_binaryDir, storeProcessed=True, processedDir=processedDir, trainFraction=1.0, validFraction=0.0, verbose=True) ##### BUIDING MODEL ##### logger_RNN.info('\n* Building network ...') RNN_network = NeuralNetwork('RNN', dataset_test, batch_size=batch_size, num_features=nbMFCCs, n_hidden_list=N_HIDDEN_LIST, num_output_units=nbPhonemes, bidirectional=BIDIRECTIONAL, addDenseLayers=ADD_DENSE_LAYERS, seed=0,
def trainNetwork(AUDIO_LSTM_HIDDEN_LIST, CNN_NETWORK, cnn_features, LIP_RNN_HIDDEN_LIST, DENSE_HIDDEN_LIST, datasetType, runType, LR_start, forceTrain): ##### SCRIPT META VARIABLES ##### VERBOSE = True compute_confusion = False # TODO: ATM this is not implemented batch_size_audio = 1 #only works processing 1 video at a time. The lipreading CNN then processes as a batch all the images in this video max_num_epochs = 20 nbMFCCs = 39 # num of features to use -> see 'utils.py' in convertToPkl under processDatabase nbPhonemes = 39 # number output neurons #AUDIO_LSTM_HIDDEN_LIST = [256, 256] BIDIRECTIONAL = True # # lipreading # CNN_NETWORK = "google" # # using CNN-LSTM combo: what to input to LSTM? direct conv outputs or first through dense layers? # cnn_features = 'conv' #'dense' # 39 outputs as input to LSTM # LIP_RNN_HIDDEN_LIST = None #[256,256] # set to None to disable CNN-LSTM architecture # # # after concatenation of audio and lipreading, which dense layers before softmax? # DENSE_HIDDEN_LIST = [128,64,64] #[128,128,128,128] # # # Decaying LR # LR_start = 0.001 logger_combined.info("LR_start = %s", str(LR_start)) LR_fin = 0.0000001 # logger_combined.info("LR_fin = %s", str(LR_fin)) #LR_decay = (LR_fin / LR_start) ** (1. / num_epochs) # each epoch, LR := LR * LR_decay LR_decay= 0.7071 logger_combined.info("LR_decay = %s", str(LR_decay)) # Set locations for DATA, LOG, PARAMETERS, TRAIN info dataset = "TCDTIMIT" root_dir = os.path.expanduser('~/TCDTIMIT/combinedSR/' + dataset) database_binaryDir = root_dir + '/binary' processedDir = database_binaryDir + "_finalProcessed" # datasetType = "lipspeakers" # ""volunteers"; if datasetType == "lipspeakers": loadPerSpeaker = False else: loadPerSpeaker = True store_dir = root_dir + os.sep + "results" + os.sep + ("CNN_LSTM" if LIP_RNN_HIDDEN_LIST != None else "CNN") + os.sep + datasetType if not os.path.exists(store_dir): os.makedirs(store_dir) # # which part of the network to train/save/... # # runType = 'audio' # # runType = 'lipreading' # runType = 'combined' ########################### # audio network + cnnNetwork + classifierNetwork model_name = "RNN__" + str(len(AUDIO_LSTM_HIDDEN_LIST)) + "_LSTMLayer" + '_'.join([str(layer) for layer in AUDIO_LSTM_HIDDEN_LIST]) \ + "_nbMFCC" + str(nbMFCCs) + ("_bidirectional" if BIDIRECTIONAL else "_unidirectional") + "__" \ + "CNN_" + CNN_NETWORK + "_" + cnn_features \ + ("_lipRNN_" if LIP_RNN_HIDDEN_LIST != None else "") + ('_'.join([str(layer) for layer in LIP_RNN_HIDDEN_LIST]) if LIP_RNN_HIDDEN_LIST != None else "") + "__" \ + "FC_" + '_'.join([str(layer) for layer in DENSE_HIDDEN_LIST]) + "__" \ + dataset + "_" + datasetType model_load = os.path.join(store_dir, model_name + ".npz") model_save = os.path.join(store_dir, model_name) # for loading stored audio models audio_dataset = "combined" # TCDTIMIT + TIMIT datasets audio_model_name = str(len(AUDIO_LSTM_HIDDEN_LIST)) + "_LSTMLayer" + '_'.join( [str(layer) for layer in AUDIO_LSTM_HIDDEN_LIST]) + "_nbMFCC" + str(nbMFCCs) + \ ("_bidirectional" if BIDIRECTIONAL else "_unidirectional") + "_" + audio_dataset audio_model_dir = os.path.expanduser("~/TCDTIMIT/audioSR/"+audio_dataset+"/results") audio_model_path = os.path.join(audio_model_dir, audio_model_name + ".npz") # for loading stored lipreading models lip_model_dir = os.path.join(os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset + "/results")) viseme = False; network_type = "google" lip_CNN_model_name = datasetType + "_" + network_type + "_" + ("viseme" if viseme else "phoneme") + str(nbPhonemes) CNN_model_path = os.path.join(lip_model_dir, lip_CNN_model_name + ".npz") # for CNN-LSTM networks if LIP_RNN_HIDDEN_LIST != None: lip_CNN_LSTM_model_name = lip_CNN_model_name + "_LSTM" + '_'.join([str(layer) for layer in LIP_RNN_HIDDEN_LIST]) lip_CNN_LSTM_model_path = os.path.join(lip_model_dir, lip_CNN_LSTM_model_name + ".npz") # log file logFile = store_dir + os.sep + model_name + '.log' if os.path.exists(logFile): fh = logging.FileHandler(logFile) # append to existing log else: fh = logging.FileHandler(logFile, 'w') # create new logFile fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger_combined.addHandler(fh) print("log file: ", logFile) ############################################################# logger_combinedtools.info("\n\n\n\n STARTING NEW EVALUATION/TRAINING SESSION AT " + strftime("%Y-%m-%d %H:%M:%S", gmtime())) ##### IMPORTING DATA ##### logger_combined.info(' data source: ' + database_binaryDir) logger_combined.info(' model target: ' + model_save + '.npz') storeProcessed = True # if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py # just get the names testVolunteerNumbers = ["13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M"]; testVolunteers = [str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers]; lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"]; allSpeakers = [f for f in os.listdir(database_binaryDir) if os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl"] trainVolunteers = [f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers)]; if datasetType == "combined": trainingSpeakerFiles = trainVolunteers + lipspeakers testSpeakerFiles = testVolunteers else:# datasetType == "volunteers": trainingSpeakerFiles = trainVolunteers testSpeakerFiles = testVolunteers # else: # raise Exception("invalid dataset entered") datasetFiles = [trainingSpeakerFiles, testSpeakerFiles] # get a sample of the dataset to debug the network if datasetType == "lipspeakers": lipspkr_path = os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binaryPerVideo/allLipspeakersTest.pkl") data = unpickle(lipspkr_path) else: data, _, _ = preprocessingCombined.getOneSpeaker(trainingSpeakerFiles[0], sourceDataDir=database_binaryDir, storeProcessed=True, processedDir=processedDir, trainFraction=1.0, validFraction=0.0, verbose=False) # import pdb;pdb.set_trace() ##### BUIDING MODEL ##### logger_combined.info('\n\n* Building network ...') network = NeuralNetwork('combined', dataset=data, loadPerSpeaker = loadPerSpeaker, num_features=nbMFCCs, lstm_hidden_list=AUDIO_LSTM_HIDDEN_LIST, num_output_units=nbPhonemes, bidirectional=BIDIRECTIONAL, cnn_network=CNN_NETWORK, cnn_features = cnn_features, lipRNN_hidden_list=LIP_RNN_HIDDEN_LIST, dense_hidden_list=DENSE_HIDDEN_LIST, debug=False) # print number of parameters nb_params_CNN_noDense = lasagne.layers.count_params(network.CNN_lout_features) nb_params_CNN = lasagne.layers.count_params(network.CNN_lout) nb_params_lipreading = lasagne.layers.count_params(network.lipreading_lout_features) nb_params_RNN = lasagne.layers.count_params(network.audioNet_lout_features) nb_params = lasagne.layers.count_params(network.combined_lout) logger_combined.info(" # params lipreading Total: %s", nb_params_lipreading) if LIP_RNN_HIDDEN_LIST != None: logger_combined.info(" # params lipRNN: %s", nb_params_lipreading - nb_params_CNN) if cnn_features == 'dense': logger_combined.info(" # params CNN: %s", nb_params_CNN) else: logger_combined.info(" # params CNN: %s", nb_params_CNN_noDense) logger_combined.info(" # params audio LSTM: %s", nb_params_RNN) logger_combined.info(" # params combining FC: %s", nb_params - nb_params_lipreading - nb_params_RNN) logger_combined.info(" # params whole network: %s", nb_params) # Try to load stored model success = {} # dict that stores whether the models were loaded successfully (so if they were trained before and are stored) # If they are, no need to train them again; we can just evaluate the test set to get the results we want. logger_combined.info(' Network built. \n\nTrying to load stored model: %s', model_load) success['combined'] = network.load_model(model_type='combined', model_path=model_load) if not success['combined']: logger_combined.warning("No complete network found, loading parts...") logger_combined.info("CNN : %s", CNN_model_path) success['lipreading'] = network.load_model(model_type='CNN', model_path=CNN_model_path) if LIP_RNN_HIDDEN_LIST != None: logger_combined.info("CNN_LSTM : %s", lip_CNN_LSTM_model_path) success['lipreading'] = network.load_model(model_type='CNN_LSTM', model_path=lip_CNN_LSTM_model_path) logger_combined.info("RNN : %s", audio_model_path) success['audio'] = network.load_model(model_type='RNN', model_path=audio_model_path) ##### COMPILING FUNCTIONS ##### logger_combined.info("\n\n* Compiling functions ...") network.build_functions(train=True, debug=False) # get the name of the model we're training/evaluating if runType == 'audio': model_save = audio_model_path elif runType == 'lipreading': if LIP_RNN_HIDDEN_LIST != None: model_save = lip_CNN_LSTM_model_path else: model_save = CNN_model_path elif runType == 'combined': model_save = model_load else: raise IOError("can't save network params; network output not found") model_save = model_save.replace(".npz", "") # if runType model already exists (and loaded successfully), no need to train it, just evaluate. if success[runType] and not forceTrain: network.finalNetworkEvaluation(save_name=model_save, database_binaryDir=database_binaryDir, processedDir=processedDir, runType=runType, storeProcessed=storeProcessed, testSpeakerFiles=testSpeakerFiles) else: # network doesn't exist, we need to train it first. ##### TRAINING ##### logger_combined.info("\n\n* Training ...") network.train(datasetFiles, database_binaryDir=database_binaryDir, runType=runType, storeProcessed=True, processedDir=processedDir, num_epochs=max_num_epochs, batch_size=batch_size_audio, LR_start=LR_start, LR_decay=LR_decay, compute_confusion=False, debug=False, save_name=model_save) logger_combined.info("\n\n* Done") logger_combined.info('Total time: {:.3f}'.format(time.time() - program_start_time)) # close the log file handler to be able to log to new file fh.close() logger_combined.removeHandler(fh) print(logger_combined.handlers)
def evalTEST(self, testSpeakerFiles, sourceDataDir=None, storeProcessed=False, processedDir=None, verbose=False, logger=logger_RNNtools): test_acc = 0 test_cost = 0 test_topk_acc = 0 nb_test_batches = 0 # for each speaker, pass over the train set, then test set. (test is other files). save the results. for speakerFile in tqdm(testSpeakerFiles, total=len(testSpeakerFiles)): logger.debug("processing %s", speakerFile) train, val, test = preprocessingCombined.getOneSpeaker( speakerFile=speakerFile, sourceDataDir=sourceDataDir, trainFraction=0.0, validFraction=0.0, storeProcessed=storeProcessed, processedDir=processedDir, logger=logger) images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test if verbose: logger.debug("the number of training examples is: %s", len(images_train)) logger.debug("the number of valid examples is: %s", len(images_val)) logger.debug("the number of test examples is: %s", len(images_test)) # get results for testidation set test_cost_one, test_acc_one, test_topk_acc_one, test_batches_one = self.val_epoch( mfccs=mfccs_test, validLabels=audioLabels_test, valid_frames=validAudioFrames_test) test_acc += test_acc_one test_cost += test_cost_one test_topk_acc += test_topk_acc_one nb_test_batches += test_batches_one if verbose: logger.debug(" this speaker results: ") logger.debug("\ttest cost: %s", test_cost_one / test_batches_one) logger.debug("\vtest acc rate: %s %%", test_acc_one / test_batches_one * 100) logger.debug("\vtest top 3 acc rate: %s %%", test_topk_acc_one / test_batches_one * 100) # get the average over all speakers test_cost /= nb_test_batches test_acc = test_acc / nb_test_batches * 100 test_topk_acc = test_topk_acc / nb_test_batches * 100 return test_cost, test_acc, test_topk_acc
def evalTRAINING(self, trainingSpeakerFiles, LR, shuffleEnabled=True, sourceDataDir=None, storeProcessed=False, processedDir=None, verbose=False, logger=logger_RNNtools): train_cost = 0 val_acc = 0 val_cost = 0 val_topk_acc = 0 nb_train_batches = 0 nb_val_batches = 0 # for each speaker, pass over the train set, then val set. (test is other files). save the results. for speakerFile in tqdm(trainingSpeakerFiles, total=len(trainingSpeakerFiles)): logger.debug("processing %s", speakerFile) train, val, test = preprocessingCombined.getOneSpeaker( speakerFile=speakerFile, sourceDataDir=sourceDataDir, trainFraction=0.8, validFraction=0.2, storeProcessed=storeProcessed, processedDir=processedDir, logger=logger) if shuffleEnabled: train = self.shuffle(train) images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test if verbose: logger.debug("the number of training examples is: %s", len(images_train)) logger.debug("the number of valid examples is: %s", len(images_val)) logger.debug("the number of test examples is: %s", len(images_test)) train_cost_one, train_batches_one = self.train_epoch( mfccs=mfccs_train, validLabels=audioLabels_train, valid_frames=validAudioFrames_train, LR=LR) train_cost += train_cost_one nb_train_batches += train_batches_one # get results for validation set val_cost_one, val_acc_one, val_topk_acc_one, val_batches_one = self.val_epoch( mfccs=mfccs_val, validLabels=audioLabels_val, valid_frames=validAudioFrames_val) val_cost += val_cost_one val_acc += val_acc_one val_topk_acc += val_topk_acc_one nb_val_batches += val_batches_one if verbose: logger.debug(" this speaker results: ") logger.debug("\ttraining cost: %s", train_cost_one / train_batches_one) logger.debug("\tvalidation cost: %s", val_cost_one / val_batches_one) logger.debug("\vvalidation acc rate: %s %%", val_acc_one / val_batches_one * 100) logger.debug("\vvalidation top 3 acc rate: %s %%", val_topk_acc_one / val_batches_one * 100) # get the average over all speakers train_cost /= nb_train_batches val_cost /= nb_val_batches val_acc = val_acc / nb_val_batches * 100 # convert to % val_topk_acc = val_topk_acc / nb_val_batches * 100 # convert to % return train_cost, val_cost, val_acc, val_topk_acc
allAudioLabels_val = [] allValidLabels_val = [] allValidAudioFrames_val = [] allImages_test = [] allMfccs_test = [] allAudioLabels_test = [] allValidLabels_test = [] allValidAudioFrames_test = [] if runType == 'normal': for lipspeaker in lipspeakers: train, val, test = preprocessingCombined.getOneSpeaker( lipspeaker, sourceDataDir=database_binaryDir, storeProcessed=False, processedDir=processedDir, trainFraction=trainFraction, validFraction=validFraction, viseme=viseme, verbose=False, logger=logger_combined) images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test allImages_train += images_train allMfccs_train += mfccs_train allAudioLabels_train += audioLabels_train allValidLabels_train += validLabels_train allValidAudioFrames_train += validAudioFrames_train allImages_val += images_val
# # import numpy as np # validAudioFrames = np.reshape(validAudioFrames[0], (1, validAudioFrames[0].shape)) ## Get images per video for lipspeakers import preprocessingCombined allImages_train = []; allMfccs_train = []; allAudioLabels_train=[]; allValidLabels_train=[]; allValidAudioFrames_train = [] allImages_val = []; allMfccs_val = []; allAudioLabels_val=[]; allValidLabels_val=[]; allValidAudioFrames_val = [] allImages_test = []; allMfccs_test = []; allAudioLabels_test=[]; allValidLabels_test=[]; allValidAudioFrames_test = [] for lipspeaker in lipspeakers: train, val, test = preprocessingCombined.getOneSpeaker(lipspeaker, sourceDataDir=database_binaryDir, storeProcessed=False, processedDir=processedDir, trainFraction=0.7, validFraction=0.1, verbose=False) images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test allImages_train += images_train allMfccs_train += mfccs_train allAudioLabels_train += audioLabels_train allValidLabels_train += validLabels_train allValidAudioFrames_train += validAudioFrames_train allImages_val += images_val allMfccs_val += mfccs_val allAudioLabels_val += audioLabels_val allValidLabels_val += validLabels_val