def getData(database_binaryDir, datasetType, processedDir):
    # just get the names
    testVolunteerNumbers = ["13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F",
                            "43F", "47M", "51F", "54M"];
    testVolunteers = [str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers];
    lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"];
    allSpeakers = [f for f in os.listdir(database_binaryDir) if
                   os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl"]
    trainVolunteers = [f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers)];
    if datasetType == "combined":
        trainingSpeakerFiles = trainVolunteers + lipspeakers
        testSpeakerFiles = testVolunteers
    else:  # datasetType == "volunteers":
        trainingSpeakerFiles = trainVolunteers
        testSpeakerFiles = testVolunteers
    # else:
    #     raise Exception("invalid dataset entered")
    datasetFiles = [trainingSpeakerFiles, testSpeakerFiles]
    # get a sample of the dataset to debug the network
    if datasetType == "lipspeakers":
        lipspkr_path = os.path.expanduser("~/TCDTIMIT/combinedSR/TCDTIMIT/binaryLipspeakers/allLipspeakersTest.pkl")
        logger_combined.info("data: lipspkr_path")
        data = unpickle(lipspkr_path)
    else:
        data, _, _ = preprocessingCombined.getOneSpeaker(trainingSpeakerFiles[0],
                                                         sourceDataDir=database_binaryDir,
                                                         storeProcessed=True,
                                                         processedDir=processedDir,
                                                         trainFraction=1.0, validFraction=0.0,
                                                         verbose=False)

    return data, datasetFiles, testSpeakerFiles
Beispiel #2
0
#             speakerFile=speakerFile, sourceDataDir=database_binaryDir,
#             trainFraction=0.8, validFraction=0.2,
#             storeProcessed=True, loadData=False, processedDir=processedDir, logger=logger_RNN)
#
# for speakerFile in testSpeakerFiles:
#     logger_RNN.info("%s", os.path.basename(speakerFile))
#     preprocessingCombined.getOneSpeaker(
#             speakerFile=speakerFile, sourceDataDir=database_binaryDir,
#             trainFraction=0.0, validFraction=0.0,
#             storeProcessed=True, loadData=False, processedDir=processedDir, logger=logger_RNN)

#used for debugging
dataset_test, _, _ = preprocessingCombined.getOneSpeaker(
    trainingSpeakerFiles[0],
    sourceDataDir=database_binaryDir,
    storeProcessed=True,
    processedDir=processedDir,
    trainFraction=1.0,
    validFraction=0.0,
    verbose=True)

##### BUIDING MODEL #####
logger_RNN.info('\n* Building network ...')
RNN_network = NeuralNetwork('RNN',
                            dataset_test,
                            batch_size=batch_size,
                            num_features=nbMFCCs,
                            n_hidden_list=N_HIDDEN_LIST,
                            num_output_units=nbPhonemes,
                            bidirectional=BIDIRECTIONAL,
                            addDenseLayers=ADD_DENSE_LAYERS,
                            seed=0,
Beispiel #3
0
def trainNetwork(AUDIO_LSTM_HIDDEN_LIST, CNN_NETWORK, cnn_features, LIP_RNN_HIDDEN_LIST, DENSE_HIDDEN_LIST, datasetType, runType, LR_start, forceTrain):
    ##### SCRIPT META VARIABLES #####
    VERBOSE = True
    compute_confusion = False  # TODO: ATM this is not implemented
    
    batch_size_audio = 1  #only works processing 1 video at a time. The lipreading CNN then processes as a batch all the images in this video
    max_num_epochs = 20
    
    nbMFCCs = 39 # num of features to use -> see 'utils.py' in convertToPkl under processDatabase
    nbPhonemes = 39  # number output neurons
    #AUDIO_LSTM_HIDDEN_LIST = [256, 256]
    BIDIRECTIONAL = True
    
    # # lipreading
    # CNN_NETWORK = "google"
    # # using CNN-LSTM combo: what to input to LSTM? direct conv outputs or first through dense layers?
    # cnn_features = 'conv' #'dense' # 39 outputs as input to LSTM
    # LIP_RNN_HIDDEN_LIST = None #[256,256]  # set to None to disable CNN-LSTM architecture
    #
    # # after concatenation of audio and lipreading, which dense layers before softmax?
    # DENSE_HIDDEN_LIST = [128,64,64] #[128,128,128,128]
    #
    # # Decaying LR
    # LR_start = 0.001
    logger_combined.info("LR_start = %s", str(LR_start))
    LR_fin = 0.0000001
    # logger_combined.info("LR_fin = %s", str(LR_fin))
    #LR_decay = (LR_fin / LR_start) ** (1. / num_epochs)  # each epoch, LR := LR * LR_decay
    LR_decay= 0.7071
    logger_combined.info("LR_decay = %s", str(LR_decay))
    
    # Set locations for DATA, LOG, PARAMETERS, TRAIN info
    dataset = "TCDTIMIT"
    root_dir = os.path.expanduser('~/TCDTIMIT/combinedSR/' + dataset)
    database_binaryDir = root_dir + '/binary'
    processedDir = database_binaryDir + "_finalProcessed"
    
    # datasetType = "lipspeakers"  # ""volunteers";
    if datasetType == "lipspeakers": loadPerSpeaker = False
    else: loadPerSpeaker = True
    
    store_dir = root_dir + os.sep + "results" + os.sep + ("CNN_LSTM" if LIP_RNN_HIDDEN_LIST != None else "CNN") + os.sep + datasetType
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    
    # # which part of the network to train/save/...
    # # runType = 'audio'
    # # runType = 'lipreading'
    # runType = 'combined'
    ###########################
    
    
    # audio network + cnnNetwork + classifierNetwork
    model_name = "RNN__" + str(len(AUDIO_LSTM_HIDDEN_LIST)) + "_LSTMLayer" + '_'.join([str(layer) for layer in AUDIO_LSTM_HIDDEN_LIST]) \
                         + "_nbMFCC" + str(nbMFCCs) + ("_bidirectional" if BIDIRECTIONAL else "_unidirectional") +  "__" \
                 + "CNN_" + CNN_NETWORK + "_" + cnn_features \
                 + ("_lipRNN_" if LIP_RNN_HIDDEN_LIST != None else "") + ('_'.join([str(layer) for layer in LIP_RNN_HIDDEN_LIST]) if LIP_RNN_HIDDEN_LIST != None else "")  + "__" \
                 + "FC_" + '_'.join([str(layer) for layer in DENSE_HIDDEN_LIST]) + "__" \
                 + dataset + "_" + datasetType
    model_load = os.path.join(store_dir, model_name + ".npz")
    model_save = os.path.join(store_dir, model_name)
    
    # for loading stored audio models
    audio_dataset = "combined" # TCDTIMIT + TIMIT datasets
    audio_model_name = str(len(AUDIO_LSTM_HIDDEN_LIST)) + "_LSTMLayer" + '_'.join(
            [str(layer) for layer in AUDIO_LSTM_HIDDEN_LIST]) + "_nbMFCC" + str(nbMFCCs) + \
                       ("_bidirectional" if BIDIRECTIONAL else "_unidirectional") + "_" + audio_dataset
    audio_model_dir = os.path.expanduser("~/TCDTIMIT/audioSR/"+audio_dataset+"/results")
    audio_model_path = os.path.join(audio_model_dir, audio_model_name + ".npz")
    
    # for loading stored lipreading models
    lip_model_dir = os.path.join(os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset + "/results"))
    viseme = False; network_type = "google"
    lip_CNN_model_name = datasetType + "_" + network_type + "_" + ("viseme" if viseme else "phoneme") + str(nbPhonemes)
    CNN_model_path = os.path.join(lip_model_dir, lip_CNN_model_name + ".npz")
    
    # for CNN-LSTM networks
    if LIP_RNN_HIDDEN_LIST != None:
        lip_CNN_LSTM_model_name = lip_CNN_model_name + "_LSTM" + '_'.join([str(layer) for layer in LIP_RNN_HIDDEN_LIST])
        lip_CNN_LSTM_model_path = os.path.join(lip_model_dir, lip_CNN_LSTM_model_name + ".npz")
    
    # log file
    logFile = store_dir + os.sep + model_name + '.log'
    if os.path.exists(logFile):
        fh = logging.FileHandler(logFile)       # append to existing log
    else:
        fh = logging.FileHandler(logFile, 'w')  # create new logFile
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger_combined.addHandler(fh)

    print("log file: ", logFile)
    #############################################################
    
    
    logger_combinedtools.info("\n\n\n\n STARTING NEW EVALUATION/TRAINING SESSION AT " + strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    
    ##### IMPORTING DATA #####
    
    logger_combined.info('  data source: ' + database_binaryDir)
    logger_combined.info('  model target: ' + model_save + '.npz')
    
    storeProcessed = True  # if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration
    # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py

    # just get the names
    testVolunteerNumbers = ["13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F",
                            "43F", "47M", "51F", "54M"];
    testVolunteers = [str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers];
    lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"];
    allSpeakers = [f for f in os.listdir(database_binaryDir) if
                   os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl"]
    trainVolunteers = [f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers)];

    if datasetType == "combined":
        trainingSpeakerFiles = trainVolunteers + lipspeakers
        testSpeakerFiles = testVolunteers
    else:# datasetType == "volunteers":
        trainingSpeakerFiles = trainVolunteers
        testSpeakerFiles = testVolunteers
    # else:
    #     raise Exception("invalid dataset entered")
    datasetFiles = [trainingSpeakerFiles, testSpeakerFiles]


    # get a sample of the dataset to debug the network
    if datasetType == "lipspeakers":
        lipspkr_path = os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binaryPerVideo/allLipspeakersTest.pkl")
        data = unpickle(lipspkr_path)
    else:
        data, _, _ = preprocessingCombined.getOneSpeaker(trainingSpeakerFiles[0],
                                                             sourceDataDir=database_binaryDir,
                                                             storeProcessed=True,
                                                             processedDir=processedDir,
                                                             trainFraction=1.0, validFraction=0.0,
                                                             verbose=False)

    # import pdb;pdb.set_trace()

    ##### BUIDING MODEL #####
    logger_combined.info('\n\n* Building network ...')
    network = NeuralNetwork('combined', dataset=data, loadPerSpeaker = loadPerSpeaker,
                            num_features=nbMFCCs, lstm_hidden_list=AUDIO_LSTM_HIDDEN_LIST,
                            num_output_units=nbPhonemes, bidirectional=BIDIRECTIONAL,
                            cnn_network=CNN_NETWORK, cnn_features = cnn_features,
                            lipRNN_hidden_list=LIP_RNN_HIDDEN_LIST,
                            dense_hidden_list=DENSE_HIDDEN_LIST,
                            debug=False)

    # print number of parameters
    nb_params_CNN_noDense   = lasagne.layers.count_params(network.CNN_lout_features)
    nb_params_CNN           = lasagne.layers.count_params(network.CNN_lout)
    nb_params_lipreading    = lasagne.layers.count_params(network.lipreading_lout_features)
    nb_params_RNN           = lasagne.layers.count_params(network.audioNet_lout_features)
    nb_params               = lasagne.layers.count_params(network.combined_lout)
    logger_combined.info(" # params lipreading Total: %s", nb_params_lipreading)

    if LIP_RNN_HIDDEN_LIST != None:
        logger_combined.info(" # params lipRNN:           %s", nb_params_lipreading - nb_params_CNN)

    if cnn_features == 'dense':
        logger_combined.info(" # params CNN:              %s", nb_params_CNN)
    else:
        logger_combined.info(" # params CNN:              %s", nb_params_CNN_noDense)

    logger_combined.info(" # params audio LSTM:       %s", nb_params_RNN)
    logger_combined.info(" # params combining FC:     %s", nb_params - nb_params_lipreading - nb_params_RNN)
    logger_combined.info(" # params whole network:    %s", nb_params)



    # Try to load stored model
    success = {} # dict that stores whether the models were loaded successfully (so if they were trained before and are stored)
                 # If they are, no need to train them again; we can just evaluate the test set to get the results we want.
    logger_combined.info(' Network built. \n\nTrying to load stored model: %s', model_load)
    success['combined'] = network.load_model(model_type='combined', model_path=model_load)
    if not success['combined']:
        logger_combined.warning("No complete network found, loading parts...")

        logger_combined.info("CNN : %s", CNN_model_path)
        success['lipreading'] = network.load_model(model_type='CNN', model_path=CNN_model_path)

        if LIP_RNN_HIDDEN_LIST != None:
            logger_combined.info("CNN_LSTM : %s", lip_CNN_LSTM_model_path)
            success['lipreading'] = network.load_model(model_type='CNN_LSTM', model_path=lip_CNN_LSTM_model_path)

        logger_combined.info("RNN : %s", audio_model_path)
        success['audio'] = network.load_model(model_type='RNN', model_path=audio_model_path)


    ##### COMPILING FUNCTIONS #####
    logger_combined.info("\n\n* Compiling functions ...")
    network.build_functions(train=True, debug=False)

    # get the name of the model we're training/evaluating
    if runType == 'audio':
        model_save = audio_model_path
    elif runType == 'lipreading':
        if LIP_RNN_HIDDEN_LIST != None:
            model_save = lip_CNN_LSTM_model_path
        else:
            model_save = CNN_model_path
    elif runType == 'combined':
        model_save = model_load
    else:
        raise IOError("can't save network params; network output not found")
    model_save = model_save.replace(".npz", "")


    # if runType model already exists (and loaded successfully), no need to train it, just evaluate.
    if success[runType] and not forceTrain:
        network.finalNetworkEvaluation(save_name=model_save,
                                       database_binaryDir=database_binaryDir,
                                       processedDir=processedDir, runType=runType,
                                       storeProcessed=storeProcessed,
                                       testSpeakerFiles=testSpeakerFiles)

    else: # network doesn't exist, we need to train it first.
        ##### TRAINING #####
        logger_combined.info("\n\n* Training ...")



        network.train(datasetFiles, database_binaryDir=database_binaryDir, runType=runType,
                      storeProcessed=True, processedDir=processedDir,
                      num_epochs=max_num_epochs,
                      batch_size=batch_size_audio, LR_start=LR_start, LR_decay=LR_decay,
                      compute_confusion=False, debug=False, save_name=model_save)

    logger_combined.info("\n\n* Done")
    logger_combined.info('Total time: {:.3f}'.format(time.time() - program_start_time))

    # close the log file handler to be able to log to new file
    fh.close()
    logger_combined.removeHandler(fh)
    print(logger_combined.handlers)
Beispiel #4
0
    def evalTEST(self,
                 testSpeakerFiles,
                 sourceDataDir=None,
                 storeProcessed=False,
                 processedDir=None,
                 verbose=False,
                 logger=logger_RNNtools):

        test_acc = 0
        test_cost = 0
        test_topk_acc = 0
        nb_test_batches = 0
        # for each speaker, pass over the train set, then test set. (test is other files). save the results.
        for speakerFile in tqdm(testSpeakerFiles, total=len(testSpeakerFiles)):
            logger.debug("processing %s", speakerFile)
            train, val, test = preprocessingCombined.getOneSpeaker(
                speakerFile=speakerFile,
                sourceDataDir=sourceDataDir,
                trainFraction=0.0,
                validFraction=0.0,
                storeProcessed=storeProcessed,
                processedDir=processedDir,
                logger=logger)

            images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train
            images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val
            images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test

            if verbose:
                logger.debug("the number of training examples is: %s",
                             len(images_train))
                logger.debug("the number of valid examples is:    %s",
                             len(images_val))
                logger.debug("the number of test examples is:     %s",
                             len(images_test))

            # get results for testidation  set
            test_cost_one, test_acc_one, test_topk_acc_one, test_batches_one = self.val_epoch(
                mfccs=mfccs_test,
                validLabels=audioLabels_test,
                valid_frames=validAudioFrames_test)
            test_acc += test_acc_one
            test_cost += test_cost_one
            test_topk_acc += test_topk_acc_one
            nb_test_batches += test_batches_one

            if verbose:
                logger.debug("  this speaker results: ")
                logger.debug("\ttest cost:   %s",
                             test_cost_one / test_batches_one)
                logger.debug("\vtest acc rate:  %s %%",
                             test_acc_one / test_batches_one * 100)
                logger.debug("\vtest  top 3 acc rate:  %s %%",
                             test_topk_acc_one / test_batches_one * 100)

        # get the average over all speakers
        test_cost /= nb_test_batches
        test_acc = test_acc / nb_test_batches * 100
        test_topk_acc = test_topk_acc / nb_test_batches * 100

        return test_cost, test_acc, test_topk_acc
Beispiel #5
0
    def evalTRAINING(self,
                     trainingSpeakerFiles,
                     LR,
                     shuffleEnabled=True,
                     sourceDataDir=None,
                     storeProcessed=False,
                     processedDir=None,
                     verbose=False,
                     logger=logger_RNNtools):
        train_cost = 0
        val_acc = 0
        val_cost = 0
        val_topk_acc = 0
        nb_train_batches = 0
        nb_val_batches = 0

        # for each speaker, pass over the train set, then val set. (test is other files). save the results.
        for speakerFile in tqdm(trainingSpeakerFiles,
                                total=len(trainingSpeakerFiles)):
            logger.debug("processing %s", speakerFile)
            train, val, test = preprocessingCombined.getOneSpeaker(
                speakerFile=speakerFile,
                sourceDataDir=sourceDataDir,
                trainFraction=0.8,
                validFraction=0.2,
                storeProcessed=storeProcessed,
                processedDir=processedDir,
                logger=logger)

            if shuffleEnabled: train = self.shuffle(train)
            images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train
            images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val
            images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test

            if verbose:
                logger.debug("the number of training examples is: %s",
                             len(images_train))
                logger.debug("the number of valid examples is:    %s",
                             len(images_val))
                logger.debug("the number of test examples is:     %s",
                             len(images_test))

            train_cost_one, train_batches_one = self.train_epoch(
                mfccs=mfccs_train,
                validLabels=audioLabels_train,
                valid_frames=validAudioFrames_train,
                LR=LR)
            train_cost += train_cost_one
            nb_train_batches += train_batches_one

            # get results for validation  set
            val_cost_one, val_acc_one, val_topk_acc_one, val_batches_one = self.val_epoch(
                mfccs=mfccs_val,
                validLabels=audioLabels_val,
                valid_frames=validAudioFrames_val)
            val_cost += val_cost_one
            val_acc += val_acc_one
            val_topk_acc += val_topk_acc_one
            nb_val_batches += val_batches_one

            if verbose:
                logger.debug("  this speaker results: ")
                logger.debug("\ttraining cost:     %s",
                             train_cost_one / train_batches_one)
                logger.debug("\tvalidation cost:   %s",
                             val_cost_one / val_batches_one)
                logger.debug("\vvalidation acc rate:  %s %%",
                             val_acc_one / val_batches_one * 100)
                logger.debug("\vvalidation top 3 acc rate:  %s %%",
                             val_topk_acc_one / val_batches_one * 100)

        # get the average over all speakers
        train_cost /= nb_train_batches
        val_cost /= nb_val_batches
        val_acc = val_acc / nb_val_batches * 100  # convert to %
        val_topk_acc = val_topk_acc / nb_val_batches * 100  # convert to %

        return train_cost, val_cost, val_acc, val_topk_acc
allAudioLabels_val = []
allValidLabels_val = []
allValidAudioFrames_val = []
allImages_test = []
allMfccs_test = []
allAudioLabels_test = []
allValidLabels_test = []
allValidAudioFrames_test = []

if runType == 'normal':
    for lipspeaker in lipspeakers:
        train, val, test = preprocessingCombined.getOneSpeaker(
            lipspeaker,
            sourceDataDir=database_binaryDir,
            storeProcessed=False,
            processedDir=processedDir,
            trainFraction=trainFraction,
            validFraction=validFraction,
            viseme=viseme,
            verbose=False,
            logger=logger_combined)
        images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train
        images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val
        images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test

        allImages_train += images_train
        allMfccs_train += mfccs_train
        allAudioLabels_train += audioLabels_train
        allValidLabels_train += validLabels_train
        allValidAudioFrames_train += validAudioFrames_train

        allImages_val += images_val
Beispiel #7
0
#
# import numpy as np
# validAudioFrames = np.reshape(validAudioFrames[0], (1, validAudioFrames[0].shape))



## Get images per video for lipspeakers
import preprocessingCombined

allImages_train = []; allMfccs_train = []; allAudioLabels_train=[]; allValidLabels_train=[]; allValidAudioFrames_train = []
allImages_val = []; allMfccs_val = []; allAudioLabels_val=[]; allValidLabels_val=[]; allValidAudioFrames_val = []
allImages_test = []; allMfccs_test = []; allAudioLabels_test=[]; allValidLabels_test=[]; allValidAudioFrames_test = []

for lipspeaker in lipspeakers:
    train, val, test = preprocessingCombined.getOneSpeaker(lipspeaker,
                                                           sourceDataDir=database_binaryDir,
                                                           storeProcessed=False, processedDir=processedDir,
                                                           trainFraction=0.7, validFraction=0.1, verbose=False)
    images_train, mfccs_train, audioLabels_train, validLabels_train, validAudioFrames_train = train
    images_val, mfccs_val, audioLabels_val, validLabels_val, validAudioFrames_val = val
    images_test, mfccs_test, audioLabels_test, validLabels_test, validAudioFrames_test = test

    allImages_train += images_train
    allMfccs_train += mfccs_train
    allAudioLabels_train += audioLabels_train
    allValidLabels_train += validLabels_train
    allValidAudioFrames_train += validAudioFrames_train

    allImages_val += images_val
    allMfccs_val += mfccs_val
    allAudioLabels_val += audioLabels_val
    allValidLabels_val += validLabels_val