def copyDBFiles(rootDir, names, targetRoot):
    from shutil import copyfile
    dirList = []
    fileList = []
    for root, dirs, files in os.walk(rootDir):
        for dir in dirs:
            for name in names:
                if name in dir:
                    path = ''.join([root, os.sep, dir])
                    dirList.append(path)
        for file in files:
            name, extension = os.path.splitext(file)
            # copy phoneme files as well
            if extension == ".txt":  #TODO Change to .VPHN after renaming done
                path = ''.join([root, os.sep, file])
                fileList.append(path)

    print("First 10 files to be copied: ", fileList[0:10])
    print("first 10 dirs to be copied: ", dirList[0:10])

    if query_yes_no(
            "Are you sure you want to copy all these directories from %s to %s?"
            % (rootDir, targetRoot), "yes"):
        nbCopiedDirs = 0
        nbCopiedFiles = 0

        for dir in dirList:
            relativePath = relpath(rootDir, dir)
            relativePath = relativePath.replace('/mouths_gray_120', '')
            dest = ''.join([targetRoot + os.sep + relativePath])
            #print("copying dir:", dir, " to: ", dest)
            copytree(dir, dest)
            nbCopiedDirs += 1

        for file in fileList:
            relativePath = relpath(rootDir, file)
            #print("copying file:", file, " to: ", targetRoot+os.sep+relativePath)
            dest = ''.join([targetRoot + os.sep + relativePath])
            copyfile(file, dest)
            nbCopiedFiles += 1

        print(nbCopiedDirs, " directories have been copied to ", targetRoot)
        print(nbCopiedFiles, " files have been copied to ", targetRoot)
    return dirList
def deleteDirs(rootDir, names):
    dirList = []
    for root, dirs, files in os.walk(rootDir):
        for dirname in dirs:
            for name in names:
                if name in dirname:
                    path = ''.join([root, os.sep, dirname])
                    dirList.append(path)
    print(dirList)
    if query_yes_no(
            "Are you sure you want to delete all these directories AND THEIR CONTENTS under %s?"
            % rootDir, "yes"):
        nbRemoved = 0
        for dir in dirList:
            print('Deleting dir: %s' % dir)
            shutil.rmtree(dir)
            nbRemoved += 1
        print(nbRemoved, " directories have been deleted")
    return dirList
    def evaluateModel(self,
                      BIDIRECTIONAL,
                      N_HIDDEN_LIST,
                      batch_size,
                      dataName,
                      wavDir,
                      data_store_dir,
                      meanStd_path,
                      model_load,
                      nbMFCCs,
                      store_dir,
                      force_overwrite=False):
        logger_evaluate.info("\n\n\n")

        ####### THE DATA you want to evaluate ##########
        data_store_path = data_store_dir + dataName.replace(
            '/', '_') + "_nbMFCC" + str(nbMFCCs)
        if not os.path.exists(data_store_dir): os.makedirs(data_store_dir)
        predictions_path = store_dir + os.sep + dataName.replace(
            '/', '_') + "_predictions.pkl"

        # log file
        logFile = store_dir + os.sep + "Evaluation" + dataName.replace(
            '/', '_') + '.log'
        if os.path.exists(logFile) and not force_overwrite:
            from general_tools import query_yes_no
            if query_yes_no(
                    "Log file already exists at %s\n Do you want to evaluate again and overwrite?",
                    "y"):
                pass
            else:
                logger_evaluate.info(
                    "Log file already exists, not re-evaluating.... ")
                return 0
        fh = logging.FileHandler(logFile, 'w')  # create new logFile
        fh.setLevel(logging.INFO)
        fh.setFormatter(formatter)
        logger_evaluate.addHandler(fh)
        logger_evaluate.info("\n  MODEL:    %s", model_load)
        logger_evaluate.info("\n  WAV_DIR:  %s", wavDir)
        logger_evaluate.info("\n  PREDICTS: %s", predictions_path)
        logger_evaluate.info("\n  LOG:      %s", logFile)
        logger_evaluate.info("\n")

        # GATHERING DATA
        logger_evaluate.info("* Gathering Data ...")
        if os.path.exists(data_store_path + ".pkl"):
            [inputs, targets,
             valid_frames] = unpickle(data_store_path + ".pkl")
            calculateAccuracy = True
            logger_evaluate.info(
                "Successfully loaded preprocessed data, with targets")

        elif os.path.exists(
                data_store_path + "_noTargets.pkl"
        ):  # TODO: make it work for unlabeled datasets. see RNN_tools_lstm.py, eg iterate_minibatch_noTargets.
            [inputs] = unpickle(data_store_path + "_noTargets.pkl")
            calculateAccuracy = False  # we can't as we don't know the correct labels
            logger_evaluate.info(
                "Successfully loaded preprocessed data, no targets")

        else:
            logger_evaluate.info("Data not found, preprocessing...")

            # From WAVS, generate X, y and valid_frames; also store under data_store_dir
            def preprocessLabeledWavs(wavDir, store_dir, name):
                # fixWavs -> suppose this is done
                # convert to pkl
                X, y, valid_frames = preprocessWavs.preprocess_dataset(
                    source_path=wavDir,
                    nbMFCCs=nbMFCCs,
                    logger=logger_evaluate)

                X_data_type = 'float32'
                X = preprocessWavs.set_type(X, X_data_type)
                y_data_type = 'int32'
                y = preprocessWavs.set_type(y, y_data_type)
                valid_frames_data_type = 'int32'
                valid_frames = preprocessWavs.set_type(valid_frames,
                                                       valid_frames_data_type)

                return X, y, valid_frames

            def preprocessUnlabeledWavs(wavDir, store_dir, name):  #TODO
                # fixWavs -> suppose this is done
                # convert to pkl
                X = preprocessWavs.preprocess_unlabeled_dataset(
                    source_path=wavDir,
                    nbMFCCs=nbMFCCs,
                    logger=logger_evaluate)

                X_data_type = 'float32'
                X = preprocessWavs.set_type(X, X_data_type)

                return X

            # load wavs and labels
            wav_files = transform.loadWavs(wavDir)
            wav_filenames = [
                str(
                    os.path.basename(
                        os.path.dirname(
                            os.path.dirname(os.path.dirname(wav_file)))) +
                    os.sep + os.path.basename(
                        os.path.dirname(os.path.dirname(wav_file))) + os.sep +
                    os.path.basename(os.path.dirname(wav_file)) + os.sep +
                    os.path.basename(wav_file)) for wav_file in wav_files
            ]
            logger_evaluate.info("Found %s files to evaluate \n Example: %s",
                                 len(wav_filenames), wav_filenames[0])
            label_files = transform.loadPhns(wavDir)

            # if source dir doesn't contain labels, we can't calculate accuracy
            calculateAccuracy = True
            if not (len(wav_files) == len(label_files)):
                calculateAccuracy = False
                inputs = preprocessUnlabeledWavs(wavDir=wavDir,
                                                 store_dir=store_dir,
                                                 name=dataName)
            else:
                inputs, targets, valid_frames = preprocessLabeledWavs(
                    wavDir=wavDir, store_dir=store_dir, name=dataName)

            # normalize inputs using dataset Mean and Std_dev;  convert to float32 for GPU evaluation
            with open(meanStd_path, 'rb') as cPickle_file:
                [mean_val, std_val] = cPickle.load(cPickle_file)
            inputs = preprocessWavs.normalize(inputs, mean_val, std_val)

            # just to be sure
            X_data_type = 'float32'
            inputs = preprocessWavs.set_type(inputs, X_data_type)

            # Print some information
            logger_evaluate.debug("* Data information")
            logger_evaluate.debug('  inputs')
            logger_evaluate.debug('%s %s', type(inputs), len(inputs))
            logger_evaluate.debug('%s %s', type(inputs[0]), inputs[0].shape)
            logger_evaluate.debug('%s %s', type(inputs[0][0]),
                                  inputs[0][0].shape)
            logger_evaluate.debug('%s', type(inputs[0][0][0]))
            logger_evaluate.debug('y train')
            logger_evaluate.debug('  %s %s', type(targets), len(targets))
            logger_evaluate.debug('  %s %s', type(targets[0]),
                                  targets[0].shape)
            logger_evaluate.debug('  %s %s', type(targets[0][0]),
                                  targets[0][0].shape)

            # slice to have a number of inputs that is a multiple of batch size
            logger_evaluate.info(
                "Not evaluating %s last files (batch size mismatch)",
                len(inputs) % batch_size)
            inputs = inputs[:-(len(inputs) % batch_size) or None]
            if calculateAccuracy:
                targets = targets[:-(len(targets) % batch_size) or None]
                valid_frames = valid_frames[:-(len(valid_frames) %
                                               batch_size) or None]

            # pad the inputs to process batches easily
            inputs = pad_sequences_X(inputs)
            if calculateAccuracy: targets = pad_sequences_y(targets)

            # save the preprocessed data
            logger_evaluate.info("storing preprocessed data to: %s",
                                 data_store_path)
            if calculateAccuracy:
                general_tools.saveToPkl(data_store_path + '.pkl',
                                        [inputs, targets, valid_frames])
            else:
                general_tools.saveToPkl(data_store_path + '_noTargets.pkl',
                                        [inputs])

        # Gather filenames; for debugging
        wav_files = transform.loadWavs(wavDir)
        wav_filenames = [
            str(
                os.path.basename(
                    os.path.dirname(os.path.dirname(os.path.dirname(
                        wav_file)))) + os.sep +
                os.path.basename(os.path.dirname(os.path.dirname(wav_file))) +
                os.sep + os.path.basename(os.path.dirname(wav_file)) + os.sep +
                os.path.basename(wav_file)) for wav_file in wav_files
        ]
        logger_evaluate.debug(" # inputs: %s, # wav files: %s", len(inputs),
                              len(wav_files))

        # make copy of data because we might need to use is again for calculating accurasy, and the iterator will remove elements from the array
        inputs_bak = copy.deepcopy(inputs)
        if calculateAccuracy:
            targets_bak = copy.deepcopy(targets)
            valid_frames_bak = copy.deepcopy(valid_frames)

        logger_evaluate.info("* Evaluating: pass over Evaluation Set")

        if calculateAccuracy:  # if .phn files are provided, we can check our predictions
            logger_evaluate.info(
                "Getting predictions and calculating accuracy...")
            avg_error, avg_acc, predictions = self.RNN_network.run_epoch(X=inputs, y=targets, valid_frames=valid_frames, \
                                                                         get_predictions=True, batch_size=batch_size)

            logger_evaluate.info("All batches, avg Accuracy: %s", avg_acc)
            inputs = inputs_bak
            targets = targets_bak
            valid_frames = valid_frames_bak

            #uncomment if you want to save everything in one place (takes quite a lot of storage space)
            #general_tools.saveToPkl(predictions_path, [inputs, predictions, targets, valid_frames, avg_Acc])

        else:
            # TODO fix this
            for inputs, masks, seq_lengths in tqdm(
                    iterate_minibatches_noTargets(inputs,
                                                  batch_size=batch_size,
                                                  shuffle=False),
                    total=len(inputs)):
                # get predictions
                nb_inputs = len(
                    inputs)  # usually batch size, but could be lower
                seq_len = len(inputs[0])
                prediction = self.RNN_network.predictions_fn(inputs, masks)
                prediction = np.reshape(prediction, (nb_inputs, -1))
                prediction = list(prediction)
                predictions = predictions + prediction

            inputs = inputs_bak
            #general_tools.saveToPkl(predictions_path, [inputs, predictions])

        # Print information about the predictions
        logger_evaluate.info("* Done")
        end_evaluation_time = time.time()
        eval_duration = end_evaluation_time - program_start_time
        logger_evaluate.info('Total time: {:.3f}'.format(eval_duration))
        # Print the results
        try:
            printEvaluation(wav_filenames,
                            inputs,
                            predictions,
                            targets,
                            valid_frames,
                            avg_acc,
                            range(len(inputs)),
                            logger=logger_evaluate,
                            only_final_accuracy=True)
        except:
            pdb.set_trace()
        logger_evaluate.info(
            'Evaluation duration: {:.3f}'.format(eval_duration))
        logger_evaluate.info(
            'Printing duration: {:.3f}'.format(time.time() -
                                               end_evaluation_time))

        # close the log handler
        fh.close()
        logger_evaluate.removeHandler(fh)
Esempio n. 4
0
    FRAC_TRAINING = 1 - FRAC_TEST  # val set will be FRAC_TRAINING * FRAC_VAL = 9% of the data. FRAC_TRAIN is 90 - 9 = 81%, test = 10

##### Everything below is calculated automatically ##########
#############################################################

# store path
target = os.path.join(
    outputDir,
    os.path.basename(dataRootDir) + '_' + str(nbMFCCs) + '_ch')
target_path = target + '.pkl'
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

# Already exists, ask if overwrite
if (os.path.exists(target_path)):
    if (not general_tools.query_yes_no(target_path + " exists. Overwrite?",
                                       "no")):
        raise Exception("Not Overwriting")

# set log file
logFile = outputDir + os.sep + os.path.basename(target) + '.log'
fh = logging.FileHandler(logFile, 'w')  # create new logFile
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)

if DEBUG:
    logger.info(
        'DEBUG mode: \tACTIVE, only a small dataset will be preprocessed')
    target_path = target + '_DEBUG.pkl'
else:
    logger.info('DEBUG mode: \tDEACTIVE')