Exemple #1
0
def genSplit(testDataDir):
    splitConfig = cfg.nnDataDirNameTest + os.path.sep + cfg.splitConfig
    writeSplitConfig(splitConfig)
    datamv   = 'NONE'        
    splitCmd = 'python %s' % (os.path.join(cfg.path_pyTools_scripts, 'dataGen/GenSynData.py'))
    splitCmd = '%s %s %s %s %s' % (splitCmd, splitConfig, testDataDir, testDataDir, datamv)
    exe_cmd(splitCmd, cfg.debug)
    display.self_print('Output features are generated to %s' % (testDataDir), 'highlight')
def createFileLst(dataDirs, dataExts, dataDim, dataListDirs, trnList, valList):
    """ create data lists 
        output *.scp will be in dataListDirs
    """
    dataDirs = dataDirs.split(',')
    dataExts = dataExts.split(',')
    dataDims = [int(x) for x in dataDim.split('_')]
    assert len(dataDirs) == len(
        dataExts), 'Error: sub_1_prepare_list.py dataDirs and dataExts wrong'

    # get the cross-set of file lists
    dataList = lstdirNoExt(dataDirs[0], dataExts[0])
    for dataDir, dataExt in zip(dataDirs[1:], dataExts[1:]):
        listTmp = lstdirNoExt(dataDir, dataExt)
        dataList = crossSet(dataList, listTmp)

    # check if file exists
    if len(dataList) < 1:
        display.self_print("Error: fail to found data. Please check:", 'error')
        display.self_print(
            "path_acous_feats, ext_acous_feats, path_waveform in config.py;",
            'error')
        display.self_print("Please also check the names of input data files.",
                           'error')
        raise Exception("Error: fail to generate file list.")

    # check if data exists
    pre_defined_trn_list = readwrite.read_txt_list(trnList)
    pre_defined_val_list = readwrite.read_txt_list(valList)
    diff_trn_list = diff_list(pre_defined_trn_list, dataList)
    diff_val_list = diff_list(pre_defined_val_list, dataList)
    if len(diff_trn_list):
        display.self_print("Error: training data missing. Please check:",
                           'error')
        print(diff_trn_list)
        raise Exception("Error: fail to prepare file list.")

    if len(diff_val_list):
        display.self_print("Error: validation data missing. Please check:",
                           'error')
        print(diff_val_list)
        raise Exception("Error: fail to prepare file list.")

    # before start, take a simple test on the configuration of feature dimension
    frameNum = None
    for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1],
                                           dataExts[0:-1]):
        inputFile = os.path.join(inputDir,
                                 dataList[0]) + '.' + featName.lstrip('.')
        if os.path.isfile(inputFile):
            tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0]
            if frameNum is None or frameNum < tmpframeNum:
                frameNum = tmpframeNum

    for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1],
                                           dataExts[0:-1]):
        inputFile = os.path.join(inputDir,
                                 dataList[0]) + '.' + featName.lstrip('.')
        if os.path.isfile(inputFile):
            tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0]
            if np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1:
                if featDim == readwrite.read_raw_mat(inputFile, 1).shape[0]:
                    pass
                else:
                    display.self_print("Large mismatch of frame numbers %s" %
                                       (inputFile))
                    display.self_print(
                        "Please check whether inputDim are correct", 'error')
                    display.self_print("Or check input features are corrupted",
                                       'error')
                    raise Exception("Error: mismatch of frame numbers")

    display.self_print('Generating data lists in to %s' % (dataListDirs),
                       'highlight')

    if True:
        trainSet = pre_defined_trn_list
        valSet = pre_defined_val_list

        if len(valSet) > len(trainSet):
            display.self_print(
                "Warning: validation set is larger than training set",
                'warning')
            display.self_print("It's better to change train_utts in config.py",
                               'warning')

        trainFileOut = dataListDirs + os.path.sep + 'train.lst'
        trainFilePtr = open(trainFileOut, 'w')
        for fileName in trainSet:
            trainFilePtr.write('%s\n' % (fileName))
        trainFilePtr.close()

        if len(valSet):
            valFileOut = dataListDirs + os.path.sep + 'val.lst'
            valFilePtr = open(valFileOut, 'w')
            for fileName in valSet:
                valFilePtr.write('%s\n' % (fileName))
            valFilePtr.close()
        display.self_print(
            '\ttrain/val sizes: %d, %d' % (len(trainSet), len(valSet)),
            'warning')

    # done
    return
Exemple #3
0
def exe_cmd(cmd, debug=False):

    display.self_print(cmd + '\n', 'highlight')
    if not debug:
        os.system(cmd)
Exemple #4
0
            pass

        tmp_data_nc_config = cfg.tmp_test_data_nc_config

        cmd = 'sh %s' % (
            cfg.path_scripts) + os.path.sep + 'sub_05_package_datanc.sh'
        cmd = cmd + ' %s %s' % (tmp_sub_nc_dir, tmp_idx_dir)
        cmd = cmd + ' testset %s %s %s' % (tmp_test_lst, tmp_data_nc_config,
                                           cfg.path_pyTools_scripts)
        exe_cmd(cmd, cfg.debug)

    if os.path.isfile(tmp_test_nc_scp):
        tmp_test_data_nc_list = readwrite.read_txt_list(tmp_test_nc_scp)
        if len(tmp_test_data_nc_list) < 1:
            display.self_print(
                'Error: not found test data.nc in %s' % (tmp_sub_nc_dir),
                'error')
            quit()
        tmp_test_data_nc_args = ','.join(tmp_test_data_nc_list)
    else:
        display.self_print('Error: not found %s' % (tmp_test_nc_scp), 'error')
        quit()

    # No need to get F0 mean and std
    if False:
        # get F0 mean and std
        dimCnt = 0
        f0Dim = -1

        meanstd_data = readwrite.read_raw_mat(tmp_mv_data, 1)
        for acousExt, acousDim in zip(cfg.ext_acous_feats,
Exemple #5
0
    tmp_trn_nc_dir = tmp_nc_dir + os.path.sep + cfg.tmp_nc_dir_train
    tmp_trn_nc_scp = tmp_trn_nc_dir + os.path.sep + 'data.scp'
    tmp_val_nc_dir = tmp_nc_dir + os.path.sep + cfg.tmp_nc_dir_val
    tmp_val_nc_scp = tmp_val_nc_dir + os.path.sep + 'data.scp'

    tmp_mv_data = os.getcwd() + os.path.sep + cfg.tmp_name_mean_file

    tmp_mdn_config = os.getcwd() + os.path.sep + cfg.tmp_data_dir
    tmp_mdn_config = tmp_mdn_config + os.path.sep + cfg.tmp_mdn_config_name

    # Get the string of training data.nc files
    if os.path.isfile(tmp_trn_nc_scp):
        tmp_trn_data_nc_list = readwrite.read_txt_list(tmp_trn_nc_scp)
        if len(tmp_trn_data_nc_list) < 1:
            display.self_print(
                'Error: not found train data.nc in %s' % (tmp_trn_nc_dir),
                'error')
            quit()
        tmp_trn_data_nc_args = ','.join(tmp_trn_data_nc_list)
    else:
        display.self_print('Error: not found %s' % (tmp_trn_nc_scp), 'error')
        quit()

    if os.path.isfile(tmp_val_nc_scp):
        tmp_val_data_nc_list = readwrite.read_txt_list(tmp_val_nc_scp)
        if len(tmp_val_data_nc_list) < 1:
            display.self_print('Warning: val data.nc is not used', 'warning')
            tmp_val_data_nc_args = ''
        tmp_val_data_nc_args = ','.join(tmp_val_data_nc_list)
    else:
        display.self_print(
Exemple #6
0
def prepareData(dataDir):

    # create directories
    try:
        os.mkdir(dataDir)
    except OSError:
        pass

    dataListPath = dataDir + os.path.sep + 'lists'
    try:
        os.mkdir(dataListPath)
    except OSError:
        pass

    dataRawDir = dataDir + os.path.sep + cfg.idxDirName
    try:
        os.mkdir(dataRawDir)
    except OSError:
        pass

    dataLinkDir = dataDir + os.path.sep + cfg.linkDirname
    try:
        os.mkdir(dataLinkDir)
    except OSError:
        pass

    testdataDirs = []

    # create file list
    for inputDirSet, outputDirSet, dataPart in zip(cfg.test_inputDirs,
                                                   cfg.test_outputDirs,
                                                   cfg.test_dataDivision):

        display.self_print('Processing ' + dataPart + ' data', 'highlight')

        # get the cross-set of file list
        listInput = lstdirNoExt(inputDirSet[0])
        if cfg.outputUttNum > 0:
            fileList = listInput[0:cfg.outputUttNum]
        else:
            fileList = listInput

        if inputDirSet:
            for inputDir in inputDirSet:
                listInput2 = lstdirNoExt(inputDir)
                fileList = crossSet(fileList, listInput2)

        if outputDirSet:
            listOutput = lstdirNoExt(outputDirSet[0])
            if outputDirSet:
                for outputDir in outputDirSet:
                    listOutput2 = lstdirNoExt(outputDir)
                    fileList = crossSet(fileList, listOutput2)

        # writing file lst
        fileListFilePath = dataListPath + os.path.sep + dataPart + '.lst'
        filePtr = open(fileListFilePath, 'w')
        for fileName in fileList:
            filePtr.write('%s\n' % (fileName))
        filePtr.close()

        # create file directories
        dataSaveDir = dataDir + os.path.sep + dataPart
        testdataDirs.append(dataSaveDir)
        try:
            os.mkdir(dataSaveDir)
        except OSError:
            pass

        inputScpList = []
        outputScpList = []

        # create the fileName + fileExt lists
        for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim,
                                               cfg.inputExt):
            tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp'
            inputScpList.append(tmpFileScp)
            filePtr = open(tmpFileScp, 'w')
            for fileName in fileList:
                filePtr.write('%s%s%s.%s\n' %
                              (inputDir, os.path.sep, fileName, featName))
                os.system("ln -f -s %s%s%s.%s %s%s%s.%s" %
                          (inputDir, os.path.sep, fileName, featName,
                           dataLinkDir, os.path.sep, fileName, featName))
            filePtr.close()

        if outputDirSet:
            for outputDir, featDim, featName in zip(outputDirSet,
                                                    cfg.outputDim,
                                                    cfg.outputExt):
                tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp'
                outputScpList.append(tmpFileScp)
                filePtr = open(tmpFileScp, 'w')
                for fileName in fileList:
                    filePtr.write('%s%s%s.%s\n' %
                                  (outputDir, os.path.sep, fileName, featName))
                    os.system("ln -f -s %s%s%s.%s %s%s%s.%s" %
                              (outputDir, os.path.sep, fileName, featName,
                               dataLinkDir, os.path.sep, fileName, featName))
                filePtr.close()

        filePtr = open(dataSaveDir + os.path.sep + cfg.idxFileName + '.scp',
                       'w')
        for fileName in fileList:
            filePtr.write('%s%s%s.%s\n' %
                          (dataRawDir, os.path.sep, fileName, cfg.idxFileName))
        filePtr.close()

        # create the lab index lists
        cmd = 'python %s/dataPrepare/getLabIdx5ms.py' % (
            cfg.path_pyTools_scripts)
        cmd = '%s %s %s %s %s %s %s' % (cmd, inputDirSet[0], cfg.inputExt[0],
                                        cfg.inputDim[0], dataRawDir,
                                        cfg.idxFileName, fileListFilePath)
        display.self_print('Creating idx', 'highlight')
        exe_cmd(cmd, cfg.debug)

        # write data_config.cfg
        writeDataConfig(dataSaveDir + os.path.sep + 'data_config.py',
                        cfg.idxFileName + '.scp', cfg.fileNumInEachNCPack)

        # pack data
        packDataCmd = 'sh %s/sub_05_package_datanc.sh %s %s' % (
            cfg.path_scripts, dataSaveDir, cfg.path_pyTools_scripts)
        display.self_print('Packing data', 'highlight')
        exe_cmd(packDataCmd, cfg.debug)
        return testdataDirs
Exemple #7
0
def wavCreate(testDataDir):

    mlpgFlag = []
    pythonScript = os.path.join(cfg.path_pyTools_scripts, 'synWav.py')
    if cfg.wavformGenerator == 'WORLD':
        for featName in cfg.wavGenWorldRequire:
            try:
                mlpgFlagIdx = cfg.outputExt.index(featName)
                mlpgFlag.append(cfg.mlpgFlag[mlpgFlagIdx])
            except ValueError:
                print("outputExt in synconfig has no %s " % (featName))
                raise Exception("WORLD vocoder requires %s " % (featName))

    elif cfg.wavformGenerator == 'STRAIGHT':
        for featName in cfg.wavGenWorldRequire:
            try:
                mlpgFlagIdx = cfg.outputExt.index(featName)
                mlpgFlag.append(cfg.mlpgFlag[mlpgFlagIdx])
            except ValueError:
                print("outputExt in synconfig has no %s " % (featName))
                raise Exception("STRAIGHT vocoder requires %s " % (featName))

    else:
        raise Exception("Unknown wavFormGenerator = %s" %
                        (cfg.wavFormGenerator))

    if sum(mlpgFlag) > 0:
        datamv = cfg.mlpgVar
        assert os.path.isdir(datamv), "Cannot find mlpgVar = %s for MLPG"
    else:
        datamv = 'NONE'

    if hasattr(cfg, 'vu_threshold'):
        vuThres = cfg.vu_threshold
        display.self_print("Use vu_threshold %f" % (vuThres), 'highlight')
    else:
        vuThres = 0.5

    exe_cmd("ls %s/*.htk > %s/gen.scp" % (testDataDir, testDataDir), cfg.debug)
    wavCmd = 'python %s %s %f %s %d %s %d %s %d %s %s %s %s %f' \
             % (pythonScript, testDataDir, cfg.wavPostFilter,
                testDataDir, mlpgFlag[0],
                testDataDir, mlpgFlag[1],
                testDataDir, mlpgFlag[2],
                "%s/gen.scp" % (testDataDir),
                cfg.wavformGenerator,
                datamv, cfg.path_pyTools_scripts, vuThres)

    exe_cmd(wavCmd, cfg.debug)

    if cfg.lf0UV:
        uvScript = os.path.join(cfg.path_pyTools_scripts,
                                'dataGen/useUVonLf0.py')
        if cfg.lf0UVExternalDir is None:
            uvCmd = 'python %s %s %s %s %s %s' % (uvScript, testDataDir,
                                                  vuThres, cfg.lf0ext,
                                                  cfg.vuvext, testDataDir)
        else:
            uvCmd = 'python %s %s %s %s %s %s' % (
                uvScript, testDataDir, cfg.lf0UVExternalThre, cfg.lf0ext,
                cfg.lf0UVExternalExt, cfg.lf0UVExternalDir)
        exe_cmd(uvCmd, cfg.debug)

    if cfg.lf02f0:
        f0Script = os.path.join(cfg.path_pyTools, 'speechTools/f0convert.py')
        f0Cmd = 'python %s %s %s %s' % (f0Script, testDataDir, cfg.lf0ext,
                                        cfg.f0ext)
        exe_cmd(f0Cmd, cfg.debug)
Exemple #8
0
        f0Script = os.path.join(cfg.path_pyTools, 'speechTools/f0convert.py')
        f0Cmd = 'python %s %s %s %s' % (f0Script, testDataDir, cfg.lf0ext,
                                        cfg.f0ext)
        exe_cmd(f0Cmd, cfg.debug)


if __name__ == "__main__":
    # prepare test data
    if cfg.step03:
        display.self_print_with_date('Step3. Generating from networks', 'h')

        if cfg.step03Prepare_DATA is True:
            testDataDirs = prepareData(cfg.nnDataDirNameTest)
        else:
            testDataDirs = []
            display.self_print('Skip packaing data', 'highlight')

        # for each test data dir
        for testDataDir in testDataDirs:
            display.self_print(testDataDir, 'highlight')
            if cfg.step03NNGen is True:
                # generate output data (HTK format) from CURRENNT
                genSynCfg(testDataDir)
                # extract the output features from the HTK genreated by CURRENNT
                genSplit(cfg.outputDir)
            else:
                display.self_print('Skip generating output from network',
                                   'highlight')

        if cfg.step03WaveFormGen is True:
            # generate waveform
Exemple #9
0
def prepareData():
    """ prepreData: 
        1. create the file list
        2. create the symbolic link to the feature data
        3. create the index file (used by CURRENNT)
        4. create package data of index file (data.nc)
        5. calculate the mean and std for a specific data set
    """
    # create directories
    dataDir = cfg.nnDataDirName
    try:
        os.mkdir(dataDir)
    except OSError:
        pass

    dataListPath = dataDir + os.path.sep + 'lists'
    try:
        os.mkdir(dataListPath)
    except OSError:
        pass

    dataRawDir = dataDir + os.path.sep + cfg.idxDirName
    try:
        os.mkdir(dataRawDir)
    except OSError:
        pass

    # decide whether create the symbolic link to each file
    if len(cfg.inputDirs) == 1 and len(cfg.outputDirs) == 1:
        # no validation set
        flagFileUseSymbolLink = False
    elif listSameContent(cfg.inputDirs) and listSameContent(
            cfg.outputDirs) and listSameContent(
                cfg.inputDirs[0]) and listSameContent(cfg.outputDirs[0]):
        # all data have been in the same directory
        flagFileUseSymbolLink = False
    else:
        flagFileUseSymbolLink = True

    #dataLinkDir = dataDir + os.path.sep + cfg.linkDirname
    dataLinkDirInput = dataDir + os.path.sep + cfg.linkDirname_input
    dataLinkDirOutput = dataDir + os.path.sep + cfg.linkDirname_output
    # prepare for data link
    if flagFileUseSymbolLink:
        try:
            os.mkdir(dataLinkDirInput)
            os.mkdir(dataLinkDirOutput)
        except OSError:
            pass
    else:
        if os.path.islink(dataLinkDirInput):
            os.system("rm %s" % (dataLinkDirInput))
        if os.path.islink(dataLinkDirOutput):
            os.system("rm %s" % (dataLinkDirOutput))
        os.system("ln -s %s %s" % (cfg.inputDirs[0][0], dataLinkDirInput))
        os.system("ln -s %s %s" % (cfg.outputDirs[0][0], dataLinkDirOutput))

    # create file list
    for dataList, inputDirSet, outputDirSet, dataPart in zip(
            cfg.dataLists, cfg.inputDirs, cfg.outputDirs, cfg.dataDivision):

        display.self_print('Processing ' + dataPart + ' data', 'highlight')

        if dataList is None:
            # get the cross-set of file list
            listInput = readwrite.list_file_name_in_dir(inputDirSet[0])
            listOutput = readwrite.list_file_name_in_dir(outputDirSet[0])
            fileList = listInput
            if inputDirSet:
                for inputDir in inputDirSet:
                    listInput2 = readwrite.list_file_name_in_dir(inputDir)
                    fileList, diffSet = crossSet(fileList, listInput2)
                    tmpName = os.path.join(
                        dataListPath,
                        dataPart + os.path.basename(inputDir) + '.dif.lst')
                    readwrite.write_txt_list(diffSet, tmpName)

            if outputDirSet:
                for outputDir in outputDirSet:
                    listOutput2 = readwrite.list_file_name_in_dir(outputDir)
                    fileList, diffSet = crossSet(fileList, listOutput2)
                    tmpName = os.path.join(
                        dataListPath,
                        dataPart + os.path.basename(outputDir) + '.dif.lst')
                    readwrite.write_txt_list(diffSet, tmpName)

            # writing the list of file name
            random.shuffle(fileList)
            fileListFilePath = dataListPath + os.path.sep + dataPart + '.lst'
            readwrite.write_txt_list(fileList, fileListFilePath)
        else:
            fileListFilePath = dataListPath + os.path.sep + dataPart + '.lst'
            os.system("cp %s %s" % (dataList, fileListFilePath))
            fileList = readwrite.read_txt_list(fileListFilePath)

        # before start, take a simple test on the configuration of feature dimension
        frameNum = None
        for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim,
                                               cfg.inputExt):
            inputFile = os.path.join(inputDir, fileList[0]) + '.' + featName
            if os.path.isfile(inputFile):
                tmpframeNum = readwrite.read_raw_mat(inputFile,
                                                     featDim).shape[0]
                if frameNum is None:
                    frameNum = tmpframeNum
                elif np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1:
                    display.self_print("Large mismatch of frame numbers %s" %
                                       (fileList[0]))
                    display.self_print(
                        "Please check whether inputDim are correct", 'error')
                    display.self_print("Or check input features are corrupted",
                                       'error')
                    raise Exception("Error: mismatch of frame numbers")

        for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim,
                                                cfg.outputExt):
            outputFile = os.path.join(outputDir, fileList[0]) + '.' + featName
            if os.path.isfile(outputFile):
                tmpframeNum = readwrite.read_raw_mat(outputFile,
                                                     featDim).shape[0]
                if np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1:
                    display.self_print("Large mismatch of frame numbers %s" %
                                       (fileList[0]))
                    display.self_print(
                        "Please check whether inputDim are correct", 'error')
                    display.self_print("Or check input features are corrupted",
                                       'error')
                    raise Exception("Error: mismatch of frame numbers")

        # create file directories
        dataSaveDir = dataDir + os.path.sep + dataPart
        try:
            os.mkdir(dataSaveDir)
        except OSError:
            pass

        inputScpList = []
        outputScpList = []

        # create the fileName + fileExt lists
        # create symbolic link
        for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim,
                                               cfg.inputExt):
            tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp'
            inputScpList.append(tmpFileScp)
            filePtr = open(tmpFileScp, 'w')
            for fileName in fileList:
                # write full path to the feature
                filePtr.write('%s%s%s.%s\n' %
                              (inputDir, os.path.sep, fileName, featName))
                if cfg.step01Prepare_LINK is True and flagFileUseSymbolLink:
                    os.system("ln -f -s %s%s%s.%s %s%s%s.%s" % \
                              (inputDir, os.path.sep, fileName, featName,
                               dataLinkDirInput, os.path.sep, fileName, featName))
            filePtr.close()

        for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim,
                                                cfg.outputExt):
            tmpFileScp = dataSaveDir + os.path.sep + featName + '.scp'
            outputScpList.append(tmpFileScp)
            filePtr = open(tmpFileScp, 'w')
            for fileName in fileList:
                filePtr.write('%s%s%s.%s\n' %
                              (outputDir, os.path.sep, fileName, featName))
                if cfg.step01Prepare_LINK is True and flagFileUseSymbolLink:
                    os.system("ln -f -s %s%s%s.%s %s%s%s.%s" % \
                              (outputDir, os.path.sep, fileName, featName,
                               dataLinkDirOutput, os.path.sep, fileName, featName))
            filePtr.close()

        # create index file list
        filePtr = open(dataSaveDir + os.path.sep + cfg.idxFileName + '.scp',
                       'w')
        for fileName in fileList:
            filePtr.write('%s%s%s.%s\n' %
                          (dataRawDir, os.path.sep, fileName, cfg.idxFileName))
        filePtr.close()

        # create index files
        if cfg.step01Prepare_IDX is True or cfg.step01Prepare_PACK is True:
            # create the lab index lists
            cmd = 'python %s/dataPrepare/getLabIdx5ms.py' % (
                cfg.path_pyTools_scripts)
            cmd = '%s %s %s %s %s %s %s' % (
                cmd, inputDirSet[0], cfg.inputExt[0], cfg.inputDim[0],
                dataRawDir, cfg.idxFileName, fileListFilePath)
            display.self_print('Creating time index files', 'highlight')
            exe_cmd(cmd, cfg.debug)
        else:
            display.self_print('skip creating time index', 'highlight')

        # package the data
        if cfg.step01Prepare_IDX is True or cfg.step01Prepare_PACK is True:
            # write data_config.cfg
            writeDataConfig(dataSaveDir + os.path.sep + 'data_config.py',
                            cfg.idxFileName + '.scp', cfg.fileNumInEachNCPack)
            # pack data
            packDataCmd = 'sh %s/sub_05_package_datanc.sh %s %s' % (
                cfg.path_scripts, dataSaveDir, cfg.path_pyTools_scripts)

            display.self_print('Packing data', 'highlight')
            exe_cmd(packDataCmd, cfg.debug)
        else:
            display.self_print('skip packing data', 'highlight')

    # create file list
    for inputDirSet, outputDirSet, dataPart in zip(cfg.inputDirs,
                                                   cfg.outputDirs,
                                                   cfg.dataDivision):

        dataSaveDir = dataDir + os.path.sep + dataPart
        inputScpList = []
        outputScpList = []

        for inputDir, featDim, featName in zip(inputDirSet, cfg.inputDim,
                                               cfg.inputExt):
            inputScpList.append(dataSaveDir + os.path.sep + featName + '.scp')

        for outputDir, featDim, featName in zip(outputDirSet, cfg.outputDim,
                                                cfg.outputExt):
            outputScpList.append(dataSaveDir + os.path.sep + featName + '.scp')

        # calculate mean and std
        if dataPart == cfg.computMeanStdOn and cfg.step01Prepare_MV is True:
            display.self_print('Calculating mean and std', 'highlight')

            meanStdTool.meanStdNormMask(
                inputScpList, cfg.inputDim, cfg.inputNormMask,
                dataSaveDir + os.path.sep + cfg.nnDataInputMV)
            display.self_print(
                "\nSave input mean-std as %s" %
                (os.path.join(dataSaveDir, cfg.nnDataInputMV)), 'highlight')

            meanStdTool.meanStdNormMask(
                outputScpList, cfg.outputDim, cfg.outputNormMask,
                dataSaveDir + os.path.sep + cfg.nnDataOutputMV)
            display.self_print(
                "\nSave output mean-std as %s" %
                (os.path.join(dataSaveDir, cfg.nnDataOutputMV)), 'highlight')
        else:
            display.self_print('skip calculating mean and std', 'highlight')
def trainCURRENNT(model_dir):

    os.chdir(model_dir)
    display.self_print('******** train config ******\n', 'highlight')
    os.system("cat ./%s" % (cfg.nnModelCfgname))
    display.self_print('****************************\n', 'highlight')

    runCmd = '%s --options_file %s --verbose 1' % (cfg.path_currennt,
                                                   cfg.nnModelCfgname)
    runCmd = runCmd + ' >%s 2>%s' % (cfg.tmp_network_trn_log,
                                     cfg.tmp_network_trn_err)

    display.self_print("GPU job submitted. Please wait until terminated.",
                       'ok')
    display.self_print("Please open another terminal to check nvidia-smi",
                       'ok')

    display.self_print(
        "Also check %s" % (os.path.join(model_dir, cfg.tmp_network_trn_log)),
        'ok')
    display.self_print(
        "Also check %s" % (os.path.join(model_dir, cfg.tmp_network_trn_err)),
        'ok')
    exe_cmd(runCmd, cfg.debug)
    display.self_print_with_date(
        'Processed terminated. Please check %s %s' %
        (os.path.join(model_dir, cfg.tmp_network_trn_log),
         os.path.join(model_dir, cfg.tmp_network_trn_err)), 'ok')
Exemple #11
0
def exe_cmd(cmd, debug=False):
    display.self_print("Execute command:", 'ok')
    display.self_print(cmd + '\n', 'highlight')
    if not debug:
        try:
            subprocess.check_call(cmd, shell=True)
            display.self_print("Command is successfully executed:\n%s\n\n" % (cmd), 'ok')
        except subprocess.CalledProcessError as e:
            display.self_print("Failed to run:" + cmd, 'error')
            display.self_print("Please check the printed error message", 'error')
            display.self_print("Process terminated with %s" % (e.returncode), 'error')
            sys.exit(1)
         display.self_print_with_date('step1.1 copying data lists', 'm')
         tmp_acous_path = ','.join(cfg.path_acous_feats)
         tmp_feat_ext = ','.join(cfg.ext_acous_feats)
         tmp_feat_dim = '_'.join([str(x) for x in cfg.dim_acous_feats])
         cmd = 'python %s' % (
             cfg.path_scripts) + os.path.sep + 'sub_01_check_list.py'
         cmd = cmd + ' %s,%s' % (tmp_acous_path, cfg.path_waveform)
         cmd = cmd + ' %s,.wav' % (tmp_feat_ext)
         cmd = cmd + ' %s_1' % (tmp_feat_dim)
         cmd = cmd + ' %s' % (tmp_data_scp_dir)
         cmd = cmd + ' %s' % (cfg.trn_list)
         cmd = cmd + ' %s' % (cfg.val_list)
         exe_cmd(cmd, cfg.debug)
     else:
         display.self_print(
             'cannot find %s %s' % (cfg.trn_list, cfg.val_list),
             'error')
         quit()
 else:
     display.self_print_with_date('step1.1 generating data lists', 'm')
     tmp_acous_path = ','.join(cfg.path_acous_feats)
     tmp_feat_ext = ','.join(cfg.ext_acous_feats)
     tmp_feat_dim = '_'.join([str(x) for x in cfg.dim_acous_feats])
     cmd = 'python %s' % (
         cfg.path_scripts) + os.path.sep + 'sub_01_prepare_list.py'
     cmd = cmd + ' %s,%s' % (tmp_acous_path, cfg.path_waveform)
     cmd = cmd + ' %s,.wav' % (tmp_feat_ext)
     cmd = cmd + ' %s_1' % (tmp_feat_dim)
     cmd = cmd + ' %s' % (tmp_data_scp_dir)
     cmd = cmd + ' %f' % (cfg.train_utts)
     exe_cmd(cmd, cfg.debug)
def PrepareScp(InScpFile, OutScpFile, inDim, outDim, allScp, datadir, txtScp, txtDim):
    """ Count the number of frames of each input and output file.
    """
    assert len(InScpFile)==len(inDim), \
        "Unequal length of input scp and in dim"
    assert len(OutScpFile)==0 or len(OutScpFile)==len(outDim), \
        "Unequal length of output scp and in dim"
    
    numSeque = 0                                    # number of sequence
    numFrame = 0                                    # number of total frame
    inPatDim = 0                                    # dimension of input 
    ouPatDim = 0                                    # dimension of output
    maxSeqLe = 0                                    # maximum length of name sequence
    fileLabBuffer = ['',]
    fileInBuffer = ['',]
    fileOutBuffer = ['',]
    seqLenBuffer = [0,]
    
    # Pre-process the input file
    #  check the duration of each input file
    #  keep the shortest duration of input files for one entry
    
    display.self_print("\nNote: Different feature files of one utterance may be different in length.",
                       'warning')
    display.self_print("Trim value shows how many data are discarded in order to match the shortest file.",
                       'warning')
    display.self_print("Large Trim value may indicate data length mismatch.", 'warning')
    display.self_print("Please check configuration carefully if it happens!\n",
                       'warning')
    print("Processing the input file")
    for scpFile, dim, scpIndex in zip(InScpFile, inDim, range(len(InScpFile))):
        fPtr = open(scpFile,'r')
        fileCtr = 0
        
        while 1:
            line = fPtr.readline()
            if len(line.strip())==0:
                break
            
            if scpIndex==0:
                numSeque = numSeque + 1
            fileline = line.strip()
            
            # only check for relative path
            #if not os.path.isfile(fileline):
            #    fileline = datadir + os.path.sep + fileline                
            assert os.path.isfile(fileline), "Can't find file"+fileline

            if scpIndex==0:                         # loading lab file
                fileLabBuffer.append(fileline)
                if len(fileline) >  maxSeqLe:
                    maxSeqLe = len(fileline)
            
            fileInBuffer.append(fileline)           # check the time step of file
            
            if scpIndex>0 and fileline==fileLabBuffer[fileCtr+1]:
                pass                     # if this file is the same as the input lab file
                                         # pass it (for the case when several columns of the 
                                         # input lab will be extracted based on inputMask)
            else:
                try:
                    tempFrame = funcs.Bytes(fileline, dim)/np.dtype(dataType).itemsize
                except TypeError:
                    tempFrame = funcs.Bytes(fileline.encode(), dim)/np.dtype(dataType).itemsize
                    
                if scpIndex==0:
                    numFrame = numFrame + tempFrame
                    seqLenBuffer.append(tempFrame)
                else:
                    if np.abs(seqLenBuffer[fileCtr+1] - tempFrame) / float(seqLenBuffer[fileCtr+1]) > 0.1:
                        if seqLenBuffer[fileCtr+1] > tempFrame:
                            addiFrame = seqLenBuffer[fileCtr+1]-tempFrame
                            display.self_print("%s has %d data, less than %d " % (fileline, tempFrame, seqLenBuffer[fileCtr+1]), 'warning')
                            display.self_print("Other files will be trimmed %d data to fit %s" % (addiFrame,fileline), 'warning')
                        elif seqLenBuffer[fileCtr+1] < tempFrame:
                            addiFrame = -1*seqLenBuffer[fileCtr+1]+tempFrame
                            display.self_print("Trim %d from %s" % (addiFrame,fileline), 'warning')
                
                    if seqLenBuffer[fileCtr+1]>tempFrame:
                        seqLenBuffer[fileCtr+1]=tempFrame
                    
                        
            fileCtr = fileCtr + 1
            print("Input %d %d\r" % (scpIndex, fileCtr), end=' ')
            sys.stdout.flush()
            #sys.stdout.write("\rInput:"+str(fileCtr))
        print("")
        if fPtr.tell() == os.fstat(fPtr.fileno()).st_size:
            flagTer = True                            # all files have been processed
        fPtr.close()    
    assert len(fileInBuffer)-1==(len(inDim)*numSeque), "Unequal file input numbers"
    
    
    # Pre-process the output file
    #  check the duration of output file
    print("Processing the output file")
    if len(OutScpFile)==0:                    # void output files
        for dim in outDim:
            for x in range(numSeque):
                fileOutBuffer.append('#')
    else:                                    # multiple output files
        for scpFile, dim, scpIndex in zip(OutScpFile, outDim, range(len(OutScpFile))):
            fPtr = open(scpFile,'r')
            fileCtr = 0
        
            while 1:
                line = fPtr.readline()
                fileline = line.strip()
                if len(line.strip())==0:
                    break
                
                # only check for relative path
                #if not os.path.isfile(fileline):
                #    fileline = datadir + os.path.sep + fileline
                assert os.path.isfile(fileline), "Can't find file"+fileline

                #print line                
                fileOutBuffer.append(fileline)
                try:
                    tempFrame = funcs.Bytes(fileline, dim)/np.dtype(dataType).itemsize
                except TypeError:
                    tempFrame = funcs.Bytes(fileline.encode(), dim)/np.dtype(dataType).itemsize

                if np.abs(seqLenBuffer[fileCtr+1] - tempFrame) / float(seqLenBuffer[fileCtr+1]) > 0.1:
                    if seqLenBuffer[fileCtr+1] > tempFrame:
                        addiFrame = seqLenBuffer[fileCtr+1]-tempFrame
                        display.self_print("%s has %d data, less than %d " % (fileline, tempFrame, seqLenBuffer[fileCtr+1]), 'warning')
                        display.self_print("Other files will be trimmed %d data to fit %s" % (addiFrame,fileline), 'warning')
                    elif seqLenBuffer[fileCtr+1] < tempFrame:
                        addiFrame = -1*seqLenBuffer[fileCtr+1]+tempFrame
                        display.self_print("Trim %d from %s" % (addiFrame,fileline), 'warning')
                
                if seqLenBuffer[fileCtr+1]>tempFrame:
                    seqLenBuffer[fileCtr+1]=tempFrame
                
                fileCtr = fileCtr + 1
                print("Output %d %d\r" % (scpIndex, fileCtr), end=' ')
                sys.stdout.flush()
            print("")
            fPtr.close()
        assert len(fileOutBuffer)-1==(len(outDim)*numSeque), "Unequal file output numbers"

    # if text scp exists, check
    if len(txtScp) > 0:
        textBuffer = []
        with open(txtScp, 'r') as filePtr:
            for i, fileline in enumerate(filePtr):
                filename = fileline.rstrip('\n')
                name1 = os.path.splitext(os.path.basename(filename))[0]
                name2 = os.path.splitext(os.path.basename(fileLabBuffer[i+1]))[0]
                assert name1==name2, "textScpFile unmatch %s, %s, %d-th line" % (name1, name2, i)
                textBuffer.append(filename)
        assert len(textBuffer)==(len(fileLabBuffer)-1), "textScpFile, unmatched length"
    else:
        textBuffer = []
        
    # Write the scp for packaging data
    scpFileCtr = 1
    fileCtr = 0
    fPtr = open(allScp+str(scpFileCtr), mode='w')
    nameBuf = ['', ]
    numFrameBuf = [0, ]
    numUttBuf  = [0, ]
    frameBuf = 0
    assert numSeque, "Found no utterance to pack"
        
    for i in range(numSeque):
        outputline = "%s %d %d %d" %  \
                     (
                         os.path.splitext(os.path.basename(fileLabBuffer[i+1]))[0], 
                         len(inDim), 
                         len(outDim), 
                         seqLenBuffer[i+1]
                     )
        frameBuf = frameBuf + seqLenBuffer[i+1]
        
        for j in range(len(inDim)):
            index = (j)*numSeque+i+1
            outputline = outputline + " %d %s" % (inDim[j], fileInBuffer[index])
        for j in range(len(outDim)):
            index = (j)*numSeque+i+1
            outputline = outputline + " %d %s" % (outDim[j], fileOutBuffer[index])
        fileCtr = fileCtr + 1
        fPtr.write(outputline)
        
        if len(textBuffer) > 0:
            assert os.path.isfile(textBuffer[i]), "Can't find %s" % (textBuffer[i])
            try:
                itemNum = funcs.Bytes(textBuffer[i], txtDim)/np.dtype(dataType).itemsize
            except TypeError:
                itemNum = funcs.Bytes(textBuffer[i].encode(), txtDim)/np.dtype(dataType).itemsize
            temp = " %d %d %s" % (itemNum, txtDim, textBuffer[i])
            fPtr.write(temp)

        fPtr.write("\n")
        flagLock = False
        
        if fileCtr >= flushThreshold:
            numUttBuf.append(fileCtr)
            numFrameBuf.append(frameBuf)
            fileCtr = 0
            frameBuf = 0        
            fPtr.close()
            nameBuf.append(allScp+str(scpFileCtr))
            scpFileCtr = scpFileCtr + 1
            fPtr = open(allScp+str(scpFileCtr), mode='w')
            flagLock = True
    if flagLock==False:    
        nameBuf.append(allScp+str(scpFileCtr))
        numUttBuf.append(fileCtr)
        numFrameBuf.append(frameBuf)
    fPtr.close()    

    # Return
    assert sum(numUttBuf)==numSeque, "Unequal utterance number"
    return numSeque, numFrame, maxSeqLe, numFrameBuf[1:], numUttBuf[1:], nameBuf[1:]
    try:
        acous_dim = int(sys.argv[4])
    except ValueError:
        raise Exception("Error: input acoustic dim incorrect %s, sub_08" %
                        (sys.argv[4]))

    try:
        waveform_quantization_bits = int(sys.argv[5])
    except ValueError:
        raise Exception(
            "Error: waveform quantization config incorrect %s, sub_08" %
            (sys.argv[5]))

    if not os.path.isfile(network_path):
        display.self_print('Error: not found %s' % (network_path), 'error')
        quit()

    network_path_tmp = network_path + '.tmp'
    with open(network_path, 'r') as file_ptr:
        network_data = json.load(file_ptr)

    # change if necessary
    if 'layers' in network_data:
        for layer_idx in range(len(network_data['layers'])):
            # change time resolution for condition layer
            if 'resolution' in network_data['layers'][layer_idx]:
                if network_data['layers'][layer_idx]['resolution'] > 1:
                    network_data['layers'][layer_idx][
                        'resolution'] = resolution
def createFileLst(dataDirs,
                  dataExts,
                  dataDim,
                  dataListDirs,
                  trainortest,
                  trainSetRatio=0.8,
                  random_seed=12345):
    """ create data lists 
        output *.scp will be in dataListDirs
    """
    dataDirs = dataDirs.split(',')
    dataExts = dataExts.split(',')
    dataDims = [int(x) for x in dataDim.split('_')]
    assert len(dataDirs) == len(
        dataExts), 'Error: sub_1_prepare_list.py dataDirs and dataExts wrong'

    # get the cross-set of file lists
    dataList = lstdirNoExt(dataDirs[0], dataExts[0])
    for dataDir, dataExt in zip(dataDirs[1:], dataExts[1:]):
        listTmp = lstdirNoExt(dataDir, dataExt)
        dataList = crossSet(dataList, listTmp)

    # check if file exists
    if len(dataList) < 1:
        display.self_print("Error: fail to generate file list. Please check:",
                           'error')
        display.self_print(
            "path_acous_feats, ext_acous_feats, path_waveform in config.py;",
            'error')
        display.self_print("Please also check the names of input data files.",
                           'error')
        raise Exception("Error: fail to generate file list.")

    # randomize the data file list
    # sort at first,
    dataList.sort()
    random.seed(random_seed)
    random_shuffle(dataList)

    # before start, take a simple test on the configuration of feature dimension
    frameNum = None
    for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1],
                                           dataExts[0:-1]):
        inputFile = os.path.join(inputDir,
                                 dataList[0]) + '.' + featName.lstrip('.')
        if os.path.isfile(inputFile):
            tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0]
            if frameNum is None or frameNum < tmpframeNum:
                frameNum = tmpframeNum

    for inputDir, featDim, featName in zip(dataDirs[0:-1], dataDims[0:-1],
                                           dataExts[0:-1]):
        inputFile = os.path.join(inputDir,
                                 dataList[0]) + '.' + featName.lstrip('.')
        if os.path.isfile(inputFile):
            tmpframeNum = readwrite.read_raw_mat(inputFile, featDim).shape[0]
            if np.abs(frameNum - tmpframeNum) * 1.0 / frameNum > 0.1:
                if featDim == readwrite.read_raw_mat(inputFile, 1).shape[0]:
                    pass
                else:
                    display.self_print("Large mismatch of frame numbers %s" %
                                       (inputFile))
                    display.self_print(
                        "Please check whether inputDim are correct", 'error')
                    display.self_print("Or check input features are corrupted",
                                       'error')
                    raise Exception("Error: mismatch of frame numbers")

    display.self_print('Generating data lists in to %s' % (dataListDirs),
                       'highlight')

    if trainortest == 'test':
        # generating for test set
        display.self_print('\ttest size: %d' % (len(dataList)), 'highlight')
        testFileOut = dataListDirs + os.path.sep + 'test.lst'
        testFilePtr = open(testFileOut, 'w')
        for fileName in dataList:
            testFilePtr.write('%s\n' % (fileName))
        testFilePtr.close()
    else:
        # determine the train and validatition set if necessary
        if trainSetRatio is not None and trainSetRatio > 0.0:
            if trainSetRatio < 1.0:
                # a ratio
                trainSetDivide = int(np.round(len(dataList) * trainSetRatio))
            elif trainSetRatio < len(dataList):
                # absolute value of the number of training utterances
                trainSetDivide = int(trainSetRatio)
            else:
                # a default ratio 0.8
                display.self_print(
                    'Warning: train_utts = 0.8 is used to divide train/val',
                    'warning')
                trainSetDivide = int(np.round(len(dataList) * 0.8))

            trainSet = dataList[0:trainSetDivide]
            valSet = dataList[trainSetDivide:]

            if len(valSet) > len(trainSet):
                display.self_print(
                    "Warning: validation set is larger than training set",
                    'warning')
                display.self_print(
                    "It's better to change train_utts in config.py", 'warning')

            trainFileOut = dataListDirs + os.path.sep + 'train.lst'
            trainFilePtr = open(trainFileOut, 'w')
            for fileName in trainSet:
                trainFilePtr.write('%s\n' % (fileName))
            trainFilePtr.close()

            if len(valSet):
                valFileOut = dataListDirs + os.path.sep + 'val.lst'
                valFilePtr = open(valFileOut, 'w')
                for fileName in valSet:
                    valFilePtr.write('%s\n' % (fileName))
                valFilePtr.close()

            display.self_print(
                '\ttrain/val sizes: %d, %d' % (len(trainSet), len(valSet)),
                'warning')
        else:
            display.self_print('\ttrain/val sizes: %d, 0' % (len(dataList)),
                               'warning')
            trainFileOut = dataListDirs + os.path.sep + 'train.lst'
            trainFilePtr = open(trainFileOut, 'w')
            for fileName in dataList:
                trainFilePtr.write('%s\n' % (fileName))
            trainFilePtr.close()
    # done
    return