def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern,
                  sDataProtoFile):

    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern

    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1

        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(
                sDataFileTemplate % iFileIdx,
                util.npy2ProtoMat(
                    np.hstack(
                        [featMatBatch,
                         np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()

    # last batch
    if featMatBatch is not None:
        util.WriteProto(
            sDataFileTemplate % iFileIdx,
            util.npy2ProtoMat(
                np.hstack([featMatBatch,
                           np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile):
    
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern
    
    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400*1024*1024 / (4*featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1
            
        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])
        
        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()
    
    # last batch
    if featMatBatch is not None:
        util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
Beispiel #3
0
 wdir = os.path.abspath(arguments['wdir'])
 output_file_prefix = arguments['output_file_prefix']
 sModelFile = arguments['model_file']
 sDeeplearnPath = arguments['deeplearn_path']
 
 # paths for output files
 output_scp = output_file_prefix + '.scp'
 output_ark = output_file_prefix + '.ark'
 removeFile(output_scp)
 removeFile(output_ark)
 
 sDataDir = os.path.join(wdir, 'data')
 if not os.path.exists(sDataDir):
     os.mkdir(sDataDir)
 
 kaldiIn = KaldiReadIn(ark_file)
 kaldiIn.open()
 kaldiOut = KaldiWriteOut(output_scp,output_ark)
 kaldiOut.open()
 uttIDBatch = []
 uttIDLength = []
 featMatBatch = None
 batchSz = -1
 uttID, featMat = kaldiIn.next()
 while featMat is not None:
     if batchSz < 0:
         batchSz = 300*1024*1024 / (4*featMat.shape[1])
         
     if featMatBatch is None:
         featMatBatch = featMat
     else:
Beispiel #4
0
    wdir = os.path.abspath(arguments['wdir'])
    output_file_prefix = arguments['output_file_prefix']
    sModelFile = arguments['model_file']
    sDeeplearnPath = arguments['deeplearn_path']

    # paths for output files
    output_scp = output_file_prefix + '.scp'
    output_ark = output_file_prefix + '.ark'
    removeFile(output_scp)
    removeFile(output_ark)

    sDataDir = os.path.join(wdir, 'data')
    if not os.path.exists(sDataDir):
        os.mkdir(sDataDir)

    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    kaldiOut = KaldiWriteOut(output_scp, output_ark)
    kaldiOut.open()
    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 300 * 1024 * 1024 / (4 * featMat.shape[1])

        if featMatBatch is None:
            featMatBatch = featMat
        else: