def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern,
                  sDataProtoFile):

    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern

    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1

        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(
                sDataFileTemplate % iFileIdx,
                util.npy2ProtoMat(
                    np.hstack(
                        [featMatBatch,
                         np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()

    # last batch
    if featMatBatch is not None:
        util.WriteProto(
            sDataFileTemplate % iFileIdx,
            util.npy2ProtoMat(
                np.hstack([featMatBatch,
                           np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile):
    
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern
    
    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400*1024*1024 / (4*featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1
            
        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])
        
        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()
    
    # last batch
    if featMatBatch is not None:
        util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])
        
        if featMatBatch.shape[0] >= batchSz:
            featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile)
            rIdx = 0
            for i, uId in enumerate(uttIDBatch):
                kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :])
                rIdx += uttIDLength[i]
            
            featMatBatch = None
            uttIDBatch = []
            uttIDLength = []
        uttID, featMat = kaldiIn.next()
    
    # final batch
    if featMatBatch.shape[0] > 0:
        featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile)
        rIdx = 0
        for i, uId in enumerate(uttIDBatch):
            kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :])
            rIdx += uttIDLength[i]
    kaldiIn.close()
    kaldiOut.close()




Exemple #4
0
        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            featOut = extractRepresentation(featMatBatch, sDataDir,
                                            sDeeplearnPath, sModelFile)
            rIdx = 0
            for i, uId in enumerate(uttIDBatch):
                kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :])
                rIdx += uttIDLength[i]

            featMatBatch = None
            uttIDBatch = []
            uttIDLength = []
        uttID, featMat = kaldiIn.next()

    # final batch
    if featMatBatch.shape[0] > 0:
        featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath,
                                        sModelFile)
        rIdx = 0
        for i, uId in enumerate(uttIDBatch):
            kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :])
            rIdx += uttIDLength[i]
    kaldiIn.close()
    kaldiOut.close()