def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack( [featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400*1024*1024 / (4*featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0 for i, uId in enumerate(uttIDBatch): kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :]) rIdx += uttIDLength[i] featMatBatch = None uttIDBatch = [] uttIDLength = [] uttID, featMat = kaldiIn.next() # final batch if featMatBatch.shape[0] > 0: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0 for i, uId in enumerate(uttIDBatch): kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :]) rIdx += uttIDLength[i] kaldiIn.close() kaldiOut.close()
if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0 for i, uId in enumerate(uttIDBatch): kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :]) rIdx += uttIDLength[i] featMatBatch = None uttIDBatch = [] uttIDLength = [] uttID, featMat = kaldiIn.next() # final batch if featMatBatch.shape[0] > 0: featOut = extractRepresentation(featMatBatch, sDataDir, sDeeplearnPath, sModelFile) rIdx = 0 for i, uId in enumerate(uttIDBatch): kaldiOut.write(uId, featOut[rIdx:(rIdx + uttIDLength[i]), :]) rIdx += uttIDLength[i] kaldiIn.close() kaldiOut.close()