def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack( [featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto( sDataFileTemplate % iFileIdx, util.npy2ProtoMat( np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile): dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_test' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = sDataDir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = 0 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = sDataFilePattern uttIDBatch = [] uttIDLength = [] featMatBatch = None batchSz = -1 iFileIdx = 0 kaldiIn = KaldiReadIn(ark_file) kaldiIn.open() uttID, featMat = kaldiIn.next() while featMat is not None: if batchSz < 0: batchSz = 400*1024*1024 / (4*featMat.shape[1]) datasetInfo.dimensions = featMat.shape[1] + 1 datasetInfo.label_start_index = datasetInfo.dimensions - 1 if featMatBatch is None: featMatBatch = featMat else: featMatBatch = np.vstack([featMatBatch, featMat]) uttIDBatch.append(uttID) uttIDLength.append(featMat.shape[0]) if featMatBatch.shape[0] >= batchSz: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] featMatBatch = None uttID, featMat = kaldiIn.next() kaldiIn.close() # last batch if featMatBatch is not None: util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))]))) iFileIdx += 1 datasetInfo.size += featMatBatch.shape[0] util.WriteProto(sDataProtoFile, dbInfo) return (uttIDBatch, uttIDLength)
def pfile2Proto(pfilePath, filePrefix, pbmDir): pfile = read_dataset(pfilePath, {'partition': 1024*1024*400}) dsInfo = dl.DatasetInfo() dsInfo.data_format = dl.DatasetInfo.PBM dsInfo.sparse_label = True dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix dim = None sz = 0 for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)): dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]])) util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)), dataset) if dim is None: dim = data.shape[1] + 1 if dim != data.shape[1] + 1: print dim, sz, data,shape, label.shape assert dim == data.shape[1] + 1 sz += data.shape[0] dsInfo.size = sz dsInfo.dimensions = dim dsInfo.label_start_index = dim - 1 return dsInfo
def pfile2Proto(pfilePath, filePrefix, pbmDir): pfile = read_dataset(pfilePath, {'partition': 1024 * 1024 * 400}) dsInfo = dl.DatasetInfo() dsInfo.data_format = dl.DatasetInfo.PBM dsInfo.sparse_label = True dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix dim = None sz = 0 for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)): dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]])) util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)), dataset) if dim is None: dim = data.shape[1] + 1 if dim != data.shape[1] + 1: print dim, sz, data, shape, label.shape assert dim == data.shape[1] + 1 sz += data.shape[0] dsInfo.size = sz dsInfo.dimensions = dim dsInfo.label_start_index = dim - 1 return dsInfo
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile): # append label column data = np.hstack([data, np.zeros((data.shape[0], 1))]) npyData = util.npy2ProtoMat(data) sDataFile = os.path.join(wdir, 'input.pbm') util.WriteProto(sDataFile, npyData) sDataProtoFile = os.path.join(wdir, 'data.pbtxt') dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_extract' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = wdir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = data.shape[0] datasetInfo.dimensions = data.shape[1] datasetInfo.label_start_index = datasetInfo.dimensions - 1 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = 'input.pbm' util.WriteProto(sDataProtoFile, dbInfo) sEvalOpFile = os.path.join(wdir, 'eval.pbtxt') sExtractedActs = os.path.join(wdir, 'acts') sLayerName = 'conv2' if not os.path.exists(sExtractedActs): os.mkdir(sExtractedActs) evalOp = dl.Operation() evalOp.name = 'extract' evalOp.stop_condition.all_processed = True evalOp.operation_type = dl.Operation.TEST evalOp.data_proto = sDataProtoFile evalOp.randomize = False evalOp.get_last_piece = True evalOp.verbose = False evalOp.extracted_layers.append(sLayerName) evalOp.extracted_output_dir = sExtractedActs evalOp.extracted_data_format = dl.DatasetInfo.PBM evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET) util.WriteProto(sEvalOpFile, evalOp) sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm') for s in sorted(glob.glob(sOutFileTemplate)): try: os.remove(s) except Exception: pass # run the network... args = [sDeeplearnPath, 'extract', sModelFile, '--eval-op=%s' % sEvalOpFile] pr = subprocess.Popen(args, stderr=subprocess.STDOUT) pr.wait() if pr.returncode != 0: print 'Failed to extract representations' exit(1) # read Dataset mOutput = None for s in sorted(glob.glob(sOutFileTemplate)): m = util.proto2Npy(util.ReadProto(s, dl.Matrix())) if mOutput is None: mOutput = m else: mOutput = np.vstack([mOutput, m]) if mOutput.shape[0] != data.shape[0]: print 'Invalid results' exit(1) return mOutput
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile): # append label column data = np.hstack([data, np.zeros((data.shape[0], 1))]) npyData = util.npy2ProtoMat(data) sDataFile = os.path.join(wdir, 'input.pbm') util.WriteProto(sDataFile, npyData) sDataProtoFile = os.path.join(wdir, 'data.pbtxt') dbInfo = dl.DatabaseInfo() dbInfo.name = 'dataset_extract' dbInfo.data_handler = 'deeplearn' dbInfo.path_prefix = wdir datasetInfo = dbInfo.data.add() datasetInfo.data_format = dl.DatasetInfo.PBM datasetInfo.size = data.shape[0] datasetInfo.dimensions = data.shape[1] datasetInfo.label_start_index = datasetInfo.dimensions - 1 datasetInfo.sparse_label = True datasetInfo.type = dl.DatasetInfo.TEST_SET datasetInfo.file_pattern = 'input.pbm' util.WriteProto(sDataProtoFile, dbInfo) sEvalOpFile = os.path.join(wdir, 'eval.pbtxt') sExtractedActs = os.path.join(wdir, 'acts') sLayerName = 'conv2' if not os.path.exists(sExtractedActs): os.mkdir(sExtractedActs) evalOp = dl.Operation() evalOp.name = 'extract' evalOp.stop_condition.all_processed = True evalOp.operation_type = dl.Operation.TEST evalOp.data_proto = sDataProtoFile evalOp.randomize = False evalOp.get_last_piece = True evalOp.verbose = False evalOp.extracted_layers.append(sLayerName) evalOp.extracted_output_dir = sExtractedActs evalOp.extracted_data_format = dl.DatasetInfo.PBM evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET) util.WriteProto(sEvalOpFile, evalOp) sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm') for s in sorted(glob.glob(sOutFileTemplate)): try: os.remove(s) except Exception: pass # run the network... args = [ sDeeplearnPath, 'extract', sModelFile, '--eval-op=%s' % sEvalOpFile ] pr = subprocess.Popen(args, stderr=subprocess.STDOUT) pr.wait() if pr.returncode != 0: print 'Failed to extract representations' exit(1) # read Dataset mOutput = None for s in sorted(glob.glob(sOutFileTemplate)): m = util.proto2Npy(util.ReadProto(s, dl.Matrix())) if mOutput is None: mOutput = m else: mOutput = np.vstack([mOutput, m]) if mOutput.shape[0] != data.shape[0]: print 'Invalid results' exit(1) return mOutput