def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern,
                  sDataProtoFile):

    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern

    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1

        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(
                sDataFileTemplate % iFileIdx,
                util.npy2ProtoMat(
                    np.hstack(
                        [featMatBatch,
                         np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()

    # last batch
    if featMatBatch is not None:
        util.WriteProto(
            sDataFileTemplate % iFileIdx,
            util.npy2ProtoMat(
                np.hstack([featMatBatch,
                           np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern, sDataProtoFile):
    
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern
    
    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400*1024*1024 / (4*featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1
            
        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])
        
        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()
    
    # last batch
    if featMatBatch is not None:
        util.WriteProto(sDataFileTemplate % iFileIdx, util.npy2ProtoMat(np.hstack([featMatBatch, np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
def pfile2Proto(pfilePath, filePrefix, pbmDir):
    pfile = read_dataset(pfilePath, {'partition': 1024*1024*400})
    
    dsInfo = dl.DatasetInfo()
    dsInfo.data_format = dl.DatasetInfo.PBM
    dsInfo.sparse_label = True
    dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix
    dim = None
    sz = 0
    for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)):
        dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]]))
        util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)), dataset)
        if dim is None:
            dim = data.shape[1] + 1
        if dim != data.shape[1] + 1:
            print dim, sz, data,shape, label.shape
        assert dim == data.shape[1] + 1
        sz += data.shape[0]
    dsInfo.size = sz
    dsInfo.dimensions = dim
    dsInfo.label_start_index = dim - 1
    return dsInfo
def pfile2Proto(pfilePath, filePrefix, pbmDir):
    pfile = read_dataset(pfilePath, {'partition': 1024 * 1024 * 400})

    dsInfo = dl.DatasetInfo()
    dsInfo.data_format = dl.DatasetInfo.PBM
    dsInfo.sparse_label = True
    dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix
    dim = None
    sz = 0
    for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)):
        dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]]))
        util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)),
                        dataset)
        if dim is None:
            dim = data.shape[1] + 1
        if dim != data.shape[1] + 1:
            print dim, sz, data, shape, label.shape
        assert dim == data.shape[1] + 1
        sz += data.shape[0]
    dsInfo.size = sz
    dsInfo.dimensions = dim
    dsInfo.label_start_index = dim - 1
    return dsInfo
Beispiel #5
0
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile):
    # append label column
    data = np.hstack([data, np.zeros((data.shape[0], 1))])
    npyData = util.npy2ProtoMat(data)
    sDataFile = os.path.join(wdir, 'input.pbm')
    util.WriteProto(sDataFile, npyData)

    sDataProtoFile = os.path.join(wdir, 'data.pbtxt')
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_extract'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = wdir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = data.shape[0]
    datasetInfo.dimensions = data.shape[1]
    datasetInfo.label_start_index = datasetInfo.dimensions - 1
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = 'input.pbm'
    util.WriteProto(sDataProtoFile, dbInfo)
    
    sEvalOpFile = os.path.join(wdir, 'eval.pbtxt')
    sExtractedActs = os.path.join(wdir, 'acts')
    sLayerName = 'conv2'
    if not os.path.exists(sExtractedActs):
        os.mkdir(sExtractedActs)
    evalOp = dl.Operation()
    evalOp.name = 'extract'
    evalOp.stop_condition.all_processed = True
    evalOp.operation_type = dl.Operation.TEST
    evalOp.data_proto = sDataProtoFile
    evalOp.randomize = False
    evalOp.get_last_piece = True
    evalOp.verbose = False
    evalOp.extracted_layers.append(sLayerName)
    evalOp.extracted_output_dir = sExtractedActs
    evalOp.extracted_data_format = dl.DatasetInfo.PBM
    evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET)
    util.WriteProto(sEvalOpFile, evalOp)
    
    sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm')
    for s in sorted(glob.glob(sOutFileTemplate)):
        try:
            os.remove(s)
        except Exception:
            pass
            
    # run the network...
    args = [sDeeplearnPath, 'extract', sModelFile, '--eval-op=%s' % sEvalOpFile]
    pr = subprocess.Popen(args, stderr=subprocess.STDOUT)
    pr.wait()
    if pr.returncode != 0:
        print 'Failed to extract representations'
        exit(1)
    
    # read Dataset
    mOutput = None
    for s in sorted(glob.glob(sOutFileTemplate)):
        m = util.proto2Npy(util.ReadProto(s, dl.Matrix()))
        if mOutput is None:
            mOutput = m
        else:
            mOutput = np.vstack([mOutput, m])
            
    if mOutput.shape[0] != data.shape[0]:
        print 'Invalid results'
        exit(1)
    return mOutput
Beispiel #6
0
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile):
    # append label column
    data = np.hstack([data, np.zeros((data.shape[0], 1))])
    npyData = util.npy2ProtoMat(data)
    sDataFile = os.path.join(wdir, 'input.pbm')
    util.WriteProto(sDataFile, npyData)

    sDataProtoFile = os.path.join(wdir, 'data.pbtxt')
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_extract'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = wdir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = data.shape[0]
    datasetInfo.dimensions = data.shape[1]
    datasetInfo.label_start_index = datasetInfo.dimensions - 1
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = 'input.pbm'
    util.WriteProto(sDataProtoFile, dbInfo)

    sEvalOpFile = os.path.join(wdir, 'eval.pbtxt')
    sExtractedActs = os.path.join(wdir, 'acts')
    sLayerName = 'conv2'
    if not os.path.exists(sExtractedActs):
        os.mkdir(sExtractedActs)
    evalOp = dl.Operation()
    evalOp.name = 'extract'
    evalOp.stop_condition.all_processed = True
    evalOp.operation_type = dl.Operation.TEST
    evalOp.data_proto = sDataProtoFile
    evalOp.randomize = False
    evalOp.get_last_piece = True
    evalOp.verbose = False
    evalOp.extracted_layers.append(sLayerName)
    evalOp.extracted_output_dir = sExtractedActs
    evalOp.extracted_data_format = dl.DatasetInfo.PBM
    evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET)
    util.WriteProto(sEvalOpFile, evalOp)

    sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm')
    for s in sorted(glob.glob(sOutFileTemplate)):
        try:
            os.remove(s)
        except Exception:
            pass

    # run the network...
    args = [
        sDeeplearnPath, 'extract', sModelFile,
        '--eval-op=%s' % sEvalOpFile
    ]
    pr = subprocess.Popen(args, stderr=subprocess.STDOUT)
    pr.wait()
    if pr.returncode != 0:
        print 'Failed to extract representations'
        exit(1)

    # read Dataset
    mOutput = None
    for s in sorted(glob.glob(sOutFileTemplate)):
        m = util.proto2Npy(util.ReadProto(s, dl.Matrix()))
        if mOutput is None:
            mOutput = m
        else:
            mOutput = np.vstack([mOutput, m])

    if mOutput.shape[0] != data.shape[0]:
        print 'Invalid results'
        exit(1)
    return mOutput