def createTestSet(ark_file, sDataFileTemplate, sDataDir, sDataFilePattern,
                  sDataProtoFile):

    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_test'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = sDataDir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = 0
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = sDataFilePattern

    uttIDBatch = []
    uttIDLength = []
    featMatBatch = None
    batchSz = -1
    iFileIdx = 0
    kaldiIn = KaldiReadIn(ark_file)
    kaldiIn.open()
    uttID, featMat = kaldiIn.next()
    while featMat is not None:
        if batchSz < 0:
            batchSz = 400 * 1024 * 1024 / (4 * featMat.shape[1])
            datasetInfo.dimensions = featMat.shape[1] + 1
            datasetInfo.label_start_index = datasetInfo.dimensions - 1

        if featMatBatch is None:
            featMatBatch = featMat
        else:
            featMatBatch = np.vstack([featMatBatch, featMat])
        uttIDBatch.append(uttID)
        uttIDLength.append(featMat.shape[0])

        if featMatBatch.shape[0] >= batchSz:
            util.WriteProto(
                sDataFileTemplate % iFileIdx,
                util.npy2ProtoMat(
                    np.hstack(
                        [featMatBatch,
                         np.zeros((featMatBatch.shape[0], 1))])))
            iFileIdx += 1
            datasetInfo.size += featMatBatch.shape[0]
            featMatBatch = None
        uttID, featMat = kaldiIn.next()
    kaldiIn.close()

    # last batch
    if featMatBatch is not None:
        util.WriteProto(
            sDataFileTemplate % iFileIdx,
            util.npy2ProtoMat(
                np.hstack([featMatBatch,
                           np.zeros((featMatBatch.shape[0], 1))])))
        iFileIdx += 1
        datasetInfo.size += featMatBatch.shape[0]
    util.WriteProto(sDataProtoFile, dbInfo)
    return (uttIDBatch, uttIDLength)
Example #2
0
def pfile2Proto(pfilePath, filePrefix, pbmDir):
    pfile = read_dataset(pfilePath, {'partition': 1024 * 1024 * 400})

    dsInfo = dl.DatasetInfo()
    dsInfo.data_format = dl.DatasetInfo.PBM
    dsInfo.sparse_label = True
    dsInfo.file_pattern = '%s[0-9]*.pbm' % filePrefix
    dim = None
    sz = 0
    for i, (data, label) in enumerate(zip(pfile.feat_mats, pfile.label_vecs)):
        dataset = util.npy2ProtoMat(np.hstack([data, label[:, None]]))
        util.WriteProto(os.path.join(pbmDir, '%s%05d.pbm' % (filePrefix, i)),
                        dataset)
        if dim is None:
            dim = data.shape[1] + 1
        if dim != data.shape[1] + 1:
            print dim, sz, data, shape, label.shape
        assert dim == data.shape[1] + 1
        sz += data.shape[0]
    dsInfo.size = sz
    dsInfo.dimensions = dim
    dsInfo.label_start_index = dim - 1
    return dsInfo
Example #3
0
def createPbmDataset(pfiles, pbmDir, protoFilePath, gpuMem):
    assert gpuMem > 0.1

    dbInfo = dl.DatabaseInfo()

    for (name, sPath) in pfiles:
        assert name == 'train' or name == 'valid' or name == 'test'

        dsInfo = pfile2Proto(sPath, name + '_part', pbmDir)
        if name == 'train':
            dsInfo.type = dl.DatasetInfo.TRAIN_SET
        elif name == 'valid':
            dsInfo.type = dl.DatasetInfo.EVAL_SET
        else:
            dsInfo.type = dl.DatasetInfo.TEST_SET
        dbInfo.data.extend([dsInfo])

    dbInfo.name = 'dataset'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.main_memory = 6.0
    dbInfo.gpu_memory = float(gpuMem)
    dbInfo.path_prefix = pbmDir
    util.WriteProto(protoFilePath, dbInfo)
    evalOp.randomize = False
    evalOp.get_last_piece = True
    evalOp.verbose = False

    # test on all models
    for i in xrange(0, config.PHONES):
        for j in xrange(i + 1, config.PHONES):
            sResultFile = os.path.join(sResultDir, '%d_%d.csv' % (i, j))

            if os.path.exists(sResultFile):
                continue
            print 'Testing for %d-%d, writing results to %s' % (i, j,
                                                                sResultFile)

            evalOp.result_file = sResultFile
            util.WriteProto(sEvalOpFile, evalOp)
            sModelFile = sModelFiles % (i, j)

            args = [
                sDeeplearnPath, 'eval', sModelFile,
                '--eval-op=%s' % sEvalOpFile
            ]
            pr = subprocess.Popen(args, stderr=subprocess.STDOUT)
            pr.wait()
            if pr.returncode != 0:
                print 'Failed to test %d-%d' % (i, j)
                exit(1)

    # run majorityVote, compute "probabilities"
    sHardVoteFile = os.path.join(sMajorVoteDir, 'hard.csv')
    sSoftVoteFile = os.path.join(sMajorVoteDir, 'soft.csv')
Example #5
0
    if not os.path.exists(sOutputModelFile):
        # modify architecture...
        sModelDir = os.path.join(wdir, 'model/')
        createDir(sModelDir)
        sCurrentDir = os.path.split(os.path.realpath(
            os.path.abspath(__file__)))[0]
        model = util.ReadProto(
            os.path.join(sCurrentDir, 'prototype/conv_timit.pbtxt'),
            dl.ModelData())
        model.name = 'spn_conv'
        for n in model.nodes:
            if n.name == 'output':
                n.dimension = num_outputs
                break
        sModelFile = os.path.join(sModelDir, 'spn_conv.pbtxt')
        util.WriteProto(sModelFile, model)

        trainOp = util.ReadProto(
            os.path.join(sCurrentDir, 'prototype/train.pbtxt'), dl.Operation())
        sCheckpointDir = os.path.join(sModelDir, 'cp')
        trainOp.name = 'train'
        trainOp.data_proto = sDataProtoFile
        trainOp.checkpoint_directory = sCheckpointDir
        trainOp.verbose = False
        sTrainOpFile = os.path.join(sModelDir, 'train.pbtxt')
        util.WriteProto(sTrainOpFile, trainOp)

        evalOp = util.ReadProto(
            os.path.join(sCurrentDir, 'prototype/eval.pbtxt'), dl.Operation())
        evalOp.data_proto = sDataProtoFile
        evalOp.verbose = False
Example #6
0
def extractRepresentation(data, wdir, sDeeplearnPath, sModelFile):
    # append label column
    data = np.hstack([data, np.zeros((data.shape[0], 1))])
    npyData = util.npy2ProtoMat(data)
    sDataFile = os.path.join(wdir, 'input.pbm')
    util.WriteProto(sDataFile, npyData)

    sDataProtoFile = os.path.join(wdir, 'data.pbtxt')
    dbInfo = dl.DatabaseInfo()
    dbInfo.name = 'dataset_extract'
    dbInfo.data_handler = 'deeplearn'
    dbInfo.path_prefix = wdir
    datasetInfo = dbInfo.data.add()
    datasetInfo.data_format = dl.DatasetInfo.PBM
    datasetInfo.size = data.shape[0]
    datasetInfo.dimensions = data.shape[1]
    datasetInfo.label_start_index = datasetInfo.dimensions - 1
    datasetInfo.sparse_label = True
    datasetInfo.type = dl.DatasetInfo.TEST_SET
    datasetInfo.file_pattern = 'input.pbm'
    util.WriteProto(sDataProtoFile, dbInfo)

    sEvalOpFile = os.path.join(wdir, 'eval.pbtxt')
    sExtractedActs = os.path.join(wdir, 'acts')
    sLayerName = 'conv2'
    if not os.path.exists(sExtractedActs):
        os.mkdir(sExtractedActs)
    evalOp = dl.Operation()
    evalOp.name = 'extract'
    evalOp.stop_condition.all_processed = True
    evalOp.operation_type = dl.Operation.TEST
    evalOp.data_proto = sDataProtoFile
    evalOp.randomize = False
    evalOp.get_last_piece = True
    evalOp.verbose = False
    evalOp.extracted_layers.append(sLayerName)
    evalOp.extracted_output_dir = sExtractedActs
    evalOp.extracted_data_format = dl.DatasetInfo.PBM
    evalOp.extracted_data_sets.append(dl.DatasetInfo.TEST_SET)
    util.WriteProto(sEvalOpFile, evalOp)

    sOutFileTemplate = os.path.join(sExtractedActs, sLayerName, '*.pbm')
    for s in sorted(glob.glob(sOutFileTemplate)):
        try:
            os.remove(s)
        except Exception:
            pass

    # run the network...
    args = [
        sDeeplearnPath, 'extract', sModelFile,
        '--eval-op=%s' % sEvalOpFile
    ]
    pr = subprocess.Popen(args, stderr=subprocess.STDOUT)
    pr.wait()
    if pr.returncode != 0:
        print 'Failed to extract representations'
        exit(1)

    # read Dataset
    mOutput = None
    for s in sorted(glob.glob(sOutFileTemplate)):
        m = util.proto2Npy(util.ReadProto(s, dl.Matrix()))
        if mOutput is None:
            mOutput = m
        else:
            mOutput = np.vstack([mOutput, m])

    if mOutput.shape[0] != data.shape[0]:
        print 'Invalid results'
        exit(1)
    return mOutput