コード例 #1
0
def create_datasets(all_data):
    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    finaltrain_data, finaltrain_metadata = all_data['finaltrain']
    test_data, test_metadata = all_data['test']
    lbl = np.array([int(data[1]) for data in test_data])
    spatial_dimensions = 1

    def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
        mlproblem_metadata['input_size'] = 3  # we need to change the input size from 6 to 3. 
        return [mlproblem_data[0][:3] , mlproblem_data[1]]

    if spatial_dimensions ==1:      
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(train_data, train_metadata)
        validset = trainset.apply_on(valid_data,valid_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data,finaltrain_metadata)
        testset = trainset.apply_on(test_data,test_metadata)

    elif spatial_dimensions ==0:
        import mlpython.mlproblems.generic as mlpg
        trainset = mlpg.PreprocessedProblem(data = train_data , metadata = train_metadata , preprocess = reduce_dimensionality)
        validset = trainset.apply_on(valid_data, valid_metadata)
        testset = trainset.apply_on(test_data, test_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
        validset = trainset.apply_on(validset,validset.metadata)
        finaltrainset = trainset.apply_on(finaltrainset,finaltrainset.metadata)
        testset = trainset.apply_on(testset,testset.metadata)

    return {'trainset':trainset,'validset':validset ,'finaltrainset':finaltrainset, 'testset':testset ,'ground_truth':lbl}
コード例 #2
0
def load_data(dataset_directory, dataset_name):
    print "Loading datasets ..."
    import os
    repo = os.environ.get('MLPYTHON_DATASET_REPO')
    if repo is None:
        raise ValueError(
            'environment variable MLPYTHON_DATASET_REPO is not defined')
    dataset_dir = os.path.join(
        os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory,
        dataset_name)

    input_size = 6
    spatial_dimensions = 1
    all_data = data_utils.load_data(dir_path=dataset_dir,
                                    input_size=input_size,
                                    train_filename=None,
                                    test_filename=None,
                                    background_filename=None,
                                    load_to_memory=False)

    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    finaltrain_data, finaltrain_metadata = all_data['finaltrain']
    test_data, test_metadata = all_data['test']
    lbl = np.array([int(data[1]) for data in test_data])

    def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
        mlproblem_metadata[
            'input_size'] = 3  # we need to change the input size from 6 to 3.
        return [mlproblem_data[0][:3], mlproblem_data[1]]

    if spatial_dimensions == 1:
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(train_data, train_metadata)
        validset = trainset.apply_on(valid_data, valid_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        testset = trainset.apply_on(test_data, test_metadata)

    elif spatial_dimensions == 0:
        import mlpython.mlproblems.generic as mlpg
        trainset = mlpg.PreprocessedProblem(data=train_data,
                                            metadata=train_metadata,
                                            preprocess=reduce_dimensionality)
        validset = trainset.apply_on(valid_data, valid_metadata)
        testset = trainset.apply_on(test_data, test_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
        validset = trainset.apply_on(validset, validset.metadata)
        finaltrainset = trainset.apply_on(finaltrainset,
                                          finaltrainset.metadata)
        testset = trainset.apply_on(testset, testset.metadata)

    return {
        'finaltrainset': finaltrainset,
        'testset': testset,
        'ground_truth': lbl
    }
コード例 #3
0
ファイル: store.py プロジェクト: pgcool/TMBP
def get_classification_problem(name,
                               dataset_dir=None,
                               load_to_memory=True,
                               **kw):
    """
    Creates train/valid/test classification MLProblems from dataset ``name``.

    ``name`` must be one of the supported dataset (see variable
    ``classification_names`` of this module).

    Option ``load_to_memory`` determines whether the dataset should
    be loaded into memory or always read from its files.

    If environment variable MLPYTHON_DATASET_REPO has been set to a
    valid directory path, this function will look into its appropriate
    subdirectory to find the dataset. Alternatively the subdirectory path
    can be given by the user through option ``dataset_dir``.
    """

    if name not in classification_names:
        raise ValueError('dataset ' + name +
                         ' unknown for classification learning')

    exec 'import mlpython.datasets.' + name + ' as mldataset'

    if dataset_dir is None:
        # Try to find dataset in MLPYTHON_DATASET_REPO
        import os
        repo = os.environ.get('MLPYTHON_DATASET_REPO')
        if repo is None:
            raise ValueError(
                'environment variable MLPYTHON_DATASET_REPO is not defined')
        dataset_dir = os.environ.get('MLPYTHON_DATASET_REPO') + '/' + name

    all_data = mldataset.load(dataset_dir, load_to_memory=load_to_memory, **kw)

    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    test_data, test_metadata = all_data['test']

    import mlpython.mlproblems.classification as mlpb
    trainset = mlpb.ClassificationProblem(train_data, train_metadata)
    validset = trainset.apply_on(valid_data, valid_metadata)
    testset = trainset.apply_on(test_data, test_metadata)

    return trainset, validset, testset
コード例 #4
0
train_data, train_metadata = all_data['train']
valid_data, valid_metadata = all_data['valid']
finaltrain_data, finaltrain_metadata = all_data['finaltrain']
test_data, test_metadata = all_data['test']


def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
    mlproblem_metadata[
        'input_size'] = 3  # we need to change the input size from 6 to 3.
    return [mlproblem_data[0][:3], mlproblem_data[1]]


if spatial_dimensions == 1:
    import mlpython.mlproblems.classification as mlpb
    trainset = mlpb.ClassificationProblem(train_data, train_metadata)
    validset = trainset.apply_on(valid_data, valid_metadata)
    finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
    testset = trainset.apply_on(test_data, test_metadata)

elif spatial_dimensions == 0:
    import mlpython.mlproblems.generic as mlpg
    trainset = mlpg.PreprocessedProblem(data=train_data,
                                        metadata=train_metadata,
                                        preprocess=reduce_dimensionality)
    validset = trainset.apply_on(valid_data, valid_metadata)
    testset = trainset.apply_on(test_data, test_metadata)
    finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
    import mlpython.mlproblems.classification as mlpb
    trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
    validset = trainset.apply_on(validset, validset.metadata)
コード例 #5
0
pb3 = mlpbgen.MergedProblem([pb2, pb1])
print 'pb3:'
for example in pb3:
    print example
print 'metadata:', pb3.metadata
print ''

pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
print 'pb4:'
for example in pb4:
    print example
print 'metadata:', pb4.metadata
print ''

pb5 = mlpbclass.ClassificationProblem(pb4)
print 'pb5:'
for example in pb5:
    print example
print 'metadata:', pb5.metadata
print ''

pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
print 'pb6 (final):'
for example in pb6:
    print example
print 'metadata:', pb6.metadata
print ''

pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
print 'pb7 (final):'
コード例 #6
0
def test_mlproblem_combinations():
    """
    Test a combination of many different MLProblems.
    """

    raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B'])
    metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    def features(example, metadata):
        metadata['input_size'] = 2
        return ((example[0], example[0]), example[1])

    pb1 = mlpbgen.MLProblem(raw_data, metadata)
    print 'pb1', pb1.metadata
    pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
    print 'pb2', pb2.metadata
    pb3 = mlpbgen.MergedProblem([pb2, pb1])
    print 'pb3', pb3.metadata
    pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
    print 'pb4', pb4.metadata
    pb5 = mlpbclass.ClassificationProblem(pb4)
    print 'pb5', pb5.metadata
    pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
    print 'pb6', pb6.metadata
    pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
    print 'pb7', pb7.metadata

    final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0),
                                                             0],
                  [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4),
                                                             0]]
    final_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pb7, final_data):
        assert cmp(ex1, ex2) == 0
    print pb7.metadata, final_metadata
    assert cmp(pb7.metadata, final_metadata) == 0

    raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
    metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    pbtest = pb7.apply_on(raw_data2, metadata2)
    final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0],
                       [(9, 9), (9, 9), 1]]
    final_test_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pbtest, final_test_data):
        assert cmp(ex1, ex2) == 0
    assert cmp(pbtest.metadata, final_test_metadata) == 0