Example #1
0
def load(dir_path,load_to_memory=False,fold=1):
    """
    Loads the LETOR 4.0 MQ2007 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.

    This dataset comes with 5 predefined folds, which can be specified
    with option ``fold`` (default = 1). 
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'length'``

    """
    
    input_size=46
    dir_path = os.path.expanduser(dir_path)
    sparse=False

    if fold not in [1,2,3,4,5]:
        raise error('There are 5 predefined folds. Option fold should be an integer between 1 and 5')

    def convert(feature,value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line,convert,int,sparse,input_size)

    n_queries = [ [ 1017, 339, 336 ],
                  [ 1017, 336, 339 ],
                  [ 1014, 339, 339 ],
                  [ 1014, 339, 339 ],
                  [ 1014, 339, 339 ] ]

    lengths = [ [42158, 13813, 13652],
                [41958, 13652, 14013],
                [41320, 14013, 14290],
                [41478, 14290, 13855],
                [41955, 13855, 13813] ]
    
    # Get data file paths
    train_file,valid_file,test_file = [os.path.join(dir_path, 'MQ2007/Fold' + str(fold) + '/' + ds + '.txt') for ds in ['train','vali','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths[fold-1])]
        
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                        'scores':range(3),
                                        'n_queries':nq,
                                        'length':l,
                                        'n_pairs':l} for nq,l in zip(n_queries[fold-1],lengths[fold-1])]

    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #2
0
def load(file_path,input_size=13649,load_to_memory=True):
    """
    Loads LIBSVM dataset in path ``file_path``.
    """
    dataFile = os.path.expanduser(file_path)
    
    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     convert_target=str,
                                     sparse=True,
                                     input_size=input_size,
                                     input_type=np.int32)[0]

    # Get data
    data = mlio.load_from_file(dataFile,load_line)
    if load_to_memory:
        data = [x for x in data]
        length = len(data)
    else:
        length = 0
        stream = open(dataFile)
        for l in stream:
            length+=1
        stream.close()
            
    # Get metadata
    data_meta = {'input_size':input_size,'length':length}
    
    return (data,data_meta)
Example #3
0
def load(dir_path,load_to_memory=False):
    """
    Loads the NIPS 0-12 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """
    
    input_size=500
    dir_path = os.path.expanduser(dir_path)
    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens[:-1]]) #The last element is bogus (don't ask...)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [400,100,1240]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,)],[np.float64],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #4
0
def load(dir_path,load_to_memory=False):
    """
    Loads a binarized version of MNIST. 

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'length'``

    """
    
    input_size=784
    dir_path = os.path.expanduser(dir_path)
    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens])

    train_file,valid_file,test_file = [os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [50000,10000,10000]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,)],[np.float64],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #5
0
def load(dir_path,load_to_memory=False):
    """
    Loads the Abalone dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """
    
    input_size = 8
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'abalone_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [3341, 418, 418]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #6
0
def load(dir_path, load_to_memory=False):
    """
    Loads the DNA dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 180
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1, 2])
    target_mapping = {'1': 0, '2': 1, '3': 2}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     convert_target=convert_target,
                                     sparse=False,
                                     input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [1400, 600, 1186]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #7
0
def load(dir_path, load_to_memory=False):
    """
    Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs
    and binary targets.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 150
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1])
    target_mapping = {'0': 0, '1': 1}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'rcv1_all_subset.binary_' + ds + '_voc_150.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40000, 10000, 150000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #8
0
def load(dir_path, load_to_memory=False):
    """
    Loads the Rectangles images dataset. 

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 784
    dir_path = os.path.expanduser(dir_path)
    targets = set(range(2))

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), float(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'rectangles_images_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [10000, 2000, 50000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #9
0
def load(dir_path, load_to_memory=False):
    """
    Loads the Housing dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 13
    #targets = set(range(2))
    #targets = set([0,1])
    #target_mapping = {'-1':0,'+1':1}
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     float,
                                     float,
                                     sparse=False,
                                     input_size=input_size)
        #return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'housing_' + ds + '.libsvm')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [404, 51, 51]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    #train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths]
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #10
0
def load(dir_path, load_to_memory=False):
    """
    Loads the OCR letters dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 128
    targets = set(range(26))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        seq_size = len(tokens) / (input_size + 1)
        input = np.zeros((seq_size, input_size))
        target = -1 * np.ones((seq_size), dtype=int)

        example = np.array([int(i) for i in tokens]).reshape((seq_size, -1))
        input[:] = example[:, :input_size]
        target[:] = example[:, input_size]
        return input, target

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'ocr_letters_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [5502, 688, 687]
    if load_to_memory:
        train = [example for example in train]
        valid = [example for example in valid]
        test = [example for example in test]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #11
0
def load(dir_path, load_to_memory=True, sparse=False, binary_input=False):
    """
    Loads the 20 news groups dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 2000 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 2000
    targets = set(range(20))
    dir_path = os.path.expanduser(dir_path)

    def convert_target(target_str):
        return int(target_str) - 1

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     convert_target=convert_target,
                                     sparse=sparse,
                                     input_size=input_size,
                                     input_type=np.int32)

    # Get data
    train_file, valid_file, test_file = [
        os.path.join(dir_path, ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]
    if load_to_memory:
        train, valid, test = [[x for x in f] for f in [train, valid, test]]

    lengths = [10284, 1000, 7502]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'targets': targets,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #12
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the OCR letters dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                OCR dataset (web page)
                link: http://www.seas.upenn.edu/~taskar/ocr/
    """

    input_size = 128
    targets = set(range(26))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))
        #return mlio.libsvm_load_line(line,float,int,sparse,input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'ocr_letters_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [32152, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #13
0
def load(dir_path, load_to_memory=False, fold=1):
    """
    Loads the LETOR 4.0 MQ2008 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.

    This dataset comes with 5 predefined folds, which can be specified
    with option ``fold`` (default = 1). 
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'length'``

    """

    input_size = 46
    dir_path = os.path.expanduser(dir_path)
    sparse = False

    if fold not in [1, 2, 3, 4, 5]:
        raise error("There are 5 predefined folds. Option fold should be an integer between 1 and 5")

    def convert(feature, value):
        if feature != "qid":
            raise ValueError("Unexpected feature")
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line, convert, int, sparse, input_size)

    n_queries = [[471, 157, 156], [471, 156, 157], [470, 157, 157], [470, 157, 157], [470, 157, 157]]

    lengths = [[9630, 2707, 2874], [9404, 2874, 2933], [8643, 2933, 3635], [8514, 3635, 3062], [9442, 3062, 2707]]

    # Get data file paths
    train_file, valid_file, test_file = [
        os.path.join(dir_path, "MQ2008/Fold" + str(fold) + "/" + ds + ".txt") for ds in ["train", "vali", "test"]
    ]
    # Get data
    train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size,), (1,), (1,)], [np.float64, int, int], l)
            for d, l in zip([train, valid, test], lengths[fold - 1])
        ]

    train_meta, valid_meta, test_meta = [
        {"input_size": input_size, "scores": range(3), "n_queries": nq, "length": l, "n_pairs": l}
        for nq, l in zip(n_queries[fold - 1], lengths[fold - 1])
    ]

    return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
Example #14
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the NIPS 0-12 dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """

    input_size = 500
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens[:-1]
                         ])  #The last element is bogus (don't ask...)

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [400, 100, 1240]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [dtype], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #15
0
def load(dir_path, load_to_memory=False):
    """
    Loads the MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been normalized between 0 and 1.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 784
    targets = set(range(10))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))
        #return mlio.libsvm_load_line(line,float,int,sparse,input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'mnist_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
def load(dir_path, load_to_memory=False):
    """
    Loads the occluded MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs and targets have been converted to a binary format.

    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 784
    target_size = 784
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:input_size]]),
                np.array([int(i) for i in tokens[input_size:]]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'occluded_mnist_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (target_size, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'target_size': target_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #17
0
def load(dir_path, load_to_memory=False):
    """
    Loads the 20-newsgroups dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 5000 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 5000
    targets = set(range(20))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [9578, 1691, 7505]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int],
                               l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #18
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads a binarized version of MNIST. 

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'length'

    Reference: On the Quantitative Analysis of Deep Belief Networks
               Salakhutdinov and Murray
               link: http://www.mit.edu/~rsalakhu/papers/dbn_ais.pdf

               The MNIST database of handwritten digits (web page)
               Yann LeCun and Corinna Cortes
               link: http://yann.lecun.com/exdb/mnist/
    """

    input_size = 784
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens])

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [dtype], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #19
0
def load(dir_path, load_to_memory=False):
    """
    SARCOS inverse dynamics dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 21
    target_size = 7
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:input_size]]),
                np.array([float(i) for i in tokens[input_size:]]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'sarcos_' + ds + '.txt')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40036, 4448, 4449]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (target_size, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'target_size': target_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #20
0
def load(dir_path, load_to_memory=False):
    """
    Loads the CAData (California housing prices) dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 8
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line,
                                     float,
                                     float,
                                     sparse=False,
                                     input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'cadata_' + ds + '.libsvm')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [16512, 2064, 2064]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )],
                               [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #21
0
def load_data(dir_path, input_size=6, targets=set(['0','1','2','3','4']), train_filename=None, test_filename=None, background_filename=None, load_to_memory=True):
    """
    Loads a dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    import mlpython.misc.io as mlio

    # Known metadata
    dir_path = os.path.expanduser(dir_path)

    # Look if the train/valid/test files already exist, if not, load the data and create the files
    train_file, valid_file, finaltrain_file, test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['trainset','validset','finaltrainset','testset']]
    if os.path.exists(train_file):
        print "Train/valid/test files exist, loading data..."
    else:
        print "Train/valid/test file do not exist, creating them..."
        if train_filename is None or test_filename is None:
            print 'ERROR, NO TRAIN/TEST FILENAMES GIVEN'
            sys.exit(1)
        else:
            create_files(dir_path,train_filename,test_filename,background_filename,input_size)
        
    # train/valid/test files should exist by now
    if load_to_memory:
        train_data, valid_data, finaltrain_data, test_data = [mlio.libsvm_load(filename=f, input_size=input_size)[0] for f in [train_file, valid_file, finaltrain_file, test_file]]
    else:
        def load_line(line):
            return mlio.libsvm_load_line(line,input_size=input_size)
            
        train_data, valid_data, finaltrain_data, test_data = [mlio.load_from_file(filename=f,load_line=load_line) for f in [train_file, valid_file, finaltrain_file, test_file]]
            
    # Get metadata
    with open(os.path.join(dir_path,'metadata.txt'),'r') as f:
        train_meta,valid_meta,finaltrain_meta,test_meta = [{'input_size':input_size,'length':int(f.readline()[:-1]),'targets':targets} for i in range(4)]
        test_meta['len_bg'] = int(f.readline()[:-1])
        label_weights = {}
        for _ in range(len(targets)):
            label, weight = f.readline()[:-1].split(':')
            label_weights[label] = float(weight)
        finaltrain_meta['label_weights'] = label_weights
        
    return {'train':(train_data,train_meta),'valid':(valid_data,valid_meta), 'finaltrain':(finaltrain_data,finaltrain_meta),'test':(test_data,test_meta)}
Example #22
0
def load(dir_path, load_to_memory=False):
    """
    Loads the NIPS 0-12 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 500
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return np.array([int(i) for i in tokens[:-1]
                         ])  #The last element is bogus (don't ask...)

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [400, 100, 1240]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, )], [np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #23
0
def load(dir_path, load_to_memory=False):
    """
    Loads the Corel5k dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 499
    target_size = 374
    dir_path = os.path.expanduser(dir_path)

    def convert_target(target_str):
        targets = np.zeros((target_size))
        for l in target_str.split(","):
            id = int(l)
            targets[id] = 1
        return targets

    def load_line(line):
        return mlio.libsvm_load_line(line, convert_target=convert_target, sparse=False, input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, "corel5k_" + ds + ".libsvm") for ds in ["train", "valid", "test"]
    ]
    # Get data
    train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]]

    lengths = [3600, 900, 500]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size,), (target_size,)], [np.float64, bool], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [
        {"input_size": input_size, "target_size": target_size, "length": l} for l in lengths
    ]

    return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
Example #24
0
def load(dir_path,load_to_memory=True,sparse=False,binary_input=False):
    """
    Loads the RCV2 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 10000 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size=10000
    target_size=103
    dir_path = os.path.expanduser(dir_path)
    def convert_target(target_str):
        targets = np.zeros((target_size))
        for l in target_str.split(','):
            id = int(l)
            targets[id] = 1
        return targets

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=sparse,input_size=input_size,input_type=np.int32)

    # Get data
    train_file,valid_file,test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['train','valid','test']]
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]
    if load_to_memory:
        train,valid,test = [ [x for x in f] for f in [train,valid,test] ]

    #lengths = [784414,10000,10000]
    lengths = [392207,10000,402207]

    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'targets':set(range(2)),'target_size':target_size,'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #25
0
def load(dir_path, load_to_memory=False):
    """
    Loads the occluded MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs and targets have been converted to a binary format.

    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """

    input_size = 784
    target_size = 784
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:input_size]]), np.array([int(i) for i in tokens[input_size:]]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, "occluded_mnist_" + ds + ".txt") for ds in ["train", "valid", "test"]
    ]
    # Get data
    train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]]

    lengths = [50000, 10000, 10000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size,), (target_size,)], [np.float64, np.float64], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [
        {"input_size": input_size, "target_size": target_size, "length": l} for l in lengths
    ]

    return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
Example #26
0
def load(dir_path,load_to_memory=False,dtype=np.float64):
    """
    Loads the DNA dataset.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """
    
    input_size=180
    dir_path = os.path.expanduser(dir_path)
    targets = set([0,1,2])
    target_mapping = {'1':0,'2':1,'3':2}
    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [1400,600,1186]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[dtype,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #27
0
def load(dir_path,load_to_memory=False):
    """
    Loads the MajMin dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """
    
    input_size=389
    target_size=96
    dir_path = os.path.expanduser(dir_path)
    
    def convert_target(target_str):
        targets = np.zeros((target_size))
        if target_str != '':
            for l in target_str.split(','):
                id = int(l)
                targets[id] = 1
        return targets

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'majmin_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [1587,471,480]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size,
                                        'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False):
    """
    Loads the MNIST background-random dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size = 784
    dir_path = os.path.expanduser(dir_path)
    targets = set(range(10))

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), int(float(tokens[-1])))

    train_file, valid_file, test_file = [
        os.path.join(dir_path, "mnist_background_random_" + ds + ".amat") for ds in ["train", "valid", "test"]
    ]
    # Get data
    train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]]

    lengths = [10000, 2000, 50000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size,), (1,)], [np.float64, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{"input_size": input_size, "length": l, "targets": targets} for l in lengths]

    return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
Example #29
0
def load(dir_path,load_to_memory=False):
    """
    Loads the MTurk dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """
    
    input_size=389
    target_size=95
    dir_path = os.path.expanduser(dir_path)
    
    def convert_target(target_str):
        targets = np.zeros((target_size))
        if target_str != '':
            for l in target_str.split(','):
                id = int(l)
                targets[id] = 1
        return targets

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'mturk_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [597,123,195]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size,
                                        'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #30
0
File: rcv1.py Project: pgcool/TMBP
def load(dir_path,load_to_memory=False):
    """
    Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs
    and binary targets.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    
    input_size=150
    dir_path = os.path.expanduser(dir_path)
    targets = set([0,1])
    target_mapping = {'0':0,'1':1}
    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1]))

    train_file,valid_file,test_file = [os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [40000,10000,150000]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #31
0
def load(dir_path,load_to_memory=False):
    """
    Loads the Mushrooms dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of
    data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    
    input_size=112
    dir_path = os.path.expanduser(dir_path)
    targets = set([0,1])
    target_mapping = {'1':0,'2':1}
    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'mushrooms_' + ds + '.libsvm') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [2000,500,5624]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #32
0
def load(dir_path,load_to_memory=False):
    """
    Loads the 20-newsgroups dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 5000 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    
    input_size=5000
    targets = set(range(20))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1]))

    train_file,valid_file,test_file = [os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [9578,1691,7505]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #33
0
def load(dir_path,load_to_memory=False):
    """
    Loads the MNIST dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been normalized between 0 and 1.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    
    input_size=784
    targets = set(range(10))
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1]))
        #return mlio.libsvm_load_line(line,float,int,sparse,input_size)

    train_file,valid_file,test_file = [os.path.join(dir_path, 'mnist_' + ds + '.txt') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [50000,10000,10000]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                              'length':l,'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False):
    """
    Corrupted OCR letters dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs and targets are binary.

    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """
    
    input_size=16*(32+3)
    target_size=16*(32+3)
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:input_size]]), np.array([float(i) for i in tokens[input_size:]]))

    train_file,valid_file,test_file = [os.path.join(dir_path, 'corrupted_ocr_letters_' + ds + '.txt') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [10000,2000,2000]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size,
                                        'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #35
0
def load(dir_path,load_to_memory=False):
    """
    Labeled Faces in the Wild, occluded faces dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs and targets have been converted to be in the [0,1] interval.

    **Defined metadata:**

    * ``'input_size'``
    * ``'target_size'``
    * ``'length'``

    """
    
    input_size=1024
    target_size=1024
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        tokens = line.split()
        return (np.array([float(i)/255 for i in tokens[:input_size]]), np.array([float(i)/255 for i in tokens[input_size:]]))

    train_file,valid_file,test_file = [os.path.join(dir_path, 'occluded_faces_lfw_' + ds + '.txt') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [11089,1149,1117]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)]
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size,
                                        'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #36
0
def load(dir_path,load_to_memory=False):
    """
    Loads the Rectangles images dataset. 

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    
    input_size = 784
    dir_path = os.path.expanduser(dir_path)
    targets = set(range(2))
        
    def load_line(line):
        tokens = line.split()
        return (np.array([float(i) for i in tokens[:-1]]), float(tokens[-1]))
        


    train_file,valid_file,test_file = [os.path.join(dir_path, 'rectangles_images_' + ds + '.amat') for ds in ['train','valid','test']]
    # Get data
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    lengths = [10000, 2000, 50000]
    if load_to_memory:
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]
        train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)]  
        
    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l, 'targets':targets} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #37
0
def load(dir_path, load_to_memory=False):
    """
    Loads the CAData (California housing prices) dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``

    """

    input_size = 8
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size)

    train_file, valid_file, test_file = [
        os.path.join(dir_path, "cadata_" + ds + ".libsvm") for ds in ["train", "valid", "test"]
    ]
    # Get data
    train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]]

    lengths = [16512, 2064, 2064]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size,), (1,)], [np.float64, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{"input_size": input_size, "length": l} for l in lengths]

    return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
Example #38
0
def load(dir_path,load_to_memory=True,sparse=False):
    """
    Loads the NIPS abstracts dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    The inputs have been put in binary format, and the vocabulary has been
    restricted to 13649 words.

    **Defined metadata:**

    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """

    input_size=13649
    dir_path = os.path.expanduser(dir_path)

    def load_line(line):
        return mlio.libsvm_load_line(line,convert_target=str,sparse=sparse,input_size=input_size,input_type=np.int32)[0]

    # Get data
    train_file,valid_file,test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['train','valid','test']]
    train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]
    if load_to_memory:
        train,valid,test = [ [x for x in f] for f in [train,valid,test] ]

    lengths = [1640,50,50]

    # Get metadata
    train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #39
0
def load(dir_path, load_to_memory=False):
    """
    Loads the the 32 x 32 pixels version of the Street View House Numbers (SVHN) dataset.

    The original 32 x 32 pixels dataset is in color, but is converted
    in grayscale by this module, in [0,1]. The original training set
    is also split into a new training set and a validation
    set. Finally, the original dataset also includes extra labeled
    examples, which are supposed to be easier to classifier. Those
    were are added to the training set.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size = 1024
    dir_path = os.path.expanduser(dir_path)

    # Put in grayscale, in [0,1]
    def to_grayscale_normalized(example):
        x, y = example
        new_x = (x[:input_size] * 0.3 + x[input_size:(2 * input_size)] * 0.59 +
                 x[(2 * input_size):(3 * input_size)] * 0.11) / 255.
        return (new_x, y)

    class TransformedIterator:
        def __init__(self, iter, transform):
            self.iter = iter
            self.transform = transform

        def __iter__(self):
            for ex in self.iter:
                yield self.transform(ex)

    if load_to_memory:
        train_inputs = np.load(os.path.join(dir_path,
                                            'train_inputs_32x32.npy'))
        valid_inputs = np.load(os.path.join(dir_path,
                                            'valid_inputs_32x32.npy'))
        test_inputs = np.load(os.path.join(dir_path, 'test_inputs_32x32.npy'))
        train_targets = np.load(
            os.path.join(dir_path, 'train_targets_32x32.npy'))
        valid_targets = np.load(
            os.path.join(dir_path, 'valid_targets_32x32.npy'))
        test_targets = np.load(os.path.join(dir_path,
                                            'test_targets_32x32.npy'))

        train = TransformedIterator(
            mlio.IteratorWithFields(
                np.hstack([train_inputs,
                           train_targets.reshape(-1, 1)]),
                ((0, input_size), (input_size, input_size + 1))),
            to_grayscale_normalized)
        valid = TransformedIterator(
            mlio.IteratorWithFields(
                np.hstack([valid_inputs,
                           valid_targets.reshape(-1, 1)]),
                ((0, input_size), (input_size, input_size + 1))),
            to_grayscale_normalized)
        test = TransformedIterator(
            mlio.IteratorWithFields(
                np.hstack([test_inputs,
                           test_targets.reshape(-1, 1)]),
                ((0, input_size), (input_size, input_size + 1))),
            to_grayscale_normalized)

    else:

        def load_line(line):
            tokens = line.split()
            return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1]))

        train_file, valid_file, test_file = [
            os.path.join(dir_path, ds + '_32x32.txt')
            for ds in ['train', 'valid', 'test']
        ]
        # Get data
        train, valid, test = [
            TransformedIterator(mlio.load_from_file(f, load_line),
                                to_grayscale_normalized)
            for f in [train_file, valid_file, test_file]
        ]

    # Get metadata
    lengths = [594388, 10000, 26032]
    targets = set(range(1, 11))
    class_to_id = {}
    for t in range(10):
        class_to_id[t + 1] = t

    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets,
        'class_to_id': class_to_id
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #40
0
def load(dir_path,load_to_memory=False):
    """
    Loads the the 32 x 32 pixels version of the Street View House Numbers (SVHN) dataset.

    The original 32 x 32 pixels dataset is in color, but is converted
    in grayscale by this module, in [0,1]. The original training set
    is also split into a new training set and a validation
    set. Finally, the original dataset also includes extra labeled
    examples, which are supposed to be easier to classifier. Those
    were are added to the training set.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size=1024
    dir_path = os.path.expanduser(dir_path)

    # Put in grayscale, in [0,1]
    def to_grayscale_normalized(example):
        x,y = example
        new_x = (x[:input_size]*0.3 + x[input_size:(2*input_size)]*0.59 + x[(2*input_size):(3*input_size)]*0.11)/255.
        return (new_x,y)

    class TransformedIterator:
        def __init__(self,iter,transform):
            self.iter = iter
            self.transform = transform
            
        def __iter__(self):
            for ex in self.iter:
                yield self.transform(ex)

    if load_to_memory:
        train_inputs = np.load(os.path.join(dir_path,'train_inputs_32x32.npy'))
        valid_inputs = np.load(os.path.join(dir_path,'valid_inputs_32x32.npy'))
        test_inputs = np.load(os.path.join(dir_path,'test_inputs_32x32.npy'))
        train_targets = np.load(os.path.join(dir_path,'train_targets_32x32.npy'))
        valid_targets = np.load(os.path.join(dir_path,'valid_targets_32x32.npy'))
        test_targets = np.load(os.path.join(dir_path,'test_targets_32x32.npy'))


        train = TransformedIterator(mlio.IteratorWithFields(np.hstack([train_inputs,train_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))),
                                    to_grayscale_normalized)
        valid = TransformedIterator(mlio.IteratorWithFields(np.hstack([valid_inputs,valid_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))),
                                    to_grayscale_normalized)
        test = TransformedIterator(mlio.IteratorWithFields(np.hstack([test_inputs,test_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))),
                                   to_grayscale_normalized)

    else:
        def load_line(line):
            tokens = line.split()
            return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1]))

        train_file,valid_file,test_file = [os.path.join(dir_path, ds + '_32x32.txt') for ds in ['train','valid','test']]
        # Get data
        train,valid,test = [TransformedIterator(mlio.load_from_file(f,load_line),to_grayscale_normalized) for f in [train_file,valid_file,test_file]]

    # Get metadata
    lengths = [594388,10000,26032]
    targets = set(range(1,11))
    class_to_id = {}
    for t in range(10):
        class_to_id[t+1] = t
        
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                        'length':l,'targets':targets,
                                        'class_to_id':class_to_id} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #41
0
def load(dir_path, load_to_memory=False, dtype=np.float64):
    """
    Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs
    and binary targets.

    The data is given by a dictionary mapping from strings
    'train', 'valid' and 'test' to the associated pair of data and metadata.
    
    Defined metadata: 
    - 'input_size'
    - 'targets'
    - 'length'

    References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest
                Larochelle, Bengio and Turian
                link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf

                LIBSVM Data: Classification, Regression, and Multi-label (web page)
                link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
    """

    input_size = 150
    dir_path = os.path.expanduser(dir_path)
    targets = set([0, 1])
    target_mapping = {'0': 0, '1': 1}

    def convert_target(target):
        return target_mapping[target]

    def load_line(line):
        tokens = line.split()
        return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

    train_file, valid_file, test_file = [
        os.path.join(dir_path,
                     'rcv1_all_subset.binary_' + ds + '_voc_150.amat')
        for ds in ['train', 'valid', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]

    lengths = [40000, 10000, 150000]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l)
            for d, l in zip([train, valid, test], lengths)
        ]

    # Get metadata
    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #42
0
def load(dir_path,load_to_memory=False):
    """
    Loads the CIFAR-10 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size=3072
    dir_path = os.path.expanduser(dir_path)
    if load_to_memory:
        batch1 = mlio.load(os.path.join(dir_path,'data_batch_1'))
        batch2 = mlio.load(os.path.join(dir_path,'data_batch_2'))
        batch3 = mlio.load(os.path.join(dir_path,'data_batch_3'))
        batch4 = mlio.load(os.path.join(dir_path,'data_batch_4'))
        batch5 = mlio.load(os.path.join(dir_path,'data_batch_5'))
        testbatch = mlio.load(os.path.join(dir_path,'test_batch'))

        train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']])
        train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']])
        train = mlio.IteratorWithFields(np.hstack([train_data,train_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

        valid_data = batch5['data']
        valid_labels = np.array(batch5['labels'])
        valid = mlio.IteratorWithFields(np.hstack([valid_data,valid_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

        test_data = testbatch['data']
        test_labels = np.array(testbatch['labels'])
        test = mlio.IteratorWithFields(np.hstack([test_data,test_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

    else:
        def load_line(line):
            tokens = line.split()
            return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1]))

        train_file,valid_file,test_file = [os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train','valid','test']]
        # Get data
        train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    # Get metadata
    lengths = [40000,10000,10000]
    other_meta = mlio.load(os.path.join(dir_path,'batches.meta'))
    label_names = other_meta['label_names']
    targets = set(label_names)
    class_to_id = {}
    for i,c in enumerate(label_names):
        class_to_id[c] = i
        
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                        'length':l,'targets':targets,
                                        'class_to_id':class_to_id} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #43
0
def load(dir_path, load_to_memory=False):
    """
    Loads the CIFAR-10 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size = 3072
    dir_path = os.path.expanduser(dir_path)
    if load_to_memory:
        batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1'))
        batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2'))
        batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3'))
        batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4'))
        batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5'))
        testbatch = mlio.load(os.path.join(dir_path, 'test_batch'))

        train_data = np.vstack(
            [batch1['data'], batch2['data'], batch3['data'], batch4['data']])
        train_labels = np.hstack([
            batch1['labels'], batch2['labels'], batch3['labels'],
            batch4['labels']
        ])
        train = mlio.IteratorWithFields(
            np.hstack([train_data, train_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

        valid_data = batch5['data']
        valid_labels = np.array(batch5['labels'])
        valid = mlio.IteratorWithFields(
            np.hstack([valid_data, valid_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

        test_data = testbatch['data']
        test_labels = np.array(testbatch['labels'])
        test = mlio.IteratorWithFields(
            np.hstack([test_data, test_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

    else:

        def load_line(line):
            tokens = line.split()
            return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

        train_file, valid_file, test_file = [
            os.path.join(dir_path, 'cifar-10-' + ds + '.txt')
            for ds in ['train', 'valid', 'test']
        ]
        # Get data
        train, valid, test = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file]
        ]

    # Get metadata
    lengths = [40000, 10000, 10000]
    other_meta = mlio.load(os.path.join(dir_path, 'batches.meta'))
    label_names = other_meta['label_names']
    targets = set(label_names)
    class_to_id = {}
    for i, c in enumerate(label_names):
        class_to_id[c] = i

    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets,
        'class_to_id': class_to_id
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #44
0
def load(dir_path,load_to_memory=False,home_made_valid_split=False):
    """
    Loads the Yahoo! Learning to Rank Challenge, Set 2 data.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    Option ``home_made_valid_split`` determines whether the original
    training set should be further split into a "home made"
    train/valid split (default: False). If True, the dictionary mapping
    will contain 4 keys instead of 3: ``'train'`` (home made training set), 
    ``'valid'`` (home made validation set), ``'test'`` (original validation set)
    and ``'test2'`` (original test set).

    **Defined metadata:**

    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'n_pairs'``
    * ``'length'``

    """
    
    input_size=700
    dir_path = os.path.expanduser(dir_path)
    sparse=False

    def convert(feature,value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line,convert,int,sparse,input_size)

    if home_made_valid_split:
        n_queries = [1000,266,1266,3798]
        lengths = [27244,7571,34881,103174]

        train_file,valid_file,test_file,test2_file = [os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['in_house_train','in_house_valid','valid','test']]
        # Get data
        train,valid,test,test2 = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file,test2_file]]

        if load_to_memory:
            train,valid,test,test2 = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test,test2],lengths)]

        # Get metadata
        train_meta,valid_meta,test_meta,test2_meta = [{'input_size':input_size,
                                                       'scores':range(5),
                                                       'n_queries':nq,
                                                       'length':l,
                                                       'n_pairs':l} for nq,l in zip(n_queries,lengths)]

        return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta),'test2':(test2,test2_meta)}
    else:
        n_queries = [1266,1266,3798]
        lengths = [34815,34881,103174]

        # Get data file paths
        train_file,valid_file,test_file = [os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['train','valid','test']]
        # Get data
        train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]
        if load_to_memory:
            train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths)]

        train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                            'scores':range(5),
                                            'n_queries':nq,
                                            'length':l,
                                            'n_pairs':l} for nq,l in zip(n_queries,lengths)]

        return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
Example #45
0
def load(dir_path, load_to_memory=False, fold=1):
    """
    Loads the LETOR 4.0 MQ2008 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.

    This dataset comes with 5 predefined folds, which can be specified
    with option ``fold`` (default = 1). 
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'length'``

    """

    input_size = 46
    dir_path = os.path.expanduser(dir_path)
    sparse = False

    if fold not in [1, 2, 3, 4, 5]:
        raise error(
            'There are 5 predefined folds. Option fold should be an integer between 1 and 5'
        )

    def convert(feature, value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line, convert, int, sparse, input_size)

    n_queries = [[471, 157, 156], [471, 156, 157], [470, 157, 157],
                 [470, 157, 157], [470, 157, 157]]

    lengths = [[9630, 2707, 2874], [9404, 2874, 2933], [8643, 2933, 3635],
               [8514, 3635, 3062], [9442, 3062, 2707]]

    # Get data file paths
    train_file, valid_file, test_file = [
        os.path.join(dir_path, 'MQ2008/Fold' + str(fold) + '/' + ds + '.txt')
        for ds in ['train', 'vali', 'test']
    ]
    # Get data
    train, valid, test = [
        mlio.load_from_file(f, load_line)
        for f in [train_file, valid_file, test_file]
    ]
    if load_to_memory:
        train, valid, test = [
            mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )],
                               [np.float64, int, int], l)
            for d, l in zip([train, valid, test], lengths[fold - 1])
        ]

    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'scores': range(3),
        'n_queries': nq,
        'length': l,
        'n_pairs': l
    } for nq, l in zip(n_queries[fold - 1], lengths[fold - 1])]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Example #46
0
def load_data(dir_path,
              input_size=6,
              targets=set(['0', '1', '2', '3', '4']),
              train_filename=None,
              test_filename=None,
              background_filename=None,
              load_to_memory=True):
    """
    Loads a dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'targets'``
    * ``'length'``

    """
    import mlpython.misc.io as mlio

    # Known metadata
    dir_path = os.path.expanduser(dir_path)

    # Look if the train/valid/test files already exist, if not, load the data and create the files
    train_file, valid_file, finaltrain_file, test_file = [
        os.path.join(dir_path, ds + '.txt')
        for ds in ['trainset', 'validset', 'finaltrainset', 'testset']
    ]
    if os.path.exists(train_file):
        print "Train/valid/test files exist, loading data..."
    else:
        print "Train/valid/test file do not exist, creating them..."
        if train_filename is None or test_filename is None:
            print 'ERROR, NO TRAIN/TEST FILENAMES GIVEN'
            sys.exit(1)
        else:
            create_files(dir_path, train_filename, test_filename,
                         background_filename, input_size)

    # train/valid/test files should exist by now
    if load_to_memory:
        train_data, valid_data, finaltrain_data, test_data = [
            mlio.libsvm_load(filename=f, input_size=input_size)[0]
            for f in [train_file, valid_file, finaltrain_file, test_file]
        ]
    else:

        def load_line(line):
            return mlio.libsvm_load_line(line, input_size=input_size)

        train_data, valid_data, finaltrain_data, test_data = [
            mlio.load_from_file(filename=f, load_line=load_line)
            for f in [train_file, valid_file, finaltrain_file, test_file]
        ]

    # Get metadata
    with open(os.path.join(dir_path, 'metadata.txt'), 'r') as f:
        train_meta, valid_meta, finaltrain_meta, test_meta = [{
            'input_size':
            input_size,
            'length':
            int(f.readline()[:-1]),
            'targets':
            targets
        } for i in range(4)]
        test_meta['len_bg'] = int(f.readline()[:-1])
        label_weights = {}
        for _ in range(len(targets)):
            label, weight = f.readline()[:-1].split(':')
            label_weights[label] = float(weight)
        finaltrain_meta['label_weights'] = label_weights

    return {
        'train': (train_data, train_meta),
        'valid': (valid_data, valid_meta),
        'finaltrain': (finaltrain_data, finaltrain_meta),
        'test': (test_data, test_meta)
    }
Example #47
0
def load(dir_path, load_to_memory=False, home_made_valid_split=False):
    """
    Loads the Yahoo! Learning to Rank Challenge, Set 2 data.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    Option ``home_made_valid_split`` determines whether the original
    training set should be further split into a "home made"
    train/valid split (default: False). If True, the dictionary mapping
    will contain 4 keys instead of 3: ``'train'`` (home made training set), 
    ``'valid'`` (home made validation set), ``'test'`` (original validation set)
    and ``'test2'`` (original test set).

    **Defined metadata:**

    * ``'input_size'``
    * ``'scores'``
    * ``'n_queries'``
    * ``'n_pairs'``
    * ``'length'``

    """

    input_size = 700
    dir_path = os.path.expanduser(dir_path)
    sparse = False

    def convert(feature, value):
        if feature != 'qid':
            raise ValueError('Unexpected feature')
        return int(value)

    def load_line(line):
        return mlio.libsvm_load_line(line, convert, int, sparse, input_size)

    if home_made_valid_split:
        n_queries = [1000, 266, 1266, 3798]
        lengths = [27244, 7571, 34881, 103174]

        train_file, valid_file, test_file, test2_file = [
            os.path.join(dir_path, 'set2.' + ds + '.txt')
            for ds in ['in_house_train', 'in_house_valid', 'valid', 'test']
        ]
        # Get data
        train, valid, test, test2 = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file, test2_file]
        ]

        if load_to_memory:
            train, valid, test, test2 = [
                mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )],
                                   [np.float64, int, int], l)
                for d, l in zip([train, valid, test, test2], lengths)
            ]

        # Get metadata
        train_meta, valid_meta, test_meta, test2_meta = [{
            'input_size': input_size,
            'scores': range(5),
            'n_queries': nq,
            'length': l,
            'n_pairs': l
        } for nq, l in zip(n_queries, lengths)]

        return {
            'train': (train, train_meta),
            'valid': (valid, valid_meta),
            'test': (test, test_meta),
            'test2': (test2, test2_meta)
        }
    else:
        n_queries = [1266, 1266, 3798]
        lengths = [34815, 34881, 103174]

        # Get data file paths
        train_file, valid_file, test_file = [
            os.path.join(dir_path, 'set2.' + ds + '.txt')
            for ds in ['train', 'valid', 'test']
        ]
        # Get data
        train, valid, test = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file]
        ]
        if load_to_memory:
            train, valid, test = [
                mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )],
                                   [np.float64, int, int], l)
                for d, l in zip([train, valid, test], lengths)
            ]

        train_meta, valid_meta, test_meta = [{
            'input_size': input_size,
            'scores': range(5),
            'n_queries': nq,
            'length': l,
            'n_pairs': l
        } for nq, l in zip(n_queries, lengths)]

        return {
            'train': (train, train_meta),
            'valid': (valid, valid_meta),
            'test': (test, test_meta)
        }