Example #1
0
def test_data_home():
    # get_data_home will point to a pre-existing folder
    data_home = get_data_home(data_home=DATA_HOME)
    assert_equal(data_home, DATA_HOME)
    assert_true(os.path.exists(data_home))

    # clear_data_home will delete both the content and the folder it-self
    clear_data_home(data_home=data_home)
    assert_false(os.path.exists(data_home))

    # if the folder is missing it will be created again
    data_home = get_data_home(data_home=DATA_HOME)
    assert_true(os.path.exists(data_home))
Example #2
0
    def get_unclassified_data(self):
        source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease)
        file_paths = []
        for root, directories, files in os.walk(source_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        print 'unclassified data loaded from ' + str(file_paths)

        tweets = []
        for file_path in file_paths:
            line_num = 0
            with codecs.open(file_path, 'r') as f:
                for line in f:
                    if line_num>0:
                        try:
                            tweets.append(Tweet(line))
                            line_num += 1
                        except:
                            print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0]
                    else:
                        line_num += 1
            f.closed
        print 'unclassified tweets loaded ' + str(len(tweets))
        return tweets
Example #3
0
def setup_module():
    check_skip_network()

    # skip the test in rcv1.rst if the dataset is not already loaded
    rcv1_dir = os.path.join(get_data_home(), "RCV1")
    if not os.path.exists(rcv1_dir):
        raise SkipTest("Download RCV1 dataset to run this test.")
def setup_working_with_text_data():
    if IS_PYPY and os.environ.get('CI', None):
        raise SkipTest('Skipping too slow test with PyPy on CI')
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
def fetch_vega_spectrum(data_home=None):
    data_home = get_data_home(data_home)
    refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1])
    if not os.path.exists(refspec_file):
        print "downnloading from %s" % REFSPEC_URL
        F = urllib2.urlopen(REFSPEC_URL)
        open(refspec_file, 'w').write(F.read())

    F = open(refspec_file)

    data = np.loadtxt(F)
    return data
def fetch_filter(filter, data_home=None):
    data_home = get_data_home(data_home)
    assert filter in 'ugriz'
    url = URL % filter
    loc = os.path.join(data_home, '%s.dat' % filter)
    if not os.path.exists(loc):
        print "downloading from %s" % url
        F = urllib2.urlopen(url)
        open(loc, 'w').write(F.read())

    F = open(loc)

    data = np.loadtxt(F)
    return data
def fetch_sdss_spec_data(data_home=None):
    data_home = get_data_home(data_home)

    local_file = os.path.join(data_home, os.path.basename(DATA_URL))

    # data directory is password protected so the public can't access it    
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
    handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(handler)

    # download training data
    if not os.path.exists(local_file):
        fhandle = opener.open(DATA_URL)
        open(local_file, 'w').write(fhandle.read())

    return np.load(local_file)
def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            #print (doc)
            yield doc
Example #9
0
def _fetch_drug_protein(data_home=None):
    """Fetch drug-protein dataset from the server"""

    base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/"

    # check if this data set has been already downloaded
    data_home = get_data_home(data_home)
    data_home = os.path.join(data_home, 'drug-protein')
    if not os.path.exists(data_home):
        os.makedirs(data_home)

    for base_name in ["drug_repmat.txt", "target_repmat.txt",
                      "inter_admat.txt"]:
        filename = os.path.join(data_home, base_name)

        if not os.path.exists(filename):
            urlname = base_url + base_name

            print("Download data at {}".format(urlname))

            try:
                url = urlopen(urlname)
            except HTTPError as e:
                if e.code == 404:
                    e.msg = "Dataset drug-protein '%s' not found." % base_name
                raise

            try:
                with open(filename, 'w+b') as fhandle:
                    shutil.copyfileobj(url, fhandle)
            except:
                os.remove(filename)
                raise

            url.close()

    return data_home
Example #10
0
    def create_data(self):
        data_home = get_data_home()
        cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)

        if os.path.exists(cache_path):
            return

        # e.g. C:\Users\[user]\scikit_learn_data\hiv
        # disease_path = os.path.join(data_home, self.disease)
        # e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv
        tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut)
        if not os.path.exists(tweets_path):
            return
        '''
        *** Manual process:
        Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format)

        *** Automated process:
        1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv
        2. For each file read all tweets line by line (only those where the category is not empty)
        3. For each tweet generate a unique file
        '''

        train_path = os.path.join(tweets_path, self.train_folder)
        train_output_path = os.path.join(data_home, self.train_folder,  self.disease + self._cl_cut)
        if not os.path.exists(train_output_path):
            os.makedirs(train_output_path)

        test_path = os.path.join(tweets_path, self.test_folder)
        test_output_path = os.path.join(data_home, self.test_folder,  self.disease + self._cl_cut)
        if not os.path.exists(test_output_path):
            os.makedirs(test_output_path)

        train_tweets = self._load_tweets(train_path)
        self._generate_singular_tweet_files(train_tweets, train_output_path)
        test_tweets = self._load_tweets(test_path)
        self._generate_singular_tweet_files(test_tweets, test_output_path)
Example #11
0
else:
    # make prediction
    testData = dataAdapter.get_unclassified_data(categories=categories)
    # predicted = clf.classifier.predict(testData.data)
    predicted_prob = clf.classifier.predict_proba(testData.data)
    print('predict done')

    #for i in range(len(testData.data)):
    #    probabilities = predicted_prob[i]
    #    zero_prob = probabilities[0]
    #    one_prob = probabilities[1]
    #
    #    if one_prob > 0.1:
    #        print one_prob
    #
    #    #testData.data[i].append(str(clf.labels[predicted_prob[i]]))

    file_dir = os.path.join(get_data_home(), 'output')

    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    # np.savetxt(os.path.join(file_dir, "predicted.csv"), predicted, delimiter=",")
    np.savetxt(os.path.join(file_dir, "predicted_prob.csv"),
               predicted_prob,
               delimiter=",")

    print("done")

print('done!')
Example #12
0
import numpy as np

from sklearn.datasets import fetch_covtype, get_data_home
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.utils import Memory
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True, shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
    chunk_size = 1000
    data_chunks = list(partition(chunk_size, testData))

    print ('start prediction')

    for i,chunk in enumerate(data_chunks):
        t0 = time()
        predicted = clf.classifier.predict(list(chunk))
        ranTime = time() - t0
        print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
        for j in range(len(chunk)):
            testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])

    print ('predict done')

    file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)

    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    file_path = os.path.join(file_dir, 'output.txt')

    with codecs.open(file_path, "w", "utf-8") as text_file:
        for i in range(len(testData)):
            try:
                tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \
                    format(testData[i].tweet_id,
                           testData[i].query,
                           testData[i].disease,
                           testData[i].created_at,
                           testData[i].screen_name,
from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import get_data_home


if __name__ == "__main__":
    fetch_olivetti_faces()

    print("Loading Labeled Faces Data (~200MB)")
    fetch_lfw_people(min_faces_per_person=70, resize=0.4)
    print("=> Success!")
    print("Data saved in %s" % get_data_home())
Example #15
0
def setup_rcv1():
    check_skip_network()
    # skip the test in rcv1.rst if the dataset is not already loaded
    rcv1_dir = join(get_data_home(), "RCV1")
    if not exists(rcv1_dir):
        raise SkipTest("Download RCV1 dataset to run this test.")
Example #16
0
def setup_twenty_newsgroups():
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
Example #17
0
def fetch_jrcacquis(
        langs=None,
        data_path=None,
        years=None,
        ignore_unclassified=True,
        cat_filter=None,
        cat_threshold=0,
        parallel=None,
        most_frequent=-1,
        DOWNLOAD_URL_BASE='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'
):

    assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
    if not langs:
        langs = JRC_LANGS
    else:
        if isinstance(langs, str): langs = [langs]
        for l in langs:
            if l not in JRC_LANGS:
                raise ValueError(
                    'Language %s is not among the valid languages in JRC-Acquis v3'
                    % l)

    if not data_path:
        data_path = get_data_home()

    if not os.path.exists(data_path):
        os.mkdir(data_path)

    request = []
    total_read = 0
    for l in langs:
        file_name = 'jrc-' + l + '.tgz'
        archive_path = join(data_path, file_name)

        if not os.path.exists(archive_path):
            print(
                "downloading language-specific dataset (once and for all) into %s"
                % data_path)
            DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
            download_file(DOWNLOAD_URL, archive_path)
            print("untarring dataset...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        documents_dir = join(data_path, l)

        print("Reading documents...")
        read = 0
        for dir in list_dirs(documents_dir):
            year = int(dir)
            if years == None or year in years:
                year_dir = join(documents_dir, dir)
                pickle_name = join(data_path,
                                   'jrc_' + l + '_' + dir + '.pickle')
                if os.path.exists(pickle_name):
                    print("loading from file %s" % pickle_name)
                    l_y_documents = pickle.load(open(pickle_name, "rb"))
                    read += len(l_y_documents)
                else:
                    l_y_documents = []
                    all_documents = list_files(year_dir)
                    empty = 0
                    for i, doc_file in enumerate(all_documents):
                        try:
                            jrc_doc = parse_document(join(year_dir, doc_file),
                                                     year)
                        except ValueError:
                            jrc_doc = None

                        if jrc_doc and (not ignore_unclassified
                                        or jrc_doc.categories):
                            l_y_documents.append(jrc_doc)
                        else:
                            empty += 1
                        if len(all_documents) > 50 and (
                            (i + 1) % (len(all_documents) / 50) == 0):
                            print('\r\tfrom %s: completed %d%%' %
                                  (year_dir,
                                   int((i + 1) * 100.0 / len(all_documents))),
                                  end='')
                        read += 1
                    print(
                        '\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n'
                        % (year_dir, i + 1, empty),
                        end='')
                    print("\t\t(Pickling object for future runs in %s)" %
                          pickle_name)
                    pickle.dump(l_y_documents, open(pickle_name, 'wb'),
                                pickle.HIGHEST_PROTOCOL)
                request += l_y_documents
        print("Read %d documents for language %s\n" % (read, l))
        total_read += read
    print("Read %d documents in total" % (total_read))

    if parallel == 'force':
        request = _force_parallel(request, langs)
    elif parallel == 'avoid':
        request = random_sampling_avoiding_parallel(request)

    final_cats = _get_categories(request)

    if cat_filter:
        request = _filter_by_category(request, cat_filter)
        final_cats = _get_categories(request)
    if cat_threshold > 0:
        request, final_cats = _filter_by_frequency(request, cat_threshold)
    if most_frequent != -1 and len(final_cats) > most_frequent:
        request, final_cats = _most_common(request, most_frequent)

    return request, final_cats
Example #18
0
def setup_twenty_newsgroups():
    data_home = get_data_home()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
import numpy as np

from sklearn.datasets import fetch_covtype, get_data_home
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import zero_one_loss
from sklearn.externals.joblib import Memory
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)
Example #20
0
def fetch_mnist(data_home=None):
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    data_home = get_data_home(data_home=data_home)
    data_home = os.path.join(data_home, 'mldata')
    if not os.path.exists(data_home):
        os.makedirs(data_home)
    mnist_save_path = os.path.join(data_home, "mnist-original.mat")
    if not os.path.exists(mnist_save_path):
        mnist_url = urllib.request.urlopen(mnist_alternative_url)
        with open(mnist_save_path, "wb") as matlab_file:
            copyfileobj(mnist_url, matlab_file)


#Step 1. Downloading the Data (MNIST)

print(get_data_home())

fetch_mnist()
mnist = fetch_mldata('MNIST original')

# These are the images
# There are 70,000 images (28 by 28 images for a dimensionality of 784)
print("Number of images: ", mnist.data.shape)
# These are the labels
print("Labels: ", mnist.target.shape)

#Step 2.  Splitting Data into Training and Test Sets (MNIST)
train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data,
                                                            mnist.target,
                                                            test_size=1 / 7.0,
                                                            random_state=0)
Example #21
0
from sklearn.metrics import confusion_matrix, accuracy_score
from skimage.transform import rotate

import missinglink

project = missinglink.SkLearnProject()

# Optional: Name this experiment. `display_name` is always visible in the experiments
# table. While the `description` is accessible by clicking the note icon.
project.set_properties(display_name="MNIST", description="Using scikit-learn")

print(__doc__)

# Load data from https://www.openml.org/d/554
print("Loading data")
print("Data home: {}".format(get_data_home()))
data, target = fetch_openml('mnist_784', version=1, return_X_y=True)
rotate = False
model_type = "forest"
#model_type = "mlp"

# rescale the data, use the traditional train/test split
print("Rescaling {} datapoints".format(data.shape))
data = data / 255.
split = 10000  # out of 70000
data_train, data_test = data[:split], data[split:]
target_train, target_test = target[:split], target[split:]

if rotate:
    print("Adding rotation")
    data_train = np.append(
Example #22
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_openml('mnist_784')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
Example #23
0
def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, 'lfw_home')):
        raise SkipTest("Skipping dataset loading doctests")
def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
Example #25
0
def setup_labeled_faces():
    data_home = get_data_home()
    if not exists(join(data_home, 'lfw_home')):
        raise SkipTest("Skipping dataset loading doctests")
Example #26
0
def fetch_datasets(data_home=None,
                   filter_data=None,
                   download_if_missing=True,
                   random_state=None,
                   shuffle=False,
                   verbose=False):
    """Load the benchmark datasets from Zenodo, downloading it if necessary.

    Parameters
    ----------
    data_home : string, optional (default=None)
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    filter_data : tuple of str/int or None, optional (default=None)
        A tuple containing the ID or the name of the datasets to be returned.
        Refer to the above table to get the ID and name of the datasets.

    download_if_missing : boolean, optional (default=True)
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, optional (default=None)
        Random state for shuffling the dataset.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : bool, optional (default=False)
        Whether to shuffle dataset.

    verbose : bool, optional (default=False)
        Show information regarding the fetching.

    Returns
    -------
    datasets : OrderedDict of Bunch object,
        The ordered is defined by ``filter_data``. Each Bunch object ---
        refered as dataset --- have the following attributes:

    dataset.data : ndarray, shape (n_samples, n_features)

    dataset.target : ndarray, shape (n_samples, )

    dataset.DESCR : string
        Description of the each dataset.

    Notes
    -----
    This collection of datasets have been proposed in [1]_. The
    characteristics of the available datasets are presented in the table
    below.

    +--+--------------+-------------------------------+-------+---------+-----+
    |ID|Name          | Repository & Target           | Ratio | #S      | #F  |
    +==+==============+===============================+=======+=========+=====+
    |1 |ecoli         | UCI, target: imU              | 8.6:1 | 336     | 7   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |2 |optical_digits| UCI, target: 8                | 9.1:1 | 5,620   | 64  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |3 |satimage      | UCI, target: 4                | 9.3:1 | 6,435   | 36  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |4 |pen_digits    | UCI, target: 5                | 9.4:1 | 10,992  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |5 |abalone       | UCI, target: 7                | 9.7:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |6 |sick_euthyroid| UCI, target: sick euthyroid   | 9.8:1 | 3,163   | 42  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |7 |spectrometer  | UCI, target: >=44             | 11:1  | 531     | 93  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |8 |car_eval_34   | UCI, target: good, v good     | 12:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |9 |isolet        | UCI, target: A, B             | 12:1  | 7,797   | 617 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |10|us_crime      | UCI, target: >0.65            | 12:1  | 1,994   | 100 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |11|yeast_ml8     | LIBSVM, target: 8             | 13:1  | 2,417   | 103 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |12|scene         | LIBSVM, target: >one label    | 13:1  | 2,407   | 294 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |13|libras_move   | UCI, target: 1                | 14:1  | 360     | 90  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |14|thyroid_sick  | UCI, target: sick             | 15:1  | 3,772   | 52  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |15|coil_2000     | KDD, CoIL, target: minority   | 16:1  | 9,822   | 85  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |16|arrhythmia    | UCI, target: 06               | 17:1  | 452     | 278 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |17|solar_flare_m0| UCI, target: M->0             | 19:1  | 1,389   | 32  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |18|oil           | UCI, target: minority         | 22:1  | 937     | 49  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |19|car_eval_4    | UCI, target: vgood            | 26:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |20|wine_quality  | UCI, wine, target: <=4        | 26:1  | 4,898   | 11  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |21|letter_img    | UCI, target: Z                | 26:1  | 20,000  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |22|yeast_me2     | UCI, target: ME2              | 28:1  | 1,484   | 8   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |23|webpage       | LIBSVM, w7a, target: minority | 33:1  | 34,780  | 300 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |24|ozone_level   | UCI, ozone, data              | 34:1  | 2,536   | 72  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |25|mammography   | UCI, target: minority         | 42:1  | 11,183  | 6   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |26|protein_homo  | KDD CUP 2004, minority        | 11:1  | 145,751 | 74  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |27|abalone_19    | UCI, target: 19               | 130:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+

    References
    ----------
    .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
       Imbalanced Data Learning and their Application in Bioinformatics."
       Dissertation, Georgia State University, (2011).
    """

    data_home = get_data_home(data_home=data_home)
    zenodo_dir = join(data_home, "zenodo")
    datasets = OrderedDict()

    if filter_data is None:
        filter_data_ = MAP_NAME_ID.keys()
    else:
        list_data = MAP_NAME_ID.keys()
        filter_data_ = []
        for it in filter_data:
            if isinstance(it, six.string_types):
                if it not in list_data:
                    raise ValueError('{} is not a dataset available. '
                                     'The available datasets are {}'.format(
                                         it, list_data))
                else:
                    filter_data_.append(it)
            elif isinstance(it, int):
                if it < 1 or it > 27:
                    raise ValueError('The dataset with the ID={} is not an '
                                     'available dataset. The IDs are '
                                     '{}'.format(it, range(1, 28)))
                else:
                    # The index start at one, then we need to remove one
                    # to not have issue with the indexing.
                    filter_data_.append(MAP_ID_NAME[it])
            else:
                raise ValueError('The value in the tuple should be str or int.'
                                 ' Got {} instead.'.format(type(it)))

    # go through the list and check if the data are available
    for it in filter_data_:
        filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME
        filename = join(zenodo_dir, filename)
        available = isfile(filename)

        if download_if_missing and not available:
            makedirs(zenodo_dir, exist_ok=True)
            if verbose:
                print("Downloading %s" % URL)
            f = BytesIO(urlopen(URL).read())
            tar = tarfile.open(fileobj=f)
            tar.extractall(path=zenodo_dir)
        elif not download_if_missing and not available:
            raise IOError("Data not found and `download_if_missing` is False")

        data = np.load(filename)
        X, y = data['data'], data['label']

        if shuffle:
            ind = np.arange(X.shape[0])
            rng = check_random_state(random_state)
            rng.shuffle(ind)
            X = X[ind]
            y = y[ind]

        datasets[it] = Bunch(data=X, target=y, DESCR=it)

    return datasets
Example #27
0
    def get_data(self, subset='train', categories=None, shuffle=True, random_state=42):
        data_home = get_data_home()
        cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)
        train_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut)
        test_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut)
        cache = None
        if os.path.exists(cache_path):
            try:
                with open(cache_path, 'rb') as f:
                    compressed_content = f.read()
                uncompressed_content = codecs.decode(
                    compressed_content, 'zlib_codec')
                cache = pickle.loads(uncompressed_content)
            except Exception as e:
                print(80 * '_')
                print('Cache loading failed')
                print(80 * '_')
                print(e)

        if cache is None:
            cache = self.get_cache(train_path=train_path, test_path=test_path, cache_path=cache_path)

        if subset in ('train', 'test'):
            data = cache[subset]
        elif subset == 'all':
            data_lst = list()
            target = list()
            filenames = list()
            for subset in ('train', 'test'):
                data = cache[subset]
                data_lst.extend(data.data)
                target.extend(data.target)
                filenames.extend(data.filenames)

            data.data = data_lst
            data.target = np.array(target)
            data.filenames = np.array(filenames)
        else:
            raise ValueError(
                "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

        data.description = 'The HIV dataset'

        if categories is not None:
            labels = [(data.target_names.index(cat), cat) for cat in categories]
            # Sort the categories to have the ordering of the labels
            labels.sort()
            labels, categories = zip(*labels)
            mask = np.in1d(data.target, labels)
            data.filenames = data.filenames[mask]
            data.target = data.target[mask]
            # searchsorted to have continuous labels
            data.target = np.searchsorted(labels, data.target)
            data.target_names = list(categories)
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[mask]
            data.data = data_lst.tolist()

        if shuffle:
            random_state = validation.check_random_state(random_state)
            indices = np.arange(data.target.shape[0])
            random_state.shuffle(indices)
            data.filenames = data.filenames[indices]
            data.target = data.target[indices]
            # Use an object array to shuffle: avoids memory copy
            data_lst = np.array(data.data, dtype=object)
            data_lst = data_lst[indices]
            data.data = data_lst.tolist()

        return data
def setup_twenty_newsgroups():
    data_home = get_data_home()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
import io
from scipy.io.arff import loadarff
import matplotlib.pyplot  as plt
from sklearn.datasets import get_data_home
from sklearn.externals.joblib import Memory
from sklearn.neural_network import MLPClassifier
try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen

memory = Memory(get_data_home())
@memory.cache()
def fetch_mnist():
    content = urlopen('https://www.openml.org/data/download/52667/mnist_784.arff').read()
    data,meta = loadarff(io.StringIO(content.decode('utf8')))
    data = data.view([('pixels', '<f8', 784), ('class', '|S1')])
    return data['pixels'],data['class']
x,y = fetch_mnist()
x_train, x_test = x[:6000],x[6000:]
y_train, y_test = y[:6000],y[6000:]
mlp = MLPClassifier(hidden_layer_sizes=(50,),max_iter=10,alpha=1e-4,solver='sgd',verbose=10,tol=1e-4,random_state=1,learning_rate_init=.1)
mlp.fit(x_train,y_train)
print("Training set score :%f"%mlp.score(x_train,y_train))
print("Test set score:%f"%mlp.score(x_test,y_test))
fig,axes = plt.subplots(4,4)
vmin, vmax = mlp.coefs_[0].min(),mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
               vmax=.5 * vmax)
    ax.set_xticks(())
Example #30
0
def setup_labeled_faces():
    data_home = get_data_home()
    if not exists(join(data_home, 'lfw_home')):
        raise SkipTest("Skipping dataset loading doctests")
Example #31
0
def setup_working_with_text_data():
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
Example #32
0
plt.legend(loc='upper right')
plt.show()
'''
x_index = 0
y_index = 3
'''
for label,color in zip(range(len(d1.target_names)),colors):
    plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图

plt.xlabel(d1.feature_names[x_index])
plt.xlabel(d1.feature_names[y_index])
plt.legend(loc='upper left')
plt.show()

'''

'''
fig = plt.figure(figsize=(6,6))
fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)

for i in range(64):
    ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
    ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest")
    ax.text(0,7,str(d3.target[i]))
plt.show()
'''

#china = datasets.load_sample_image('china.jpg')

print(datasets.get_data_home())
Example #33
0
# Each number generator use the same seed to avoid coupling issue between
# estimators.
op.add_option("--random-seed",
              dest="random_seed", default=13, type=int,
              help="Common seed used by random number generator.")

op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
joblib_cache_folder = os.path.join(get_data_home(), 'covertype_benchmark_data')
m = Memory(joblib_cache_folder, mmap_mode='r')


# Load the data, then cache and memmap the train/test split
@m.cache
def load_data(dtype=np.float32, order='C'):
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True, shuffle=True,
                         random_state=opts.random_seed)
    X, y = data['data'], data['target']
    X = np.asarray(X, dtype=dtype)
    
    if order.lower() == 'f':
Example #34
0
from sklearn.datasets import get_data_home
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.externals.joblib import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
Example #35
0
#In [4]: boston.
#boston.DESCR         boston.items         boston.target
#boston.clear         boston.iteritems     boston.update
#boston.copy          boston.iterkeys      boston.values
#boston.data          boston.itervalues    boston.viewitems
#boston.feature_names boston.keys          boston.viewkeys
#boston.fromkeys      boston.pop           boston.viewvalues
#boston.get           boston.popitem
#boston.has_key       boston.setdefault

#housing = datasets.fetch_california_housing()

#downloading Cal. housing from http://lib.stat.cmu.edu/modules.php?op=modload&nam
#e=Downloads&file=index&req=getit&lid=83 to C:\Users\c01843\scikit_learn_data

datasets.get_data_home()
#Out[8]: 'C:\\Users\\c01843\\scikit_learn_data'

X, y = boston.data, boston.target

datasets.make_biclusters
datasets.make_blobs
datasets.make_checkerboard
datasets.make_circles
datasets.make_classification
#datasets.make_biclusters                datasets.make_friedman3                 datasets.make_s_curve
#datasets.make_blobs                     datasets.make_gaussian_quantiles        datasets.make_sparse_coded_signal
#datasets.make_checkerboard              datasets.make_hastie_10_2               datasets.make_sparse_spd_matrix
#datasets.make_circles                   datasets.make_low_rank_matrix           datasets.make_sparse_uncorrelated
#datasets.make_classification            datasets.make_moons                     datasets.make_spd_matrix
#datasets.make_friedman1                 datasets.make_multilabel_classification datasets.make_swiss_roll
Example #36
0
def setup_twenty_newsgroups():
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
Example #37
0
def fetch_datasets(
    *,
    data_home=None,
    filter_data=None,
    download_if_missing=True,
    random_state=None,
    shuffle=False,
    verbose=False,
):
    """Load the benchmark datasets from Zenodo, downloading it if necessary.

    .. versionadded:: 0.3

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    filter_data : tuple of str/int, default=None
        A tuple containing the ID or the name of the datasets to be returned.
        Refer to the above table to get the ID and name of the datasets.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Random state for shuffling the dataset.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    verbose : bool, default=False
        Show information regarding the fetching.

    Returns
    -------
    datasets : OrderedDict of Bunch object,
        The ordered is defined by ``filter_data``. Each Bunch object ---
        referred as dataset --- have the following attributes:

    dataset.data : ndarray of shape (n_samples, n_features)

    dataset.target : ndarray of shape (n_samples,)

    dataset.DESCR : str
        Description of the each dataset.

    Notes
    -----
    This collection of datasets have been proposed in [1]_. The
    characteristics of the available datasets are presented in the table
    below.

    +--+--------------+-------------------------------+-------+---------+-----+
    |ID|Name          | Repository & Target           | Ratio | #S      | #F  |
    +==+==============+===============================+=======+=========+=====+
    |1 |ecoli         | UCI, target: imU              | 8.6:1 | 336     | 7   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |2 |optical_digits| UCI, target: 8                | 9.1:1 | 5,620   | 64  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |3 |satimage      | UCI, target: 4                | 9.3:1 | 6,435   | 36  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |4 |pen_digits    | UCI, target: 5                | 9.4:1 | 10,992  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |5 |abalone       | UCI, target: 7                | 9.7:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |6 |sick_euthyroid| UCI, target: sick euthyroid   | 9.8:1 | 3,163   | 42  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |7 |spectrometer  | UCI, target: >=44             | 11:1  | 531     | 93  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |8 |car_eval_34   | UCI, target: good, v good     | 12:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |9 |isolet        | UCI, target: A, B             | 12:1  | 7,797   | 617 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |10|us_crime      | UCI, target: >0.65            | 12:1  | 1,994   | 100 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |11|yeast_ml8     | LIBSVM, target: 8             | 13:1  | 2,417   | 103 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |12|scene         | LIBSVM, target: >one label    | 13:1  | 2,407   | 294 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |13|libras_move   | UCI, target: 1                | 14:1  | 360     | 90  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |14|thyroid_sick  | UCI, target: sick             | 15:1  | 3,772   | 52  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |15|coil_2000     | KDD, CoIL, target: minority   | 16:1  | 9,822   | 85  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |16|arrhythmia    | UCI, target: 06               | 17:1  | 452     | 278 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |17|solar_flare_m0| UCI, target: M->0             | 19:1  | 1,389   | 32  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |18|oil           | UCI, target: minority         | 22:1  | 937     | 49  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |19|car_eval_4    | UCI, target: vgood            | 26:1  | 1,728   | 21  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |20|wine_quality  | UCI, wine, target: <=4        | 26:1  | 4,898   | 11  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |21|letter_img    | UCI, target: Z                | 26:1  | 20,000  | 16  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |22|yeast_me2     | UCI, target: ME2              | 28:1  | 1,484   | 8   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |23|webpage       | LIBSVM, w7a, target: minority | 33:1  | 34,780  | 300 |
    +--+--------------+-------------------------------+-------+---------+-----+
    |24|ozone_level   | UCI, ozone, data              | 34:1  | 2,536   | 72  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |25|mammography   | UCI, target: minority         | 42:1  | 11,183  | 6   |
    +--+--------------+-------------------------------+-------+---------+-----+
    |26|protein_homo  | KDD CUP 2004, minority        | 111:1 | 145,751 | 74  |
    +--+--------------+-------------------------------+-------+---------+-----+
    |27|abalone_19    | UCI, target: 19               | 130:1 | 4,177   | 10  |
    +--+--------------+-------------------------------+-------+---------+-----+

    References
    ----------
    .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
       Imbalanced Data Learning and their Application in Bioinformatics."
       Dissertation, Georgia State University, (2011).
    """

    data_home = get_data_home(data_home=data_home)
    zenodo_dir = join(data_home, "zenodo")
    datasets = OrderedDict()

    if filter_data is None:
        filter_data_ = MAP_NAME_ID.keys()
    else:
        list_data = MAP_NAME_ID.keys()
        filter_data_ = []
        for it in filter_data:
            if isinstance(it, str):
                if it not in list_data:
                    raise ValueError(
                        f"{it} is not a dataset available. "
                        f"The available datasets are {list_data}"
                    )
                else:
                    filter_data_.append(it)
            elif isinstance(it, int):
                if it < 1 or it > 27:
                    raise ValueError(
                        f"The dataset with the ID={it} is not an "
                        f"available dataset. The IDs are "
                        f"{range(1, 28)}"
                    )
                else:
                    # The index start at one, then we need to remove one
                    # to not have issue with the indexing.
                    filter_data_.append(MAP_ID_NAME[it])
            else:
                raise ValueError(
                    f"The value in the tuple should be str or int."
                    f" Got {type(it)} instead."
                )

    # go through the list and check if the data are available
    for it in filter_data_:
        filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME
        filename = join(zenodo_dir, filename)
        available = isfile(filename)

        if download_if_missing and not available:
            makedirs(zenodo_dir, exist_ok=True)
            if verbose:
                print("Downloading %s" % URL)
            f = BytesIO(urlopen(URL).read())
            tar = tarfile.open(fileobj=f)
            tar.extractall(path=zenodo_dir)
        elif not download_if_missing and not available:
            raise IOError("Data not found and `download_if_missing` is False")

        data = np.load(filename)
        X, y = data["data"], data["label"]

        if shuffle:
            ind = np.arange(X.shape[0])
            rng = check_random_state(random_state)
            rng.shuffle(ind)
            X = X[ind]
            y = y[ind]

        datasets[it] = Bunch(data=X, target=y, DESCR=it)

    return datasets