Example #1
0
def load_data(shuffle=True, n_cols=None):
    train_path = get_file(
        'P1B1.train.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv'
    )
    test_path = get_file(
        'P1B1.test.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv'
    )

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    df_train = df_train.drop('case_id', 1).astype(np.float32)
    df_test = df_test.drop('case_id', 1).astype(np.float32)

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.as_matrix()
    X_test = df_test.as_matrix()

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    return X_train, X_test
Example #2
0
def load_data(shuffle=True, n_cols=None):
    train_path = get_file(
        'P1B2.train.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv'
    )
    test_path = get_file(
        'P1B2.test.csv',
        origin=
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv'
    )

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.iloc[:, 2:].as_matrix()
    X_test = df_test.iloc[:, 2:].as_matrix()

    y_train = pd.get_dummies(df_train[['cancer_type']]).as_matrix()
    y_test = pd.get_dummies(df_test[['cancer_type']]).as_matrix()

    return (X_train, y_train), (X_test, y_test)
Example #3
0
def load_cell_proteome(ncols=None, scaling='std', add_prefix=True):
    """Load cell line microRNA data, sub-select columns randomly if
        specificed, scale the selected data and return a pandas
        dataframe.

    Parameters
    ----------
    ncols : int or None
        number of columns to randomly subselect (default None : use all data)
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    add_prefix: True or False
        add feature namespace prefix
    """

    path1 = get_file(P1B3_URL + 'nci60_proteome_log2.transposed.tsv')
    path2 = get_file(P1B3_URL + 'nci60_kinome_log2.transposed.tsv')

    df = global_cache.get(path1)
    if df is None:
        df = pd.read_csv(path1, sep='\t', engine='c')
        global_cache[path1] = df

    df_k = global_cache.get(path2)
    if df_k is None:
        df_k = pd.read_csv(path2, sep='\t', engine='c')
        global_cache[path2] = df_k

    df = df.set_index('CellLine')
    df_k = df_k.set_index('CellLine')

    if add_prefix:
        df = df.add_prefix('prot.')
        df_k = df_k.add_prefix('kino.')
    else:
        df_k = df_k.add_suffix('.K')

    df = df.merge(df_k, left_index=True, right_index=True)

    index = df.index.map(lambda x: x.replace('.', ':'))

    total = df.shape[1]
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        df = df.iloc[:, usecols]

    df = impute_and_scale(df, scaling)
    df = df.astype(np.float32)

    df.index = index
    df.index.names = ['CELLNAME']
    df = df.reset_index()

    return df
Example #4
0
def _load_fashion_mnist():
    dirname = os.path.join('datasets', 'fashion-mnist')
    base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
    files = [
        'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
        't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
    ]

    paths = []
    for fname in files:
        paths.append(get_file(fname, origin=base + fname,
                              cache_subdir=dirname))

    with gzip.open(paths[0], 'rb') as lbpath:
        y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[1], 'rb') as imgpath:
        x_train = np.frombuffer(imgpath.read(), np.uint8,
                                offset=16).reshape(len(y_train), 28, 28)

    with gzip.open(paths[2], 'rb') as lbpath:
        y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[3], 'rb') as imgpath:
        x_test = np.frombuffer(imgpath.read(), np.uint8,
                               offset=16).reshape(len(y_test), 28, 28)

    # prevent compatibility issues
    x_train = np.expand_dims(x_train, axis=-1)
    y_train = np.expand_dims(y_train, axis=-1)
    x_test = np.expand_dims(x_test, axis=-1)
    y_test = np.expand_dims(y_test, axis=-1)

    return (x_train, y_train), (x_test, y_test)
Example #5
0
def _load_cifar10():
    dirname = 'cifar-10-batches-py'
    origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    path = get_file(dirname, origin=origin, untar=True)

    num_train_samples = 50000

    x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8')
    y_train = np.empty((num_train_samples, ), dtype='uint8')

    for i in range(1, 6):
        fpath = os.path.join(path, 'data_batch_' + str(i))
        (x_train[(i - 1) * 10000:i * 10000, :, :, :],
         y_train[(i - 1) * 10000:i * 10000]) = _load_batch(fpath)

    fpath = os.path.join(path, 'test_batch')
    x_test, y_test = _load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    # make channels last dimension
    x_train = x_train.transpose(0, 2, 3, 1)
    x_test = x_test.transpose(0, 2, 3, 1)

    return (x_train, y_train), (x_test, y_test)
def load_data(path="data/imdb.pkl", n_words=100000, maxlen=None, test_split=0.2, seed=113):
    path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/imdb.pkl")

    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    X, labels = cPickle.load(f)
    random.seed(seed)
    random.shuffle(X)
    random.seed(seed)
    random.shuffle(labels)

    f.close()

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels

    X = [[1 if w >= n_words else w for w in x] for x in X]
    X_train = X[:int(len(X)*(1-test_split))]
    y_train = labels[:int(len(X)*(1-test_split))]

    X_test = X[int(len(X)*(1-test_split)):]
    y_test = labels[int(len(X)*(1-test_split)):]

    return (X_train, y_train), (X_test, y_test)
Example #7
0
    def download_data(self, file_dir, download_dir):
        # Open file path
        imdb_root = os.path.join(file_dir, "aclImdb")

        if not os.path.isdir(imdb_root):
            logger.info("Downloading IMDB dataset")
            if download_dir is None:
                download_dir = os.path.dirname(os.path.normpath(file_dir))

            # ensure directories exist
            if not os.path.isdir(download_dir):
                mkdir_p(download_dir)
            if not os.path.isdir(file_dir):
                mkdir_p(file_dir)

            # download file
            downloaded_file_path = get_file(
                "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                download_dir)
            # then extract it
            if not os.path.isdir(os.path.join(file_dir, 'aclImdb')):
                logger.info("Extracting IMDB dataset")
                tar = tarfile.open(downloaded_file_path, mode="r:gz")
                tar.extractall(path=file_dir)
                tar.close()

        # output data location
        return imdb_root
def load_data():
    """Loads CIFAR10 dataset.

    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    dirname = 'cifar-10-batches-py'
    origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    path = get_file(dirname, origin=origin, untar=True)

    num_train_samples = 50000

    x_train = np.zeros((num_train_samples, 3, 32, 32), dtype='uint8')
    y_train = np.zeros((num_train_samples,), dtype='uint8')

    for i in range(1, 6):
        fpath = os.path.join(path, 'data_batch_' + str(i))
        data, labels = load_batch(fpath)
        x_train[(i - 1) * 10000: i * 10000, :, :, :] = data
        y_train[(i - 1) * 10000: i * 10000] = labels

    fpath = os.path.join(path, 'test_batch')
    x_test, y_test = load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    if K.image_data_format() == 'channels_last':
        x_train = x_train.transpose(0, 2, 3, 1)
        x_test = x_test.transpose(0, 2, 3, 1)

    return (x_train, y_train), (x_test, y_test)
Example #9
0
def load_data(path="data/reuters.pkl",
              n_words=100000,
              maxlen=None,
              test_split=0.2,
              seed=113):
    path = get_file(
        path, origin="https://s3.amazonaws.com/text-datasets/reuters.pkl")

    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    X, labels = cPickle.load(f)
    f.close()

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels

    X = [[1 if w >= n_words else w for w in x] for x in X]
    X_train = X[:int(len(X) * (1 - test_split))]
    y_train = labels[:int(len(X) * (1 - test_split))]

    X_test = X[int(len(X) * (1 - test_split)):]
    y_test = labels[int(len(X) * (1 - test_split)):]

    return (X_train, y_train), (X_test, y_test)
Example #10
0
def load_data(test_split=0.1, seed=113):
    dirname = "cifar-10-batches-py"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    nb_samples = 50000
    X = np.zeros((nb_samples, 3, 32, 32), dtype="uint8")
    y = np.zeros((nb_samples, ), dtype="uint8")
    for i in range(1, 6):
        fpath = path + '/data_batch_' + str(i)
        f = open(fpath, 'rb')
        d = cPickle.load(f)
        f.close()
        data = d["data"]
        labels = d["labels"]

        data = data.reshape(data.shape[0], 3, 32, 32)
        X[(i - 1) * 10000:i * 10000, :, :, :] = data
        y[(i - 1) * 10000:i * 10000] = labels

    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(y)

    y = np.reshape(y, (len(y), 1))

    X_train = X[:int(len(X) * (1 - test_split))]
    y_train = y[:int(len(X) * (1 - test_split))]

    X_test = X[int(len(X) * (1 - test_split)):]
    y_test = y[int(len(X) * (1 - test_split)):]

    return (X_train, y_train), (X_test, y_test)
Example #11
0
def load_data(file_path=None):
    ''' Function that takes in a path to the Google questions-words.txt
        word analogy file, opens it, removes topic tags and returns a list
        of the analogies

        @Arguments:
            file_path -- (optional) personal system file path to the
                questions-words.txt data set (or others of
                a similar structure)

                The Questions-Words Dataset is of the following format per row:
                    'WordA WordB WordC, WordD'

        @Return:
            A list of strings representing analogies
    '''
    word_analogies = list()

    # Open file path
    if not file_path:
        file_path = get_file(
            "https://word2vec.googlecode.com/svn/trunk/questions-words.txt")

    # Questions word file
    try:
        qw = open(file_path, 'r')
    except IOError, e:
        print "IO Error" + e.code + file_path
Example #12
0
def load_data(path="reuters.pkl", nb_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113):
    path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters.pkl")
    f = open(path, 'rb')

    X, labels = cPickle.load(f)
    f.close()
    random.seed(seed)
    random.shuffle(X)
    random.seed(seed)
    random.shuffle(labels)

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels

    if not nb_words:
        nb_words = max([max(x) for x in X])

    X = [[0 if (w >= nb_words or w < skip_top) else w for w in x] for x in X]
    X_train = X[:int(len(X)*(1-test_split))]
    y_train = labels[:int(len(X)*(1-test_split))]

    X_test = X[int(len(X)*(1-test_split)):]
    y_test = labels[int(len(X)*(1-test_split)):]

    return (X_train, y_train), (X_test, y_test)
Example #13
0
def load_data(file_path=None):
    ''' Function that takes in a path to the Google questions-words.txt
        word analogy file, opens it, removes topic tags and returns a list
        of the analogies

        @Arguments:
            file_path -- (optional) personal system file path to the
                questions-words.txt data set (or others of
                a similar structure)

                The Questions-Words Dataset is of the following format per row:
                    'WordA WordB WordC, WordD'

        @Return:
            A list of strings representing analogies
    '''
    word_analogies = list()

    # Open file path
    if not file_path:
        file_path = get_file("https://word2vec.googlecode.com/svn/trunk/questions-words.txt")

    # Questions word file
    try:
        qw = open(file_path, 'r')
    except IOError, e:
        print "IO Error" + e.code + file_path
def load_data(file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz',
              amazon_url = "http://snap.stanford.edu/data/amazon/"
                           "productGraph/categoryFiles/"
                           "reviews_Home_and_Kitchen.json.gz"):
    ''' Function that takes in a path to the Stanford SNAP Amazon review
        data, opens it, and yields a dictoray of information for each
        review

        @Arguments:
            file_path -- (optional) personal system file path to the
                SNAP Stanford data set (or others of a similar structure)
            
            amazon_url -- (optional) URI of data set, in case it needs to be 
                downloaded. Defaults to Home and Kitchen reviews
        @Return:
            A generator over a dictionaries of each Amazon Reveiws
    '''

    # Open file path
    if not os.path.isfile(file_path):
        file_path = get_file(amazon_url, os.path.dirname(file_path))

    # Parse Amazon Reviews GZip file
    with gzip.open(file_path, 'r') as f:
        for l in f:
            try:
                review_text, sentiment = process_amazon_json(l)
                yield review_text.decode("latin1"), sentiment
            except BoringException as e:
                logger.info(e)
                continue
Example #15
0
    def download_data(self, file_dir, download_dir):
        # Open file path
        imdb_root = os.path.join(file_dir, "aclImdb")

        if not os.path.isdir(imdb_root):
            logger.info("Downloading IMDB dataset")
            if download_dir is None:
                download_dir = os.path.dirname(os.path.normpath(file_dir))

            # ensure directories exist
            if not os.path.isdir(download_dir):
                mkdir_p(download_dir)
            if not os.path.isdir(file_dir):
                mkdir_p(file_dir)

            # download file
            downloaded_file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                 download_dir)
            # then extract it
            if not os.path.isdir(os.path.join(file_dir, 'aclImdb')):
                logger.info("Extracting IMDB dataset")
                tar = tarfile.open(downloaded_file_path, mode="r:gz")
                tar.extractall(path=file_dir)
                tar.close()

        # output data location
        return imdb_root
Example #16
0
def load_data(test_split=0.1, seed=113):
    dirname = "cifar-10-batches-py"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    nb_samples = 50000
    X = np.zeros((nb_samples, 3, 32, 32), dtype="uint8")
    y = np.zeros((nb_samples,))
    for i in range(1, 6):
        fpath = path + '/data_batch_' + str(i)
        f = open(fpath, 'rb')
        d = cPickle.load(f)
        f.close()
        data = d["data"]
        labels = d["labels"]
        
        data = data.reshape(data.shape[0], 3, 32, 32)
        X[(i-1)*10000:i*10000, :, :, :] = data
        y[(i-1)*10000:i*10000] = labels

    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(y)

    y = np.reshape(y, (len(y), 1))

    X_train = X[:int(len(X)*(1-test_split))]
    y_train = y[:int(len(X)*(1-test_split))]

    X_test = X[int(len(X)*(1-test_split)):]
    y_test = y[int(len(X)*(1-test_split)):]

    return (X_train, y_train), (X_test, y_test)
Example #17
0
def load_data(file_dir="./.downloads", download_dir="./.downloads"):
    ''' Function that yields records from the IMDB reviews dataset

        @Arguments:
            file_dir -- personal system file path to the
                unzipped IMDB data set (so, a directory). If this does
                not exist, the archive will be downloaded and unzipped here
            download_dir -- what directory to download the actual archive to? Can be None,
                in which case it defaults to the parent directory of file_path.
                The archive will only be downloaded if necessary

        @Return:
            A generator over a tuples of Movie reviews and their sentiment
    '''
    # Open file path
    imdb_root = os.path.join(file_dir, "aclImdb")
    if not os.path.isdir(imdb_root):
        logger.info("Downloading IMDB dataset")
        if download_dir is None:
            download_dir = os.path.dirname(os.path.normpath(file_dir))
        downloaded_file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                             download_dir)
        # then extract it 
        if not os.path.isdir(os.path.join(file_dir, 'aclImdb')):
            logger.info("Extracting IMDB dataset")
            tar = tarfile.open(downloaded_file_path, mode="r:gz")
            tar.extractall(path=file_dir)
            tar.close()

    imdb_train = os.path.join(imdb_root, "train")
    imdb_test = os.path.join(imdb_root, "test")
    imdb_train_pos = os.path.join(imdb_train, "pos")
    imdb_train_neg = os.path.join(imdb_train, "neg")
    imdb_test_pos = os.path.join(imdb_test, "pos")
    imdb_test_neg = os.path.join(imdb_test, "neg")
    
    # Specifies positive and negative files
    pos_train = os.listdir(imdb_train_pos)
    pos_train = [(os.path.join(imdb_train_pos, file_name), pos_label) for file_name in pos_train]
    pos_test = os.listdir(imdb_test_pos)
    pos_test = [(os.path.join(imdb_test_pos, file_name), pos_label) for file_name in pos_test]

    neg_train = os.listdir(imdb_train_neg)
    neg_train = [(os.path.join(imdb_train_neg, file_name), neg_label) for file_name in neg_train]
    neg_test = os.listdir(imdb_test_neg)
    neg_test = [(os.path.join(imdb_test_neg, file_name), neg_label) for file_name in neg_test]

    all_data = pos_train + pos_test + neg_train + neg_test

    # Combines data and shuffles it.
    random.shuffle(all_data)

    for (file_path, sentiment) in all_data:
        # Open the movie review
        f = open(file_path, 'r')
        yield (f.read().decode('utf-8'), sentiment)
        # Closes f on the following next() call by user
        f.close()
Example #18
0
def load_dose_response(min_logconc=-4.,
                       max_logconc=-4.,
                       subsample=None,
                       fraction=False):
    """Load cell line response to different drug compounds, sub-select response for a specific
        drug log concentration range and return a pandas dataframe.

    Parameters
    ----------
    min_logconc : -3, -4, -5, -6, -7, optional (default -4)
        min log concentration of drug to return cell line growth
    max_logconc : -3, -4, -5, -6, -7, optional (default -4)
        max log concentration of drug to return cell line growth
    subsample: None, 'naive_balancing' (default None)
        subsampling strategy to use to balance the data based on growth
    fraction: bool (default False)
        divide growth percentage by 100
    """

    path = get_file(P1B3_URL + 'NCI60_dose_response_with_missing_z5_avg.csv')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path,
                         sep=',',
                         engine='c',
                         na_values=['na', '-', ''],
                         dtype={
                             'NSC': object,
                             'CELLNAME': str,
                             'LOG_CONCENTRATION': np.float32,
                             'GROWTH': np.float32
                         })
        global_cache[path] = df

    df = df[(df['LOG_CONCENTRATION'] >= min_logconc)
            & (df['LOG_CONCENTRATION'] <= max_logconc)]

    df = df[['NSC', 'CELLNAME', 'GROWTH', 'LOG_CONCENTRATION']]

    if subsample and subsample == 'naive_balancing':
        df1 = df[df['GROWTH'] <= 0]
        df2 = df[(df['GROWTH'] > 0) & (df['GROWTH'] < 50)].sample(
            frac=0.7, random_state=SEED)
        df3 = df[(df['GROWTH'] >= 50) & (df['GROWTH'] <= 100)].sample(
            frac=0.18, random_state=SEED)
        df4 = df[df['GROWTH'] > 100].sample(frac=0.01, random_state=SEED)
        df = pd.concat([df1, df2, df3, df4])

    if fraction:
        df['GROWTH'] /= 100

    df = df.set_index(['NSC'])

    return df
Example #19
0
    def download_data(self, file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz',
                            amazon_url =   "http://snap.stanford.edu/data/amazon/"
                                           "productGraph/categoryFiles/"
                                           "reviews_Home_and_Kitchen.json.gz"):
        # download data if necessary
        filename_url = os.path.basename(amazon_url)
        dir_data = os.path.dirname(file_path)
        if not os.path.isfile(file_path):
            file_downloaded = get_file(amazon_url, dir_data)
            shutil.move(os.path.join(dir_data, filename_url), file_path)

        # return parent data directory
        return dir_data
Example #20
0
    def download_data(self, file_path):

        # download file
        if not os.path.isfile(file_path):

            # download and save file from internet
            logger.info("Downloading {}...".format(file_path))
            file_downloaded = get_file("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip")

            # extract csv
            filename = 'training.1600000.processed.noemoticon.csv'
            file_dir = os.path.dirname(file_path)
            with ZipFile(file_downloaded, 'r') as zp:
                zp.extract(filename, path=file_dir)
                shutil.move(os.path.join(file_dir, filename), file_path)
Example #21
0
def load_data(file_path=None, download_path="./.downloads", dest_path="./.downloads"):
    """ Function that takes in a path to the IMDB movie review dataset
        word analogy file, opens it, removes topic tags and returns a list
        of the analogies

        @Arguments:
            file_path -- (optional) personal system file path to the
                IMDB data set in gzip form(or others of
                a similar structure)

        @Return:
            A generator over a tuples of Movie reviews and their sentiment
    """
    # Open file path
    if not file_path:
        print "Downloading IMDB dataset"
        file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", download_path)

    # If file has not been extracted, then extract it
    # to the downloads folder. This will save a lot of time
    if not os.path.isdir(os.path.join(dest_path, "aclImdb")):
        print ("Extracting IMDB dataset")
        tar = tarfile.open(file_path, mode="r:gz")
        tar.extractall(path=dest_path)
        tar.close()

    # Specifies positive and negative files
    pos_train = os.listdir("./.downloads/aclImdb/train/pos")
    pos_train = [(os.path.join("./.downloads/aclImdb/train/pos", file_name), pos_label) for file_name in pos_train]
    pos_test = os.listdir("./.downloads/aclImdb/test/pos")
    pos_test = [(os.path.join("./.downloads/aclImdb/test/pos", file_name), pos_label) for file_name in pos_test]

    neg_train = os.listdir("./.downloads/aclImdb/train/neg")
    neg_train = [(os.path.join("./.downloads/aclImdb/train/neg", file_name), neg_label) for file_name in neg_train]
    neg_test = os.listdir("./.downloads/aclImdb/test/neg")
    neg_test = [(os.path.join("./.downloads/aclImdb/test/neg", file_name), neg_label) for file_name in neg_test]

    all_data = pos_train + pos_test + neg_train + neg_test

    # Combines data and shuffles it.
    random.shuffle(all_data)

    for (file_path, sentiment) in all_data:
        # Open the movie review
        f = open(file_path, "r")
        yield (f.read(), sentiment)
        # Closes f on the following next() call by user
        f.close()
Example #22
0
def load_data(file_path=None):
    ''' Function that takes in a path to the IMDB movie review dataset
        word analogy file, opens it, removes topic tags and returns a list
        of the analogies

        @Arguments:
            file_path -- (optional) personal system file path to the
                IMDB data set in gzip form(or others of
                a similar structure)

        @Return:
            A generator over a tuples of Movie reviews and their sentiment
    '''
    # Open file path
    if not file_path:
        print "Downloading IMDB dataset"
        file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")

    # If file has not been extracted, then extract it 
    # to the downloads folder. This will save a lot of time
    if not os.path.isdir('./.downloads/aclImdb'):
        print("Extracting IMDB dataset")
        tar = tarfile.open(file_path, mode="r:gz")
        tar.extractall(path="./.downloads")
        tar.close()

    # Specifies positive and negative files
    pos_train = os.listdir('./.downloads/aclImdb/train/pos')
    pos_train = [(os.path.join('./.downloads/aclImdb/train/pos', file_name), 'pos') for file_name in pos_train]
    pos_test = os.listdir('./.downloads/aclImdb/test/pos')
    pos_test = [(os.path.join('./.downloads/aclImdb/test/pos', file_name), 'pos') for file_name in pos_test]

    neg_train = os.listdir('./.downloads/aclImdb/train/neg')
    neg_train = [(os.path.join('./.downloads/aclImdb/train/neg', file_name), 'neg') for file_name in neg_train]
    neg_test = os.listdir('./.downloads/aclImdb/test/neg')
    neg_test = [(os.path.join('./.downloads/aclImdb/test/neg', file_name), 'neg') for file_name in neg_test]

    all_data = pos_train + pos_test + neg_train + neg_test

    # Combines data and shuffles it.
    random.shuffle(all_data)

    for (file_path, sentiment) in all_data:
        # Open the movie review
        f = open(file_path, 'r')
        yield (f.read(), sentiment)
        # Closes f on the following next() call by user
        f.close()
Example #23
0
def load_data(path="mnist.pkl.gz"):
    path = get_file(path, origin="https://s3.amazonaws.com/img-datasets/mnist.pkl.gz")

    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    if sys.version_info < (3,):
        data = cPickle.load(f)
    else:
        data = cPickle.load(f, encoding="bytes")

    f.close()

    return data  # (X_train, y_train), (X_test, y_test)
Example #24
0
def get_word_index(path='imdb_word_index.json'):
    """Retrieves the dictionary mapping word indices back to words.

  Arguments:
      path: where to cache the data (relative to `~/.keras/dataset`).

  Returns:
      The word index dictionary.
  """
    path = get_file(
        path,
        origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
    f = open(path)
    data = json.load(f)
    f.close()
    return data
Example #25
0
def load_data(path="mnist.pkl.gz"):
    path = get_file(
        path, origin="https://s3.amazonaws.com/img-datasets/mnist.pkl.gz")

    if path.endswith(".gz"):
        f = gzip.open(path, 'rb')
    else:
        f = open(path, 'rb')

    if sys.version_info < (3, ):
        data = cPickle.load(f)
    else:
        data = cPickle.load(f, encoding="bytes")

    f.close()
    return data  # (X_train, y_train), (X_test, y_test)
Example #26
0
def _load_mnist(path='mnist.npz'):
    path = get_file(path,
                    origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
                    file_hash='8a61469f7ea1b51cbae51d4f78837e45')
    f = np.load(path)
    x_train, y_train = f['x_train'], f['y_train']
    x_test, y_test = f['x_test'], f['y_test']
    f.close()

    # prevent compatibility issues
    x_train = np.expand_dims(x_train, axis=-1)
    y_train = np.expand_dims(y_train, axis=-1)
    x_test = np.expand_dims(x_test, axis=-1)
    y_test = np.expand_dims(y_test, axis=-1)

    return (x_train, y_train), (x_test, y_test)
Example #27
0
def download_data(file_path):

    url_weibo = "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip"

    if not os.path.exists(file_path) or not check_for_csvs(file_path):
        # download repository files and unzip them
        try:
            os.makedirs(file_path)
        except OSError as e:
            logger.debug(e)
            if not os.path.isdir(file_path):
                raise
        for remote_path in [ url_weibo.format(a) for a in [ str(b) for b in range(1, 52) ] ]:
            local_zip = get_file(remote_path, file_path)
            with ZipFile(local_zip) as zf:
                zf.extractall(file_path)
Example #28
0
    def download_data(self, file_path):

        # download file
        if not os.path.isfile(file_path):

            # download and save file from internet
            logger.info("Downloading {}...".format(file_path))
            file_downloaded = get_file(
                "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"
            )

            # extract csv
            filename = 'training.1600000.processed.noemoticon.csv'
            file_dir = os.path.dirname(file_path)
            with ZipFile(file_downloaded, 'r') as zp:
                zp.extract(filename, path=file_dir)
                shutil.move(os.path.join(file_dir, filename), file_path)
Example #29
0
def load_data():
    """Loads the MNIST dataset.

    # Arguments
        path: path where to cache the dataset locally
            (relative to ~/.evolutionary-learning/datasets).

    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    path = get_file('mnist.npz',
                    origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
                    file_hash='8a61469f7ea1b51cbae51d4f78837e45')
    f = np.load(path)
    x_train, y_train = f['x_train'], f['y_train']
    x_test, y_test = f['x_test'], f['y_test']
    f.close()
    return (x_train, y_train), (x_test, y_test)
Example #30
0
def download_data(file_path):

    url_weibo = "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip"

    if not os.path.exists(file_path) or not check_for_csvs(file_path):
        # download repository files and unzip them
        try:
            os.makedirs(file_path)
        except OSError as e:
            logger.debug(e)
            if not os.path.isdir(file_path):
                raise
        for remote_path in [
                url_weibo.format(a) for a in [str(b) for b in range(1, 52)]
        ]:
            local_zip = get_file(remote_path, file_path)
            with ZipFile(local_zip) as zf:
                zf.extractall(file_path)
Example #31
0
def load_data(
        path='/home/inorganic-bandstructure/band-inversion/band_inv-01.jpg'):
    """Loads the MNIST dataset.
    # Arguments
        path: path where to cache the dataset locally
            (relative to ~/.keras/datasets).
    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    path = get_file(
        path,
        '/home/inorganic-bandstructure/band-inversion/band_inv-01.jpg',
        file_hash='adh340')
    f = np.load(path)
    x_train, y_train = f['x_train'], f['y_train']
    x_test, y_test = f['x_test'], f['y_test']
    f.close()
    return (x_train, y_train), (x_test, y_test)
Example #32
0
def load_drug_descriptors(ncols=None, scaling='std', add_prefix=True):
    """Load drug descriptor data, sub-select columns of drugs descriptors
        randomly if specificed, impute and scale the selected data, and return a
        pandas dataframe.

    Parameters
    ----------
    ncols : int or None
        number of columns (drugs descriptors) to randomly subselect (default None : use all data)
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    add_prefix: True or False
        add feature namespace prefix
    """

    path = get_file(P1B3_URL + 'descriptors.2D-NSC.5dose.filtered.txt')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path,
                         sep='\t',
                         engine='c',
                         na_values=['na', '-', ''],
                         dtype=np.float32)
        global_cache[path] = df

    df1 = pd.DataFrame(df.loc[:, 'NAME'].astype(int).astype(str))
    df1.rename(columns={'NAME': 'NSC'}, inplace=True)

    df2 = df.drop('NAME', 1)
    if add_prefix:
        df2 = df2.add_prefix('dragon7.')

    total = df2.shape[1]
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        df2 = df2.iloc[:, usecols]

    df2 = impute_and_scale(df2, scaling)
    df2 = df2.astype(np.float32)

    df_dg = pd.concat([df1, df2], axis=1)

    return df_dg
Example #33
0
def get_list_of_data_files(GP):

    import pilot2_datasets as p2
    reload(p2)
    print ('Reading Data...')
    ## Identify the data set selected
    data_set=p2.data_sets[GP['set_sel']][0]
    ## Get the MD5 hash for the proper data set
    data_hash=p2.data_sets[GP['set_sel']][1]
    print ('Reading Data Files... %s->%s' % (GP['set_sel'], data_set))
    ## Check if the data files are in the data director, otherwise fetch from FTP
    data_file = get_file(data_set, origin='http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/'+data_set+'.tar.gz', untar=True, md5_hash=data_hash)
    data_dir = os.path.join(os.path.dirname(data_file), data_set)
    ## Make a list of all of the data files in the data set
    data_files=glob.glob('%s/*.npz'%data_dir)

    fields = p2.gen_data_set_dict()

    return (data_files, fields)
Example #34
0
def _load_freyface(path='frey_rawface.mat'):
    img_dims = [28, 20]
    train_size = 1685  # ???
    path = get_file(path,
                    origin='https://cs.nyu.edu/~roweis/data/frey_rawface.mat')
    f = loadmat(path)
    x_train = f['ff'][:, :train_size]
    x_test = f['ff'][:, train_size:]

    # reformat data to match expected format
    x_train = x_train.transpose()
    x_train = np.reshape(x_train, tuple([train_size] + img_dims), order='C')
    x_train = np.expand_dims(x_train, axis=-1)
    x_test = x_test.transpose()
    x_test = np.reshape(x_test, tuple([x_test.shape[0]] + img_dims), order='C')
    x_test = np.expand_dims(x_test, axis=-1)

    return (x_train, np.zeros(shape=(train_size, 1), dtype=np.uint8)), \
           (x_test, np.zeros(shape=(x_test.shape[0], 1), dtype=np.uint8))
Example #35
0
def load_cell_expression_5platform(ncols=None, scaling='std', add_prefix=True):
    """Load 5-platform averaged cell line expression data, sub-select
        columns of gene expression randomly if specificed, scale the
        selected data and return a pandas dataframe.

    Parameters
    ----------
    ncols : int or None
        number of columns (gene expression) to randomly subselect (default None : use all data)
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    add_prefix: True or False
        add feature namespace prefix
    """

    path = get_file(
        P1B3_URL +
        'RNA_5_Platform_Gene_Transcript_Averaged_intensities.transposed.txt')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path, sep='\t', engine='c', na_values=['na', '-', ''])
        global_cache[path] = df

    df1 = df['CellLine']
    df1 = df1.map(lambda x: x.replace('.', ':'))
    df1.name = 'CELLNAME'

    df2 = df.drop('CellLine', 1)
    if add_prefix:
        df2 = df2.add_prefix('expr_5p.')

    total = df2.shape[1]
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        df2 = df2.iloc[:, usecols]

    df2 = impute_and_scale(df2, scaling)
    df2 = df2.astype(np.float32)
    df = pd.concat([df1, df2], axis=1)

    return df
Example #36
0
def load_smiles(verbose=False):
    """ (ap) Load SMILES data (Simplified Molecular-Input Line-Entry System).
    Args:
    Returns:
    """
    path = get_file(P1B3_URL + 'ChemStructures_Consistent.smiles')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path, sep='\t', engine='c',
                         dtype=np.str)  # (ap) update this command
        global_cache[path] = df

    # TODO maybe do some processing (data augmentation; check if strings are valid)
    df_smiles = df

    if verbose:
        print('SMILES shape {}'.format(df_smiles.shape))
        print('SMILES columns {}'.format(df.columns))

    return df_smiles
Example #37
0
def load_data(file_path=None, verbose=False):
    ''' Function that takes in a path to the Stanford SNAP Amazon review
        data, opens it, and yields a dictoray of information for each
        review

        @Arguments:
            file_path -- (optional) personal system file path to the
                SNAP Stanford data set (or others of a similar structure)

        @Return:
            A generator over a dictionaries of each Amazon Reveiws
    '''
    # Open file path
    if not file_path:
        file_path = get_file("https://snap.stanford.edu/data/amazon/all.txt.gz")

    # Parse Amazon Reviews GZip file -- taken from Stanford SNAP page
    try:
        f = gzip.open(file_path, 'r')
    except IOError, e:
        print "IO Error", e.code, file_path
Example #38
0
def _load_cifar_100(label_mode='fine'):
    if label_mode not in ['fine', 'coarse']:
        raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`.')

    dirname = 'cifar-100-python'
    origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
    path = get_file(dirname, origin=origin, untar=True)

    fpath = os.path.join(path, 'train')
    x_train, y_train = _load_batch(fpath, label_key=label_mode + '_labels')

    fpath = os.path.join(path, 'test')
    x_test, y_test = _load_batch(fpath, label_key=label_mode + '_labels')

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    # make channels last dimension
    x_train = x_train.transpose(0, 2, 3, 1)
    x_test = x_test.transpose(0, 2, 3, 1)

    return (x_train, y_train), (x_test, y_test)
Example #39
0
def load_data(file_path=None, verbose=False):
    ''' Function that takes in a path to the Stanford SNAP Amazon review
        data, opens it, and yields a dictoray of information for each
        review

        @Arguments:
            file_path -- (optional) personal system file path to the
                SNAP Stanford data set (or others of a similar structure)

        @Return:
            A generator over a dictionaries of each Amazon Reveiws
    '''
    # Open file path
    if not file_path:
        file_path = get_file(
            "https://snap.stanford.edu/data/amazon/all.txt.gz")

    # Parse Amazon Reviews GZip file -- taken from Stanford SNAP page
    try:
        f = gzip.open(file_path, 'r')
    except IOError, e:
        print "IO Error", e.code, file_path
Example #40
0
def load_cell_expression_u133p2(ncols=None, scaling='std', add_prefix=True):
    """Load U133_Plus2 cell line expression data prepared by Judith,
        sub-select columns of gene expression randomly if specificed,
        scale the selected data and return a pandas dataframe.

    Parameters
    ----------
    ncols : int or None
        number of columns (gene expression) to randomly subselect (default None : use all data)
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    add_prefix: True or False
        add feature namespace prefix
    """
    path = get_file(
        'http://bioseed.mcs.anl.gov/~fangfang/p1h/GSE32474_U133Plus2_GCRMA_gene_median.txt'
    )

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path, sep='\t', engine='c')
        global_cache[path] = df

    df1 = df['CELLNAME']
    df2 = df.drop('CELLNAME', 1)
    if add_prefix:
        df2 = df2.add_prefix('expr.')

    total = df.shape[1]
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        df2 = df2.iloc[:, usecols]

    df2 = impute_and_scale(df2, scaling)
    df2 = df2.astype(np.float32)
    df = pd.concat([df1, df2], axis=1)

    return df
Example #41
0
def load_drug_autoencoded_AG(ncols=None, scaling='std', add_prefix=True):
    """Load drug latent representation from Aspuru-Guzik's variational
    autoencoder, sub-select columns of drugs randomly if specificed,
    impute and scale the selected data, and return a pandas dataframe

    Parameters
    ----------
    ncols : int or None
        number of columns (drug latent representations) to randomly subselect (default None : use all data)
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    add_prefix: True or False
        add feature namespace prefix
    """
    path = get_file(P1B3_URL +
                    'Aspuru-Guzik_NSC_latent_representation_292D.csv')

    df = global_cache.get(path)
    if df is None:
        df = pd.read_csv(path, engine='c', dtype=np.float32)
        global_cache[path] = df

    df1 = pd.DataFrame(df.loc[:, 'NSC'].astype(int).astype(str))
    df2 = df.drop('NSC', 1)
    if add_prefix:
        df2 = df2.add_prefix('smiles_latent_AG.')

    total = df2.shape[1]
    if ncols and ncols < total:
        usecols = np.random.choice(total, size=ncols, replace=False)
        df2 = df2.iloc[:, usecols]

    df2 = impute_and_scale(df2, scaling)
    df2 = df2.astype(np.float32)

    df = pd.concat([df1, df2], axis=1)

    return df
def load_data(file_path, which_set='train', form='pinyin', train_pct=1.0, nr_records=None, rng_seed=None, min_length=None, max_length=None, pad_out=False):
    """
    Load data from Open Weiboscope corpus of Sina Weibo posts. Options are available for encoding
    of returned text data. 

    @Arguments:
        file_path -- path to downloaded, unzipped Open Weiboscope
            data (a directory). If this path does not exist or is not given, load_data
            will create the path and download the data (string)
        which_set -- whether to iterate over train or testing set. You should
            also set train_pct and rng_seed to non-default values if you specify this
            (string)
        form -- return results in hanzi, pinyin romanization?
            can take values of 'hanzi', 'pinyin' (string)
        train_pct -- what percent of dataset should go to training (remainder goes to test)?  (float)
        nr_records -- if not None, gives the maximum number of records this generator should yield.
            will yield fewer records if the corpus exhausted before nr_records records are yielded
        rng_seed -- value for seeding random number generator
        min_length -- enforce a minimum length, in characters, for the 
            dataset? Counted in hanzi for form='hanzi' and in roman characters 
            for form='pinyin'. Texts that are too short will be excluded. (int)
        max_length -- enforce a maximum length, in characters, for the dataset?
            Counted in hanzi or roman characters as approriate (see above).
            Texts that are too long will be truncated at the end. (int)
        pad_out -- for texts shorter than max_length, should they be padded out
            at the end with spaces?

    @Return:
        a generator over a tuples of review text (unicode or numpy array) and whether or not 
        the tweet was deleted (bool)

    """

    if not os.path.exists(file_path):
        # download repository files and unzip them
        os.makedirs(file_path)
        for remote_path in [ "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip".format(a) for a in [ str(b) for b in range(1, 52) ] ]:
            local_zip = get_file(remote_path, file_path)
            with ZipFile(local_zip) as zf:
                zf.extractall(file_path)

    # get list of weekNN.csv files at file_path
    ow_files = [ os.path.join(file_path, f) for f in os.listdir(file_path) if re.match(r"week[0-9]{,2}\.csv", f) is not None ]
    assert ow_files is not []
    
    # strategy: randomize order of weeks (individual files), sample in order from each week.
    try:
        random.seed(rng_seed)
    except:
        pass
    random.shuffle(ow_files)
    split_on = int(len(ow_files) * train_pct)
    data_sets = {}
    logger.debug("Shuffle order: {}, split on {}".format(ow_files, split_on))
    data_sets['train'], data_sets['test'] = ow_files[:split_on], ow_files[split_on:]
    logger.debug(data_sets)
    nr_yielded = 0
    for table_path in data_sets[which_set]:
        with codecs.open(table_path, "r", encoding="utf-8") as f:
            logging.debug("In file {}".format(table_path))
            for line in unicode_csv_reader(f):
                try:
                    records_split = line
                    post_id = records_split[0]
                    
                    if len(records_split) != 11:
                        raise  BadRecordException("Comma split error on mid={} in"
                                         "file {} (len of record: {})".format(
                                            post_id, 
                                            os.path.basename(table_path),
                                            len(records_split)))
            
                    # different fields of post record 
                    post_text = records_split[6]
                    post_retweeted = records_split[1] != ''
                    post_deleted = records_split[9] != ''
                   
                    if not post_retweeted:
                        if form=='hanzi':
                            record_txt, sentiment = enforce_length(
                                post_text, min_length, max_length, 
                                pad_out), post_deleted
                            yield record_txt, sentiment
                        elif form=='pinyin':
                            record_txt, sentiment = enforce_length(
                                romanize_tweet(post_text), min_length, 
                                max_length, pad_out), post_deleted
                            yield record_txt, sentiment
                        else:
                            raise Exception("Unknown form '{}' (should be 'hanzi' "
                                            "or 'pinyin')".format(form))
                        # limit number of records retrieved?
                        nr_yielded += 1
                        if nr_records is not None and nr_yielded >= nr_records:
                            raise StopIteration()
                # log various exception cases from loop body
                except TextTooShortException:
                    logger.info("Record {} thrown out (too short)".format(post_id))
                except BadRecordException as e:
                    logger.info(e)
                except IndexError as e:
                    logger.info(e)
                except UnicodeEncodeError as e:
                    logger.info(e)

                except GeneratorExit:
                    return
Example #43
0
def get_p1_file(link):
    fname = os.path.basename(link)
    return get_file(fname, origin=link, cache_subdir='Pilot1')
        @Return:
            A list of tuples of the following format:
                (tweets/features, sentiment label)
    '''
    tweet_to_sentiment = list()

    # Open file path
    if file_path:
        try:
            twitter_csv = open(file_path, 'r')
        except IOError, e:
            print "IO Error:", e.code, file_path
    else:
        # Dowloads and saves locally the zip file from internet
        file_path = get_file("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip")

        with ZipFile(file_path, 'r') as zp:
            twitter_csv = zp.open('training.1600000.processed.noemoticon.csv')

    # Perform parsing of CSV file
    reader = latin_csv_reader(twitter_csv, delimiter=',')
    for i, tweet in enumerate(reader):
        # Prints progress every 10000 words read
        if verbose and i % 10000 == 0:
            logging.info("PROGRESS: at tweet #%s", i)

        # Gets tweets string from line in csv
        tweet_string = tweet[5]
        # Gets feature from Sentiment dictionary
        sent = Sentiment[int(tweet[0])]
	## Import keras modules
	from keras.optimizers import SGD,RMSprop,Adam
	from keras.datasets import mnist
	from keras.callbacks import LearningRateScheduler,ModelCheckpoint
	from keras import callbacks
	from keras.layers.advanced_activations import ELU
	from keras.preprocessing.image import ImageDataGenerator

	batch_size = GP['batch_size']
##### Read Data ########
	print ('Reading Data...')
	datagen=hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor'])
        data_set=p2.data_sets[opts.set_sel][0]
        data_hash=p2.data_sets[opts.set_sel][1]
	print ('Reading Data Files... %s->%s' % (opts.set_sel, data_set))
        data_file = get_file(data_set, origin='http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/'+data_set+'.tar.gz', untar=True, md5_hash=data_hash)
        data_dir = os.path.join(os.path.dirname(data_file), data_set)
	data_files=glob.glob('%s/*.npy'%data_dir) 

	X=np.load(data_files[0])
	data=hf.get_data(X,case=opts.case)
	X_train,y_train=hf.create_dataset(data,GP['look_back'],look_forward=GP['look_forward']) ## convert data to a sequence 
	temporal_dim=X_train.shape[1]
	input_dim=X_train.shape[2]
	
	print('X_train type and shape:', X_train.dtype, X_train.shape)
	print('X_train.min():', X_train.min())
	print('X_train.max():', X_train.max())
	
### Define Model, Solver and Compile ##########
	print ('Define the model and compile')
def do_10_fold():
    shared_nnet_spec= [ 1200 ]
    individual_nnet_spec0= [ 1200, 1200 ]
    individual_nnet_spec1= [ 1200, 1200 ]
    individual_nnet_spec2= [ 1200, 1200 ]
    individual_nnet_spec = [ individual_nnet_spec0, individual_nnet_spec1, individual_nnet_spec2 ]

    learning_rate = 0.01
    batch_size = 10
    n_epochs = 10
    dropout = 0.0


    truth0 = []
    pred0 = []

    truth1 = []
    pred1 = []

    truth2 = []
    pred2 = []


   ## Read files
    file_path = os.path.dirname(os.path.realpath(__file__))
    print file_path
    lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
    sys.path.append(lib_path)

    from data_utils import get_file
    origin = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P3B1/P3B1_data.tgz'
    data_loc = get_file('P3B1_data.tgz', origin, untar=True, md5_hash=None, cache_subdir='P3B1')

    print 'Data downloaded and stored at: ' + data_loc
    data_path = os.path.dirname(data_loc)
    print data_path

    for fold in range( 1 ):

        feature_train_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_train_feature.csv', delimiter= ',' )
        truth_train_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_train_label.csv', delimiter= ',' )
        feature_test_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_test_feature.csv', delimiter= ',' )
        truth_test_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_test_label.csv', delimiter= ',' )

        feature_train_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_train_feature.csv', delimiter= ',' )
        truth_train_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_train_label.csv', delimiter= ',' )
        feature_test_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_test_feature.csv', delimiter= ',' )
        truth_test_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_test_label.csv', delimiter= ',' )

        feature_train_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_train_feature.csv', delimiter= ',' )
        truth_train_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_train_label.csv', delimiter= ',' )
        feature_test_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_test_feature.csv', delimiter= ',' )
        truth_test_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_test_label.csv', delimiter= ',' )

        features_train = [ feature_train_0, feature_train_1, feature_train_2 ]
        truths_train = [ truth_train_0, truth_train_1, truth_train_2 ]
        features_test = [ feature_test_0, feature_test_1, feature_test_2 ]
        truths_test = [ truth_test_0, truth_test_1, truth_test_2 ]


        ret = run_mtl(
            features_train= features_train,
            truths_train= truths_train,
            features_test= features_test,
            truths_test= truths_test,
            shared_nnet_spec= shared_nnet_spec,
            individual_nnet_spec= individual_nnet_spec,
            learning_rate= learning_rate,
            batch_size= batch_size,
            n_epochs= n_epochs,
            dropout= dropout
        )

        truth0.extend( ret[ 0 ][ 0 ] )
        pred0.extend( ret[ 0 ][ 1 ] )

        truth1.extend( ret[ 1 ][ 0 ] )
        pred1.extend( ret[ 1 ][ 1 ] )

        truth2.extend( ret[ 2 ][ 0 ] )
        pred2.extend( ret[ 2 ][ 1 ] )


    print 'Task 1: Primary site - Macro F1 score', f1_score( truth0, pred0, average= 'macro' )
    print 'Task 1: Primary site - Micro F1 score', f1_score( truth0, pred0, average= 'micro' )

    print 'Task 2: Tumor laterality - Macro F1 score', f1_score( truth1, pred1, average= 'macro' )
    print 'Task 3: Tumor laterality - Micro F1 score', f1_score( truth1, pred1, average= 'micro' )

    print 'Task 3: Histological grade - Macro F1 score', f1_score( truth2, pred2, average= 'macro' )
    print 'Task 3: Histological grade - Micro F1 score', f1_score( truth2, pred2, average= 'micro' )
Example #47
0
def get_word_index(path="reuters_word_index.pkl"):
    path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters_word_index.pkl")
    f = open(path, 'rb')
    return cPickle.load(f)