def test_fetch_rcv1():
    try:
        data1 = fetch_rcv1(shuffle=False, download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("Download RCV1 dataset to run this test.")

    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert_true(sp.issparse(X1))
    assert_true(sp.issparse(Y1))
    assert_equal(60915113, X1.data.size)
    assert_equal(2606875, Y1.data.size)

    # test shapes
    assert_equal((804414, 47236), X1.shape)
    assert_equal((804414, 103), Y1.shape)
    assert_equal((804414,), s1.shape)
    assert_equal(103, len(cat_list))

    # test ordering of categories
    first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151']
    assert_array_equal(first_categories, cat_list[:6])

    # test number of sample for some categories
    some_categories = ('GMIL', 'E143', 'CCAT')
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert_equal(num, Y1[:, j].data.size)

    # test shuffling and subset
    data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77,
                       download_if_missing=False)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # test return_X_y option
    fetch_func = partial(fetch_rcv1, shuffle=False, subset='train',
                         download_if_missing=False)
    check_return_X_y(data2, fetch_func)

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
def test_fetch_rcv1():
    try:
        data1 = fetch_rcv1(shuffle=False, download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("Download RCV1 dataset to run this test.")

    X1, Y1 = data1.data, data1.target
    cat_list, s1 = data1.target_names.tolist(), data1.sample_id

    # test sparsity
    assert_true(sp.issparse(X1))
    assert_true(sp.issparse(Y1))
    assert_equal(60915113, X1.data.size)
    assert_equal(2606875, Y1.data.size)

    # test shapes
    assert_equal((804414, 47236), X1.shape)
    assert_equal((804414, 103), Y1.shape)
    assert_equal((804414, ), s1.shape)
    assert_equal(103, len(cat_list))

    # test number of sample for some categories
    some_categories = ('GMIL', 'E143', 'CCAT')
    number_non_zero_in_cat = (5, 1206, 381327)
    for num, cat in zip(number_non_zero_in_cat, some_categories):
        j = cat_list.index(cat)
        assert_equal(num, Y1[:, j].data.size)

    # test shuffling and subset
    data2 = fetch_rcv1(shuffle=True,
                       subset='train',
                       random_state=77,
                       download_if_missing=False)
    X2, Y2 = data2.data, data2.target
    s2 = data2.sample_id

    # The first 23149 samples are the training samples
    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))

    # test some precise values
    some_sample_ids = (2286, 3274, 14042)
    for sample_id in some_sample_ids:
        idx1 = s1.tolist().index(sample_id)
        idx2 = s2.tolist().index(sample_id)

        feature_values_1 = X1[idx1, :].toarray()
        feature_values_2 = X2[idx2, :].toarray()
        assert_almost_equal(feature_values_1, feature_values_2)

        target_values_1 = Y1[idx1, :].toarray()
        target_values_2 = Y2[idx2, :].toarray()
        assert_almost_equal(target_values_1, target_values_2)
Exemple #3
0
    def run(self, step_limit):
        self.train()

        rcv1 = fetch_rcv1(subset='train')
        train_data = rcv1.data
        train_label = rcv1.target

        rcv1 = fetch_rcv1(subset='test', random_state=1)
        test_data = rcv1.data
        test_label = rcv1.target

        with tf.Session() as sess:
            tf.global_variables_initializer().run()

            path = "LSTM/" + str(step_limit) + "rcv1"
            saver = NNutils.save(path, sess)
            writer, writer_test, merged = NNutils.graph(path, sess)

            step = sess.run(self.global_step)
            while step < step_limit:
                print("step :", step)

                for start, end in zip(
                        range(0, train_data.shape[0], self.batch_size),
                        range(self.batch_size, train_data.shape[0],
                              self.batch_size)):
                    data = scipy.sparse.coo_matrix(train_data[start:end])
                    label = scipy.sparse.coo_matrix(train_label[start:end])
                    indices = np.array([data.row, data.col]).T

                    summary, \
                    _, loss, \
                    step = sess.run([merged,
                                     self.training, self.cost,
                                     self.global_step],
                                    feed_dict={self.x: (indices, data.data, data.shape),
                                               self.y: (indices, label.data, label.shape),
                                               self.dropout_conv: 1.0, self.dropout_normal: 1.0})

                    if step % 50 == 0:
                        writer.add_summary(summary, step)
                        print(step, datetime.now(), loss)

                        summary, \
                        loss, \
                        accuracy = sess.run([merged, self.cost, self.accuracy],
                                            feed_dict={self.x: test_data[0:1000],
                                                       self.y: test_label[0:1000],
                                                       self.dropout_conv: 1.0, self.dropout_normal: 1.0})

                        writer_test.add_summary(summary, step)

                        print("test results : ", accuracy, loss)
def test_fetch_rcv1_true_and_test():
   hold = fetch_rcv1(download_if_missing=True, shuffle=True)

   data3 = fetch_rcv1(download_if_missing=False, shuffle=True, subset= "test")
   X3, Y3 = data3.data, data3.target
   catlist2, s3 = data3.target_names.tolist(), data3.sample_id
   
   assert_equal((781265, 47236), X3.shape)
   assert_equal((781265, 103), Y3.shape)


   first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151']
   assert_array_equal(first_categories, catlist2[:6])
def _download_rcv1():
    """
    Download the rcv1 dataset from scikitlearn.
    :return: The train, test and validation set.
    """
    from sklearn.datasets import fetch_rcv1
    print "downloading rcv1 train data...."
    newsgroups_train = fetch_rcv1(subset='train')
    print "downloading rcv1 test data...."
    newsgroups_test = fetch_rcv1(subset='test')
    train_set = (newsgroups_train.data, newsgroups_train.target)
    test_set = (newsgroups_test.data, newsgroups_test.target)

    return train_set, test_set
Exemple #6
0
def _download_rcv1():
    """
    Download the rcv1 dataset from scikitlearn.
    :return: The train, test and validation set.
    """
    from sklearn.datasets import fetch_rcv1
    print "downloading rcv1 train data...."
    newsgroups_train = fetch_rcv1(subset='train')
    print "downloading rcv1 test data...."
    newsgroups_test = fetch_rcv1(subset='test')
    train_set = (newsgroups_train.data, newsgroups_train.target)
    test_set = (newsgroups_test.data, newsgroups_test.target)

    return train_set,test_set
Exemple #7
0
 def __init__(self, dataset=None):
     self.dataset = dataset if dataset is not None else fetch_rcv1()
     self.parent_hierarchy, self.children_hierarchy = self.__get_rcv1_hierarchy(
     )
     self.csc_target = csc_matrix(
         self.dataset.target
     )  # This will improve speed for single-label indices
Exemple #8
0
def train():
    convertdict()
    print 'Start training a multiclass Naive Bayesian Classifier...' + str(
        trainingnum) + ' training data is used.'
    rcv1 = fetch_rcv1(data_home=Paths.rcv1DataHome, random_state=1)

    X_train = rcv1.data[:trainingnum]
    Y_train = rcv1.target[:trainingnum]

    X_test = rcv1.data[trainingnum:]
    Y_test = rcv1.target[trainingnum:]
    multiClassClf = OneVsRestClassifier(MultinomialNB()).fit(X_train, Y_train)

    joblib.dump(multiClassClf, Paths.pklDataPath + 'NBClassifier.pkl')
    predictionTrain = multiClassClf.predict(X_train)
    print 'Train accuracy:'
    print accuracy_score(predictionTrain.toarray(), Y_train.toarray())
    print 'Train Recall:'
    print recall_score(predictionTrain.toarray(),
                       Y_train.toarray(),
                       average='macro')
    print 'F1 Score:'
    print f1_score(predictionTrain.toarray(),
                   Y_train.toarray(),
                   average='macro')
    prediction = multiClassClf.predict(X_test)
    print 'Accuracy is: ',
    print accuracy_score(prediction.toarray(), Y_test.toarray())
    print 'F1 Score:'
    print f1_score(prediction.toarray(), Y_test.toarray(), average='macro')
Exemple #9
0
def load_reuters(nb_words=2000, test_split=0.2):

    rcv1 = fetch_rcv1()

    ind_ccat = (rcv1.target[:, 33] == 1).toarray().reshape(804414)
    ind_ecat = (rcv1.target[:, 59] == 1).toarray().reshape(804414)
    ind_gcat = (rcv1.target[:, 70] == 1).toarray().reshape(804414)
    ind_mcat = (rcv1.target[:, 102] == 1).toarray().reshape(804414)

    ind_valid = np.logical_or(
        np.logical_and(np.logical_xor(ind_ccat, ind_mcat),
                       np.logical_and(~ind_gcat, ~ind_ecat)),
        np.logical_and(np.logical_xor(ind_gcat, ind_ecat),
                       np.logical_and(~ind_ccat, ~ind_mcat)))

    y = rcv1.target[ind_valid, ].toarray()[:, [33, 59, 70, 102]].argmax(axis=1)

    ind_word = np.argsort(np.bincount(
        rcv1.data[ind_valid, ].nonzero()[1]))[::-1][0:nb_words]

    X = rcv1.data[ind_valid, ][:, ind_word].toarray()

    X_train = X[:int(len(X) * (1 - test_split))]
    y_train = y[:int(len(X) * (1 - test_split))]

    X_test = X[int(len(X) * (1 - test_split)):]
    y_test = y[int(len(X) * (1 - test_split)):]

    input_shape = (nb_words, )

    return (X_train, y_train), (X_test, y_test), input_shape
def load_data():
	rcv1 = fetch_rcv1()
	X = rcv1.data.T
	num_samples = X.shape[1]

	# Find the index for 'CCAT'
	ccat_index = -1
	for i, label in enumerate(rcv1.target_names):
		if label == 'CCAT':
			ccat_index = i
			break

	# Convert encoding to {-1, 1}
	Y = np.zeros((1, num_samples))

	numpos = 0
	numneg = 0
	for i in range(rcv1.target.shape[0]):
		y = rcv1.target[i, ccat_index]
		if y == 1:
			numpos += 1
			Y[0, i] = 1
		else:
			numneg += 1
			Y[0, i] = -1

	return (X.tocsc(), Y, numpos, numneg)
Exemple #11
0
def load_data_rcv1_test():

    rcv1 = fetch_rcv1()

    X_coo = rcv1.data[23149:].tocoo(
    )  #coo_matrix(([3,4,5], ([0,1,1], [2,0,2])), shape=(2,3))

    Y_coo = rcv1.target[23149:].tocoo()

    values = X_coo.data
    #     print(X_coo)
    indices = np.vstack((X_coo.row, X_coo.col))

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    shape = X_coo.shape

    X_sparse = torch.sparse.DoubleTensor(i, v, torch.Size(shape))

    indices = np.vstack((Y_coo.row, Y_coo.col))
    values = Y_coo.data

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    shape = Y_coo.shape

    Y_sparse = torch.sparse.DoubleTensor(i, v, torch.Size(shape))

    return X_sparse, Y_sparse
Exemple #12
0
 def __init__(self, data_name, train=True):
     self.train = train
     self.data = data_name
     if data_name == 'rcv1':
         self.rcv1 = fetch_rcv1()
         X_train, Y_train, X_test, Y_test = rcv1_test(self.rcv1)
         if train:
             self.samples = X_train
         else:
             self.samples = X_test
     else:
         if data_name == 'yelp':
             X_train, Y_train, X_test, Y_test, train_ids, test_ids = yelp_test(
             )
         elif data_name == 'nyt':
             X_train, Y_train, X_test, Y_test, train_ids, test_ids = nyt_test(
             )
         else:
             X_train, Y_train, X_test, Y_test, train_ids, test_ids = fungo_test(
                 data_name)
         if train:
             self.samples = X_train
             self.ids = train_ids
         else:
             self.samples = X_test
             self.ids = test_ids
Exemple #13
0
def build_file_rcv1():
    rcv1 = fetch_rcv1()

    fi = open("rcv1.txt","w")
    for sample in rcv1.data[1:5]:
        print(sample.toarray())
        fi.write(str(sample.toarray()) + '\n')
    fi.close()
Exemple #14
0
 def load_data(self):
     rcv1 = fetch_rcv1(subset='train', download_if_missing=False)
     x = rcv1.data.A  # numpy.float64
     x = x.astype(np.float32)  # 修改数据类型,否则就会出错
     self.xArray = torch.from_numpy(x)
     y = rcv1.target.A
     y = y.astype(np.float32)  # 修改数据类型,否则就会出错
     self.yArray = torch.from_numpy(y)
Exemple #15
0
def load_validation_data(path_to_ids):
    data = fetch_rcv1(subset='test')
    ids = pd.read_csv(path_to_ids, names=['id'], dtype=np.int32)
    mask = np.isin(data.sample_id, ids['id'])
    validation_data = data.data[mask]
    validation_target = data.target[mask].toarray()
    validation_ids = data.sample_id[mask]
    return validation_data, validation_target, validation_ids
Exemple #16
0
def rcv1():
    rcv1 = fetch_rcv1(subset='train')
    # train_data = rcv1.data
    # train_label = rcv1.target

    train_data = csr_matrix(rcv1.data[0:1000]).toarray()
    # train_data2 = csr_matrix(rcv1.data).toarray()
    # print(train_data.shape)
    train_data = tf.train.batch([train_data], 128)
Exemple #17
0
def rcv1_test():
    from sklearn.datasets import fetch_rcv1
    rcv1 = fetch_rcv1()
    X_train = rcv1.data[:23149]
    Y_train = rcv1.target[:23149]
    X_test = rcv1.data[23149:]
    Y_test = rcv1.target[23149:]
    print(Y_train[:2])
    print(rcv1.target_names[34], rcv1.target_names[59])
    return X_train, Y_train, X_test, Y_test
Exemple #18
0
 def load_data(self):
     rcv1 = fetch_rcv1(subset='train', download_if_missing=False)
     x = rcv1.data.A  # numpy.float64
     x = x.astype(np.float32)
     self.xArray = torch.from_numpy(x)
     print("length = ", len(self.xArray))
     # csr_matrix -> numpy.ndarray -> torch.tensor
     y = rcv1.target.A
     y = y.astype(np.float32)  # 修改数据类型,否则会出错
     self.yArray = torch.from_numpy(y)
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == "lfw_people":
        X = fetch_lfw_people().data
    elif dataset_name == "20newsgroups":
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == "olivetti_faces":
        X = fetch_olivetti_faces().data
    elif dataset_name == "rcv1":
        X = fetch_rcv1().data
    elif dataset_name == "CIFAR":
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == "SVHN":
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"]
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == "low rank matrix":
        X = make_low_rank_matrix(
            n_samples=500,
            n_features=int(1e4),
            effective_rank=100,
            tail_strength=0.5,
            random_state=random_state,
        )
    elif dataset_name == "uncorrelated matrix":
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == "big sparse matrix":
        sparsity = int(1e6)
        size = int(1e6)
        small_size = int(1e4)
        data = np.random.normal(0, 1, int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name, parser="auto").data
    return X
def get_datasets_rcv1(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_rcv1(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets
Exemple #21
0
def make_rcv1_data():
    print('Loading RCV1 features ...')
    rcv1_dic = fetch_rcv1(subset='test')
    X = rcv1_dic.data
    y = rcv1_dic.target
    save_npz(os.path.join(DATASETS_PATH, 'rcv1_X'), X)
    y = y.todense()
    y = np.array([1 if y_i[0][0, 0] == 1 else -1 for y_i in y])
    print(np.unique(y, return_counts=True))
    np.save(os.path.join(DATASETS_PATH, 'rcv1_y'), y)
    print(' ... Dataset created !')
    return
Exemple #22
0
def print_some(pred, Y, k=500):
    pred_tmp = pred[:k].todense().tolist()
    from sklearn.datasets import fetch_rcv1
    rcv1 = fetch_rcv1()
    for tmp, y in zip(pred_tmp, Y.todense().tolist()):
        for i in range(len(tmp)):
            if tmp[i] == 1:
                print(rcv1.target_names[i], end=' ')
        print()
        for i in range(len(tmp)):
            if y[i] == 1:
                print(rcv1.target_names[i], end=' ')
        print('---')
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
              for i in range(5)]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
                                 effective_rank=100, tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity/10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_mldata(dataset_name).data
    return X
Exemple #24
0
def get_data(split_type='random'):
    if os.path.exists('./data/features.npz') and os.path.exists(
            './data/labels.npz'):
        features = load_sparse_csr('data/features.npz')
        labels = load_sparse_csr('data/labels.npz')
        if ((os.path.exists('./data/first-indices-{}.npy'.format(split_type))
             and os.path.exists(
                 './data/second-indices-{}.npy'.format(split_type)))
                or os.path.exists(
                    './data/first-labels-{}.npy'.format(split_type))):

            if not split_type == 'coarse':
                first_features = load_sparse_csr(
                    './data/first-features-{}.npz'.format(split_type))
                first_labels = load_sparse_csr(
                    './data/first-labels-{}.npz'.format(split_type))

                second_features = load_sparse_csr(
                    './data/second-features-{}.npz'.format(split_type))
                second_labels = load_sparse_csr(
                    './data/second-labels-{}.npz'.format(split_type))

                # Get split indices array
                first_ind = np.load(
                    './data/first-indices-{}.npy'.format(split_type))
                second_ind = np.load(
                    './data/second-indices-{}.npy'.format(split_type))
                return first_features, first_labels, second_features, second_labels, (
                    first_ind, second_ind)
            else:
                first_labels = np.load('./data/first-labels-coarse.npy')
                second_labels = np.load('./data/second-labels-coarse.npy')
                return features, first_labels, second_labels
        else:
            if split_type == 'coarse':
                return coarse_categorical_split(features, labels)
            else:
                return split(features, labels, split_type)

    else:
        rcv1 = fetch_rcv1()
        save_sparse_csr('data/features', rcv1.data)
        save_sparse_csr('data/labels', rcv1.target)

        if split_type == 'coarse':
            return coarse_categorical_split(features, labels)
        else:
            return split(features, labels, split_type)
Exemple #25
0
def preprocess(cache_location, output_location):

    np.random.seed(10000019)
    print("Fetching RCV1 dataset")
    rcv1 = fetch_rcv1()

    print("Shape of the data:", rcv1.data.shape)

    print("Index of CCAT:", rcv1.target_names.tolist().index("CCAT"))

    # get the first SIZE samples
    features = rcv1.data[:SIZE]
    categories = rcv1.target[:SIZE]

    # convert labels to 1, -1
    # our classification is binary: in/out of class 33
    print("Converting labels")
    labels = np.array([mk_label(row.toarray()[0, 33]) for row in categories])

    # test the sklearn classifier
    classify(features, labels)

    # shuffle the dataset
    print("Shuffling dataset")
    index = np.arange(np.shape(features)[0])
    np.random.shuffle(index)
    features = features[index, :]
    labels = labels[index]

    classify(features, labels)

    # shrink the dataset
    print("Shrinking to size")
    features = features[:SHRUNK_SIZE]
    labels = labels[:SHRUNK_SIZE]

    classify(features, labels)

    # save the dataset
    print("Saving")
    np.save(os.path.join(output_location, FILENAME_D), features.data)
    np.save(os.path.join(output_location, FILENAME_INDICES), features.indices)
    np.save(os.path.join(output_location, FILENAME_INDPTR), features.indptr)
    np.save(os.path.join(output_location, FILENAME_Y), labels)

    # print statistics
    print("Shape of the data is:", features.shape)
Exemple #26
0
def import_data():
	rcv1 = fetch_rcv1()

	# do we have ways to simplify the way to find index of row contains 1?
	aa = rcv1['target'][:,33]
	kk = list(aa.toarray().reshape(-1,).astype("int"))
	postive_ind = [i for i, x in enumerate(kk) if x==1]
	negative_ind = [i for i, x in enumerate(kk) if x==0]

	# generate new -1 and 1 target
	# len(postive_ind)+len(negative_ind) = 804,414
	new_target = np.ones(804414) #check how long
	for i in negative_ind:
	    new_target[i] = -1

	new_rcv1 = sparse.hstack([rcv1['data'],new_target.reshape(-1,1)]) #804414x47237
	csr_data = new_rcv1.tocsr()

	return csr_data
def evaluate_score(train_dataset, test_dataset, number_of_labels):
    # train = fetch_rcv1(subset='train', shuffle=True, random_state=42)
    # number_of_labels = 10
    train = deepcopy(train_dataset)
    test = deepcopy(test_dataset)

    train.target = train.target[:, range(number_of_labels)]
    bool_array = np.zeros(shape=train.target.shape[0], dtype=bool)

    for i in range(number_of_labels):
        bool_array = np.logical_or(
            bool_array,
            np.array(((train.target[:, i] == 1).todense())).flatten())

    train.data = train.data[bool_array, :]
    train.target = train.target[bool_array, :]

    print("Total number of train documents : " + str(train.target.shape[0]))

    classifier = RandomForestClassifier(n_estimators=10)
    classifier.fit(train.data, train.target.todense())

    test = fetch_rcv1(subset='test', shuffle=True, random_state=42)
    test.data = test.data[:30000, :]
    test.target = test.target[:30000, :]
    test.target = test.target[:, range(number_of_labels)]

    bool_array = np.zeros(shape=test.target.shape[0], dtype=bool)

    for i in range(number_of_labels):
        bool_array = np.logical_or(
            bool_array,
            np.array(((test.target[:, i] == 1).todense())).flatten())

    test.data = test.data[bool_array, :]
    test.target = test.target[bool_array, :]

    print("Total number of test documents : " + str(test.target.shape[0]))

    predicted = classifier.predict(test.data)

    # print("Jaccard Similarity Score is : "+str(jaccard_similarity_score(test.target, predicted)))
    return jaccard_similarity_score(test.target, predicted)
Exemple #28
0
def sparse():
    rcv1 = fetch_rcv1(subset='train')

    train = rcv1.data

    print("들어감")
    a = scipy.sparse.coo_matrix(train[0:2])

    indices = np.array([a.row, a.col], dtype=np.int64).T
    # indices = np.array([[3, 2, 0], [4, 5, 1]], dtype=np.int64)
    values = a.data
    print(a.shape)

    # print(b.get_shape())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        b = tf.SparseTensor(indices=np.array([a.row, a.col]).T,
                            values=a.data,
                            dense_shape=a.shape)
        print(sess.run(b))
Exemple #29
0
def load(name):
    """
    Load the database from Lazy Initialized Dictionary with its known name.
    :param name: Name of database
    :return: tuple(X, y)
    """
    databases = LazyDict({
        'breast_cancer': lambda: load_breast_cancer(return_X_y=True),
        'cov_type': lambda: itemgetter('data', 'target')(fetch_covtype()),
        'digits': lambda: load_digits(return_X_y=True),
        'iris': lambda: load_iris(return_X_y=True),
        'kddcup99': lambda: load_kddcup99(),
        'lfw': lambda: fetch_lfw_people(return_X_y=True),
        'mnist': lambda: openml.fetch_openml('mnist_784', version=1,
                                             return_X_y=True),
        'news_groups': lambda: itemgetter('data', 'target')(
            fetch_20newsgroups_vectorized(subset='all')),
        'olivetti_faces': lambda: itemgetter('data', 'target')(
            fetch_olivetti_faces()),
        'rcv1': lambda: fetch_rcv1(random_state=0, return_X_y=True),
        'wine': lambda: load_wine(return_X_y=True)
    })
    return databases.get(name)
def load_sklearn_dataset(data_set_name="covtype", n=1000, d=10):
    if data_set_name == "covtype":
        covtype = fetch_covtype()
        X = normalize(covtype.data[:n])
        y = covtype.target[:n]
        return X, y

    if data_set_name == "rcv1":
        rcv1 = fetch_rcv1()
        X = normalize(rcv1.data[:n])
        y = rcv1.target[:n]
        return X, y

    if data_set_name == "lfw":
        lfw = fetch_lfw_people()
        print(lfw.data.shape)
        print(lfw.target.shape)
        X = normalize(lfw.data[:n], axis=1)
        # MinMaxScaler().transform(lfw.data[:n])
        # scale(lfw.data[:n])

        y = lfw.target[:n]
        return X, y
Exemple #31
0
def main():
    # Fetch the rcv1 dataset from sklearn.
    rcv1 = fetch_rcv1()

    # Clean and reformat the dataset.
    target = rcv1['target'].todense()
    label = np.array(target[:, 33]).reshape(1, -1)[0]
    label.dtype = 'int8'
    label[label == 0] = -1

    # Create numpy array of training data.
    training_data = rcv1['data'][0:100000, :]

    # Assign labels to training data.
    training_label = label[0:100000]

    test_data = rcv1['data'][100000:, :]
    test_label = label[100000:]

    # Save the training and test datasets to disk.
    np.save('test_data_rcv1.npy', test_data)
    np.save('test_label_rcv1', test_label)
    np.save('training_data_rcv1', training_data)
    np.save('training_label_rcv1', training_label)
Exemple #32
0
        error_rate.append(error(train_data, train_label, w))
        #Test error for problem 5
        error_rate_test.append(error(test_data, test_label, w))
    return error_rate, error_rate_test


def error(data, label, w):
    predict = w.dot(data.transpose()).toarray()
    wx = predict.transpose() * label
    res = np.sum(wx < 0)
    error_rate = res / label.shape[0]
    return error_rate


if __name__ == '__main__':
    rcv1 = fetch_rcv1()
    CCAT, label, data = problem1a(rcv1)
    train_data, train_label, test_data, test_label = problem1b(rcv1, 100000)
    Train_error_P, Test_error_P, min_train_P, min_test_P = problem2(
        train_data, train_label, test_data, test_label, 0.0001, 2000, 50)
    Train_error_A, Test_error_A, min_train_A, min_test_A = problem3(
        train_data, train_label, test_data, test_label, 1e-7, 0.0001, 2000, 50)
    loss1, accuracy1, error_DNN1 = problem4(train_data, train_label, test_data,
                                            test_label, 5, 375)
#    loss2, accuracy2, error_DNN2 = problem4(train_data,train_label,test_data,test_label,2,100)
#    loss3, accuracy3, error_DNN3 = problem4(train_data,train_label,test_data,test_label,3,100)
#    y = [error_DNN1,error_DNN2,error_DNN3]
#    x = [1,2,3]
#    plt.figure()
#    plt.plot(x,y)
#    plt.xlabel('Hidden layers')
Exemple #33
0
def load_data_multi_classes_rcv1():

    #     x_data = np.array()
    #
    #     y_data = np.array()

    print('start loading data...')

    rcv1 = fetch_rcv1()

    X_coo = rcv1.data[0:23149].tocoo(
    )  #coo_matrix(([3,4,5], ([0,1,1], [2,0,2])), shape=(2,3))

    Y_coo = rcv1.target[0:23149].tocoo()

    values = X_coo.data
    #     print(X_coo)
    indices = np.vstack((X_coo.row, X_coo.col))

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    shape = X_coo.shape

    X = torch.sparse.DoubleTensor(i, v, torch.Size(shape)).to_dense()

    indices = np.vstack((Y_coo.row, Y_coo.col))
    values = Y_coo.data

    i = torch.LongTensor(indices)
    v = torch.DoubleTensor(values)
    shape = Y_coo.shape

    #     print(Y_coo)

    Y = torch.sparse.DoubleTensor(i, v, torch.Size(shape)).to_dense()

    #     configs = load_config_data(config_file)
    #
    #     from_csv = configs[file_name]['from_csv']
    #
    #     if not from_csv:
    #         return clean_sensor_data(file_name, is_classification)
    #
    #     train_data = pd.read_csv(file_name)
    #
    #
    #
    #     x_cols = configs[file_name]['x_cols']
    #
    #     y_cols = configs[file_name]['y_cols']
    #
    #
    #
    #
    #     x_data = train_data.iloc[:,x_cols].get_values()
    #
    #     y_data = train_data.iloc[:,y_cols].get_values()
    #
    # #     with open(file_name, 'r') as f:
    # #         reader = csv.reader(f)
    # #         line_count = 0
    # #         for row in reader:
    # #             if line_count == 0:
    # # #                 print(f'Column names are {", ".join(row)}')
    # #                 line_count += 1
    # #             else:
    # #
    # #                 if line_count == 1:
    # #                     x_data = np.array(cleaning(row, x_cols), dtype=np.float64)
    # #
    # #                     y_data = np.array(cleaning(row, y_cols), dtype=np.float64)
    # #                 else:
    # #
    # #                     x_data = np.vstack([x_data, np.array(cleaning(row, x_cols), dtype=np.float64)])
    # #
    # #                     y_data = np.vstack([y_data, np.array(cleaning(row, y_cols), dtype=np.float64)])
    # #
    # #                 line_count += 1
    #
    #
    #
    #     x_data = normalize(x_data)
    # #     x_data = normalize_with_known_range(x_data, 255, 0)
    #
    #
    #     if not is_classification:
    #         y_data = normalize(y_data)
    #
    # #     print(x_[0:10, :])
    #
    #     x_train = torch.from_numpy(x_data)
    #
    #     y_train = torch.from_numpy(y_data)
    #
    #
    # #     x_train,y_train=x_train.type(torch.FloatTensor),y_train.type(torch.FloatTensor)
    #
    #     x_train,y_train=x_train.type(torch.FloatTensor),y_train.type(torch.FloatTensor)
    #
    # #     print(x_train[0:10, :])
    #
    # #     print('sum', torch.sum(x_train, dim=0))
    #
    #     X = Variable(x_train)
    #
    #     Y = Variable(y_train)

    #     if is_classification:

    Y_uniques = torch.unique(Y)

    if not (set(Y_uniques.numpy()) == set(range(Y_uniques.shape[0]))):
        #         print(Y_uniques)
        Y_copy = torch.zeros(Y.shape)

        for k in range(Y_uniques.shape[0]):
            #             print((Y==Y_uniques[k]).nonzero()[:, 0])

            Y_copy[(Y == Y_uniques[k]).nonzero()[:, 0]] = k

        Y = Y_copy


#         min_label = torch.min(Y)
#
#         if min_label == 0:
#             Y = 2*Y-1

#     print(Y)

    print('X_max::', torch.max(X))

    print('X_min::', torch.min(X))

    print('Y_max::', torch.max(Y))

    print('Y_min::', torch.min(Y))

    print('x_shape::', X.shape)

    print('y_shape::', Y[:, 0].shape)

    print('loading data done...')

    #     print('X_norm::', torch.norm(torch.mm(torch.t(X), X), p=2))

    return split_train_test_data(X, Y[:, 0], 0.1, True)
Exemple #34
0
def exp(solvers, penalty, single_target,
        n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    dtypes_mapping = {
        "float64": np.float64,
        "float32": np.float32,
    }

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(fit_single)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            dtype=dtype,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for dtype in dtypes_mapping.values())

    res = []
    idx = 0
    for dtype_name in dtypes_mapping.keys():
        for solver in solvers:
            if not (skip_slow and
                    solver == 'lightning' and
                    penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                dtype=dtype_name,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
    for (name, _, _, train_losses, _, _, durations) in clfs:
        pobj_final.append(train_losses[-1])

    indices = np.argsort(pobj_final)
    pobj_best = pobj_final[indices[0]]

    for (name, _, _, train_losses, _, _, durations) in clfs:
        log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10)

        plt.plot(durations, log_pobj, '-o', label=name)
        plt.legend(loc=0)
        plt.xlabel("seconds")
        plt.ylabel("log(best - train_loss)")


rcv1 = fetch_rcv1()
X = rcv1.data
n_samples, n_features = X.shape

# consider the binary classification problem 'CCAT' vs the rest
ccat_idx = rcv1.target_names.tolist().index('CCAT')
y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64)
y[y == 0] = -1

# parameters
C = 1.
fit_intercept = True
tol = 1.0e-14

# max_iter range
sgd_iter_range = list(range(1, 121, 10))
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    cached_fit = mem.cache(fit_single)
    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(cached_fit)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for penalty in penalties)

    res = []
    idx = 0
    for solver in solvers:
        for penalty in penalties:
            if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
Exemple #37
0
from sklearn.datasets import fetch_rcv1
import scipy.io as sio
import numpy
import gzip, cPickle
import os

# target_dir = '/home/bo/Data/RCV1/Processed'
# data_home = '/home/bo/Data'
#target_dir = '/project/sidir001/yang4173/Data/RCV1/Processed'
#data_home = '/project/sidir001/yang4173/Data'
target_dir = 'data/RCV1/Processed'
data_home = 'data'

cwd = os.getcwd()
data = fetch_rcv1(data_home = data_home, download_if_missing = True)
names = data.target_names

ind = numpy.full(len(names), False, dtype = bool)
f = open(data_home + '/RCV1/rcv1.topics.hier.orig.txt', 'r')
count = 0
for i in range(len(names) + 1):
    s = f.readline()
    if s[9:12] == 'CAT':
        ind[i - 1] = True
        count = count + 1
f.close()

labels = data.target[:][:, ind].copy()
labels = labels.toarray()
t = labels.sum(axis = 1, keepdims = False)