def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert_true(sp.issparse(X1)) assert_true(sp.issparse(Y1)) assert_equal(60915113, X1.data.size) assert_equal(2606875, Y1.data.size) # test shapes assert_equal((804414, 47236), X1.shape) assert_equal((804414, 103), Y1.shape) assert_equal((804414,), s1.shape) assert_equal(103, len(cat_list)) # test ordering of categories first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert_equal(num, Y1[:, j].data.size) # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert_true(sp.issparse(X1)) assert_true(sp.issparse(Y1)) assert_equal(60915113, X1.data.size) assert_equal(2606875, Y1.data.size) # test shapes assert_equal((804414, 47236), X1.shape) assert_equal((804414, 103), Y1.shape) assert_equal((804414, ), s1.shape) assert_equal(103, len(cat_list)) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert_equal(num, Y1[:, j].data.size) # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def run(self, step_limit): self.train() rcv1 = fetch_rcv1(subset='train') train_data = rcv1.data train_label = rcv1.target rcv1 = fetch_rcv1(subset='test', random_state=1) test_data = rcv1.data test_label = rcv1.target with tf.Session() as sess: tf.global_variables_initializer().run() path = "LSTM/" + str(step_limit) + "rcv1" saver = NNutils.save(path, sess) writer, writer_test, merged = NNutils.graph(path, sess) step = sess.run(self.global_step) while step < step_limit: print("step :", step) for start, end in zip( range(0, train_data.shape[0], self.batch_size), range(self.batch_size, train_data.shape[0], self.batch_size)): data = scipy.sparse.coo_matrix(train_data[start:end]) label = scipy.sparse.coo_matrix(train_label[start:end]) indices = np.array([data.row, data.col]).T summary, \ _, loss, \ step = sess.run([merged, self.training, self.cost, self.global_step], feed_dict={self.x: (indices, data.data, data.shape), self.y: (indices, label.data, label.shape), self.dropout_conv: 1.0, self.dropout_normal: 1.0}) if step % 50 == 0: writer.add_summary(summary, step) print(step, datetime.now(), loss) summary, \ loss, \ accuracy = sess.run([merged, self.cost, self.accuracy], feed_dict={self.x: test_data[0:1000], self.y: test_label[0:1000], self.dropout_conv: 1.0, self.dropout_normal: 1.0}) writer_test.add_summary(summary, step) print("test results : ", accuracy, loss)
def test_fetch_rcv1_true_and_test(): hold = fetch_rcv1(download_if_missing=True, shuffle=True) data3 = fetch_rcv1(download_if_missing=False, shuffle=True, subset= "test") X3, Y3 = data3.data, data3.target catlist2, s3 = data3.target_names.tolist(), data3.sample_id assert_equal((781265, 47236), X3.shape) assert_equal((781265, 103), Y3.shape) first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151'] assert_array_equal(first_categories, catlist2[:6])
def _download_rcv1(): """ Download the rcv1 dataset from scikitlearn. :return: The train, test and validation set. """ from sklearn.datasets import fetch_rcv1 print "downloading rcv1 train data...." newsgroups_train = fetch_rcv1(subset='train') print "downloading rcv1 test data...." newsgroups_test = fetch_rcv1(subset='test') train_set = (newsgroups_train.data, newsgroups_train.target) test_set = (newsgroups_test.data, newsgroups_test.target) return train_set, test_set
def _download_rcv1(): """ Download the rcv1 dataset from scikitlearn. :return: The train, test and validation set. """ from sklearn.datasets import fetch_rcv1 print "downloading rcv1 train data...." newsgroups_train = fetch_rcv1(subset='train') print "downloading rcv1 test data...." newsgroups_test = fetch_rcv1(subset='test') train_set = (newsgroups_train.data, newsgroups_train.target) test_set = (newsgroups_test.data, newsgroups_test.target) return train_set,test_set
def __init__(self, dataset=None): self.dataset = dataset if dataset is not None else fetch_rcv1() self.parent_hierarchy, self.children_hierarchy = self.__get_rcv1_hierarchy( ) self.csc_target = csc_matrix( self.dataset.target ) # This will improve speed for single-label indices
def train(): convertdict() print 'Start training a multiclass Naive Bayesian Classifier...' + str( trainingnum) + ' training data is used.' rcv1 = fetch_rcv1(data_home=Paths.rcv1DataHome, random_state=1) X_train = rcv1.data[:trainingnum] Y_train = rcv1.target[:trainingnum] X_test = rcv1.data[trainingnum:] Y_test = rcv1.target[trainingnum:] multiClassClf = OneVsRestClassifier(MultinomialNB()).fit(X_train, Y_train) joblib.dump(multiClassClf, Paths.pklDataPath + 'NBClassifier.pkl') predictionTrain = multiClassClf.predict(X_train) print 'Train accuracy:' print accuracy_score(predictionTrain.toarray(), Y_train.toarray()) print 'Train Recall:' print recall_score(predictionTrain.toarray(), Y_train.toarray(), average='macro') print 'F1 Score:' print f1_score(predictionTrain.toarray(), Y_train.toarray(), average='macro') prediction = multiClassClf.predict(X_test) print 'Accuracy is: ', print accuracy_score(prediction.toarray(), Y_test.toarray()) print 'F1 Score:' print f1_score(prediction.toarray(), Y_test.toarray(), average='macro')
def load_reuters(nb_words=2000, test_split=0.2): rcv1 = fetch_rcv1() ind_ccat = (rcv1.target[:, 33] == 1).toarray().reshape(804414) ind_ecat = (rcv1.target[:, 59] == 1).toarray().reshape(804414) ind_gcat = (rcv1.target[:, 70] == 1).toarray().reshape(804414) ind_mcat = (rcv1.target[:, 102] == 1).toarray().reshape(804414) ind_valid = np.logical_or( np.logical_and(np.logical_xor(ind_ccat, ind_mcat), np.logical_and(~ind_gcat, ~ind_ecat)), np.logical_and(np.logical_xor(ind_gcat, ind_ecat), np.logical_and(~ind_ccat, ~ind_mcat))) y = rcv1.target[ind_valid, ].toarray()[:, [33, 59, 70, 102]].argmax(axis=1) ind_word = np.argsort(np.bincount( rcv1.data[ind_valid, ].nonzero()[1]))[::-1][0:nb_words] X = rcv1.data[ind_valid, ][:, ind_word].toarray() X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] input_shape = (nb_words, ) return (X_train, y_train), (X_test, y_test), input_shape
def load_data(): rcv1 = fetch_rcv1() X = rcv1.data.T num_samples = X.shape[1] # Find the index for 'CCAT' ccat_index = -1 for i, label in enumerate(rcv1.target_names): if label == 'CCAT': ccat_index = i break # Convert encoding to {-1, 1} Y = np.zeros((1, num_samples)) numpos = 0 numneg = 0 for i in range(rcv1.target.shape[0]): y = rcv1.target[i, ccat_index] if y == 1: numpos += 1 Y[0, i] = 1 else: numneg += 1 Y[0, i] = -1 return (X.tocsc(), Y, numpos, numneg)
def load_data_rcv1_test(): rcv1 = fetch_rcv1() X_coo = rcv1.data[23149:].tocoo( ) #coo_matrix(([3,4,5], ([0,1,1], [2,0,2])), shape=(2,3)) Y_coo = rcv1.target[23149:].tocoo() values = X_coo.data # print(X_coo) indices = np.vstack((X_coo.row, X_coo.col)) i = torch.LongTensor(indices) v = torch.DoubleTensor(values) shape = X_coo.shape X_sparse = torch.sparse.DoubleTensor(i, v, torch.Size(shape)) indices = np.vstack((Y_coo.row, Y_coo.col)) values = Y_coo.data i = torch.LongTensor(indices) v = torch.DoubleTensor(values) shape = Y_coo.shape Y_sparse = torch.sparse.DoubleTensor(i, v, torch.Size(shape)) return X_sparse, Y_sparse
def __init__(self, data_name, train=True): self.train = train self.data = data_name if data_name == 'rcv1': self.rcv1 = fetch_rcv1() X_train, Y_train, X_test, Y_test = rcv1_test(self.rcv1) if train: self.samples = X_train else: self.samples = X_test else: if data_name == 'yelp': X_train, Y_train, X_test, Y_test, train_ids, test_ids = yelp_test( ) elif data_name == 'nyt': X_train, Y_train, X_test, Y_test, train_ids, test_ids = nyt_test( ) else: X_train, Y_train, X_test, Y_test, train_ids, test_ids = fungo_test( data_name) if train: self.samples = X_train self.ids = train_ids else: self.samples = X_test self.ids = test_ids
def build_file_rcv1(): rcv1 = fetch_rcv1() fi = open("rcv1.txt","w") for sample in rcv1.data[1:5]: print(sample.toarray()) fi.write(str(sample.toarray()) + '\n') fi.close()
def load_data(self): rcv1 = fetch_rcv1(subset='train', download_if_missing=False) x = rcv1.data.A # numpy.float64 x = x.astype(np.float32) # 修改数据类型,否则就会出错 self.xArray = torch.from_numpy(x) y = rcv1.target.A y = y.astype(np.float32) # 修改数据类型,否则就会出错 self.yArray = torch.from_numpy(y)
def load_validation_data(path_to_ids): data = fetch_rcv1(subset='test') ids = pd.read_csv(path_to_ids, names=['id'], dtype=np.int32) mask = np.isin(data.sample_id, ids['id']) validation_data = data.data[mask] validation_target = data.target[mask].toarray() validation_ids = data.sample_id[mask] return validation_data, validation_target, validation_ids
def rcv1(): rcv1 = fetch_rcv1(subset='train') # train_data = rcv1.data # train_label = rcv1.target train_data = csr_matrix(rcv1.data[0:1000]).toarray() # train_data2 = csr_matrix(rcv1.data).toarray() # print(train_data.shape) train_data = tf.train.batch([train_data], 128)
def rcv1_test(): from sklearn.datasets import fetch_rcv1 rcv1 = fetch_rcv1() X_train = rcv1.data[:23149] Y_train = rcv1.target[:23149] X_test = rcv1.data[23149:] Y_test = rcv1.target[23149:] print(Y_train[:2]) print(rcv1.target_names[34], rcv1.target_names[59]) return X_train, Y_train, X_test, Y_test
def load_data(self): rcv1 = fetch_rcv1(subset='train', download_if_missing=False) x = rcv1.data.A # numpy.float64 x = x.astype(np.float32) self.xArray = torch.from_numpy(x) print("length = ", len(self.xArray)) # csr_matrix -> numpy.ndarray -> torch.tensor y = rcv1.target.A y = y.astype(np.float32) # 修改数据类型,否则会出错 self.yArray = torch.from_numpy(y)
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == "lfw_people": X = fetch_lfw_people().data elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data elif dataset_name == "rcv1": X = fetch_rcv1().data elif dataset_name == "CIFAR": if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == "low rank matrix": X = make_low_rank_matrix( n_samples=500, n_features=int(1e4), effective_rank=100, tail_strength=0.5, random_state=random_state, ) elif dataset_name == "uncorrelated matrix": X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name, parser="auto").data return X
def get_datasets_rcv1(subset='train', categories=None, shuffle=True, random_state=42): """ Retrieve data from 20 newsgroups :param subset: train, test or all :param categories: List of newsgroup name :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the newsgroup """ datasets = fetch_rcv1(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state) return datasets
def make_rcv1_data(): print('Loading RCV1 features ...') rcv1_dic = fetch_rcv1(subset='test') X = rcv1_dic.data y = rcv1_dic.target save_npz(os.path.join(DATASETS_PATH, 'rcv1_X'), X) y = y.todense() y = np.array([1 if y_i[0][0, 0] == 1 else -1 for y_i in y]) print(np.unique(y, return_counts=True)) np.save(os.path.join(DATASETS_PATH, 'rcv1_y'), y) print(' ... Dataset created !') return
def print_some(pred, Y, k=500): pred_tmp = pred[:k].todense().tolist() from sklearn.datasets import fetch_rcv1 rcv1 = fetch_rcv1() for tmp, y in zip(pred_tmp, Y.todense().tolist()): for i in range(len(tmp)): if tmp[i] == 1: print(rcv1.target_names[i], end=' ') print() for i in range(len(tmp)): if y[i] == 1: print(rcv1.target_names[i], end=' ') print('---')
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity/10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_mldata(dataset_name).data return X
def get_data(split_type='random'): if os.path.exists('./data/features.npz') and os.path.exists( './data/labels.npz'): features = load_sparse_csr('data/features.npz') labels = load_sparse_csr('data/labels.npz') if ((os.path.exists('./data/first-indices-{}.npy'.format(split_type)) and os.path.exists( './data/second-indices-{}.npy'.format(split_type))) or os.path.exists( './data/first-labels-{}.npy'.format(split_type))): if not split_type == 'coarse': first_features = load_sparse_csr( './data/first-features-{}.npz'.format(split_type)) first_labels = load_sparse_csr( './data/first-labels-{}.npz'.format(split_type)) second_features = load_sparse_csr( './data/second-features-{}.npz'.format(split_type)) second_labels = load_sparse_csr( './data/second-labels-{}.npz'.format(split_type)) # Get split indices array first_ind = np.load( './data/first-indices-{}.npy'.format(split_type)) second_ind = np.load( './data/second-indices-{}.npy'.format(split_type)) return first_features, first_labels, second_features, second_labels, ( first_ind, second_ind) else: first_labels = np.load('./data/first-labels-coarse.npy') second_labels = np.load('./data/second-labels-coarse.npy') return features, first_labels, second_labels else: if split_type == 'coarse': return coarse_categorical_split(features, labels) else: return split(features, labels, split_type) else: rcv1 = fetch_rcv1() save_sparse_csr('data/features', rcv1.data) save_sparse_csr('data/labels', rcv1.target) if split_type == 'coarse': return coarse_categorical_split(features, labels) else: return split(features, labels, split_type)
def preprocess(cache_location, output_location): np.random.seed(10000019) print("Fetching RCV1 dataset") rcv1 = fetch_rcv1() print("Shape of the data:", rcv1.data.shape) print("Index of CCAT:", rcv1.target_names.tolist().index("CCAT")) # get the first SIZE samples features = rcv1.data[:SIZE] categories = rcv1.target[:SIZE] # convert labels to 1, -1 # our classification is binary: in/out of class 33 print("Converting labels") labels = np.array([mk_label(row.toarray()[0, 33]) for row in categories]) # test the sklearn classifier classify(features, labels) # shuffle the dataset print("Shuffling dataset") index = np.arange(np.shape(features)[0]) np.random.shuffle(index) features = features[index, :] labels = labels[index] classify(features, labels) # shrink the dataset print("Shrinking to size") features = features[:SHRUNK_SIZE] labels = labels[:SHRUNK_SIZE] classify(features, labels) # save the dataset print("Saving") np.save(os.path.join(output_location, FILENAME_D), features.data) np.save(os.path.join(output_location, FILENAME_INDICES), features.indices) np.save(os.path.join(output_location, FILENAME_INDPTR), features.indptr) np.save(os.path.join(output_location, FILENAME_Y), labels) # print statistics print("Shape of the data is:", features.shape)
def import_data(): rcv1 = fetch_rcv1() # do we have ways to simplify the way to find index of row contains 1? aa = rcv1['target'][:,33] kk = list(aa.toarray().reshape(-1,).astype("int")) postive_ind = [i for i, x in enumerate(kk) if x==1] negative_ind = [i for i, x in enumerate(kk) if x==0] # generate new -1 and 1 target # len(postive_ind)+len(negative_ind) = 804,414 new_target = np.ones(804414) #check how long for i in negative_ind: new_target[i] = -1 new_rcv1 = sparse.hstack([rcv1['data'],new_target.reshape(-1,1)]) #804414x47237 csr_data = new_rcv1.tocsr() return csr_data
def evaluate_score(train_dataset, test_dataset, number_of_labels): # train = fetch_rcv1(subset='train', shuffle=True, random_state=42) # number_of_labels = 10 train = deepcopy(train_dataset) test = deepcopy(test_dataset) train.target = train.target[:, range(number_of_labels)] bool_array = np.zeros(shape=train.target.shape[0], dtype=bool) for i in range(number_of_labels): bool_array = np.logical_or( bool_array, np.array(((train.target[:, i] == 1).todense())).flatten()) train.data = train.data[bool_array, :] train.target = train.target[bool_array, :] print("Total number of train documents : " + str(train.target.shape[0])) classifier = RandomForestClassifier(n_estimators=10) classifier.fit(train.data, train.target.todense()) test = fetch_rcv1(subset='test', shuffle=True, random_state=42) test.data = test.data[:30000, :] test.target = test.target[:30000, :] test.target = test.target[:, range(number_of_labels)] bool_array = np.zeros(shape=test.target.shape[0], dtype=bool) for i in range(number_of_labels): bool_array = np.logical_or( bool_array, np.array(((test.target[:, i] == 1).todense())).flatten()) test.data = test.data[bool_array, :] test.target = test.target[bool_array, :] print("Total number of test documents : " + str(test.target.shape[0])) predicted = classifier.predict(test.data) # print("Jaccard Similarity Score is : "+str(jaccard_similarity_score(test.target, predicted))) return jaccard_similarity_score(test.target, predicted)
def sparse(): rcv1 = fetch_rcv1(subset='train') train = rcv1.data print("들어감") a = scipy.sparse.coo_matrix(train[0:2]) indices = np.array([a.row, a.col], dtype=np.int64).T # indices = np.array([[3, 2, 0], [4, 5, 1]], dtype=np.int64) values = a.data print(a.shape) # print(b.get_shape()) with tf.Session() as sess: tf.global_variables_initializer().run() b = tf.SparseTensor(indices=np.array([a.row, a.col]).T, values=a.data, dense_shape=a.shape) print(sess.run(b))
def load(name): """ Load the database from Lazy Initialized Dictionary with its known name. :param name: Name of database :return: tuple(X, y) """ databases = LazyDict({ 'breast_cancer': lambda: load_breast_cancer(return_X_y=True), 'cov_type': lambda: itemgetter('data', 'target')(fetch_covtype()), 'digits': lambda: load_digits(return_X_y=True), 'iris': lambda: load_iris(return_X_y=True), 'kddcup99': lambda: load_kddcup99(), 'lfw': lambda: fetch_lfw_people(return_X_y=True), 'mnist': lambda: openml.fetch_openml('mnist_784', version=1, return_X_y=True), 'news_groups': lambda: itemgetter('data', 'target')( fetch_20newsgroups_vectorized(subset='all')), 'olivetti_faces': lambda: itemgetter('data', 'target')( fetch_olivetti_faces()), 'rcv1': lambda: fetch_rcv1(random_state=0, return_X_y=True), 'wine': lambda: load_wine(return_X_y=True) }) return databases.get(name)
def load_sklearn_dataset(data_set_name="covtype", n=1000, d=10): if data_set_name == "covtype": covtype = fetch_covtype() X = normalize(covtype.data[:n]) y = covtype.target[:n] return X, y if data_set_name == "rcv1": rcv1 = fetch_rcv1() X = normalize(rcv1.data[:n]) y = rcv1.target[:n] return X, y if data_set_name == "lfw": lfw = fetch_lfw_people() print(lfw.data.shape) print(lfw.target.shape) X = normalize(lfw.data[:n], axis=1) # MinMaxScaler().transform(lfw.data[:n]) # scale(lfw.data[:n]) y = lfw.target[:n] return X, y
def main(): # Fetch the rcv1 dataset from sklearn. rcv1 = fetch_rcv1() # Clean and reformat the dataset. target = rcv1['target'].todense() label = np.array(target[:, 33]).reshape(1, -1)[0] label.dtype = 'int8' label[label == 0] = -1 # Create numpy array of training data. training_data = rcv1['data'][0:100000, :] # Assign labels to training data. training_label = label[0:100000] test_data = rcv1['data'][100000:, :] test_label = label[100000:] # Save the training and test datasets to disk. np.save('test_data_rcv1.npy', test_data) np.save('test_label_rcv1', test_label) np.save('training_data_rcv1', training_data) np.save('training_label_rcv1', training_label)
error_rate.append(error(train_data, train_label, w)) #Test error for problem 5 error_rate_test.append(error(test_data, test_label, w)) return error_rate, error_rate_test def error(data, label, w): predict = w.dot(data.transpose()).toarray() wx = predict.transpose() * label res = np.sum(wx < 0) error_rate = res / label.shape[0] return error_rate if __name__ == '__main__': rcv1 = fetch_rcv1() CCAT, label, data = problem1a(rcv1) train_data, train_label, test_data, test_label = problem1b(rcv1, 100000) Train_error_P, Test_error_P, min_train_P, min_test_P = problem2( train_data, train_label, test_data, test_label, 0.0001, 2000, 50) Train_error_A, Test_error_A, min_train_A, min_test_A = problem3( train_data, train_label, test_data, test_label, 1e-7, 0.0001, 2000, 50) loss1, accuracy1, error_DNN1 = problem4(train_data, train_label, test_data, test_label, 5, 375) # loss2, accuracy2, error_DNN2 = problem4(train_data,train_label,test_data,test_label,2,100) # loss3, accuracy3, error_DNN3 = problem4(train_data,train_label,test_data,test_label,3,100) # y = [error_DNN1,error_DNN2,error_DNN3] # x = [1,2,3] # plt.figure() # plt.plot(x,y) # plt.xlabel('Hidden layers')
def load_data_multi_classes_rcv1(): # x_data = np.array() # # y_data = np.array() print('start loading data...') rcv1 = fetch_rcv1() X_coo = rcv1.data[0:23149].tocoo( ) #coo_matrix(([3,4,5], ([0,1,1], [2,0,2])), shape=(2,3)) Y_coo = rcv1.target[0:23149].tocoo() values = X_coo.data # print(X_coo) indices = np.vstack((X_coo.row, X_coo.col)) i = torch.LongTensor(indices) v = torch.DoubleTensor(values) shape = X_coo.shape X = torch.sparse.DoubleTensor(i, v, torch.Size(shape)).to_dense() indices = np.vstack((Y_coo.row, Y_coo.col)) values = Y_coo.data i = torch.LongTensor(indices) v = torch.DoubleTensor(values) shape = Y_coo.shape # print(Y_coo) Y = torch.sparse.DoubleTensor(i, v, torch.Size(shape)).to_dense() # configs = load_config_data(config_file) # # from_csv = configs[file_name]['from_csv'] # # if not from_csv: # return clean_sensor_data(file_name, is_classification) # # train_data = pd.read_csv(file_name) # # # # x_cols = configs[file_name]['x_cols'] # # y_cols = configs[file_name]['y_cols'] # # # # # x_data = train_data.iloc[:,x_cols].get_values() # # y_data = train_data.iloc[:,y_cols].get_values() # # # with open(file_name, 'r') as f: # # reader = csv.reader(f) # # line_count = 0 # # for row in reader: # # if line_count == 0: # # # print(f'Column names are {", ".join(row)}') # # line_count += 1 # # else: # # # # if line_count == 1: # # x_data = np.array(cleaning(row, x_cols), dtype=np.float64) # # # # y_data = np.array(cleaning(row, y_cols), dtype=np.float64) # # else: # # # # x_data = np.vstack([x_data, np.array(cleaning(row, x_cols), dtype=np.float64)]) # # # # y_data = np.vstack([y_data, np.array(cleaning(row, y_cols), dtype=np.float64)]) # # # # line_count += 1 # # # # x_data = normalize(x_data) # # x_data = normalize_with_known_range(x_data, 255, 0) # # # if not is_classification: # y_data = normalize(y_data) # # # print(x_[0:10, :]) # # x_train = torch.from_numpy(x_data) # # y_train = torch.from_numpy(y_data) # # # # x_train,y_train=x_train.type(torch.FloatTensor),y_train.type(torch.FloatTensor) # # x_train,y_train=x_train.type(torch.FloatTensor),y_train.type(torch.FloatTensor) # # # print(x_train[0:10, :]) # # # print('sum', torch.sum(x_train, dim=0)) # # X = Variable(x_train) # # Y = Variable(y_train) # if is_classification: Y_uniques = torch.unique(Y) if not (set(Y_uniques.numpy()) == set(range(Y_uniques.shape[0]))): # print(Y_uniques) Y_copy = torch.zeros(Y.shape) for k in range(Y_uniques.shape[0]): # print((Y==Y_uniques[k]).nonzero()[:, 0]) Y_copy[(Y == Y_uniques[k]).nonzero()[:, 0]] = k Y = Y_copy # min_label = torch.min(Y) # # if min_label == 0: # Y = 2*Y-1 # print(Y) print('X_max::', torch.max(X)) print('X_min::', torch.min(X)) print('Y_max::', torch.max(Y)) print('Y_min::', torch.min(Y)) print('x_shape::', X.shape) print('y_shape::', Y[:, 0].shape) print('loading data done...') # print('X_norm::', torch.norm(torch.mm(torch.t(X), X), p=2)) return split_train_test_data(X, Y[:, 0], 0.1, True)
def exp(solvers, penalty, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): dtypes_mapping = { "float64": np.float64, "float32": np.float32, } if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(fit_single)(solver, X, y, penalty=penalty, single_target=single_target, dtype=dtype, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for dtype in dtypes_mapping.values()) res = [] idx = 0 for dtype_name in dtypes_mapping.keys(): for solver in solvers: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, dtype=dtype_name, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
for (name, _, _, train_losses, _, _, durations) in clfs: pobj_final.append(train_losses[-1]) indices = np.argsort(pobj_final) pobj_best = pobj_final[indices[0]] for (name, _, _, train_losses, _, _, durations) in clfs: log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10) plt.plot(durations, log_pobj, '-o', label=name) plt.legend(loc=0) plt.xlabel("seconds") plt.ylabel("log(best - train_loss)") rcv1 = fetch_rcv1() X = rcv1.data n_samples, n_features = X.shape # consider the binary classification problem 'CCAT' vs the rest ccat_idx = rcv1.target_names.tolist().index('CCAT') y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64) y[y == 0] = -1 # parameters C = 1. fit_intercept = True tol = 1.0e-14 # max_iter range sgd_iter_range = list(range(1, 121, 10))
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): mem = Memory(cachedir=expanduser('~/cache'), verbose=0) if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] cached_fit = mem.cache(fit_single) out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(cached_fit)(solver, X, y, penalty=penalty, single_target=single_target, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for penalty in penalties) res = [] idx = 0 for solver in solvers: for penalty in penalties: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
from sklearn.datasets import fetch_rcv1 import scipy.io as sio import numpy import gzip, cPickle import os # target_dir = '/home/bo/Data/RCV1/Processed' # data_home = '/home/bo/Data' #target_dir = '/project/sidir001/yang4173/Data/RCV1/Processed' #data_home = '/project/sidir001/yang4173/Data' target_dir = 'data/RCV1/Processed' data_home = 'data' cwd = os.getcwd() data = fetch_rcv1(data_home = data_home, download_if_missing = True) names = data.target_names ind = numpy.full(len(names), False, dtype = bool) f = open(data_home + '/RCV1/rcv1.topics.hier.orig.txt', 'r') count = 0 for i in range(len(names) + 1): s = f.readline() if s[9:12] == 'CAT': ind[i - 1] = True count = count + 1 f.close() labels = data.target[:][:, ind].copy() labels = labels.toarray() t = labels.sum(axis = 1, keepdims = False)