<<<<<<< HEAD from scikits.learn.metrics import f1_score from scikits.learn.metrics import precision from scikits.learn.metrics import recall ======= from scikits.learn.metrics import classification_report >>>>>>> remote if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" print __doc__ sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = {
from scikits.learn.datasets import load_mlcomp from scikits.learn.feature_extraction.text import Vectorizer from scikits.learn.linear_model.sparse import SGDClassifier from scikits.learn.metrics import confusion_matrix from scikits.learn.metrics import classification_report from scikits.learn.naive_bayes import MultinomialNB if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... "
from scikits.learn.datasets import load_mlcomp from scikits.learn.feature_extraction.text import Vectorizer from scikits.learn.linear_model.sparse import SGDClassifier from scikits.learn.metrics import confusion_matrix from scikits.learn.metrics import classification_report from scikits.learn.naive_bayes import MultinomialNB if "MLCOMP_DATASETS_HOME" not in os.environ: print "Please follow those instructions to get started:" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp("20news-18828", "train") print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp("20news-18828", "test")
import scipy.sparse as sp import pylab as pl from scikits.learn.datasets import load_mlcomp from scikits.learn.sparse.svm import SVC from scikits.learn.metrics import confusion_matrix if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" print __doc__ sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " t0 = time() news_train = load_mlcomp('20news-18828', 'train', sparse=True) print "done in %fs" % (time() - t0) print "news_train.data is sparse: ", print sp.issparse(news_train.data) # The documents have been hashed into TF-IDF (Term Frequencies times Inverse # Document Frequencies) vectors of a fixed dimension. print "n_samples: %d, n_features: %d" % news_train.data.shape print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = { 'kernel': 'linear', 'C': 10, } print "parameters:", parameters
import pylab as pl from scikits.learn.datasets import load_mlcomp from scikits.learn.feature_extraction.text.sparse import Vectorizer from scikits.learn.sgd.sparse import SGD from scikits.learn.metrics import confusion_matrix from scikits.learn.metrics import classification_report if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" print __doc__ sys.exit(0) # Load two categories from the training set (binary classification) print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train', categories=['alt.atheism', 'comp.graphics']) print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear SVM (hinge loss and L2 regularizer) using SGD.\n"\
from scikits.learn.datasets import load_mlcomp from scikits.learn.metrics import confusion_matrix # from scikits.learn.svm.sparse import LinearSVC from scikits.learn.sgd.sparse import SGD if 'MLCOMP_DATASETS_HOME' not in os.environ: print "Please follow those instructions to get started:" print __doc__ sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " t0 = time() news_train = load_mlcomp('20news-18828', 'train', sparse=True) print "done in %fs" % (time() - t0) print "Creating binary classification task\n"\ "alt.atheism vs. comp.graphics" target = news_train.target pos = 0 # alt.atheism neg = 1 # comp.graphics pos_idx = np.where(target == pos)[0] neg_idx = np.where(target == neg)[0] idx = np.concatenate((pos_idx, neg_idx)) np.random.seed(13) np.random.shuffle(idx) data = news_train.data[idx] target = news_train.target[idx]
import pylab as pl import time import random from scikits.learn.datasets import load_mlcomp from codemaker.embedding import SDAEmbedder from codemaker.evaluation import Neighbors, local_match, pairwise_distances pl.clf() data_file = "20news_data.npy" target_file = "20news_target.npy" if not os.path.exists(data_file): print "Loading 20 newsgroups training set... " t0 = time.time() news_train = load_mlcomp('20news-18828', 'train') X, y = news_train.data, news_train.target n_samples, n_features = X.shape print "done in %fs" % (time.time() - t0) print "n_samples: %d, n_features: %d" % (n_samples, n_features) # reshuffle: print "Reshuffling the data" random.seed(0) permutation = range(n_samples) random.shuffle(permutation) X = X[permutation] y = y[permutation] # sample part of X to be used for plotting plot_size = 5000