Example #1
0
<<<<<<< HEAD
from scikits.learn.metrics import f1_score
from scikits.learn.metrics import precision
from scikits.learn.metrics import recall
=======
from scikits.learn.metrics import classification_report
>>>>>>> remote

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "Please follow those instructions to get started:"
    print __doc__
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Training a linear SVM (hinge loss and L2 regularizer)..."
parameters = {
from scikits.learn.datasets import load_mlcomp
from scikits.learn.feature_extraction.text import Vectorizer
from scikits.learn.linear_model.sparse import SGDClassifier
from scikits.learn.metrics import confusion_matrix
from scikits.learn.metrics import classification_report
from scikits.learn.naive_bayes import MultinomialNB


if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
from scikits.learn.datasets import load_mlcomp
from scikits.learn.feature_extraction.text import Vectorizer
from scikits.learn.linear_model.sparse import SGDClassifier
from scikits.learn.metrics import confusion_matrix
from scikits.learn.metrics import classification_report
from scikits.learn.naive_bayes import MultinomialNB


if "MLCOMP_DATASETS_HOME" not in os.environ:
    print "Please follow those instructions to get started:"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp("20news-18828", "train")
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp("20news-18828", "test")
import scipy.sparse as sp
import pylab as pl

from scikits.learn.datasets import load_mlcomp
from scikits.learn.sparse.svm import SVC
from scikits.learn.metrics import confusion_matrix

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "Please follow those instructions to get started:"
    print __doc__
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
t0 = time()
news_train = load_mlcomp('20news-18828', 'train', sparse=True)
print "done in %fs" % (time() - t0)

print "news_train.data is sparse: ",
print sp.issparse(news_train.data)

# The documents have been hashed into TF-IDF (Term Frequencies times Inverse
# Document Frequencies) vectors of a fixed dimension.
print "n_samples: %d, n_features: %d" % news_train.data.shape

print "Training a linear SVM (hinge loss and L2 regularizer)..."
parameters = {
    'kernel': 'linear',
    'C': 10,
}
print "parameters:", parameters
import pylab as pl

from scikits.learn.datasets import load_mlcomp
from scikits.learn.feature_extraction.text.sparse import Vectorizer
from scikits.learn.sgd.sparse import SGD
from scikits.learn.metrics import confusion_matrix
from scikits.learn.metrics import classification_report

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "Please follow those instructions to get started:"
    print __doc__
    sys.exit(0)

# Load two categories from the training set (binary classification)
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train',
                         categories=['alt.atheism', 'comp.graphics'])

print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Training a linear SVM (hinge loss and L2 regularizer) using SGD.\n"\
Example #6
0
from scikits.learn.datasets import load_mlcomp
from scikits.learn.metrics import confusion_matrix

# from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.sgd.sparse import SGD


if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "Please follow those instructions to get started:"
    print __doc__
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
t0 = time()
news_train = load_mlcomp('20news-18828', 'train', sparse=True)
print "done in %fs" % (time() - t0)

print "Creating binary classification task\n"\
      "alt.atheism vs. comp.graphics"
target = news_train.target
pos = 0 # alt.atheism
neg = 1 # comp.graphics
pos_idx = np.where(target == pos)[0]
neg_idx = np.where(target == neg)[0]
idx = np.concatenate((pos_idx, neg_idx))
np.random.seed(13)
np.random.shuffle(idx)
data = news_train.data[idx]
target = news_train.target[idx]
Example #7
0
import pylab as pl
import time
import random

from scikits.learn.datasets import load_mlcomp
from codemaker.embedding import SDAEmbedder
from codemaker.evaluation import Neighbors, local_match, pairwise_distances

pl.clf()
data_file = "20news_data.npy"
target_file = "20news_target.npy"

if not os.path.exists(data_file):
    print "Loading 20 newsgroups training set... "
    t0 = time.time()
    news_train = load_mlcomp('20news-18828', 'train')
    X, y = news_train.data, news_train.target
    n_samples, n_features = X.shape
    print "done in %fs" % (time.time() - t0)
    print "n_samples: %d, n_features: %d" % (n_samples, n_features)

    # reshuffle:
    print "Reshuffling the data"
    random.seed(0)
    permutation = range(n_samples)
    random.shuffle(permutation)
    X = X[permutation]
    y = y[permutation]

    # sample part of X to be used for plotting
    plot_size = 5000