Exemple #1
0
def loadMatrix(matrixFileName):
    """
    Load the sparse matrix in the libsvm format from the given file.
    Returns the csr matrix and an index to row ids.
    e.g. rowids is a list of row ids that precede each line.
    """
    return load_svmlight_file(matrixFileName)
def get_sampled(datafile):
    n_sample = 100
    t_label = '3'
    output_num = 0

    sampled_file = os.path.join(tmp_dir, 'tmp.sample')
    fout = open(sampled_file, 'w')
    for idx, line in enumerate(open(datafile)):
        if n_sample == 0:
            break

        tokens = line.split(' ')
        label = tokens[0]

        if t_label == label:
            print output_num, idx
            fout.write(line)
            n_sample -= 1
            output_num += 1

    fout.close()

    X, y = load_svmlight_file(sampled_file)

    return X
def test_load_svmlight_file():
    X, y = load_svmlight_file(datafile)

    # test X's shape
    assert_equal(X.indptr.shape[0], 4)
    assert_equal(X.shape[0], 3)
    assert_equal(X.shape[1], 20)
    assert_equal(y.shape[0], 3)

    # test X's non-zero values
    for i, j, val in ((0, 1, 2.5), (0, 9, -5.2), (0, 14, 1.5),
                     (1, 4, 1.0), (1, 11, -3),
                     (2, 19, 27)):

        assert_equal(X[i, j], val)

    # tests X's zero values
    assert_equal(X[0, 2], 0)
    assert_equal(X[0, 4], 0)
    assert_equal(X[1, 7], 0)
    assert_equal(X[1, 15], 0)
    assert_equal(X[2, 17], 0)

    # test can change X's values
    X[0, 1] *= 2
    assert_equal(X[0, 1], 5)

    # test y
    assert_array_equal(y, [1, 2, 3])
def test_load_svmlight_file():
    X, y = load_svmlight_file(datafile)

    # test X's shape
    assert_equal(X.indptr.shape[0], 4)
    assert_equal(X.shape[0], 3)
    assert_equal(X.shape[1], 20)
    assert_equal(y.shape[0], 3)

    # test X's non-zero values
    for i, j, val in ((0, 1, 2.5), (0, 9, -5.2), (0, 14, 1.5), (1, 4, 1.0),
                      (1, 11, -3), (2, 19, 27)):

        assert_equal(X[i, j], val)

    # tests X's zero values
    assert_equal(X[0, 2], 0)
    assert_equal(X[0, 4], 0)
    assert_equal(X[1, 7], 0)
    assert_equal(X[1, 15], 0)
    assert_equal(X[2, 17], 0)

    # test can change X's values
    X[0, 1] *= 2
    assert_equal(X[0, 1], 5)

    # test y
    assert_array_equal(y, [1, 2, 3])
Exemple #5
0
def loadMatrix(matrixFileName):
    """
    Load the sparse matrix in the libsvm format from the given file.
    Returns the csr matrix and an index to row ids.
    e.g. rowids is a list of row ids that precede each line.
    """
    return load_svmlight_file(matrixFileName)
Exemple #6
0
def mergeSparse(f1, f2, f3):
    X1, y1 = io.load_svmlight_file(f1)
    X2, y2 = io.load_svmlight_file(f2)

    if (y1.shape == y2.shape):
        X = sp.hstack([X1, X2])
    else:
        sys.stderr.write('Error: Different number of examples in files: ' +
                         str(y1.shape) + ' != ' + str(y2.shape) + '\n')
        return

    if (y1 == y2).sum() != y1.shape[0]:
        sys.stderr.write(
            'Warning: Label mismatch. Are you merging features of the same subset?\nI will use the labels of the first argument\n'
        )

    y = y1
    io.dump_svmlight_file(X, y, f3)
    return
def test_dump():
    try:
        Xs, y = load_svmlight_file(datafile)
        tmpfile = "tmp_dump.txt"
        dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
        X2, y2 = sk_load_svmlight_file(tmpfile)
        assert_array_equal(Xs.toarray(), X2.toarray())
        assert_array_equal(y, y2)
    finally:
        os.remove(tmpfile)
def test_dump():
    try:
        Xs, y = load_svmlight_file(datafile)
        tmpfile = "tmp_dump.txt"
        dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
        X2, y2 = sk_load_svmlight_file(tmpfile)
        assert_array_equal(Xs.toarray(), X2.toarray())
        assert_array_equal(y, y2)
    finally:
        os.remove(tmpfile)
Exemple #9
0
def changeSparseLabels(f1, f2, labelFunction=AvsI): 
	Xold, yold = io.load_svmlight_file(f1)

	vectorizedFunction = np.vectorize(labelFunction, otypes=[np.int32])
	ynew = vectorizedFunction(yold)

	X = Xold[ynew != 0, :]
	y = ynew[ynew != 0]
	io.dump_svmlight_file(X, y, f2)
	return
Exemple #10
0
	def load_data(self, rel_path):
		'''
		Loads data from a SVMLight file using the svmlight_loader
		library: https://github.com/mblondel/svmlight-loader
		Returns a list of the dataset and the labels
		'''
		abs_path = os.path.abspath(rel_path)

		(x_train, labels) = svml.load_svmlight_file(abs_path)
		return [x_train, labels]
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        f = StringIO()
        dump_svmlight_file(X, y, f, zero_based=False)
        f.seek(0)
        X2, y2 = sk_load_svmlight_file(f)
        assert_array_equal(Xd, X2.toarray())
        assert_array_equal(y, y2)
def test_load_svmlight_file_n_features():
    X, y = load_svmlight_file(datafile, n_features=14)

    # test X'shape
    assert_equal(X.indptr.shape[0], 4)
    assert_equal(X.shape[0], 3)
    assert_equal(X.shape[1], 14)

    # test X's non-zero values
    for i, j, val in ((0, 1, 2.5), (0, 9, -5.2), (1, 4, 1.0), (1, 11, -3)):

        assert_equal(X[i, j], val)
def test_load_svmlight_file_n_features():
    X, y = load_svmlight_file(datafile, n_features=14)

    # test X'shape
    assert_equal(X.indptr.shape[0], 4)
    assert_equal(X.shape[0], 3)
    assert_equal(X.shape[1], 14)

    # test X's non-zero values
    for i, j, val in ((0, 1, 2.5), (0, 9, -5.2),
                     (1, 4, 1.0), (1, 11, -3)):

        assert_equal(X[i, j], val)
Exemple #14
0
def compute_LMI(matrixFileName):
    """
    We will first read the co-occurrence matrix from matrixFile Name.
    Next, we will compute the PPMI values for the matrix.
    """
    mat, rowids = load_svmlight_file(matrixFileName)
    (nrows, ncols) = mat.shape
    colTotals = np.zeros(ncols, dtype=DTYPE)
    for j in range(0, ncols):
        colTotals[j] = np.sum(mat[:, j].data)
    N = np.sum(colTotals)
    for i in range(0, nrows):
        row = mat[i, :]
        rowTotal = np.sum(row.data)
        for j in row.indices:
            mat[i,
                j] = max(0, np.log(
                    (mat[i, j] * N) / (rowTotal * colTotals[j])))
    return mat
Exemple #15
0
def process():
    """ 
    Demonstrates the LogisticRegression class 
    """
    dataset = "rcv1"
    rate = 1000
    epohs = 500
    train_X, train_y = load_svmlight_file("../data/%s/%s.train" %
                                          (dataset, dataset))
    N, D = train_X.shape
    train_y = 0.5 * (train_y + numpy.ones(N, dtype=int))
    shared_X = numpy.asarray(train_X.toarray(), dtype=theano.config.floatX)
    shared_y = numpy.asarray(train_y, dtype=theano.config.floatX)

    x = T.matrix('x')
    y = T.vector('y')
    LR = LogisticRegression(x, N, D)
    cost = LR.negative_log_likelihood(y)
    g_w = T.grad(cost=cost, wrt=LR.w)
    g_b = T.grad(cost=cost, wrt=LR.b)
    updates = [(LR.w, LR.w - rate * g_w), (LR.b, LR.b - rate * g_b)]
    train_model = theano.function(inputs=[x, y],
                                  outputs=cost,
                                  updates=updates,
                                  allow_input_downcast=True)
    err, wval, bval = LR.errors(y)
    test_model = theano.function(inputs=[x, y],
                                 outputs=[err, wval, bval],
                                 allow_input_downcast=True)
    print "Train instances =", N
    print "Dimensionality  =", D
    for t in range(epohs):
        likelihood = train_model(shared_X, shared_y)
        err_val, w_val, b_val = test_model(shared_X, shared_y)
        #norm = numpy.dot(wval, wval)
        print "Epoh %d: Likelihood = %f Errors = %f b = %s norm = %s" % (
            t, likelihood, err_val, str(b_val), str(w_val))

    pass
def test_load_invalid_file():
    load_svmlight_file(invalidfile)
Exemple #17
0
def main():
    #X, y = load_svmlight_file('./heart_scale')
    #X, y = load_svmlight_file('./large.dat')
    X, y = load_svmlight_file('./kddb')
    y[y==0] = -1
    gradient(X, y)
def test_not_a_filename():
    load_svmlight_file(1)
def test_invalid_filename():
    load_svmlight_file("trou pic nic douille")
def test_not_a_filename():
    load_svmlight_file(1)
def test_load_invalid_file():
    load_svmlight_file(invalidfile)
def test_invalid_filename():
    load_svmlight_file("trou pic nic douille")
Exemple #23
0
#!/usr/bin/env python2
import numpy as np
import pylab as pl
import svmlight_loader as io
import sys

from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, chi2

###############################################################################
# import some data to play with

X, y = io.load_svmlight_file('mergedCPK-TP/output_bound10_ps4_f427_cyclic.AvsI')
# X, y = io.load_svmlight_file(sys.argv[1])


# ###############################################################################
pl.figure(1)
pl.clf()

X_indices = np.arange(X.shape[-1])

###############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(chi2, percentile=10)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
pl.bar(X_indices - .45, scores, width=.2,
       label=r'Univariate score ($-Log(p_{value})$)', 
Exemple #24
0
    def load_from_text(filepaths,
                       dtype=np.float32,
                       max_score=None,
                       min_feature=None,
                       max_feature=None,
                       has_sorted_relevances=False,
                       purge=False):
        '''
        Load queries in the svmlight format from the specified file(s).

        SVMlight format example (one line):

            5[\s]qid:8[\s]103:1.0[\s]...[\s]981:1.0 982:1.0 # comment[\n]

        Parameters:
        -----------
        filepath: string or list of strings
            The location of the dataset file(s).

        dtype: data-type, optional (default is np.float32)
            The desired data-type for the document feature vectors. Here,
            the default value (np.float32) is chosen for mere optimization
            purposes.

        max_score: int, optional (default is None)
            The maximum relevance score value. If None, the value is derived
            from the relevance scores in the file.

        min_feature: int or None, optional (default is None)
            The minimum feature identifier, which is present in the dataset. If
            None, this value is read from the data. This parameter is important
            because of internal feature remapping: in case of loading different
            parts of a dataset (folds), some features may be present in one
            part and may not be present in another (because all its values are
            0) - this would create inconsistent feature mappings between
            the parts.

        max_feature: int or None, optional (default is None)
            The maximum feature identifier, which is present in the dataset. If
            None, this value is read from the data. This parameter is important
            because of internal feature remapping, see `min_feature` for more.

        has_sorted_relevances: bool, optional (default is False)
            If True, it indicates that the relevance scores of the queries
            in the file are sorted in decreasing order.

        purge: bool, optional (default is False)
            If True, all queries which have documents with the same relevance
            labels are removed. If False, no query is removed.
        '''
        # Arrays used to build CSR matrix of query-document vectors.
        data, indices, indptr = [], [], [0]

        # Relevance score, query ID, query hash, and document hash.
        relevances = []

        query_ids = []
        query_indptr = [0]
        prev_qid = None

        # If only single filepath is given, not a list.
        if isinstance(filepaths, str):
            filepaths = [filepaths]

        n_purged_queries = 0
        n_purged_documents = 0

        def purge_query(qid, data, indices, indptr):
            '''Remove the last query added to the set according to `purge`.'''
            raise NotImplemented
            if not purge or qid is None:
                return 0

            r = relevances[query_indptr[-2]]

            i = query_indptr[-2]
            while i < query_indptr[-1] and relevances[i] == r:
                i += 1

            if i == query_indptr[-1]:
                n = query_indptr.pop()

                del query_ids[-1]

                del indices[indptr[query_indptr[-1]]:]
                del data[indptr[query_indptr[-1]]:]

                del relevances[query_indptr[-1]:]
                del indptr[query_indptr[-1] + 1:]

                return n - query_indptr[-1]
            else:
                return 0

        for filepath in filepaths:
            lineno = 0  # Used just to report invalid lines (if any).

            logger.info('Reading queries from %s.' % filepath)

            #--------------------------------------------------------------------------------
            # Call svmlight_loader here.
            #--------------------------------------------------------------------------------
            (feature_vectors, relevances,
             qids) = svmlight_loader.load_svmlight_file(filepath,
                                                        query_ids=True)

            # Set up query_indptr and query_ids from qids
            last_query_id = None

            # The below code mimics the original way of setting up query_indptr.  It initializes query_indptr
            # to [0] then immediately appends a 1.
            for i in range(0, len(qids)):
                if qids[i] != last_query_id:
                    query_indptr.append(query_indptr[-1] + 1)
                    query_ids.append(qids[i])
                    last_query_id = qids[i]
                else:
                    query_indptr[-1] += 1

            logger.info(
                'Read %d queries and %d documents out of which '
                '%d queries and %d documents were discarded.' %
                (len(query_indptr) + n_purged_queries - 1, query_indptr[-1] +
                 n_purged_documents, n_purged_queries, n_purged_documents))

        # Empty dataset.
        if len(query_indptr) == 1:
            raise ValueError('the input seems to be empty')

        # Set the minimum feature ID, if not given.
        if min_feature is None:
            min_feature = min(indices)

        if max_feature is None:
            raise NotImplemented
            # Remap the features for a proper conversion into dense matrix.
            feature_indices = np.unique(np.r_[min_feature, indices])
            indices = np.searchsorted(feature_indices, indices)
        else:
            #assert min(indices) >= min_feature, ('there is a feature with id '
            #        'smaller than min_feature: %d < %d' % (min(indices),
            #                                               min_feature))

            assert feature_vectors.shape[1] <= max_feature, (
                'there is a feature with id '
                'greater than max_feature: %d > %d' %
                (feature_vectors.shape[1], max_feature))

            feature_indices = np.arange(min_feature,
                                        max_feature,
                                        dtype='int32')

        # Free the copies of the feature_vectors in non-Numpy arrays (if any),
        # this is important in order not to waste memory for the transfer of
        # the feature vectors to dense format (default option).
        del data, indices, indptr

        feature_vectors = feature_vectors.toarray()
        #--------------------------------------------------------------------------------
        # KDR: things that just get passed through:
        # max_score
        # has_sorted_relevances
        #--------------------------------------------------------------------------------
        # KDR: things we need to construct:
        # feature_vectors: CSR matrix
        # relevances: list of what I would call "labels", one per line
        # query_indptr: row pointers to the start of each query
        # query_ids: just a list of query ids in the order they appear (relies on the input being sorted)
        # feature_indices: sequential list of features (current logic is fine)

        # Create and return a Queries object.
        return Queries(feature_vectors,
                       relevances,
                       query_indptr,
                       max_score=max_score,
                       has_sorted_relevances=has_sorted_relevances,
                       query_ids=query_ids,
                       feature_indices=feature_indices)
Exemple #25
0
    if (len(sys.argv) > 4) | (len(sys.argv) < 3):
        sys.stderr.write(
            'libsvmFeatureFilter.py: Wrong number of arguments. Must be 2 or 3.\n'
        )
        sys.exit(1)
    else:
        if (len(sys.argv) == 3):
            svmFilename = sys.argv[1]
            filterFile = sys.stdin
            outFilename = sys.argv[2]
        else:  # len(sys.argv) == 4
            svmFilename = sys.argv[1]
            filterFile = open(sys.argv[2], 'r')
            outFilename = sys.argv[3]

    X, y = io.load_svmlight_file(svmFilename)
    featureSubset = np.loadtxt(filterFile)
    filterFile.close()

    sanitycheck = np.unique(featureSubset)
    print sanitycheck

    X_col = sp.csc_matrix(X)
    if X.shape[1] != featureSubset.shape[0]:
        sys.stderr.write(
            'libsvmFeatureFilter.py: Dimension mismatch of filter and features. Assuming that the feature list is a prefix of the filter list.\n'
        )
        featureSubset.resize(X.shape[1])

    # filter columns
    X_sub = X[:, featureSubset == 1]
Exemple #26
0
#! /usr/bin/env python
#from sklearn.datasets import load_svmlight_file
from svmlight_loader import load_svmlight_file
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score 
import pickle

print 'Loading feature file...'
X, y = load_svmlight_file('FeatureVectorBlueDefault.txt')

print 'Training classifier...'
clf = RandomForestClassifier(n_estimators=3, criterion='entropy', max_depth=20)
clf.fit(X, y)

print 'Saving model...'
file = open('forest.model', 'w')
pickle.dump(clf, file)
file.close()
#scores = cross_val_score(clf, X, y)
#print scores.mean()
Exemple #27
0
 def train(self, path):
     train_sparse, self.training_labels = svml.load_svmlight_file(path)
     train_dense = train_sparse.todense()
     self.training_data = np.asarray(train_dense)
Exemple #28
0
import os
import math
import operator
import numpy as np
import matplotlib.pyplot as mplt

# Number of nearest neighbors to use
k = 30
assert k <= 300
labels = ['electronic', 'metal', 'rap', 'classical']

# Load SVMLight files as numpy arrays
currdir = os.path.dirname(os.path.abspath(__file__))
trainfile = os.path.join(currdir, "data", "songsv1.train")
testfile = os.path.join(currdir, "data", "songsv1.test.txt")
x_train, y_train = svm.load_svmlight_file(trainfile)
x_test, y_test = svm.load_svmlight_file(testfile)

# Convert sparse matrix to dense matrix
x_train = x_train.todense()
x_test = x_test.todense()

x_train = np.array(x_train)
x_test = np.array(x_test)

# Statistics
num_correct = 0
num_correct_electronic = 0
num_correct_metal = 0
num_correct_rap = 0
num_correct_classical = 0
#!/usr/bin/python

from scipy.sparse import hstack
import svmlight_loader as io
import sys

if len(sys.argv) < 4:
    print '''Merge two files in libsvm / svmlight format
into a single file.

Parameters: inFile1 inFile2 outFile

implemented June 2014 by Pascal Welke'''

fileOne = sys.argv[1]
fileTwo = sys.argv[2]
fileThree = sys.argv[3]

xOne, yOne = io.load_svmlight_file(fileOne)
xTwo, yTwo = io.load_svmlight_file(fileTwo)

X = hstack((xOne, xTwo))

io.dump_svmlight_file(X, yOne, fileThree, False)
Exemple #30
0
 def load_test_file(self, path):
     test_sparse, self.test_labels = svml.load_svmlight_file(path)
     test_dense = test_sparse.todense()
     self.test_data = np.asarray(test_dense)
Exemple #31
0
#!/usr/bin/python
'''Usage: name INFILE OUTFILE PERCENTILE
Where 
	INFILE is a file in libSVM format
	OUTFILE will be a file in libSVM format containing the chosen percentile of top features
	PERCENTILE is an integer specifying the amount of features you want to keep.'''

import numpy as np
import pylab as pl
import svmlight_loader as io
import sys

from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, chi2

#################################################################
X, y = io.load_svmlight_file(sys.argv[1])

#################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(chi2, percentile=int(sys.argv[3]))
selector.fit(X, y)

#################################################################
# store output in file
Xsmall = selector.transform(X)
io.dump_svmlight_file(Xsmall, y, sys.argv[2], False)
Exemple #32
0
 def get_data(in_filename, n_features, **kwargs):
     data = load_svmlight_file(in_filename,
                               n_features=n_features,
                               dtype=np.float32)
     return data[0], data[1]