Beispiel #1
0
def predict_PLSR(x_filename, y_filename, model_filename, showError):
    """
    Read the PLSR model from the model_fname and read the X matrix from 
    x_filename. Write the predicted output to the y_filename.
    """
    sys.stderr.write("Predicting PLSR...")
    startTime = time.time()
    X = loadMatrix(x_filename)[0].todense()
    model = open(model_filename)
    pls2 = pickle.load(model)
    model.close()
    Y = pls2.predict(X)
    n = X.shape[0]
    dump_svmlight_file(X, np.arange(1, n + 1), y_filename, zero_based=True)
    endTime = time.time()
    sys.stderr.write(" took %ss\n" % str(round(endTime - startTime, 2)))

    if showError:
        Xnorm = np.linalg.norm(X, ord='fro')
        Error = np.linalg.norm((X - Y), ord='fro')
        rate = (100 * Error) / Xnorm
        print "Approximation Error Percentage = %f%%" % rate
        print "Frobenius norm of the original matrix =", Xnorm
        print "Frobenius norm of the error matrix =", Error
    pass
Beispiel #2
0
def predict_PLSR(x_filename, y_filename, model_filename, showError):
    """
    Read the PLSR model from the model_fname and read the X matrix from 
    x_filename. Write the predicted output to the y_filename.
    """
    sys.stderr.write("Predicting PLSR...")
    startTime = time.time()
    X = loadMatrix(x_filename)[0].todense()
    model = open(model_filename)
    pls2 = pickle.load(model)
    model.close()
    Y = pls2.predict(X)
    n = X.shape[0]
    dump_svmlight_file(X, np.arange(1, n+1), y_filename, zero_based=True)
    endTime = time.time()
    sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2)))

    if showError:
        Xnorm = np.linalg.norm(X, ord='fro')
        Error = np.linalg.norm((X - Y), ord='fro')
        rate = (100 * Error) / Xnorm
        print "Approximation Error Percentage = %f%%" % rate
        print "Frobenius norm of the original matrix =", Xnorm
        print "Frobenius norm of the error matrix =", Error  
    pass
Beispiel #3
0
def generate_random_matrix_PLSR():
    """
    Generate random X and Y matrices to test PLSR.
    """
    n = 1000
    m = 500
    X = np.random.randn(n, m)
    dump_svmlight_file(X, np.arange(1, n + 1), "../work/Y")
    pass
Beispiel #4
0
def generate_random_matrix_PLSR():
    """
    Generate random X and Y matrices to test PLSR.
    """
    n = 1000
    m = 500
    X = np.random.randn(n, m)
    dump_svmlight_file(X, np.arange(1, n+1), "../work/Y")
    pass
def test_dump():
    try:
        Xs, y = load_svmlight_file(datafile)
        tmpfile = "tmp_dump.txt"
        dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
        X2, y2 = sk_load_svmlight_file(tmpfile)
        assert_array_equal(Xs.toarray(), X2.toarray())
        assert_array_equal(y, y2)
    finally:
        os.remove(tmpfile)
Beispiel #6
0
def changeSparseLabels(f1, f2, labelFunction=AvsI): 
	Xold, yold = io.load_svmlight_file(f1)

	vectorizedFunction = np.vectorize(labelFunction, otypes=[np.int32])
	ynew = vectorizedFunction(yold)

	X = Xold[ynew != 0, :]
	y = ynew[ynew != 0]
	io.dump_svmlight_file(X, y, f2)
	return
def test_dump():
    try:
        Xs, y = load_svmlight_file(datafile)
        tmpfile = "tmp_dump.txt"
        dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
        X2, y2 = sk_load_svmlight_file(tmpfile)
        assert_array_equal(Xs.toarray(), X2.toarray())
        assert_array_equal(y, y2)
    finally:
        os.remove(tmpfile)
Beispiel #8
0
def make_svm_file(labelfile, datafile, resultfile):
	y = np.loadtxt(labelfile)
	X = np.loadtxt(datafile)

	if len(y) != len(X):
		sys.stderr.write('dense2svm.py: Data and Labels have different dimensions. I give up.\n')
		return		

	io.dump_svmlight_file(X, y, resultfile)
	return
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        f = StringIO()
        dump_svmlight_file(X, y, f, zero_based=False)
        f.seek(0)
        X2, y2 = sk_load_svmlight_file(f)
        assert_array_equal(Xd, X2.toarray())
        assert_array_equal(y, y2)
Beispiel #10
0
def caffe_batch_extract_predictionmap(network_proto, dense_network_proto, network_weights, mean_protofile, imagelist, outfile, src_layers, batch_size=100, dst_layers=['fc6-conv', 'fc7-conv', 'fc8-conv']):
	caffe.set_mode_cpu()	
	
	# load learned weights
	print 'Loading network weights...'
	net = DenseNet(network_proto, dense_network_proto, network_weights, mean_protofile, src_layers=src_layers, dst_layers=dst_layers)

	# verify again image list in order to make sure they just contain valid image format
	print 'Loading images and their labels...'
	start_ix = 0
	stop_ix = start_ix + batch_size

	if batch_size == -1:
		batch_size = len(imagelist)

	X = None
	first_time = True
	while True:
		images_data = []
		for img in imagelist[start_ix:stop_ix]:
			if os.path.isfile(img):
				images_data.append(caffe.io.load_image(img))
			else:
				continue
		print '... a batch of ', len(images_data), 'images were loaded'
#		stop_ix = len(images_data)
		tic = time.time()
		# start extraction
		print 'extracting features...'
		x = net.predict_densemap(images_data)
		toc = time.time()
		print '...elapsed time ', (toc-tic)/batch_size, 'secs per image'
	
		if first_time:
			X = x
			first_time = False
		else:
			X = np.r_[X, x]	
		# batch incremental
		start_ix = stop_ix
		stop_ix += batch_size
		if start_ix >= len(imagelist):
			break

	print 'Writing feature to file...'
	dump_svmlight_file(X, np.zeros((len(imagelist),1)), outfile)
	print 'DONE.'
Beispiel #11
0
def mergeSparse(f1, f2, f3):
    X1, y1 = io.load_svmlight_file(f1)
    X2, y2 = io.load_svmlight_file(f2)

    if (y1.shape == y2.shape):
        X = sp.hstack([X1, X2])
    else:
        sys.stderr.write('Error: Different number of examples in files: ' +
                         str(y1.shape) + ' != ' + str(y2.shape) + '\n')
        return

    if (y1 == y2).sum() != y1.shape[0]:
        sys.stderr.write(
            'Warning: Label mismatch. Are you merging features of the same subset?\nI will use the labels of the first argument\n'
        )

    y = y1
    io.dump_svmlight_file(X, y, f3)
    return
Beispiel #12
0
def make_svm_file(labelfile, datafile, resultfile):
	y = np.loadtxt(labelfile)
	X = np.loadtxt(datafile)

	if len(y) != len(X):
		sys.stderr.write('minhash2svm.py: Data and Labels have different dimensions. I give up.\n')
		return		

	# Check if data is nonnegative.
	# Also check if data contains zeros. If so, shift values up and generate a warning
	unique_values = np.unique(X)
	zero_count = (unique_values == 0).sum()
	nonnegative_count = (unique_values >= 0).sum()
	if nonnegative_count == unique_values.shape[0]:
		if zero_count > 0:
			sys.stderr.write('minhash2svm.py: Data file contains zero positions. Shifting values up by one.\n')
			X = X + 1
	else:
		sys.stderr.write('minhash2svm.py: Data file contains nonpositive values. I give up.\n')
		return

	io.dump_svmlight_file(X, y, resultfile)
	return
Beispiel #13
0
from numpy import *
from scipy.sparse import csr_matrix

from svmlight_loader import dump_svmlight_file

positives = loadtxt('positives_imputed.csv',delimiter=',')
negatives = loadtxt('negatives_imputed.csv', delimiter=',')

positives = positives[:,1:]
negatives = negatives[:,1:]

positives_samples = positives.shape[0]
negatives_samples = negatives.shape[0]

y = ones(positives_samples+negatives_samples, dtype='int64')
y[0:negatives_samples] = -1

X = vstack((negatives,positives))
X = csr_matrix(X)

dump_svmlight_file(X, y, "Jul2_training.svmlight", zero_based=False)
def select_model(X, y):
    # make holdout-holdin split
    (
        X_in,
        X_out,
        y_in,
        y_out,
        indices_in,
        indices_out,
        removed_features,
    ) = dh.split_holdout(X, y)

    logging.info("Writing holdin-holdout split data and info to file.")

    dump_svmlight_file(X_in,
                       y_in,
                       os.path.join(s.OPT_DIRP, "holdin.svm"),
                       zero_based=True)
    dump_svmlight_file(X_out,
                       y_out,
                       os.path.join(s.OPT_DIRP, "holdout.svm"),
                       zero_based=True)
    with open(os.path.join(s.OPT_DIRP, "holdinout_split_indices.json"),
              "wt") as f:
        json.dump(
            {
                "holdin": indices_in.tolist(),
                "holdout": indices_out.tolist(),
                "num_features": X_in.shape[1],
            },
            f,
        )

    steps, param_grids = pipeline.make_pipelines(s.PIPE_STEPS,
                                                 alt_order=s.ALT_ORDER)
    steps_param_grids = zip(steps, param_grids)

    if (
            s.PARTIALRUN
    ):  # filter with partial run info from the list pkl generated by reporter.py
        partialinfo = json.load(open(s.PARTIALRUN, "rt"))
        steps_param_grids = pipeline.filter_partialrun(steps_param_grids,
                                                       partialinfo)

    all_results = {}
    fit_pred_duration = {}
    cv_pipe_dir = os.path.join(s.OPT_DIRP, "cv_pipelines")
    util.ensure_dir(cv_pipe_dir)

    for (steps, param_grid) in steps_param_grids:

        # generate a human readable name for the current pipeline from the Pipeline object
        pipe_name = []
        for (name, step) in steps:
            if not "SelectPercentile" in str(step):
                pipe_name.append(str(step).split("(")[0].lower())
            else:
                pipe_name.append(
                    str(step.score_func.func_name).split("(")[0].lower())
        pipe_name = "+".join(pipe_name)
        DATASET_NAME = "{}_{}".format(
            pipe_name,
            s.DATASET_NAME)  # append the dataset name with pipeline name for
        # logging and metadata purposes
        pipe_opt_dir = os.path.join(cv_pipe_dir, pipe_name)
        util.ensure_dir(pipe_opt_dir)

        pipe = Pipeline(steps)
        grid_search = GridSearchCV(
            pipe,
            param_grid=param_grid,
            scoring=pipeline.my_scorer,
            n_jobs=s.CV_N_JOBS,
            cv=s.CV,
            verbose=10,
            error_score=0,
            return_train_score=False,
        )

        logging.info("{}: Doing modelselection with {}.".format(
            pipe_name, grid_search))
        start_pipefit = timeit.default_timer()
        grid_search.fit(X_in, y_in)

        # save grid_search object
        logging.info("{}: Pickling crossvalidation object..".format(pipe_name))
        dump(
            grid_search,
            os.path.join(pipe_opt_dir,
                         "%s_grid_search.joblibpkl" % s.TIMESTAMP),
            compress=1,
        )

        # save all intermediate results
        all_results[pipe_name] = grid_search.cv_results_
        with open(os.path.join(s.OPT_DIRP, "all_pipeline_cv_results.pkl"),
                  "wb") as all_res_out:
            pickle.dump(all_results, all_res_out)

        logging.info(
            "{}: Evaluating winning model on holdout test set.".format(
                pipe_name))

        logging.info("{}: Evaluating holdout performance.".format(pipe_name))
        y_pred = grid_search.predict(X_out).astype(int)
        y_out_true_y_out_pred = {
            "y_out_true": y_out.tolist(),
            "y_out_pred": y_pred.tolist(),
        }
        with open(os.path.join(pipe_opt_dir, "y_out_true-y_out_pred.json"),
                  "wt") as f:
            json.dump(y_out_true_y_out_pred, f)

        # save all intermediate fit and predict durations
        elapsed = timeit.default_timer() - start_pipefit
        fit_pred_duration[pipe_name] = elapsed
        json.dump(
            fit_pred_duration,
            open(
                os.path.join(s.OPT_DIRP,
                             "all_pipeline_fit_predict_duration.json"), "wt"),
        )

        precision, recall, fscore, support = precision_recall_fscore_support(
            y_out, y_pred, average=s.SCORE_AVERAGING)
        acc = accuracy_score(y_out, y_pred)
        if s.MULTICLASS:
            auc = None
        else:
            auc = roc_auc_score(y_out, y_pred)

        # make report
        params = grid_search.best_params_
        winscore = grid_search.best_score_
        ablation_name = "blah"
        report = ("%s\t%s\t%s"
                  "\nSettings: %s"
                  "\nTested parameters: %s"
                  "\nWinning parameters: %s"
                  "\nWinning model CV score: %s %s"
                  "\nHoldout score:"
                  "\nfscore\tprecision\trecall\tacc\tauc"
                  "\n%s\t%s\t%s\t%s\t%s" % (
                      s.DATA_FP,
                      ablation_name,
                      str(pipe.get_params()),
                      s.__file__,
                      s.PIPE_STEPS,
                      params,
                      winscore,
                      s.SCORER_METRIC,
                      fscore,
                      precision,
                      recall,
                      acc,
                      auc,
                  ))
        print(report)
        with open(os.path.join(pipe_opt_dir, "%s_results.txt" % s.TIMESTAMP),
                  "wt") as f:
            f.write(report)
        report_as_dict = {
            "data_path": s.DATA_FP,
            "feature_groups": ablation_name,
            # 'classifier_type': str(type(clf)),
            "settings": str(s.__file__),
            "param_grid": str(s.PIPE_STEPS),
            "best_params": str(params),
            "score_grid_search": winscore,
            "metric_grid_search": s.SCORER_METRIC,
            "fscore_holdout": fscore,
            "precision_holdout": precision,
            "recall_holdout": recall,
            "acc_holdout": acc,
            "auc_holdout": auc,
            "support_holdout": support,
            "predictions_holdout": y_pred.tolist(),
            "y_true_holdout": y_out.tolist(),
        }

        with open(
                os.path.join(pipe_opt_dir, "%s_finalreport.txt" % s.TIMESTAMP),
                "wt") as f:
            f.write(report)
        with open(os.path.join(pipe_opt_dir, "report.json"), "wt") as f:
            json.dump(report_as_dict, f)

        logging.info("{}: Model selection done. Duration: {}".format(
            pipe_name.upper(), str(datetime.timedelta(seconds=elapsed))))

    logging.info("DONE.")
Beispiel #15
0
#!/usr/bin/python
'''Usage: name INFILE OUTFILE PERCENTILE
Where 
	INFILE is a file in libSVM format
	OUTFILE will be a file in libSVM format containing the chosen percentile of top features
	PERCENTILE is an integer specifying the amount of features you want to keep.'''

import numpy as np
import pylab as pl
import svmlight_loader as io
import sys

from sklearn import datasets, svm
from sklearn.feature_selection import SelectPercentile, chi2

#################################################################
X, y = io.load_svmlight_file(sys.argv[1])

#################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(chi2, percentile=int(sys.argv[3]))
selector.fit(X, y)

#################################################################
# store output in file
Xsmall = selector.transform(X)
io.dump_svmlight_file(Xsmall, y, sys.argv[2], False)
Beispiel #16
0
	for key, value in lmdb_cursor:
		if i == chunk_size:
			break
		datum.ParseFromString(value)
		label = datum.label
		data.append(caffe.io.datum_to_array(datum).ravel())
		i += 1
	return np.array(data, dtype=np.float32)

def caffe_lmdb2csr(lmdb_file, gt_file, out_file):
	imgs = []
	lbls = []
	with open(gt_file, 'rt') as fin:
		for line in fin:
			try:
				img, lbl = line.strip().split(' ')
			except ValueError, e:
				print(e)
				print(line)
				raise
			imgs.append(img)
			lbls.append(lbl)
	X = caffe_get_data_chunk(lmdb_file, len(imgs))
	if X.shape[0] != len(imgs):
#		print 'Length mismatch between ', gt_file, ' and ', lmdb_file
#		print ' ', X.shape[0], ' vs ', len(imgs)
		raise ValueError('Length mismatch between ' + gt_file, ' and ' + lmdb_file)
                                                                 
	dump_svmlight_file(X, np.array(lbls), out_file)
                                                                        
Beispiel #17
0
def caffe_batch_extract_features(network_proto, network_weights, mean_protofile, imagelist_file, outfile, blob_names=['fc7'], batch_size=100, use_gpu=True, cuda_dev=0):
	# load learned weights
	if not os.path.isfile(mean_protofile):
		raise ValueError('mean file not found!')
		
	if os.path.isfile(outfile):
		print 'file exist. exit.'
		return
	
	if not mean_protofile.split('.')[-1] == 'npy':
		print 'Converting mean protofile into numpy format...'
		blob = caffe.proto.caffe_pb2.BlobProto()
		data = open(mean_protofile, 'rb').read()
		blob.ParseFromString(data)
		arr = np.array(caffe.io.blobproto_to_array(blob))[0]
		np.save(os.path.join(os.path.dirname(mean_protofile), os.path.basename(mean_protofile).split('.')[0] + '.npy'), arr)
	else:
		print 'Loading mean file...'
		arr = np.load(mean_protofile)
	
	net = Extractor(network_proto, network_weights, mean=arr.mean(1).mean(1), raw_scale=255, channel_swap=(2,1,0), image_dims=(256,256))
	# verify again image list in order to make sure they just contain valid image format
	print 'Extracting features from listing file ', imagelist_file, '...'
	start_ix = 0
	stop_ix = start_ix + batch_size

	# load the imagelist and labelist
	imagelist = []
	with open(imagelist_file, 'rt') as fin:
		for line in fin:
			fpath = line.strip().split(' ')
			fpath = fpath[0]
			imagelist.append(fpath)
	print 'Total ', len(imagelist), ' images are enlisted'

	if batch_size == -1:
		batch_size = len(imagelist)

	while True:
		images_data = []
		for img in imagelist[start_ix:stop_ix]:
			if os.path.isfile(img):
				try:
					images_data.append(caffe.io.load_image(img))
				except:
					print 'Warning: unknown/bad format file'
			else:
				raise ValueError('Image file(s) not found: ' + img)
		print '... a batch of ', len(images_data), 'images were loaded'
#		stop_ix = len(images_data)
		tic = time.time()
		# start extraction
#		print 'extracting features...'
		if len(blob_names) == 1:
			x = net.compute_featvecs(images_data, blob_names[0])
		else:
			x = net.compute_compound_featvecs(images_data, blob_names)
#		x = x.reshape((x.shape[0], x.shape[1]))
		toc = time.time()
		print '...elapsed time ', (toc-tic)/batch_size, 'secs per image'

#		print 'Writing feature to file...'
		dump_svmlight_file(x, np.zeros((x.shape[0], 1), dtype=np.int32), outfile, do_append=True)

		# batch incremental
		start_ix = stop_ix
		stop_ix += batch_size
		if start_ix >= len(imagelist):
			break

	print 'DONE.'
Beispiel #18
0
def saveMatrix(mat, rowIndex, matrixFileName, zero_based=True):
    """
    Write the matrix and the row index to external text files.
    """
    return dump_svmlight_file(mat, rowIndex, F, zero_based)
    pass
#!/usr/bin/python

from scipy.sparse import hstack
import svmlight_loader as io
import sys

if len(sys.argv) < 4:
    print '''Merge two files in libsvm / svmlight format
into a single file.

Parameters: inFile1 inFile2 outFile

implemented June 2014 by Pascal Welke'''

fileOne = sys.argv[1]
fileTwo = sys.argv[2]
fileThree = sys.argv[3]

xOne, yOne = io.load_svmlight_file(fileOne)
xTwo, yTwo = io.load_svmlight_file(fileTwo)

X = hstack((xOne, xTwo))

io.dump_svmlight_file(X, yOne, fileThree, False)
Beispiel #20
0
            maxtid = max(maxtid, tid)
            # print(str(i) + ' : ' + str(tids[i]))
            i += 1
        pid += 1
    patternFile.close()

    # print(n_transactions)
    # print(pid)
    # print(maxtid)

    return sp.csr_matrix((data, (tids, pids)), shape=(n_transactions, pid))


if __name__ == '__main__':
    if len(sys.argv) != 4:
        sys.stderr.write('fsgtid2libsvm: Error, wrong number of arguments: ' +
                         str(len(sys.argv) - 1) + ' (expected: 3)')

    transactionFile = open(sys.argv[2], 'r')
    labels = getTransactionLabels(transactionFile)
    transactionFile.close()

    # novel variant that reduces memory usage
    data = tidSparseLoader2(sys.argv[1], labels.shape[0])

    # patternFile = open(sys.argv[1], 'r')
    # data = tidSparseLoader(patternFile, labels.shape[0])
    # patternFile.close()

    io.dump_svmlight_file(data, labels, sys.argv[3])
Beispiel #21
0
    else:
        if (len(sys.argv) == 3):
            svmFilename = sys.argv[1]
            filterFile = sys.stdin
            outFilename = sys.argv[2]
        else:  # len(sys.argv) == 4
            svmFilename = sys.argv[1]
            filterFile = open(sys.argv[2], 'r')
            outFilename = sys.argv[3]

    X, y = io.load_svmlight_file(svmFilename)
    featureSubset = np.loadtxt(filterFile)
    filterFile.close()

    sanitycheck = np.unique(featureSubset)
    print sanitycheck

    X_col = sp.csc_matrix(X)
    if X.shape[1] != featureSubset.shape[0]:
        sys.stderr.write(
            'libsvmFeatureFilter.py: Dimension mismatch of filter and features. Assuming that the feature list is a prefix of the filter list.\n'
        )
        featureSubset.resize(X.shape[1])

    # filter columns
    X_sub = X[:, featureSubset == 1]
    # sort matrix (o/w column indices are somehow inverted)
    X_sub.sort_indices()

    io.dump_svmlight_file(sp.csr_matrix(X_sub), y, outFilename)
Beispiel #22
0
def saveMatrix(mat, rowIndex, matrixFileName, zero_based=True):
    """
    Write the matrix and the row index to external text files.
    """
    return dump_svmlight_file(mat, rowIndex, F, zero_based)
    pass