def predict_PLSR(x_filename, y_filename, model_filename, showError): """ Read the PLSR model from the model_fname and read the X matrix from x_filename. Write the predicted output to the y_filename. """ sys.stderr.write("Predicting PLSR...") startTime = time.time() X = loadMatrix(x_filename)[0].todense() model = open(model_filename) pls2 = pickle.load(model) model.close() Y = pls2.predict(X) n = X.shape[0] dump_svmlight_file(X, np.arange(1, n + 1), y_filename, zero_based=True) endTime = time.time() sys.stderr.write(" took %ss\n" % str(round(endTime - startTime, 2))) if showError: Xnorm = np.linalg.norm(X, ord='fro') Error = np.linalg.norm((X - Y), ord='fro') rate = (100 * Error) / Xnorm print "Approximation Error Percentage = %f%%" % rate print "Frobenius norm of the original matrix =", Xnorm print "Frobenius norm of the error matrix =", Error pass
def predict_PLSR(x_filename, y_filename, model_filename, showError): """ Read the PLSR model from the model_fname and read the X matrix from x_filename. Write the predicted output to the y_filename. """ sys.stderr.write("Predicting PLSR...") startTime = time.time() X = loadMatrix(x_filename)[0].todense() model = open(model_filename) pls2 = pickle.load(model) model.close() Y = pls2.predict(X) n = X.shape[0] dump_svmlight_file(X, np.arange(1, n+1), y_filename, zero_based=True) endTime = time.time() sys.stderr.write(" took %ss\n" % str(round(endTime-startTime, 2))) if showError: Xnorm = np.linalg.norm(X, ord='fro') Error = np.linalg.norm((X - Y), ord='fro') rate = (100 * Error) / Xnorm print "Approximation Error Percentage = %f%%" % rate print "Frobenius norm of the original matrix =", Xnorm print "Frobenius norm of the error matrix =", Error pass
def generate_random_matrix_PLSR(): """ Generate random X and Y matrices to test PLSR. """ n = 1000 m = 500 X = np.random.randn(n, m) dump_svmlight_file(X, np.arange(1, n + 1), "../work/Y") pass
def generate_random_matrix_PLSR(): """ Generate random X and Y matrices to test PLSR. """ n = 1000 m = 500 X = np.random.randn(n, m) dump_svmlight_file(X, np.arange(1, n+1), "../work/Y") pass
def test_dump(): try: Xs, y = load_svmlight_file(datafile) tmpfile = "tmp_dump.txt" dump_svmlight_file(Xs, y, tmpfile, zero_based=False) X2, y2 = sk_load_svmlight_file(tmpfile) assert_array_equal(Xs.toarray(), X2.toarray()) assert_array_equal(y, y2) finally: os.remove(tmpfile)
def changeSparseLabels(f1, f2, labelFunction=AvsI): Xold, yold = io.load_svmlight_file(f1) vectorizedFunction = np.vectorize(labelFunction, otypes=[np.int32]) ynew = vectorizedFunction(yold) X = Xold[ynew != 0, :] y = ynew[ynew != 0] io.dump_svmlight_file(X, y, f2) return
def make_svm_file(labelfile, datafile, resultfile): y = np.loadtxt(labelfile) X = np.loadtxt(datafile) if len(y) != len(X): sys.stderr.write('dense2svm.py: Data and Labels have different dimensions. I give up.\n') return io.dump_svmlight_file(X, y, resultfile) return
def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): f = StringIO() dump_svmlight_file(X, y, f, zero_based=False) f.seek(0) X2, y2 = sk_load_svmlight_file(f) assert_array_equal(Xd, X2.toarray()) assert_array_equal(y, y2)
def caffe_batch_extract_predictionmap(network_proto, dense_network_proto, network_weights, mean_protofile, imagelist, outfile, src_layers, batch_size=100, dst_layers=['fc6-conv', 'fc7-conv', 'fc8-conv']): caffe.set_mode_cpu() # load learned weights print 'Loading network weights...' net = DenseNet(network_proto, dense_network_proto, network_weights, mean_protofile, src_layers=src_layers, dst_layers=dst_layers) # verify again image list in order to make sure they just contain valid image format print 'Loading images and their labels...' start_ix = 0 stop_ix = start_ix + batch_size if batch_size == -1: batch_size = len(imagelist) X = None first_time = True while True: images_data = [] for img in imagelist[start_ix:stop_ix]: if os.path.isfile(img): images_data.append(caffe.io.load_image(img)) else: continue print '... a batch of ', len(images_data), 'images were loaded' # stop_ix = len(images_data) tic = time.time() # start extraction print 'extracting features...' x = net.predict_densemap(images_data) toc = time.time() print '...elapsed time ', (toc-tic)/batch_size, 'secs per image' if first_time: X = x first_time = False else: X = np.r_[X, x] # batch incremental start_ix = stop_ix stop_ix += batch_size if start_ix >= len(imagelist): break print 'Writing feature to file...' dump_svmlight_file(X, np.zeros((len(imagelist),1)), outfile) print 'DONE.'
def mergeSparse(f1, f2, f3): X1, y1 = io.load_svmlight_file(f1) X2, y2 = io.load_svmlight_file(f2) if (y1.shape == y2.shape): X = sp.hstack([X1, X2]) else: sys.stderr.write('Error: Different number of examples in files: ' + str(y1.shape) + ' != ' + str(y2.shape) + '\n') return if (y1 == y2).sum() != y1.shape[0]: sys.stderr.write( 'Warning: Label mismatch. Are you merging features of the same subset?\nI will use the labels of the first argument\n' ) y = y1 io.dump_svmlight_file(X, y, f3) return
def make_svm_file(labelfile, datafile, resultfile): y = np.loadtxt(labelfile) X = np.loadtxt(datafile) if len(y) != len(X): sys.stderr.write('minhash2svm.py: Data and Labels have different dimensions. I give up.\n') return # Check if data is nonnegative. # Also check if data contains zeros. If so, shift values up and generate a warning unique_values = np.unique(X) zero_count = (unique_values == 0).sum() nonnegative_count = (unique_values >= 0).sum() if nonnegative_count == unique_values.shape[0]: if zero_count > 0: sys.stderr.write('minhash2svm.py: Data file contains zero positions. Shifting values up by one.\n') X = X + 1 else: sys.stderr.write('minhash2svm.py: Data file contains nonpositive values. I give up.\n') return io.dump_svmlight_file(X, y, resultfile) return
from numpy import * from scipy.sparse import csr_matrix from svmlight_loader import dump_svmlight_file positives = loadtxt('positives_imputed.csv',delimiter=',') negatives = loadtxt('negatives_imputed.csv', delimiter=',') positives = positives[:,1:] negatives = negatives[:,1:] positives_samples = positives.shape[0] negatives_samples = negatives.shape[0] y = ones(positives_samples+negatives_samples, dtype='int64') y[0:negatives_samples] = -1 X = vstack((negatives,positives)) X = csr_matrix(X) dump_svmlight_file(X, y, "Jul2_training.svmlight", zero_based=False)
def select_model(X, y): # make holdout-holdin split ( X_in, X_out, y_in, y_out, indices_in, indices_out, removed_features, ) = dh.split_holdout(X, y) logging.info("Writing holdin-holdout split data and info to file.") dump_svmlight_file(X_in, y_in, os.path.join(s.OPT_DIRP, "holdin.svm"), zero_based=True) dump_svmlight_file(X_out, y_out, os.path.join(s.OPT_DIRP, "holdout.svm"), zero_based=True) with open(os.path.join(s.OPT_DIRP, "holdinout_split_indices.json"), "wt") as f: json.dump( { "holdin": indices_in.tolist(), "holdout": indices_out.tolist(), "num_features": X_in.shape[1], }, f, ) steps, param_grids = pipeline.make_pipelines(s.PIPE_STEPS, alt_order=s.ALT_ORDER) steps_param_grids = zip(steps, param_grids) if ( s.PARTIALRUN ): # filter with partial run info from the list pkl generated by reporter.py partialinfo = json.load(open(s.PARTIALRUN, "rt")) steps_param_grids = pipeline.filter_partialrun(steps_param_grids, partialinfo) all_results = {} fit_pred_duration = {} cv_pipe_dir = os.path.join(s.OPT_DIRP, "cv_pipelines") util.ensure_dir(cv_pipe_dir) for (steps, param_grid) in steps_param_grids: # generate a human readable name for the current pipeline from the Pipeline object pipe_name = [] for (name, step) in steps: if not "SelectPercentile" in str(step): pipe_name.append(str(step).split("(")[0].lower()) else: pipe_name.append( str(step.score_func.func_name).split("(")[0].lower()) pipe_name = "+".join(pipe_name) DATASET_NAME = "{}_{}".format( pipe_name, s.DATASET_NAME) # append the dataset name with pipeline name for # logging and metadata purposes pipe_opt_dir = os.path.join(cv_pipe_dir, pipe_name) util.ensure_dir(pipe_opt_dir) pipe = Pipeline(steps) grid_search = GridSearchCV( pipe, param_grid=param_grid, scoring=pipeline.my_scorer, n_jobs=s.CV_N_JOBS, cv=s.CV, verbose=10, error_score=0, return_train_score=False, ) logging.info("{}: Doing modelselection with {}.".format( pipe_name, grid_search)) start_pipefit = timeit.default_timer() grid_search.fit(X_in, y_in) # save grid_search object logging.info("{}: Pickling crossvalidation object..".format(pipe_name)) dump( grid_search, os.path.join(pipe_opt_dir, "%s_grid_search.joblibpkl" % s.TIMESTAMP), compress=1, ) # save all intermediate results all_results[pipe_name] = grid_search.cv_results_ with open(os.path.join(s.OPT_DIRP, "all_pipeline_cv_results.pkl"), "wb") as all_res_out: pickle.dump(all_results, all_res_out) logging.info( "{}: Evaluating winning model on holdout test set.".format( pipe_name)) logging.info("{}: Evaluating holdout performance.".format(pipe_name)) y_pred = grid_search.predict(X_out).astype(int) y_out_true_y_out_pred = { "y_out_true": y_out.tolist(), "y_out_pred": y_pred.tolist(), } with open(os.path.join(pipe_opt_dir, "y_out_true-y_out_pred.json"), "wt") as f: json.dump(y_out_true_y_out_pred, f) # save all intermediate fit and predict durations elapsed = timeit.default_timer() - start_pipefit fit_pred_duration[pipe_name] = elapsed json.dump( fit_pred_duration, open( os.path.join(s.OPT_DIRP, "all_pipeline_fit_predict_duration.json"), "wt"), ) precision, recall, fscore, support = precision_recall_fscore_support( y_out, y_pred, average=s.SCORE_AVERAGING) acc = accuracy_score(y_out, y_pred) if s.MULTICLASS: auc = None else: auc = roc_auc_score(y_out, y_pred) # make report params = grid_search.best_params_ winscore = grid_search.best_score_ ablation_name = "blah" report = ("%s\t%s\t%s" "\nSettings: %s" "\nTested parameters: %s" "\nWinning parameters: %s" "\nWinning model CV score: %s %s" "\nHoldout score:" "\nfscore\tprecision\trecall\tacc\tauc" "\n%s\t%s\t%s\t%s\t%s" % ( s.DATA_FP, ablation_name, str(pipe.get_params()), s.__file__, s.PIPE_STEPS, params, winscore, s.SCORER_METRIC, fscore, precision, recall, acc, auc, )) print(report) with open(os.path.join(pipe_opt_dir, "%s_results.txt" % s.TIMESTAMP), "wt") as f: f.write(report) report_as_dict = { "data_path": s.DATA_FP, "feature_groups": ablation_name, # 'classifier_type': str(type(clf)), "settings": str(s.__file__), "param_grid": str(s.PIPE_STEPS), "best_params": str(params), "score_grid_search": winscore, "metric_grid_search": s.SCORER_METRIC, "fscore_holdout": fscore, "precision_holdout": precision, "recall_holdout": recall, "acc_holdout": acc, "auc_holdout": auc, "support_holdout": support, "predictions_holdout": y_pred.tolist(), "y_true_holdout": y_out.tolist(), } with open( os.path.join(pipe_opt_dir, "%s_finalreport.txt" % s.TIMESTAMP), "wt") as f: f.write(report) with open(os.path.join(pipe_opt_dir, "report.json"), "wt") as f: json.dump(report_as_dict, f) logging.info("{}: Model selection done. Duration: {}".format( pipe_name.upper(), str(datetime.timedelta(seconds=elapsed)))) logging.info("DONE.")
#!/usr/bin/python '''Usage: name INFILE OUTFILE PERCENTILE Where INFILE is a file in libSVM format OUTFILE will be a file in libSVM format containing the chosen percentile of top features PERCENTILE is an integer specifying the amount of features you want to keep.''' import numpy as np import pylab as pl import svmlight_loader as io import sys from sklearn import datasets, svm from sklearn.feature_selection import SelectPercentile, chi2 ################################################################# X, y = io.load_svmlight_file(sys.argv[1]) ################################################################# # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(chi2, percentile=int(sys.argv[3])) selector.fit(X, y) ################################################################# # store output in file Xsmall = selector.transform(X) io.dump_svmlight_file(Xsmall, y, sys.argv[2], False)
for key, value in lmdb_cursor: if i == chunk_size: break datum.ParseFromString(value) label = datum.label data.append(caffe.io.datum_to_array(datum).ravel()) i += 1 return np.array(data, dtype=np.float32) def caffe_lmdb2csr(lmdb_file, gt_file, out_file): imgs = [] lbls = [] with open(gt_file, 'rt') as fin: for line in fin: try: img, lbl = line.strip().split(' ') except ValueError, e: print(e) print(line) raise imgs.append(img) lbls.append(lbl) X = caffe_get_data_chunk(lmdb_file, len(imgs)) if X.shape[0] != len(imgs): # print 'Length mismatch between ', gt_file, ' and ', lmdb_file # print ' ', X.shape[0], ' vs ', len(imgs) raise ValueError('Length mismatch between ' + gt_file, ' and ' + lmdb_file) dump_svmlight_file(X, np.array(lbls), out_file)
def caffe_batch_extract_features(network_proto, network_weights, mean_protofile, imagelist_file, outfile, blob_names=['fc7'], batch_size=100, use_gpu=True, cuda_dev=0): # load learned weights if not os.path.isfile(mean_protofile): raise ValueError('mean file not found!') if os.path.isfile(outfile): print 'file exist. exit.' return if not mean_protofile.split('.')[-1] == 'npy': print 'Converting mean protofile into numpy format...' blob = caffe.proto.caffe_pb2.BlobProto() data = open(mean_protofile, 'rb').read() blob.ParseFromString(data) arr = np.array(caffe.io.blobproto_to_array(blob))[0] np.save(os.path.join(os.path.dirname(mean_protofile), os.path.basename(mean_protofile).split('.')[0] + '.npy'), arr) else: print 'Loading mean file...' arr = np.load(mean_protofile) net = Extractor(network_proto, network_weights, mean=arr.mean(1).mean(1), raw_scale=255, channel_swap=(2,1,0), image_dims=(256,256)) # verify again image list in order to make sure they just contain valid image format print 'Extracting features from listing file ', imagelist_file, '...' start_ix = 0 stop_ix = start_ix + batch_size # load the imagelist and labelist imagelist = [] with open(imagelist_file, 'rt') as fin: for line in fin: fpath = line.strip().split(' ') fpath = fpath[0] imagelist.append(fpath) print 'Total ', len(imagelist), ' images are enlisted' if batch_size == -1: batch_size = len(imagelist) while True: images_data = [] for img in imagelist[start_ix:stop_ix]: if os.path.isfile(img): try: images_data.append(caffe.io.load_image(img)) except: print 'Warning: unknown/bad format file' else: raise ValueError('Image file(s) not found: ' + img) print '... a batch of ', len(images_data), 'images were loaded' # stop_ix = len(images_data) tic = time.time() # start extraction # print 'extracting features...' if len(blob_names) == 1: x = net.compute_featvecs(images_data, blob_names[0]) else: x = net.compute_compound_featvecs(images_data, blob_names) # x = x.reshape((x.shape[0], x.shape[1])) toc = time.time() print '...elapsed time ', (toc-tic)/batch_size, 'secs per image' # print 'Writing feature to file...' dump_svmlight_file(x, np.zeros((x.shape[0], 1), dtype=np.int32), outfile, do_append=True) # batch incremental start_ix = stop_ix stop_ix += batch_size if start_ix >= len(imagelist): break print 'DONE.'
def saveMatrix(mat, rowIndex, matrixFileName, zero_based=True): """ Write the matrix and the row index to external text files. """ return dump_svmlight_file(mat, rowIndex, F, zero_based) pass
#!/usr/bin/python from scipy.sparse import hstack import svmlight_loader as io import sys if len(sys.argv) < 4: print '''Merge two files in libsvm / svmlight format into a single file. Parameters: inFile1 inFile2 outFile implemented June 2014 by Pascal Welke''' fileOne = sys.argv[1] fileTwo = sys.argv[2] fileThree = sys.argv[3] xOne, yOne = io.load_svmlight_file(fileOne) xTwo, yTwo = io.load_svmlight_file(fileTwo) X = hstack((xOne, xTwo)) io.dump_svmlight_file(X, yOne, fileThree, False)
maxtid = max(maxtid, tid) # print(str(i) + ' : ' + str(tids[i])) i += 1 pid += 1 patternFile.close() # print(n_transactions) # print(pid) # print(maxtid) return sp.csr_matrix((data, (tids, pids)), shape=(n_transactions, pid)) if __name__ == '__main__': if len(sys.argv) != 4: sys.stderr.write('fsgtid2libsvm: Error, wrong number of arguments: ' + str(len(sys.argv) - 1) + ' (expected: 3)') transactionFile = open(sys.argv[2], 'r') labels = getTransactionLabels(transactionFile) transactionFile.close() # novel variant that reduces memory usage data = tidSparseLoader2(sys.argv[1], labels.shape[0]) # patternFile = open(sys.argv[1], 'r') # data = tidSparseLoader(patternFile, labels.shape[0]) # patternFile.close() io.dump_svmlight_file(data, labels, sys.argv[3])
else: if (len(sys.argv) == 3): svmFilename = sys.argv[1] filterFile = sys.stdin outFilename = sys.argv[2] else: # len(sys.argv) == 4 svmFilename = sys.argv[1] filterFile = open(sys.argv[2], 'r') outFilename = sys.argv[3] X, y = io.load_svmlight_file(svmFilename) featureSubset = np.loadtxt(filterFile) filterFile.close() sanitycheck = np.unique(featureSubset) print sanitycheck X_col = sp.csc_matrix(X) if X.shape[1] != featureSubset.shape[0]: sys.stderr.write( 'libsvmFeatureFilter.py: Dimension mismatch of filter and features. Assuming that the feature list is a prefix of the filter list.\n' ) featureSubset.resize(X.shape[1]) # filter columns X_sub = X[:, featureSubset == 1] # sort matrix (o/w column indices are somehow inverted) X_sub.sort_indices() io.dump_svmlight_file(sp.csr_matrix(X_sub), y, outFilename)