def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): for zero_based in (True, False): for dtype in [np.float32, np.float64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) if dtype == np.float32: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 4) else: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 15) assert_array_equal(y, y2)
def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] if not os.path.exists(outDir): os.makedirs(outDir) trainFeatures, trainClasses = datasets.load_svmlight_file(examples) if classifyExamples != None: develFeatures, develClasses = datasets.load_svmlight_file(classifyExamples, trainFeatures.shape[1]) binarizer = preprocessing.LabelBinarizer() binarizer.fit(trainClasses) trainClasses = binarizer.transform(trainClasses) if classifyExamples != None: develClasses = binarizer.transform(develClasses) print >> sys.stderr, "Training Keras model with parameters:", parameters parameters = Parameters.get(parameters, {"TEES.classifier":"KerasClassifier", "layers":5, "lr":0.001, "epochs":1, "batch_size":64, "patience":10}) np.random.seed(10) classifier.kerasModel = classifier._defineModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
def gridSearch(): X_train, y_train = load_svmlight_file(svmPath + "/" + trainFile) X_test, y_test = load_svmlight_file(svmPath + "/" + testFile, n_features=X_train.shape[1]) tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]#, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] #training # clf = svm.SVC(kernel='linear') # clf.fit(X_features, trainingLabels) scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_estimator_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print()
def scale_mnist8m(): from sklearn.datasets import load_svmlight_file print "loading train",datetime.datetime.now() dd_train = load_svmlight_file(base_folder_mnist + "mnist8m_6_8_train.libsvm") print "loading test", datetime.datetime.now() dd_test = load_svmlight_file(base_folder_mnist + "mnist8m_6_8_test.libsvm") Xtrain = dd_train[0] Xtest = dd_test[0] Ytrain = dd_train[1] Ytest = dd_test[1] Xtrain = csr_matrix((Xtrain.data, Xtrain.indices, Xtrain.indptr), shape=(Xtrain.shape[0], 786)) Xtest = csr_matrix((Xtest.data, Xtest.indices, Xtest.indptr), shape=(Xtest.shape[0], 786)) from sklearn.externals import joblib print "densifying train",datetime.datetime.now() Xtrain = Xtrain.todense() print "densifying test",datetime.datetime.now() Xtest = Xtest.todense() print "dumping train",datetime.datetime.now() joblib.dump((np.asarray(Xtrain),Ytrain),base_folder_mnist + "mnist8m_6_8_train_reshaped") #joblib.load(base_folder + "mnist8m_6_8_train_touple_small") print "dumping test",datetime.datetime.now() joblib.dump((np.asarray(Xtest),Ytest),base_folder_mnist + "mnist8m_6_8_test_reshaped") print "finished",datetime.datetime.now()
def test(): x_train,y_train=load_svmlight_file("D:/traindata/12trainset") x_train.todense() x_test,y_test=load_svmlight_file("D:/traindata/12testset") x_test.todense() print(x_train.shape) #classifier clf=SVC(kernel='rbf') ovrclf=OneVsRestClassifier(clf,-1) #parameter parameters=[{'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5], 'estimator__kernel':['rbf'], 'estimator__gamma':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5]}, {'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5], 'estimator__kernel':['linear']}] para={'estimator__C':[2**-5,2**-4], 'estimator__kernel':['rbf'], 'estimator__gamma':[2**-1,1]} #scoring sougou_score=make_scorer(score_func,greater_is_better=False) #cross_validation iterator sfk=c_v.StratifiedKFold(y_train,shuffle=True,n_folds=5,random_state=0) #grid search gsclf=g_s.GridSearchCV(ovrclf,param_grid=para,cv=sfk,scoring=sougou_score) gsclf.fit(x_train,y_train) print("best score: ",gsclf.best_score_) print("best parameters: ",gsclf.best_params_) y_pred=gsclf.predict(x_test) #result target_names=['0','1','2','3'] sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) print(classification_report(y_test,y_pred,target_names=target_names)) print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def test_load_with_offsets(sparsity, n_samples, n_features): rng = np.random.RandomState(0) X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) if sparsity: X[X < sparsity] = 0.0 X = sp.csr_matrix(X) y = rng.randint(low=0, high=2, size=n_samples) f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) size = len(f.getvalue()) # put some marks that are likely to happen anywhere in a row mark_0 = 0 mark_1 = size // 3 length_0 = mark_1 - mark_0 mark_2 = 4 * size // 5 length_1 = mark_2 - mark_1 # load the original sparse matrix into 3 independent CSR matrices X_0, y_0 = load_svmlight_file(f, n_features=n_features, offset=mark_0, length=length_0) X_1, y_1 = load_svmlight_file(f, n_features=n_features, offset=mark_1, length=length_1) X_2, y_2 = load_svmlight_file(f, n_features=n_features, offset=mark_2) y_concat = np.concatenate([y_0, y_1, y_2]) X_concat = sp.vstack([X_0, X_1, X_2]) assert_array_almost_equal(y, y_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): for zero_based in (True, False): for dtype in [np.float32, np.float64]: f = BytesIO() dump_svmlight_file(X.astype(dtype), y, f, zero_based=zero_based) f.seek(0) comment = f.readline() assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) if dtype == np.float32: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 4, ) else: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 15, ) assert_array_equal(y, y2)
def test_load_with_long_qid(): # load svmfile with longint qid attribute data = b(""" 1 qid:0 0:1 1:2 2:3 0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985 0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985 3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985""") X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) true_X = [[1, 2, 3], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985]] true_y = [1, 0, 0, 3] trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807] assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f = BytesIO() dump_svmlight_file(X, y, f, query_id=qid, zero_based=True) f.seek(0) X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f.seek(0) X, y = load_svmlight_file(f, query_id=False, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X)
def test_dump_comment(): X, y = load_svmlight_file(datafile) X = X.toarray() f = BytesIO() ascii_comment = "This is a comment\nspanning multiple lines." dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) # XXX we have to update this to support Python 3.x utf8_comment = "It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc" f = BytesIO() assert_raises(UnicodeDecodeError, dump_svmlight_file, X, y, f, comment=utf8_comment) unicode_comment = utf8_comment.decode("utf-8") f = BytesIO() dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) f = BytesIO() assert_raises(ValueError, dump_svmlight_file, X, y, f, comment="I've got a \0.")
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, C, n_fold=5): logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='lr_{}.log'.format(C)) logging.info('Loading training and test data...') X, y = load_svmlight_file(train_file) X_tst, _ = load_svmlight_file(test_file) clf = LR(penalty='l2', dual=True, C=C, class_weight='auto', random_state=2015) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) lloss = 0. for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] lloss += log_loss(y[i_val], p_val[i_val]) logging.info('Log Loss = {:.4f}'.format(lloss)) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def classification_subfeature(train, test, outclss): fields = iot.read_fields() print len(fields) foi = ['liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.affect', 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.ingest'] indeces = [np.where(fields==f)[0][0] for f in foi] print fields[indeces] '''Load Training data''' X_train, y_train = load_svmlight_file(train) X_train = X_train.toarray()[:, indeces] scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) print X_train.shape '''Load Test data''' X_test, y_test = load_svmlight_file(test) X_test = X_test.toarray()[:, indeces] X_test = scaler.transform(X_test) print X_test.shape svc_lin = SVC(kernel='linear', class_weight='balanced') y_lin = svc_lin.fit(X_train, y_train).predict(X_test) # pickle.dump(y_test, open(outid, 'w')) pickle.dump(y_lin, open(outclss, 'w'))
def train_and_test(domain_dir, sentences): train_dir = os.path.join(domain_dir, "train") test_dir = os.path.join(domain_dir, "test") X_train, y_train = load_svmlight_file(os.path.join(train_dir, "feature_vector")) X_test, y_test = load_svmlight_file(os.path.join(test_dir, "feature_vector")) clf = LogisticRegression(C=1.0, intercept_scaling=1, dual=False, fit_intercept=True, penalty="l2", tol=0.0001) print("fit..") clf.fit(X_train, y_train) print("fit end...") y_train_predict = clf.predict(X_train) print(f1_score(y_train, y_train_predict)) y = clf.predict(X_test) f = open(os.path.join(test_dir, "relation.classifier"), "w", encoding="utf8") i = 0 for sentence in sentences: flag = False str_list = [] str_list.append("S\t{0}".format(sentence.text)) for pair in sentence.candidate_relation: if y[i] != 0: flag = True str_list.append("R\t{0}\t{1}\t{2}\t{3}".format( sentence.print_phrase(pair[0]).lower(), sentence.print_phrase(pair[1]).lower(), list(pair[0]), list(pair[1]))) i += 1 if flag: for s in str_list: print(s, file=f) f.close()
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_fold=5): feature_name = os.path.basename(train_file)[:-10] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='esb_xg_grid_colsub_{}.log'.format(feature_name)) logging.info('Loading training and test data...') X, y = load_svmlight_file(train_file) X_tst, _ = load_svmlight_file(test_file) xg = xgb.XGBClassifier() param = {'learning_rate': [.01, .03, .05], 'max_depth': [4, 5, 6], 'n_estimators': [400, 600]} cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) clf = GridSearchCV(xg, param, scoring='log_loss', verbose=1, cv=cv) logging.info('Cross validation for grid search...') clf.fit(X, y) p = clf.predict_proba(X)[:, 1] logging.info('best model = {}'.format(clf.best_estimator_)) logging.info('best score = {:.4f}'.format(clf.best_score_)) logging.info('Retraining with 100% data...') clf.best_estimator_.fit(X, y) p_tst = clf.best_estimator_.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def check_data_compatibility(self): try: load_svmlight_file(self.input_path) return True except Exception as ex: print ex.message return False
def load(self, dataset = None, data_dir = "/home/drunkeneye/lab/data", verbose = None): if verbose == None: verbose = self.verbose if dataset == None: dataset = self.name # first try to load the data 'directly' try: filePath = os.path.join(data_dir, dataset, dataset) if verbose: print(" Trying to load data set from {}". format(filePath)) self.X, self.y = load_svmlight_file(filePath) self.X = np.asarray(self.X.todense()) if verbose: print (" Loaded from {}". format( filePath)) return except: pass # next try try: filePath = os.path.join(data_dir, dataset, dataset + ".combined.scaled") if verbose: print(" Trying to load data set from {}". format(filePath)) self.X, self.y = load_svmlight_file(filePath) self.X = np.asarray(self.X.todense()) if verbose: print (" Loaded from {}". format( filePath)) return except: pass
def run(train_fp, test_fp, pred_fp, key_fp): keys = [] load(key_fp, keys) X_train, y_train = load_svmlight_file(train_fp) X_test, y_test = load_svmlight_file(test_fp) #dtrain = xgb.DMatrix(train_fp) #dtest = xgb.DMatrix(test_fp) params = {} with open("lr_reg.params", 'r') as f: params = json.load(f) print "[%s] [INFO] params: %s\n" % (t_now(), str(params)) model = linear_model.Ridge (alpha = params['alpha']) model.fit(X_train, y_train) pred = model.predict(X_test) #model = xgb.train( params, dtrain, params['n_round']) #model = xgb.train( params, dtrain, params['n_round'], obj = customed_obj_1) #pred = model.predict(dtest, ntree_limit=params['n_round']) #pred = model.predict(dtest) f = open(pred_fp, 'w') for i in range(len(keys)): f.write(keys[i] + "," + str(max(1.0, pred[i])) + "\n") f.close() return 0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5): logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='rf_{}_{}.log'.format( n_est, depth )) logging.info('Loading training and test data...') X, y = load_svmlight_file(train_file) X_tst, _ = load_svmlight_file(test_file) clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) lloss = 0. for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] lloss += log_loss(y[i_val], p_val[i_val]) logging.info('Log Loss = {:.4f}'.format(lloss)) logging.info('Retraining with 100% data...') clf.fit(X.todense(), y) p_tst = clf.predict_proba(X_tst.todense())[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def main(): # svm_para = {'C': 10.0, 'kernel': 'rbf', 'gamma': 1.667, 'verbose': False} # svm_para = {'kernel': 'linear', 'verbose': False} # loading data # X_train, y_train = datasets.load_svmlight_file(r'./dataset/mnist_train_784_poly_8vr.dat') # X_train, y_train = datasets.load_svmlight_file(r'./dataset/covtype_tr_2vr.data') # svm_para = {'C': 10.0, 'kernel': 'rbf', 'gamma': 0.00002, 'tol': 0.01, 'verbose': False} # census svm_para = {"C": 10.0, "kernel": "rbf", "gamma": 1.667, "verbose": False} X_train, y_train = datasets.load_svmlight_file(r"./dataset/census.train") # test ramdom sampling RS_SVM = RandomSamplingSVM(svm_para) start_time = time.time() model = RS_SVM.train_one_half_v2(X_train, y_train) print("Remain SVs: " + str(model.n_support_), flush=True) print("--- %s seconds ---" % (time.time() - start_time), flush=True) if model is None: print("Can not train the dataset", flush=True) else: # X_test, y_test = datasets.load_svmlight_file(r'./dataset/mnist_test_784_poly_8vr.dat') # X_test, y_test = datasets.load_svmlight_file(r'./dataset/covtype_tst_2vr.data') X_test, y_test = datasets.load_svmlight_file(r"./dataset/census.train") ratio = model.score(X_test, y_test) print(ratio) print("--- %s seconds ---" % (time.time() - start_time), flush=True)
def load_data(dataset1, dataset2=None, make_dense=False): """Loads the dataset(s) given in the the svmlight / libsvm format **Parameters** * dataset1 (*str*) - Path to the file of the first dataset. * dataset2 (*str or None*) - If not None, path to the file of second dataset * make_dense (*boolean*) - Whether to return dense matrices instead of sparse ones **Returns** * (X_pool, X_test, y_pool, y_test) - Pool and test files if two files are provided * (X, y) - The single dataset """ if dataset2: X_pool, y_pool = load_svmlight_file(dataset1) _, num_feat = X_pool.shape X_test, y_test = load_svmlight_file(dataset2, n_features=num_feat) if make_dense: X_pool = X_pool.todense() X_test = X_test.todense() return (X_pool, X_test, y_pool, y_test) else: X, y = load_svmlight_file(dataset1) if make_dense: X = X.todense() return X, y
def svm(): #load data x_train,y_train=load_svmlight_file("12trainset") x_train.todense() x_test,y_test=load_svmlight_file("12testdata") x_test.todense() sk=SelectKBest(f_classif,9).fit(x_train,y_train) x_new=sk.transform(x_train) x_newtest=sk.transform(x_test) print(sk.scores_) print(x_new.shape) print(sk.get_support()) #classfier clf=SVC(C=2,gamma=2) ovrclf=OneVsRestClassifier(clf,-1) ovrclf.fit(x_train,y_train) y_pred=ovrclf.predict(x_test) # write result with open("result.txt","w") as fw: for st in y_pred.tolist(): fw.write(str(st)+'\n') print(np.array(y_pred).shape) target_names=['0','1','2','3'] #result #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) #print(classification_report(y_test,y_pred,target_names=target_names)) #print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def test_load_compressed(): X, y = load_svmlight_file(datafile) with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: with gzip.open(tmp.name, "wb") as fh_out: shutil.copyfileobj(f, fh_out) Xgz, ygz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. os.remove(tmp.name) assert_array_almost_equal(X.toarray(), Xgz.toarray()) assert_array_almost_equal(y, ygz) with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: with BZ2File(tmp.name, "wb") as fh_out: shutil.copyfileobj(f, fh_out) Xbz, ybz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. os.remove(tmp.name) assert_array_almost_equal(X.toarray(), Xbz.toarray()) assert_array_almost_equal(y, ybz)
def train_predict_lr_cv(train_file, test_file, predict_train_file, predict_test_file, c, n_fold=10): logger.info("Reading in the training data") X_trn, y_trn = load_svmlight_file(train_file) X_trn = X_trn.todense() logger.info("Reading in the test data") X_tst, _ = load_svmlight_file(test_file) X_tst = X_tst.todense() logger.info('Normalizing data') scaler = StandardScaler() X_trn = scaler.fit_transform(X_trn) X_tst = scaler.transform(X_tst) cv = cross_validation.StratifiedKFold(y_trn, n_folds=n_fold, shuffle=True, random_state=1) yhat_tst = np.zeros((X_tst.shape[0], )) yhat_trn = np.zeros((X_trn.shape[0], )) for i, (i_trn, i_val) in enumerate(cv, start=1): logger.info('Training CV #{}'.format(i)) clf = LogisticRegression(C=c, class_weight=None, random_state=2013) clf.fit(X_trn[i_trn], y_trn[i_trn]) yhat_trn[i_val] = clf.predict_proba(X_trn[i_val])[:, 1] yhat_tst += np.array(clf.predict_proba(X_tst)[:, 1]) / n_fold auc_cv = metrics.roc_auc_score(y_trn, yhat_trn) logger.info('AUC CV: {}'.format(auc_cv)) logger.info("Writing test predictions to file") np.savetxt(predict_train_file, yhat_trn, fmt='%.6f', delimiter=',') np.savetxt(predict_test_file, yhat_tst, fmt='%.6f', delimiter=',')
def loadData(): data1, target = load_svmlight_file('dataset/text.scale') data2, target = load_svmlight_file('dataset/following.scale') data1, data2, target = shuffle(data1, data2, target) return (data1, data2, target)
def load_covtype(): try: x, y = da.load_svmlight_file("data/covtype/covtype.sample04_train", 54) x_test, y_test = da.load_svmlight_file("data/covtype/covtype.sample04_test", 54) except(Exception): x, y = da.load_svmlight_file("../data/covtype/covtype.sample04_train", 54) x_test, y_test = da.load_svmlight_file("../data/covtype/covtype.sample04_test", 54) return x, x_test, y, y_test
def drop_fn(train_file,drop_file): x_train,y_train=load_svmlight_file(train_file) x_fn,y_fn=load_svmlight_file(drop_file,n_features=x_train.shape[1]) iterations = 0 while 1: print 'iteration:%d'%iterations iterations += 1 train_set=update_model((x_train,y_train),(x_fn,y_fn))
def nn_classify(): # train_X,Y = load_svmlight_file('data/train_metrix') # rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique() # train_X = pd.read_csv('data/train_tfidf.csv',index_col=0) # test_X = pd.read_csv('data/test_tfidf.csv',index_col=0) # select = SelectPercentile(f_classif, percentile=50) # select.fit(train_X,Y) # train_X = select.transform(train_X) # test_X = select.transform(test_X) # print 'dump train...' # dump_svmlight_file(train_X,Y,'data/train_last') # test_Y = [0]*(test_X.shape[0]) # print 'dump test...' # dump_svmlight_file(test_X,test_Y,'data/test_last') train_X,Y = load_svmlight_file('data/train_last') test_X,test_Y = load_svmlight_file('data/test_last') train_X = train_X.toarray() test_X = test_X.toarray() Y = [int(y)-1 for y in Y] print 'Y:',len(Y) rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique() train_n = train_X.shape[0] m = train_X.shape[1] test_n = test_X.shape[0] print train_n,m,#test_n train_data = ClassificationDataSet(m,1,nb_classes=12) test_data = ClassificationDataSet(m,1,nb_classes=12) # test_data = ClassificationDataSet(test_n,m,nb_classes=12) for i in range(train_n): train_data.addSample(np.ravel(train_X[i]),Y[i]) for i in range(test_n): test_data.addSample(test_X[i],Y[i]) trndata = train_data # tstdata = train_data trndata._convertToOneOfMany() # tstdata._convertToOneOfMany() test_data._convertToOneOfMany() # 先用训练集训练出所有的分类器 print 'train classify...' fnn = buildNetwork( trndata.indim, 400 , trndata.outdim, outclass=SoftmaxLayer ) trainer = BackpropTrainer( fnn, dataset=trndata, momentum=0.1, learningrate=0.01 , verbose=True, weightdecay=0.01) trainer.trainEpochs(3) # print 'Percent Error on Test dataset: ' , percentError( trainer.testOnClassData ( # dataset=tstdata ) # , ) print 'end train classify' pre_y = trainer.testOnClassData(dataset=trndata) print metrics.classification_report(Y,pre_y) pre_y = trainer.testOnClassData(dataset=test_data) print 'write result...' print 'before:',pre_y[:100] pre_y = [int(y)+1 for y in pre_y] print 'after:',pre_y[:100] DataFrame(pre_y,index=rows).to_csv('data/info_test2.csv', header=False) print 'end...'
def load_train_test(): current_path = os.path.abspath( os.path.join(os.getcwd(), os.pardir)) train, train_labels = load_svmlight_file(current_path + "/Data/Processed/TrainSet.svm") test, test_labels = load_svmlight_file(current_path + "/Data/Processed/TestSet.svm") return train, test, train_labels, test_labels
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() comment = str(comment, "utf-8") assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_dense.astype(dtype), X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_dense.astype(dtype), X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype), y2, 15)
def test_lambdarank(self): X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_iter=100, hidden=4, lrate=.1, n_fold=5): _, y_val = load_svmlight_file(train_file) cv = StratifiedKFold(y_val, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y_val) lloss = 0. for i_trn, i_val in cv: clf = NN(n=10000, h=hidden, a=lrate, seed=2015) logging.info('Epoch\tTrain\tValid') logging.info('=========================') for i_iter in range(n_iter): lloss_trn = 0. cnt_trn = 0 for i, (x, y) in enumerate(clf.read_sparse(train_file)): if i in i_val: p_val[i] = clf.predict(x) else: p = clf.predict(x) clf.update(x, p - y) lloss_trn += logloss(y, p) cnt_trn += 1 lloss_trn /= cnt_trn lloss_val = log_loss(y_val[i_val], p_val[i_val]) if (i_iter == 0) or ((i_iter + 1) % int(n_iter / 10) == 0) or (i_iter == n_iter - 1): logging.info('#{:4d}\t{:.4f}\t{:.4f}'.format(i_iter + 1, lloss_trn, lloss_val)) lloss += lloss_val logging.info('Log Loss = {:.4f}'.format(lloss / n_fold)) logging.info('Retraining with 100% data...') clf = NN(n=10000, h=hidden, a=lrate, seed=2015) for i_iter in range(n_iter): for x, y in clf.read_sparse(train_file): p = clf.predict(x) clf.update(x, p - y) logging.info('#{:4d}'.format(i_iter + 1)) _, y_tst = load_svmlight_file(test_file) p_tst = np.zeros_like(y_tst) for i, (x, _) in enumerate(clf.read_sparse(test_file)): p_tst[i] = clf.predict(x) logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def test_load_svmlight_file_multilabel(): X, y = load_svmlight_file(multifile, multilabel=True) assert_equal(y, [(0, 1), (2,), (1, 2)])
return train_face, train_face_number, test_face, test_face_number # tuple def resizeSVHDShape(matrix): svhd = np.zeros((5000, 3072)) [rows, cols] = svhd.shape for r in range(rows): for c in range(cols): svhd[r][c] = matrix[int( (c % 1024) / 32)][(c % 1024) % 32][int(c / 1024)][r] return svhd if __name__ == "__main__": x_train2, y_train2 = ds.load_svmlight_file( 'F:/projects/vec2vec/data-clear-xlren/data-clear/movie/train.bow') x_train = x_train2.toarray() y_train = y_train2 program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) x_train = x_train[0:2000, :] y_train = y_train[0:2000] models = [] emb_size = 64 num_neighbors = 16 print(x_train.shape)
def sensorless(path_sensorless): from sklearn.datasets import load_svmlight_file data = load_svmlight_file(path_sensorless+"/Sensorless.scale") dense_vector = np.zeros((data[0].shape[0],data[0].shape[1])) data[0].toarray(out = dense_vector) return dense_vector, data[1]
# kafkaproducer.start() # import os # import shutil # path = r'/home/sunbite/video/action_youtube_naudio' # for dirpath,dirnames,filenames in os.walk(path): # for filename in filenames: # print(os.path.join(dirpath, filename)) # my_feature = GetFeatures.get_features('v_shooting_01_01_0.jpg') # print(my_feature) # ssh = paramiko.SSHClient() # ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) # ssh.connect(hostname='10.3.11.131', username='******', password='******') # stdin, stdout, stderr = ssh.exec_command('who') # print() # print(stdout.read()) # vd = VideoDetector.VideoDetector("/home/sunbite/video/action_youtube_naudio/basketball/v_shooting_01_01.avi","/home/sunbite/Co_KNN_SVM_TMP/CoKNNSVM.model") # print(vd.getLabel()) x, y = datasets.load_svmlight_file("/home/sunbite/dataset/dataset") print(x.todense()) print(y) train_x, test_x, train_y, test_y = train_test_split(x.todense(), y, test_size=0.2, random_state=42, shuffle=False) print(len(train_x)) print(len(test_x)) print(len(train_y)) print(len(test_y))
X = ssp.hstack([ # X, X_char_1, X_char_2, X_char_3, X_char_1_q, X_char_2_q, X_char_3_q, # X_char_4_5_6_q, # sim_char_2, # sim_char_3, ]).tocsr() dump_svmlight_file(X, y, path + 'data.svm') data, y_all = load_svmlight_file(path + 'data.svm') y_all = y data = X del X del y X = data[:len_train] y = y_all[:len_train] X_t = data[len_train:] del data del y_all def make_mf_lr(X, y, clf, X_test, n_round=3): n = X.shape[0] '''
def test_load_invalid_file(): load_svmlight_file(invalidfile)
def get_data(): data = load_svmlight_file("housing_scale") return data[0], data[1]
def test_invalid_filename(): load_svmlight_file("trou pic nic douille")
from sklearn.datasets import load_svmlight_file feature_vectors, targets = load_svmlight_file("training_data_file.IDF") import matplotlib.pyplot as plt from sklearn.naive_bayes import MultinomialNB , BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, mutual_info_classif classifiers = ['MultinomialNB', 'BernoulliNB', 'KNeighborsClassifier', 'SVC'] for n,clf in enumerate([ MultinomialNB(), BernoulliNB(), KNeighborsClassifier(),SVC()]): chi2_scores = [] mutual_info_scores = [] for k in range(100, 5000, 100): X_new1 = SelectKBest(chi2, k=k).fit_transform(feature_vectors, targets) X_new2 = SelectKBest(mutual_info_classif, k=k).fit_transform(feature_vectors, targets) print(classifiers[n] + ' Chi Squared') scores = cross_val_score(clf, X_new1, targets, cv=5, scoring='f1_macro', verbose =0) chi2_scores.append(scores.mean()) print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print(classifiers[n] + ' Mutual Information') scores = cross_val_score(clf, X_new2, targets, cv=5, scoring='f1_macro', verbose =0) mutual_info_scores.append(scores.mean()) print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
from __future__ import absolute_import import numpy as np from sklearn.datasets import load_svmlight_file from rrf import RRF DATA_DIR = "example_data" N_RUNS = 10 if __name__ == '__main__': print("=========== Test classification on cod-rna data ===========") # X in [-1, 1] X, y = load_svmlight_file(DATA_DIR + "/cod-rna.scale", n_features=8) X = X.toarray() score = np.zeros(N_RUNS) train_time = np.zeros(N_RUNS) test_time = np.zeros(N_RUNS) model = [None] * N_RUNS for r in range(N_RUNS): idx = np.random.permutation(X.shape[0]) c = RRF(loss='hinge', task="classification", learning_rate=0.003, learning_rate_gamma=0.0001,
accuracy = [] train_accu = [] for i in range(n): binary = False if nclasses[i] == 2: binary = True feature_zbased = False if 'HIGGS' in datasets[i] or 'rna' in datasets[i]: feature_zbased = True model_name = '{}/models/rxgb/{}/{}_rxgb.model'.format(home, datasets[i], datasets[i]) file_name = '{}/data/rxgb/{}_test.svm'.format(home, datasets[i]) bst = xgb.Booster() bst.load_model(model_name) binary = binary model = xgboost_wrapper(bst, binary=binary) test_data, test_labels = load_svmlight_file(file_name, n_features[i]) test_data = test_data.toarray() test_labels = test_labels.astype('int') y = model.predict(test_data) temp = pd.DataFrame() temp['true'] = test_labels temp['pred'] = y correct = 0 correct_classified = [] for j in range(temp.shape[0]): if temp.iloc[j]['true'] == temp.iloc[j]['pred']: correct = correct + 1 correct_classified.append(j) selected = random.sample(correct_classified, min(sample_size[i], correct)) accu = correct / temp.shape[0] accuracy.append(accu)
from sklearn.datasets import load_svmlight_file from sklearn.externals import joblib from sklearn.svm import LinearSVC import random import pickle import numpy as np X_train, Y_train = load_svmlight_file("../datasets/news20/news20.binary.bz2") print(X_train.shape) print(Y_train.shape) ## randomly pick training instances trainIndices = np.array(random.sample(range(0, 19996), 100)) allIndices = np.array([i for i in range(19996)]) ## the rest are testing instances testIndices = np.setdiff1d(allIndices, trainIndices) print("testing number", testIndices.shape) ## loads training instances trainInstances = X_train[trainIndices] trainLabels = Y_train[trainIndices] ## loads testing instances testInstances = X_train[testIndices] testLabels = Y_train[testIndices] ## dumps the files pickle.dump(testIndices, open("testIndices.pkl", "wb"))
# -*- coding: utf-8 -*- """ Created on Sun Apr 29 15:29:57 2018 @author: MiaoWangqian """ from sklearn import datasets from sklearn.model_selection import train_test_split from scipy import sparse import numpy as np import time #%% filename = "D:/UCD_STA141C/hw1/news20.binary.bz2" X,y = datasets.load_svmlight_file(filename) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train = sparse.csr_matrix(y_train).T y_test = sparse.csr_matrix(y_test).T omega = np.random.randn(1355191).reshape(1355191,1) omega = sparse.csr_matrix(omega) lamda = 1 #%% #gradient def h(x): return 1/(1+np.exp(x)) def Gradient(X,y,omega,lamda): a = np.array(sparse.csr_matrix.todense(X@omega)) b = np.array(sparse.csr_matrix.todense(y)) g = [email protected]_matrix(-b*h(b*a))+ lamda*omega
def test_not_a_filename(): # in python 3 integers are valid file opening arguments (taken as unix # file descriptors) load_svmlight_file(.42)
def get_data(name): data = load_svmlight_file(name) return data[0], data[1]
import math import random import numpy as np import pandas as pd from sympy import symbols, diff from sklearn import datasets as ds from sklearn.model_selection import train_test_split from sklearn.datasets import load_svmlight_file import matplotlib.pyplot as plt x, y = load_svmlight_file("F:/data/Experiment/exp1/housing_scale") X = x.todense()#transfer sparse matrix to dense matrix #adding a column(1) ahead of data one = np.mat(np.ones((506,1))) X = np.c_[one, X] #split the dataset x_train, x_test, y_train, y_test = train_test_split(X , y, test_size = 0.25, shuffle= False) y_train = (np.mat(y_train)).T y_test = (np.mat(y_test)).T #求参数w def calW(x, y): return (x.T * x).I * (x.T * y) #损失函数 def loss(x, y, w): return ((y - x * w).T * (y - x * w)) / 2
from sklearn import datasets from scipy.spatial.distance import pdist, squareform import scipy as scip import numpy as np import math SPLICE_LOCATION = "/home/anirudhan/workspace/foudnations-of-machine-learning/hw2/libsvm-3.20/tools/splice_hw/" DEGREE = 3 sigma = math.sqrt(1 / (2 * 0.03125)) negSigmaSq = sigma * sigma * -1 X, Y = datasets.load_svmlight_file(SPLICE_LOCATION + "splice_noise_train.txt.scale") X = X.toarray() gamma = 1.0 / X.shape[1] pairwise_dists = squareform(pdist(X, 'euclidean')) k_g = scip.exp(pairwise_dists**2 / negSigmaSq) k_p = np.dot(X, X.T) k_p = np.multiply(k_p, gamma) k_p = np.power(k_p, DEGREE) k_sum = k_g + k_p Xt, Yt = datasets.load_svmlight_file(SPLICE_LOCATION + "splice_noise_test.txt.scale") Xt = Xt.toarray()
def get_data(file_input, separator='\t'): if 'libsvm' not in file_input: file_input = other2libsvm(file_input, separator) data = datasets.load_svmlight_file(file_input) return data[0], data[1]
def test_load_zero_based(): f = BytesIO("-1 4:1.\n1 0:1\n") load_svmlight_file(f, zero_based=False)
def get_malicious(): data = load_svmlight_file(DIR_PREFIX + "adult.libsvm") return _scale(data[0].todense()), data[1]
def get_data(data): my_data = load_svmlight_file(data) return my_data[0], my_data[1]
def _get_X_y(dataset, multilabel, replace=False): """Load a LIBSVM dataset as sparse X and observation y/Y. If X and y already exists as npz and npy, they are not redownloaded unless replace=True.""" # some files are compressed, some are not: if NAMES[dataset].endswith('.bz2'): stripped_name = NAMES[dataset][:-4] else: stripped_name = NAMES[dataset] ext = '.npz' if multilabel else '.npy' y_path = DATA_HOME / f"{stripped_name}_target{ext}" X_path = DATA_HOME / f"{stripped_name}_data" # no ext to handle npy or npz if (replace or not y_path.exists() or not ((X_path.parent / (X_path.name + '.npy')).exists() or (X_path.parent / (X_path.name + '.npz')).exists())): # above, do not use .with_suffix bc of datasets like a1a.t, where the # method would replace the .t by .npz tmp_path = DATA_HOME / stripped_name # Download the dataset source_path = DATA_HOME / NAMES[dataset] if not source_path.parent.exists(): source_path.parent.mkdir(parents=True) download_libsvm(dataset, source_path, replace=replace) # decompress file only if it is compressed if NAMES[dataset].endswith('.bz2'): decompressor = BZ2Decompressor() print("Decompressing...") with open(tmp_path, "wb") as f, open(source_path, "rb") as g: for data in iter(lambda: g.read(100 * 1024), b''): f.write(decompressor.decompress(data)) source_path.unlink() n_features_total = N_FEATURES[dataset] print("Loading svmlight file...") with open(tmp_path, 'rb') as f: X, y = load_svmlight_file(f, n_features=n_features_total, multilabel=multilabel) tmp_path.unlink() # if X's density is more than 0.5, store it in dense format: if len(X.data) >= 0.5 * X.shape[0] * X.shape[1]: X = X.toarray(order='F') np.save(X_path, X) else: X = sparse.csc_matrix(X) X.sort_indices() sparse.save_npz(X_path, X) if multilabel: indices = np.array([lab for labels in y for lab in labels]) indptr = np.cumsum([0] + [len(labels) for labels in y]) data = np.ones_like(indices) Y = sparse.csr_matrix((data, indices, indptr)) sparse.save_npz(y_path, Y) return X, Y else: np.save(y_path, y) else: try: X = sparse.load_npz(X_path.parent / (X_path.name + '.npz')) except FileNotFoundError: X = np.load(X_path.parent / (X_path.name + '.npy')) if multilabel: y = sparse.load_npz(y_path) else: y = np.load(y_path) return X, y
import numpy as np from sklearn.datasets import load_svmlight_file V, y = load_svmlight_file('U1.lsvm', 100) U, y = load_svmlight_file('M1.lsvm', 100) U = np.array(U.todense()) V = np.array(V.todense()) np.savetxt('U.csv', U, fmt='%f', delimiter=',', newline='\n') np.savetxt('V.csv', V, fmt='%f', delimiter=',', newline='\n')
def load_data(path): x_train, y_train = datasets.load_svmlight_file(path) x_train.todense() return x_train, y_train
def classification(test_size): '''Perform classification on each of the features, using each of Naive Bayes and Linear Regression. The test_size is given. ''' test_features = [('./lab3_data/1000/train.tokens.svmlight', 'tokens', 0.1), ('./lab3_data/1000/train.pos.svmlight', 'pos', 0.5), ('./lab3_data/1000/train.ner.svmlight', 'ner', 0.5), ('./lab3_data/1000/train.sent_length.svmlight', 'sent_length', 0.5)] all_acc, all_f1 = [], [] # X_train_all, X_test_all, y_train_all, y_test_all = [], [], [], [] for file, feature, alpha in test_features: X, y = load_svmlight_file(file) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) #Stacking for training on all features # X_train_all, X_test_all, y_train_all, y_test_all = hstack( X_train_all, X_train ), hstack( X_test_all, X_test), hstack(y_train_all, y_train), hstack(y_test_all, y_test) #Classify and report on scores for classifier in ['nb', 'maxent']: clf = train_clf(X_train, y_train, classifier, alpha) y_pred = evaluate(clf, X_test) classifier_name = 'Naive Bayes' if classifier == 'nb' else 'Logistic Regression' print('='*53) print(classifier_name + ' on ' + feature) print('-'*53) acc = compute_score(y_test, y_pred, score='acc') f1 = compute_score(y_test, y_pred, score='f1', average='macro') all_acc.append(acc) all_f1.append(f1) print('acc = ', acc) target_names = ['author '+str(n) for n in range(20)] print(classification_report(y_test, y_pred, target_names=target_names)) #Prepare a figure to display clf_types = ['NB tokens', 'LR tokens', 'NB POS', 'LR POS', 'NB NER', 'LR NER', 'NB Size', 'LR Size'] fig, ax = plt.subplots() index = np.arange(len(all_f1)) bar_width = 0.35 opacity = 0.8 rects1 = plt.bar(index, all_acc, bar_width, alpha=opacity, color='b', label='Acc') rects2 = plt.bar(index + bar_width, all_f1, bar_width, alpha=opacity, color='g', label='Macro-F1') plt.xlabel('Clf type') plt.ylabel('Scores') plt.title('Scores by clf type') plt.xticks(index + bar_width, [str(m) for m in clf_types]) plt.legend() plt.show()
split = split[:-1] for j, feature in enumerate(split): if j == 0: split[0] = int(split[0]) else: split2 = feature.split(':') split[j] = float(split2[1]) #print split Y_test_list.append(split[0]) X_test_list.append(split[1:]) X_test = numpy.array(X_test_list) Y_test = numpy.array(Y_test_list) #print sum(numpy.isinf(X_train)) # Use load_svmlight_file X_train, Y_train = load_svmlight_file("../feats/train_formatted.lsvm") X_train = X_train.toarray() X_test, Y_test = load_svmlight_file("../feats/test_formatted.lsvm") X_test = X_test.toarray() #print X_train # LDA clf = LDA() clf.fit(X_train,Y_train) qda_pred = clf.predict(X_test) accuracy = sum(qda_pred == Y_test)/Y_test.size print 'LDA Accuracy: ' + str(accuracy) # QDA clf = QDA() clf.fit(X_train,Y_train)
def test_load_libsvm(): datasets = { "eurlex-4k": { "file": os.path.join(TEST_DATA_PATH, "Eurlex/eurlex_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 5000, "offset": 1 } }, "amazonCat-13k": { "file": os.path.join(TEST_DATA_PATH, "AmazonCat/amazonCat_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 203882, "offset": 1 } }, "amazonCat-14k": { "file": os.path.join(TEST_DATA_PATH, "AmazonCat-14K/amazonCat-14K_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 597540, "offset": 1 } }, "wiki10-31k": { "file": os.path.join(TEST_DATA_PATH, "Wiki10/wiki10_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 101938, "offset": 1 } } } for d, v in datasets.items(): download_dataset(d, subset='test', format='bow', root=TEST_DATA_PATH) print("\n{} time comparison:".format(d)) t_start = time() sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"]) print( "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() - t_start)) t_start = time() nxc_X1, nxc_Y_list = load_libsvm_file(v["file"], labels_format="list") print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start)) t_start = time() nxc_X2, nxc_Y_csrm = load_libsvm_file(v["file"], labels_format="csr_matrix") print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start)) assert np.array_equal(nxc_X1.indptr, nxc_X2.indptr) assert np.array_equal(nxc_X1.indices, nxc_X2.indices) assert np.array_equal(nxc_X1.data, nxc_X2.data) assert np.array_equal(nxc_X1.indptr, sk_X.indptr) assert np.array_equal(nxc_X1.indices, sk_X.indices) assert np.allclose(nxc_X1.data, sk_X.data) assert nxc_X1.shape[0] == nxc_Y_csrm.shape[0] assert len(nxc_Y_list) == len(sk_Y) for nxc_y, sk_y in zip(nxc_Y_list, sk_Y): assert len(nxc_y) == len(sk_y) assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
for i in range(1,len(data.columns)): plt.plot(data[data.columns[0]],data[data.columns[i]]) else: x = range(len(data)) plt.xticks(x,data[data.columns[0]],rotation='vertical') for i in range(1,len(data.columns)): plt.plot(x,data[data.columns[i]]) plt.legend(data.columns[1:], loc='upper left') plt.xlabel(data.columns[0]) plt.ylabel('Accuracy') plt.title('Accuracy plot for ' + fileName) plt.show() #===================================Main ======================================= file ='vision_cuboids_histogram.txt.gz' X_train, y_train = load_svmlight_file(gzip.open(path+"train\\"+file)) X_test, y_test = load_svmlight_file(gzip.open(path+"test\\"+file)) X_val, y_val = load_svmlight_file(gzip.open(path+"validation\\"+file)) X_train = X_train[y_train!=31] X_test = X_test[y_test!=31] X_val = X_val[y_val!=31] y_train = y_train[y_train!=31] y_test = y_test[y_test!=31] y_val = y_val[y_val!=31] tech = 'LinearSVC' C=0.5 X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = tech,C=C) data_df = pd.DataFrame() n_guass =2
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNetCV, ElasticNet from sklearn.metrics import mean_squared_error, r2_score from matplotlib import pyplot as plt data, target = load_svmlight_file('data/E2006.train') # 다음을 변경한다 # from sklearn.linear_model import Lasso # met = Lasso(alpha=0.1) met = ElasticNet(alpha=0.1) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(data[train], target[train]) pred[test] = met.predict(data[test]) print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred)))) print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred))) print('')
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_svmlight_file from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.decomposition import PCA import time cancer = load_svmlight_file('data/breast-cancer') cancer_X = cancer[0].toarray() cancer_y = cancer[1] dna = load_svmlight_file('data/dna') dna_X = dna[0].toarray() dna_y = dna[1] params_svm = { 'scale': [True, False], 'test_size': [0.1, 0.2, 0.3, 0.4, 0.5], 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'cancer_dim': [2, 3, 5, 8, 10], 'dna_dim': [2, 5, 10, 20, 50, 100, 150, 180] } params_mlp = { 'scale': [True, False], 'test_size': [0.1, 0.2, 0.3, 0.4, 0.5], 'layers': [(10,), (100,), (10, 10), (100, 100), (200, 200), (100, 200, 100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'],