def main(): logging.basicConfig(level=logging.INFO) clfs = [LinearSVC(), BernoulliNB(), SVC(kernel='rbf'), SVC(kernel='poly')] logging.info("SBS Vector TESTS =======================================") (X_train, y_train), (X_val, y_val), (X_test, y_test) = data_vector_sbs() for clf in clfs: logging.info("training " + type(clf).__name__) logging.info("param:" + str(clf.get_params())) clf.fit(X_train, y_train) pred = clf.predict(X_test) acc = scorer.accuracy_score(y_test, pred) logging.info(acc) logging.info("DIFF Vector TESTS =======================================") (X_train, y_train), (X_val, y_val), (X_test, y_test) = data_vector_diff() for clf in clfs: logging.info("training " + type(clf).__name__) logging.info("param:" + str(clf.get_params())) clf.fit(X_train, y_train) pred = clf.predict(X_test) acc = scorer.accuracy_score(y_test, pred) logging.info(acc)
def try_one(): (X_train, y_train), (X_val, y_val), (X_test, y_test) =\ prep.data_vector_diff(CountVectorizer(binary=False, analyzer='word')) model = LinearSVC() model.fit(X_train, y_train) pred = model.predict(X_test) acc = scorer.accuracy_score(y_test, pred) logging.info("ACC: " + str(acc))
def confusionMatrix(true_class, predicted_class): nf_mat = confusion_matrix(true_class, predicted_class) accuracy = accuracy_score(true_class, predicted_class) x = np.array(nf_mat) print( 'accuracy ', accuracy, ' con-mat: ', precision_recall_fscore_support(true_class, predicted_class, average='macro'))
def accu_score(y_pred, y_true): ''' mission 1&2 :param y_pred: :param y_true: :return: ''' score = accuracy_score(y_pred=y_pred, y_true=y_true) return score
def shallow_classify_method(): """ tf-idf特征进行分类 :return: """ vocabulary_size = 5000 data, labels = mr_read_files() X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=2) vectorizer = TfidfVectorizer(max_df=0.5, max_features=vocabulary_size, min_df=2, stop_words='english', use_idf=True) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # svm 0.7018 clf = LinearSVC(C=10, max_iter=2000, verbose=1) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(y_pred) print(scorer.accuracy_score(y_pred, y_test)) # gbm 0.6929 gbm = lgb.LGBMClassifier(objective='binary', n_estimators=200, learning_rate=0.3, max_depth=4) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='binary_logloss', early_stopping_rounds=10, verbose=0) y_pred = gbm.predict(X_test) print(y_pred) print(scorer.accuracy_score(y_pred, y_test))
def all_diff(): for method_name, vec_method in vectorization_methods.items(): logging.info("Vectorization Method: " + method_name) (X_train, y_train), (X_val, y_val), (X_test, y_test) = prep.data_vector_diff(vec_method) clfs = [LinearSVC(), BernoulliNB()] for clf in clfs: logging.info("training " + type(clf).__name__) clf.fit(X_train, y_train) pred = clf.predict(X_test) acc = scorer.accuracy_score(y_test, pred) logging.info("ACC: " + str(acc))
class_weight=None, coef0=0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=1000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False).fit(x_train, y_train) y_predict = reg.predict(x[split:]) df = df.assign(p_trend=pd.Series(np.zeros(len(x))).values) df['p_trend'][split:] = y_predict accuracy = scorer.accuracy_score(df['Signal'][split:], df['p_trend'][split:]) df = df.assign(ret=pd.Series(np.zeros(len(x))).values) df['ret'] = np.log(df['Open'].shift(-1) / df['Open']) df = df.assign(ret1=pd.Series(np.zeros(len(x))).values) df['ret1'] = df['p_trend'] * df['ret'] df = df.assign(cu_ret1=pd.Series(np.zeros(len(x))).values) df['cu_ret1'] = np.cumsum(df['ret1'][split:]) df = df.assign(cu_ret=pd.Series(np.zeros(len(x))).values) df['cu_ret'] = np.cumsum(df['ret'][split:]) std = pd.expanding_std(df['cu_ret1']) sharpe = (df['cu_ret1'] - df['cu_ret']) / std sharpe = sharpe[split:].mean() print("\n\n ACCURACY :", accuracy) plt.plot(df['cu_ret1'], color='b', label='Strategy Returns') plt.plot(df['cu_ret'], color='g', label='Market Returns') plt.figtext(0.14, 0.7, s='Sharpe ratio: %.2f' % sharpe)
def write_files(self, output_results_path, output_classifier_name="EE", write_train=True, write_test=True, overwrite=False): """ Args: output_results_path: String - path to where output results will be written output_classifier_name: String - the name of the composite ensmeble classifier in the output files write_train: boolean - true will write train files for the ensemble, false will skip training files write_test: boolean - true will write test files for the ensemble, false will skip test files overwrite: boolean - if true, any existing train/test files will be over-written. False prevents file overwriting """ if write_train is False and write_test is False: print( "Train and test writing both set to false - method will terminate without doing anything" ) return if not overwrite: if write_train: full_path = str(output_results_path) + "/" + str( output_classifier_name) + "/Predictions/" + str( self.dataset_name) + "/trainFold" + str( self.resample_id) + ".csv" if os.path.exists(full_path): print( full_path + " already exists and overwrite set to false, not writing Train", Warning) write_train = False if write_test is True: full_path = str(output_results_path) + "/" + str( output_classifier_name) + "/Predictions/" + str( self.dataset_name) + "/testFold" + str( self.resample_id) + ".csv" if os.path.exists(full_path): print( full_path + " already exists and overwrite set to false, not writing Test" ) write_test = False if write_train is False and write_test is False: print( "Train and test files both already exist and overwrite set to false - method will terminate without doing anything" ) return """ file_format = None if os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TRAIN.ts') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TEST.ts'): train_x, train_y = loader.load_from_tsfile_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts') test_x, test_y = loader.load_from_tsfile_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts') elif os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TRAIN.arff') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TEST.arff'): train_x, train_y = loader.load_from_arff_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts') test_x, test_y = loader.load_from_arff_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts') elif os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.tsv') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.tsv'): train_x, train_y = loader.load_from_ucr_tsv_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts') test_x, test_y = loader.load_from_ucr_tsv_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts') else: raise ValueError("No dataset found for "+self.dataset_name) """ if write_train: train_probs = self.ee_train_dists train_preds = self.classes_[np.argmax(train_probs, axis=1)] acc = accuracy_score(self.actual_train_class_vals, train_preds) second = str(self.distance_measures) third = str(acc) + ",NA,NA,-1,-1," + str(len( self.classes_)) + "," + str(self.classes_) write_results_to_uea_format( second_line=second, third_line=third, output_path=output_results_path, classifier_name=output_classifier_name, resample_seed=self.resample_id, predicted_class_vals=train_preds, actual_probas=train_probs, dataset_name=self.dataset_name, actual_class_vals=self.actual_train_class_vals, split='TRAIN') if write_test: test_probs = self.ee_test_dists test_preds = self.classes_[np.argmax(test_probs, axis=1)] acc = accuracy_score(self.actual_test_class_vals, test_preds) second = str(self.distance_measures) third = str(acc) + ",NA,NA,-1,-1," + str(len( self.classes_)) + "," + str(self.classes_) write_results_to_uea_format( second_line=second, third_line=third, output_path=output_results_path, classifier_name=output_classifier_name, resample_seed=self.resample_id, predicted_class_vals=test_preds, actual_probas=test_probs, dataset_name=self.dataset_name, actual_class_vals=self.actual_test_class_vals, split='TEST')
def score(self, y_pred, y): return accuracy_score(y.numpy(), y_pred.numpy(), normalize=True)
print(Df.head()) Df = Df.dropna() X = Df[['Open','high','low','close']] t = .8 split = int(t*len(Df)) reg = SVC(C=1,cache_size=200,class_weight=None,coef0=0,decision_function_shape=None,degree=3,gamma='auto',kernel='rbf',max_iter=1000,probability=False,random_state=None,shrinking=True,tol=0.001,verbose=False) reg.fit(X[:split],y[:split]) y_predict = reg.predict(X[split:]) Df = Df.assign(P_Trend=pd.Series(np.zeros(len(X))).values) Df['P_Trend'][split:] = y_predict accuracy = scorer.accuracy_score(Df['Signal'][split:],Df['P_Trend'][split:]) Df = Df.assign(Ret=pd.Series(np.zeros(len(X))).values) Df['Ret'] = np.log(Df['Open'].shift(-1)/Df['Open']) Df = Df.assign(Ret1=pd.Series(np.zeros(len(X))).values) Df['Ret1'] = Df['P_Trend']*Df['Ret'] Df = Df.assign(Cu_Ret1=pd.Series(np.zeros(len(X))).values) Df['Cu_Ret1'] = np.cumsum(Df['Ret1'][split:]) Df = Df.assign(Cu_Ret=pd.Series(np.zeros(len(X))).values) Df['Cu_Ret'] = np.cumsum(Dp['Ret'][split:])
import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() t0 = time() gnb.fit(features_train, labels_train) print "training time:", round(time() - t0, 3), "s" t1 = time() predicted = gnb.predict(features_test) print "training time:", round(time() - t1, 3), "s" from sklearn.metrics import scorer print scorer.accuracy_score(labels_test, predicted) #########################################################
def fit(self, data_src, data_tar, onehot=False, plot = False): """ :param data_src: :param data_tar: :param onehot: :return: """ loss_mmds = None loss_srcs = None iters = None if plot == True: loss_mmds = [] loss_srcs = [] iters = [] with tf.Graph().as_default() as g: global_step = tf.Variable(0, trainable=False) X_src_placeholder = tf.placeholder(shape=[self.batch_size_src, self.input_dim], dtype=tf.float32, name='Xsrc') X_tar_placeholder = tf.placeholder(shape=[self.batch_size_tar, self.input_dim], dtype=tf.float32, name='Xtar') y_placeholder = tf.placeholder(tf.float32, [None, self.n_classes], name='y-input') output, hidden_src, hidden_tar = self.dann(X_src_placeholder, X_tar_placeholder) loss_y = tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y_placeholder) if self.kernel_type == 'linear': loss_mmd = self.linear_mmd2(hidden_src, hidden_tar) elif self.kernel_type == 'poly': loss_mmd = self.poly_mmd2(hidden_src, hidden_tar, self.kernel_param) elif self.kernel_type == 'rbf': if self.sigma_list is None: raise ValueError("sigma list is None??") loss_mmd = self.mix_rbf_mmd2(hidden_src, hidden_tar, sigma_list=self.sigma_list) else: loss_mmd = tf.constant(0, dtype='float') loss_y_mean = tf.reduce_mean(loss_y) loss = tf.constant(self.lamb) * loss_mmd + loss_y_mean + tf.add_n(tf.get_collection('losses')) if self.optimizer == 'GD': train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss,global_step=global_step) elif self.optimizer == 'Adam': train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(loss,global_step=global_step) elif self.optimizer == 'Adg': train_step = tf.train.AdagradOptimizer(self.learning_rate).minimize(loss,global_step=global_step) else: train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss, global_step=global_step) saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() for i in range(self.training_steps): xs, ys = data_src.dataset.train.next_batch(self.batch_size_src) xt, _ = data_tar.dataset.train.next_batch(self.batch_size_tar) xs = np.reshape(xs, (self.batch_size_src, self.input_dim)) xt = np.reshape(xt, (self.batch_size_tar, self.input_dim)) if onehot == False: ys = self._transformlabel(ys, self.n_classes) _, loss_, loss_y_mean_, loss_mmd_, output_, hidden_src_, hidden_tar_ = sess.run([train_step, loss, loss_y_mean, loss_mmd, output, hidden_src, hidden_tar], feed_dict={ X_src_placeholder: xs, X_tar_placeholder: xt, y_placeholder: ys }) acc = accuracy_score(np.argmax(output_, 1), np.argmax(ys, 1)) if plot == True: loss_mmds.append(self.mmd(hidden_src_, hidden_tar_)) loss_srcs.append(loss_y_mean_) iters.append(i) if i%self.print_step == 0: print("After {} training steps\n loss_mmd on training batch is:{}" "\n loss_y on training batch is:{}" "\n loss on training batch is:{}".format(i, loss_mmd_, loss_y_mean_, loss_)) print("accuracy on train set:{}".format(acc)) print("mmd: ", self.mmd(hidden_src_, hidden_tar_)) if i%self.save_step == 0: saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step) if plot == True: fig = plt.figure(figsize=(8, 4)) ax1 = fig.add_subplot(111) p1, = ax1.plot(iters, np.array(loss_mmds) / max(loss_mmds), 'b-', label='mmd') ax1.set_ylabel('MMD') ax1.set_title("Iters") ax1.yaxis.label.set_color(p1.get_color()) ax2 = ax1.twinx() p2, = ax2.plot(iters, np.array(loss_srcs) / max(loss_srcs), 'g--', label='src loss') ax2.set_ylabel("Loss SRC") ax2.yaxis.label.set_color(p2.get_color()) plt.savefig('./demo.png') plt.show()