Exemple #1
0
def main():
    logging.basicConfig(level=logging.INFO)

    clfs = [LinearSVC(), BernoulliNB(), SVC(kernel='rbf'), SVC(kernel='poly')]

    logging.info("SBS Vector TESTS =======================================")
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = data_vector_sbs()
    for clf in clfs:
        logging.info("training " + type(clf).__name__)
        logging.info("param:" + str(clf.get_params()))

        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc = scorer.accuracy_score(y_test, pred)
        logging.info(acc)

    logging.info("DIFF Vector TESTS =======================================")
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = data_vector_diff()
    for clf in clfs:
        logging.info("training " + type(clf).__name__)
        logging.info("param:" + str(clf.get_params()))

        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc = scorer.accuracy_score(y_test, pred)
        logging.info(acc)
def try_one():
    (X_train, y_train), (X_val, y_val), (X_test, y_test) =\
        prep.data_vector_diff(CountVectorizer(binary=False, analyzer='word'))
    model = LinearSVC()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = scorer.accuracy_score(y_test, pred)
    logging.info("ACC: " + str(acc))
Exemple #3
0
def confusionMatrix(true_class, predicted_class):
    nf_mat = confusion_matrix(true_class, predicted_class)
    accuracy = accuracy_score(true_class, predicted_class)
    x = np.array(nf_mat)
    print(
        'accuracy  ', accuracy, '  con-mat: ',
        precision_recall_fscore_support(true_class,
                                        predicted_class,
                                        average='macro'))
Exemple #4
0
def accu_score(y_pred, y_true):
    '''
    mission 1&2
    :param y_pred:
    :param y_true:
    :return:
    '''
    score = accuracy_score(y_pred=y_pred, y_true=y_true)
    return score
def shallow_classify_method():
    """
    tf-idf特征进行分类
    :return:
    """
    vocabulary_size = 5000
    data, labels = mr_read_files()

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=2)
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=vocabulary_size,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=True)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # svm 0.7018
    clf = LinearSVC(C=10, max_iter=2000, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(y_pred)
    print(scorer.accuracy_score(y_pred, y_test))

    # gbm 0.6929
    gbm = lgb.LGBMClassifier(objective='binary',
                             n_estimators=200,
                             learning_rate=0.3,
                             max_depth=4)
    gbm.fit(X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='binary_logloss',
            early_stopping_rounds=10,
            verbose=0)
    y_pred = gbm.predict(X_test)
    print(y_pred)
    print(scorer.accuracy_score(y_pred, y_test))
def all_diff():
    for method_name, vec_method in vectorization_methods.items():
        logging.info("Vectorization Method: " + method_name)
        (X_train,
         y_train), (X_val, y_val), (X_test,
                                    y_test) = prep.data_vector_diff(vec_method)
        clfs = [LinearSVC(), BernoulliNB()]

        for clf in clfs:
            logging.info("training " + type(clf).__name__)
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)
            acc = scorer.accuracy_score(y_test, pred)
            logging.info("ACC: " + str(acc))
Exemple #7
0
          class_weight=None,
          coef0=0,
          decision_function_shape=None,
          degree=3,
          gamma='auto',
          kernel='rbf',
          max_iter=1000,
          probability=False,
          random_state=None,
          shrinking=True,
          tol=0.001,
          verbose=False).fit(x_train, y_train)
y_predict = reg.predict(x[split:])
df = df.assign(p_trend=pd.Series(np.zeros(len(x))).values)
df['p_trend'][split:] = y_predict
accuracy = scorer.accuracy_score(df['Signal'][split:], df['p_trend'][split:])
df = df.assign(ret=pd.Series(np.zeros(len(x))).values)
df['ret'] = np.log(df['Open'].shift(-1) / df['Open'])
df = df.assign(ret1=pd.Series(np.zeros(len(x))).values)
df['ret1'] = df['p_trend'] * df['ret']
df = df.assign(cu_ret1=pd.Series(np.zeros(len(x))).values)
df['cu_ret1'] = np.cumsum(df['ret1'][split:])
df = df.assign(cu_ret=pd.Series(np.zeros(len(x))).values)
df['cu_ret'] = np.cumsum(df['ret'][split:])
std = pd.expanding_std(df['cu_ret1'])
sharpe = (df['cu_ret1'] - df['cu_ret']) / std
sharpe = sharpe[split:].mean()
print("\n\n ACCURACY :", accuracy)
plt.plot(df['cu_ret1'], color='b', label='Strategy Returns')
plt.plot(df['cu_ret'], color='g', label='Market Returns')
plt.figtext(0.14, 0.7, s='Sharpe ratio: %.2f' % sharpe)
Exemple #8
0
    def write_files(self,
                    output_results_path,
                    output_classifier_name="EE",
                    write_train=True,
                    write_test=True,
                    overwrite=False):
        """

        Args:
            output_results_path: String - path to where output results will be written
            output_classifier_name: String - the name of the composite ensmeble classifier in the output files
            write_train: boolean - true will write train files for the ensemble, false will skip training files
            write_test: boolean - true will write test files for the ensemble, false will skip test files
            overwrite: boolean - if true, any existing train/test files will be over-written. False prevents file overwriting


        """
        if write_train is False and write_test is False:
            print(
                "Train and test writing both set to false - method will terminate without doing anything"
            )
            return

        if not overwrite:
            if write_train:
                full_path = str(output_results_path) + "/" + str(
                    output_classifier_name) + "/Predictions/" + str(
                        self.dataset_name) + "/trainFold" + str(
                            self.resample_id) + ".csv"
                if os.path.exists(full_path):
                    print(
                        full_path +
                        " already exists and overwrite set to false, not writing Train",
                        Warning)
                    write_train = False

            if write_test is True:
                full_path = str(output_results_path) + "/" + str(
                    output_classifier_name) + "/Predictions/" + str(
                        self.dataset_name) + "/testFold" + str(
                            self.resample_id) + ".csv"
                if os.path.exists(full_path):
                    print(
                        full_path +
                        " already exists and overwrite set to false, not writing Test"
                    )
                    write_test = False

        if write_train is False and write_test is False:
            print(
                "Train and test files both already exist and overwrite set to false - method will terminate without doing anything"
            )
            return
        """
        file_format = None
        if os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TRAIN.ts') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TEST.ts'):
            train_x, train_y = loader.load_from_tsfile_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts')
            test_x, test_y = loader.load_from_tsfile_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts')
        elif os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TRAIN.arff') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name+ '_TEST.arff'):
            train_x, train_y = loader.load_from_arff_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts')
            test_x, test_y = loader.load_from_arff_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts')
        elif os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.tsv') and os.path.exists(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.tsv'):
            train_x, train_y = loader.load_from_ucr_tsv_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TRAIN.ts')
            test_x, test_y = loader.load_from_ucr_tsv_to_dataframe(problem_path + self.dataset_name + '/' + self.dataset_name + '_TEST.ts')
        else:
            raise ValueError("No dataset found for "+self.dataset_name)
        """
        if write_train:
            train_probs = self.ee_train_dists
            train_preds = self.classes_[np.argmax(train_probs, axis=1)]
            acc = accuracy_score(self.actual_train_class_vals, train_preds)
            second = str(self.distance_measures)
            third = str(acc) + ",NA,NA,-1,-1," + str(len(
                self.classes_)) + "," + str(self.classes_)
            write_results_to_uea_format(
                second_line=second,
                third_line=third,
                output_path=output_results_path,
                classifier_name=output_classifier_name,
                resample_seed=self.resample_id,
                predicted_class_vals=train_preds,
                actual_probas=train_probs,
                dataset_name=self.dataset_name,
                actual_class_vals=self.actual_train_class_vals,
                split='TRAIN')

        if write_test:
            test_probs = self.ee_test_dists
            test_preds = self.classes_[np.argmax(test_probs, axis=1)]
            acc = accuracy_score(self.actual_test_class_vals, test_preds)
            second = str(self.distance_measures)
            third = str(acc) + ",NA,NA,-1,-1," + str(len(
                self.classes_)) + "," + str(self.classes_)
            write_results_to_uea_format(
                second_line=second,
                third_line=third,
                output_path=output_results_path,
                classifier_name=output_classifier_name,
                resample_seed=self.resample_id,
                predicted_class_vals=test_preds,
                actual_probas=test_probs,
                dataset_name=self.dataset_name,
                actual_class_vals=self.actual_test_class_vals,
                split='TEST')
 def score(self, y_pred, y):
     return accuracy_score(y.numpy(), y_pred.numpy(), normalize=True)
print(Df.head())
Df = Df.dropna()

X = Df[['Open','high','low','close']]

t = .8
split = int(t*len(Df))

reg = SVC(C=1,cache_size=200,class_weight=None,coef0=0,decision_function_shape=None,degree=3,gamma='auto',kernel='rbf',max_iter=1000,probability=False,random_state=None,shrinking=True,tol=0.001,verbose=False)
reg.fit(X[:split],y[:split])
y_predict = reg.predict(X[split:])

Df = Df.assign(P_Trend=pd.Series(np.zeros(len(X))).values)
Df['P_Trend'][split:] = y_predict
accuracy = scorer.accuracy_score(Df['Signal'][split:],Df['P_Trend'][split:])

Df = Df.assign(Ret=pd.Series(np.zeros(len(X))).values)
Df['Ret'] = np.log(Df['Open'].shift(-1)/Df['Open'])

Df = Df.assign(Ret1=pd.Series(np.zeros(len(X))).values)
Df['Ret1'] = Df['P_Trend']*Df['Ret']

Df = Df.assign(Cu_Ret1=pd.Series(np.zeros(len(X))).values)
Df['Cu_Ret1'] = np.cumsum(Df['Ret1'][split:])

Df = Df.assign(Cu_Ret=pd.Series(np.zeros(len(X))).values)
Df['Cu_Ret'] = np.cumsum(Dp['Ret'][split:])


Exemple #11
0
import sys
from time import time

sys.path.append("../tools/")
from email_preprocess import preprocess

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

t0 = time()
gnb.fit(features_train, labels_train)
print "training time:", round(time() - t0, 3), "s"

t1 = time()
predicted = gnb.predict(features_test)
print "training time:", round(time() - t1, 3), "s"

from sklearn.metrics import scorer

print scorer.accuracy_score(labels_test, predicted)

#########################################################
Exemple #12
0
    def fit(self, data_src, data_tar, onehot=False, plot = False):
        """
        
        :param data_src: 
        :param data_tar: 
        :param onehot: 
        :return: 
        """
        loss_mmds = None
        loss_srcs = None
        iters = None
        if plot == True:
            loss_mmds = []
            loss_srcs = []
            iters = []
        with tf.Graph().as_default() as g:
            global_step = tf.Variable(0, trainable=False)
            X_src_placeholder = tf.placeholder(shape=[self.batch_size_src, self.input_dim], dtype=tf.float32, name='Xsrc')
            X_tar_placeholder = tf.placeholder(shape=[self.batch_size_tar, self.input_dim], dtype=tf.float32, name='Xtar')
            y_placeholder = tf.placeholder(tf.float32, [None, self.n_classes], name='y-input')

            output, hidden_src, hidden_tar = self.dann(X_src_placeholder, X_tar_placeholder)

            loss_y = tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y_placeholder)
            if self.kernel_type == 'linear':
                loss_mmd = self.linear_mmd2(hidden_src, hidden_tar)
            elif self.kernel_type == 'poly':
                loss_mmd = self.poly_mmd2(hidden_src, hidden_tar, self.kernel_param)
            elif self.kernel_type == 'rbf':
                if self.sigma_list is None:
                    raise ValueError("sigma list is None??")
                loss_mmd = self.mix_rbf_mmd2(hidden_src, hidden_tar, sigma_list=self.sigma_list)
            else:
                loss_mmd = tf.constant(0, dtype='float')
            loss_y_mean = tf.reduce_mean(loss_y)
            loss = tf.constant(self.lamb) * loss_mmd + loss_y_mean + tf.add_n(tf.get_collection('losses'))

            if self.optimizer == 'GD':
                train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss,global_step=global_step)
            elif self.optimizer == 'Adam':
                train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(loss,global_step=global_step)
            elif self.optimizer == 'Adg':
                train_step = tf.train.AdagradOptimizer(self.learning_rate).minimize(loss,global_step=global_step)
            else:
                train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss,
                                                                                            global_step=global_step)

            saver = tf.train.Saver()
            with tf.Session() as sess:
                tf.global_variables_initializer().run()
                for i in range(self.training_steps):
                    xs, ys = data_src.dataset.train.next_batch(self.batch_size_src)
                    xt, _ = data_tar.dataset.train.next_batch(self.batch_size_tar)

                    xs = np.reshape(xs, (self.batch_size_src, self.input_dim))
                    xt = np.reshape(xt, (self.batch_size_tar, self.input_dim))

                    if onehot == False:
                        ys = self._transformlabel(ys, self.n_classes)
                    _, loss_, loss_y_mean_, loss_mmd_, output_, hidden_src_, hidden_tar_ = sess.run([train_step, loss, loss_y_mean, loss_mmd, output,
                                                                           hidden_src, hidden_tar],
                                                              feed_dict={
                                                                  X_src_placeholder: xs,
                                                                  X_tar_placeholder: xt,
                                                                  y_placeholder: ys
                                                              })
                    acc = accuracy_score(np.argmax(output_, 1),
                                         np.argmax(ys, 1))

                    if plot == True:
                        loss_mmds.append(self.mmd(hidden_src_, hidden_tar_))
                        loss_srcs.append(loss_y_mean_)
                        iters.append(i)
                    if i%self.print_step == 0:
                        print("After {} training steps\n loss_mmd on training batch is:{}"
                              "\n loss_y on training batch is:{}"
                              "\n loss on training batch is:{}".format(i, loss_mmd_, loss_y_mean_, loss_))
                        print("accuracy on train set:{}".format(acc))
                        print("mmd: ", self.mmd(hidden_src_, hidden_tar_))
                    if i%self.save_step == 0:
                        saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step)

        if plot == True:
            fig = plt.figure(figsize=(8, 4))
            ax1 = fig.add_subplot(111)
            p1, = ax1.plot(iters, np.array(loss_mmds) / max(loss_mmds), 'b-', label='mmd')
            ax1.set_ylabel('MMD')
            ax1.set_title("Iters")
            ax1.yaxis.label.set_color(p1.get_color())
            ax2 = ax1.twinx()
            p2, = ax2.plot(iters, np.array(loss_srcs) / max(loss_srcs), 'g--', label='src loss')
            ax2.set_ylabel("Loss SRC")
            ax2.yaxis.label.set_color(p2.get_color())
            plt.savefig('./demo.png')
            plt.show()