def main(_): train_set = [path + '\\' + data_name + '\\' + data_name + '_TRAIN'] test_set = [path + '\\' + data_name + '\\' + data_name + '_TEST'] model_url = path + '\\' + 'cnn' + '\\' + data_name + '\\' best = 0.0 data, label = get_data(train_set, data_set.length, data_set.classes_num, batch_size, False) if retrain: shutil.rmtree(model_url) model = Net() hps = { 'learning_rate': learning_rate, } estimator = tf.estimator.Estimator(model.model_fn, model_url, params=hps) logging_hook = tf.train.LoggingTensorHook({}, every_n_iter=100, at_end=True) for i in range(125): estimator.train(lambda: get_data(train_set, data_set.length, data_set. classes_num, batch_size, True), [logging_hook], steps=steps) result = estimator.evaluate( lambda: get_data(test_set, data_set.length, data_set.classes_num, data_set.test_size, False), steps=1) if best < result['accuracy']: best = result['accuracy'] print('The best accuracy is', best) print('The best error is', 1 - best)
import pandas as pd import pickle import numpy as np from sklearn.cross_validation import KFold, StratifiedKFold from sklearn.metrics import log_loss from sklearn.metrics import confusion_matrix from sklearn.ensemble import RandomForestClassifier import xgboost as xgb from submissions.submit import make_submission from data_prepare import get_data, get_sub from utils import Timer from itertools import product with Timer(): X_train, X_test, features = get_data() print('Data loaded') validate = False RF = False depths = [10] etas = [0.08] # Learning rate alphas = [0.2] # L1 weight penalty num_est = [100] log_file = open('./submissions/model_params.log', 'w+') headers = ("eta \t alpha \t num_est \t depth \t loss_train \t " "loss_val \t loss_test\n") log_file.write(headers) for depth, eta, alpha, num_est in product(depths, etas, alphas, num_est):
batch_inputs = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64) batch_masks = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64) batch_labels = torch.LongTensor(doc_labels) for b in range(batch_size): for sent_idx in range(doc_lens[b]): sent_data = batch_data[b][2][sent_idx] # 表示一个句子 for word_idx in range(sent_data[0]): batch_inputs[b, sent_idx, word_idx] = sent_data[1][word_idx] batch_masks[b, sent_idx, word_idx] = 1 if use_cuda: batch_inputs = batch_inputs.to(device) batch_masks = batch_masks.to(device) batch_labels = batch_labels.to(device) return (batch_inputs, batch_masks), batch_labels if __name__ == '__main__': train_data, dev_data, test_data = get_data(fold_num, dev_fold) vocab = Vocab(train_data) model = Model(vocab) trainer = Trainer(model, vocab, train_data, dev_data, test_data) trainer.train() print(vocab._label2id)
kline2_params = { 'window': 256, } params_list.append(kline2_params) func_list.append(feature_kline2) label_by_multi_ma_params = { 'window': [3, 5, 10] } params_list.append(label_by_multi_ma_params) func_list.append(label_by_multi_ma) construct_feature_func = partial(construct_features, params_list=params_list, func_list=func_list, test=True) ohlcv_list = get_data(file_name="~/cs_market.csv", stks=['002277.XSHE']) stk_features_list = construct_features_for_stocks(ohlcv_list, construct_feature_func) print(len(stk_features_list)) print(stk_features_list[0].columns) # i_columns = ['ma_1', 'ma_2', 'ma_3', 'ma_5', 'ma_8', 'ma_13', 'ma_21', 'ma_34', 'ma_55'] f = stk_features_list[0] f = f.reset_index().reset_index() print(f.columns) fig, ax = plt.subplots(1, figsize=(21, 7)) f.loc[:, 'close'].plot(figsize=(21, 7)) f[f["label"] == -1].plot.scatter(x='index', y='close', s=15, c='green', figsize=(21, 7), ax=ax, label="down") f[f["label"] == 1].plot.scatter(x='index', y='close', s=15, c='red', figsize=(21, 7), ax=ax, label="up")
def create_strategy(filename: str, columns_list: List[str], som_width: int, som_height: int, n_iter: int, sigma=0.3, learning_rate=0.01) -> tuple: """ Creates strategy which can be used in testing part of the script. - reads preprocessed split into training and testing sets data - train som model - calculates mean profit per cluster in training and testing dataset - gets mean profits Arguments: filename: name of file with data columns_list: list of columns which should be left in the training data som_width: width of som map som_height: height of som map n_iter: number of iterations in som map sigma: sigma parameter for som map learning_rate: learning rate for som map Returns: len(df_profit_per_cluster_train): amount of used clusters in training data len(df_profit_per_cluster_test): amount of used clusters in testing data buy_clusters_mean_profit_train: mean profit in buy clusters for training data sell_clusters_mean_profit_train: mean profit in sell clusters for training data buy_clusters_mean_profit_test: mean profit in buy clusters for testing data sell_clusters_mean_profit_test: mean profit in sell clusters for testing data """ # get prepared data df, df_prepared, df_train, df_test, df_train_columns = get_data( filename, columns_list) # train som final_df_train, final_df_test = train_som(som_width, som_height, df, df_train, df_test, df_train_columns, n_iter, sigma=sigma, learning_rate=learning_rate) # get profit per cluster in train and test datasets df_profit_per_cluster_train = get_profit_per_cluster(final_df_train) df_profit_per_cluster_test = get_profit_per_cluster(final_df_test) # get mean profit for sell and buy class in training and testing datasets try: buy_clusters_mean_profit_train, buy_clusters_list, sell_clusters_mean_profit_train, sell_clusters_list = \ get_mean_profit_per_class_from_train_df(df_profit_per_cluster_train) buy_clusters_mean_profit_test, sell_clusters_mean_profit_test = \ get_mean_profit_per_class_from_test_df(df_profit_per_cluster_test, buy_clusters_list, sell_clusters_list) # if the data was assigned to less than to 3 clusters except: buy_clusters_mean_profit_train, sell_clusters_mean_profit_train, \ buy_clusters_mean_profit_test, sell_clusters_mean_profit_test = None, None, None, None return len(df_profit_per_cluster_train), len(df_profit_per_cluster_test), \ buy_clusters_mean_profit_train, sell_clusters_mean_profit_train, \ buy_clusters_mean_profit_test, sell_clusters_mean_profit_test
def create_final_strategy(filename: str, columns_list: List[str], som_width=Config.som_width, som_height=Config.som_height, n_iter=Config.n_iter, sigma=Config.sigma, learning_rate=Config.learning_rate) -> tuple: """ Used for creating a final strategy (not for testing) - reads preprocessed split into training and testing sets data - train som model - calculates mean profit per cluster in training dataset - gets list of sell and buy clusters Arguments: filename: name of file with data columns_list: list of columns which should be left in the training data som_width: width of som map som_height: height of som map n_iter: number of iterations in som map sigma: sigma parameter for som map learning_rate: learning rate for som map Returns: final_df_train: training dataset final_df_test: testing dataset buy_clusters_list: list of buy clusters sell_clusters_list: list of sell clusters """ print( f'Creating final strategy for parameters: \nmap_size: {som_height}\nn_iter: {n_iter}\nsigma:{sigma}\nlr: {learning_rate}' ) # get prepared data df, df_prepared, df_train, df_test, df_train_columns = get_data( filename, columns_list) # train som final_df_train, final_df_test = train_som(som_width, som_height, df, df_train, df_test, df_train_columns, n_iter, sigma=sigma, learning_rate=learning_rate) # get profit per cluster in train datasets df_profit_per_cluster_train = get_profit_per_cluster(final_df_train) assert len(df_profit_per_cluster_train ) >= 3, "Algorithm, returned less than 3 clusters." df_profit_per_cluster = df_profit_per_cluster_train.sort_values( by='profit', ascending=False) group_size = int(len(df_profit_per_cluster) / 3) buy_clusters_list = list( df_profit_per_cluster.iloc[:group_size]['cluster']) sell_clusters_list = list( df_profit_per_cluster.iloc[-group_size:]['cluster']) return final_df_train, final_df_test, buy_clusters_list, sell_clusters_list
label_by_ma_price_params = { 'window': 250, 'next_ma_window': 3, 'quantile_list': [0, 0.1, 0.3, 0.7, 0.9, 1] } params_list.append(label_by_ma_price_params) func_list.append(label_by_ma_price) construct_feature_func = partial(construct_features, params_list=params_list, func_list=func_list, test=True) data_set, reverse_func = get_data( file_name="E:\market_data/cs_market.csv", stks=zz500[200:205], construct_feature_func=construct_feature_func, split_dates=["2016-01-01", "2017-01-01"]) for tag in ['train', 'validate', 'test']: data_set[tag]['label2'] = data_set[tag]['label'].map(reverse_func) labels = data_set[tag]['label2'].unique().tolist() labels.sort() print(tag) for label in labels: selected = data_set[tag][data_set[tag]['label2'] == label] print("{}: {}".format(label, len(selected) / len(data_set[tag]))) idx_slice = pd.IndexSlice stks = data_set['train'].index.get_level_values('code').unique().tolist() stks.sort()