def test(ind): print("=> test") test_data = load_data(file='test', norm=True) func = toolbox.compile(expr=ind) out = "" for (i, row) in test_data.iterrows(): x = row[1:].tolist() # print(f"{func(*x)}") out += f"{func(*x)}\n" with open("test.txt", 'w') as f: f.write(out)
def validate(ind): print("=> validate") out = "" val_data = load_data(file='validate', norm=True) func = toolbox.compile(expr=ind) for (i, row) in val_data.iterrows(): x = row[1:].tolist() y = row[0] out += f"calc: {func(*x)}, label: {y}\n" # print(f"calc: {func(*x)}, label: {y}") with open("validation.txt", 'w') as f: f.write(out)
def Multi_stocktrain(num, file_name, days, time_stamp, model, epochs, batch_sizes): ''' 采用随机的方法来选取不同的股票来对同一模型进行训练 :param num: #选取额外的股票数据作为训练集 :param file_name: :param days:预测未来几天内的收盘价的平均值 :param timestamps: :param model:模型 :param epochs: :param batch_sizes: :return: model 返回训练好的模型 ''' files = os.listdir('stock_data') files.remove(file_name) stock_list = random.sample(range(1, len(files)), num) for i in stock_list: print(files[i]) file_path = "stock_data\\" + files[i] data = handle_data.load_data(file_path) data = handle_data.K_mean(data, days) scaler = MinMaxScaler(feature_range=(0, 1)) train = scaler.fit_transform(data) x_train, y_train = [], [] for i in range(time_stamp, len(train) - days + 1): x_train.append(train[i - time_stamp:i]) y_train.append(train[i, 0]) x_train, y_train = np.array(x_train), np.array(y_train) model.fit(x_train, y_train, epochs=epochs, batch_size=batch_sizes, verbose=1) return model
6) constants are always floats? if they can be integers - how the gaussian noise works? 7) in mutation: arity preservation OR arity disruption? """ # take only features # df = df.iloc[:, 1:] # dataset = [{'x': x, # 'y': y, # LABEL: z} # for x, y, z in [(3, 6, 16), (4, 12, 45), (5, 10, 48), (2, 9, 13.5)] # ] data = normalize(load_data()) variables = [Variable(i) for i in titles] operators = [PLUS, MULTIPLY, SUBTRACT, DIVIDE] # SQUARED, components = tuple(variables + operators) # [Constant(range=(2, 6), integer=True)] max_depth=10 max_nodes=30 class DomainChromosome(GPChromosome): def __init__(self, components=components, max_depth=max_depth, max_nodes=max_nodes, **kwargs): super().__init__(components, max_depth=max_depth, max_nodes=max_nodes, **kwargs)
def Modle_preday(file_name, days, time_stamp, division): ''' :param file_name: 对应的是股票数据的文件名 :param days: 预测未来几天内的收盘价的平均值 :param time_stamp: 一共使用前多少天的数据进行预测 :param division: 把数据分成测试集和训练集的比例 :return: ''' #提取已经储存好的股票的历史数据 file_path = "stock_data\\" + file_name data = handle_data.load_data(file_path) data = handle_data.K_mean(data, days) # 划分训练集以及测试集 divide = division * data.shape[0] train = data[data.index <= divide] test = data[data.index > divide] # 数据归一化 scaler = MinMaxScaler(feature_range=(0, 1)) s_train = scaler.fit_transform(train) s_test = scaler.fit_transform(test) # 测试集与训练集 x_train, y_train = [], [] for i in range(time_stamp, len(train) - days + 1): x_train.append(s_train[i - time_stamp:i]) y_train.append(s_train[i, 0]) x_train, y_train = np.array(x_train), np.array(y_train) x_test, y_test = [], [] for i in range(time_stamp, len(s_test) - days + 1): x_test.append(s_test[i - time_stamp:i]) y_test.append(test.iloc[i, 0]) x_test = np.array(x_test) history = LossHistory() # 创建模型 epochs = 10 batch_size = 16 #model = Model_Lstm(x_train.shape[-1], x_train.shape[1]) model = Model_Create(x_train.shape[-1], x_train.shape[1]) model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[history]) # 训练网络 #Multi_stocktrain(2, file_name, days, time_stamp, model, 3, batch_size) #保存模型 model_file = "model_save\\" + file_name.strip('.csv') + ".h5" model.save(model_file) #使用测试集进行预测 predict_price = model.predict(x_test) scaler.fit_transform(pd.DataFrame(test['close'].values)) predict_price = scaler.inverse_transform(predict_price) # 模型效果指标 pre = predict_price.reshape(1, -1)[0] pre = pre.tolist() handle_data.calPerformance(y_test, pre) #计算预测误差绝对值的平均数 sum = 0 for i in range(0, len(y_test)): sum += abs(y_test[i] - pre[i]) err = sum / len(y_test) print(err) # 图像展示 dict_data = {'pre': pre, 'close': y_test} data_pd = pd.DataFrame(dict_data) data_pd.plot() plt.plot(data_pd[['pre', 'close']]) plt.show() #损失值的图像展示 history.loss_plot() #未来三天的预测值 Strategy(test.iloc[-1, 4], pre[-1], err) #print(pre[-1]) #把预测数据和目标值存入文件 predata_file = 'pre_data\\' + file_name data_pd.to_csv(predata_file) #handle_data.stock_info(20) #Modle_preday('000001.SZ.csv', 3, 50, 2/3) # for root, dirs, files in os.walk('stock_data'): # print(files)
import numpy as np import csv from sklearn import linear_model if __name__ == "__main__": from handle_data import load_data, dump_results x, y = load_data("../train.csv") print "Loaded training set" x_test, y_test = load_data("../test.csv", test=True) print "Loaded testing set" logreg = linear_model.LogisticRegression(solver='lbfgs', n_jobs=3, max_iter=200) logreg.fit(x, y) print "Classifier trained" y_predict = logreg.predict(x_test) print "Predictions ready" dump_results()