def get_feature_value(): feature_vector_list = list() feature_name_list = list() for feature_name in FEATURE_NAME_LIST: feature_name_list.append(feature_name) feature_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' + feature_name + '.csv') feature_dict = dict() for feature_i in range(1, len(feature_list)): date = CommonUtil.get_datetime_from_string_( feature_list[feature_i][0]).date() # 开盘价 feature_value = float(feature_list[feature_i][1]) feature_dict[date] = feature_value for value_key in valueDict.keys(): if value_key in feature_dict.keys(): feature_value = feature_dict[value_key] else: feature_value = 'N/A' if value_key in featureValue.keys(): feature_items = featureValue[value_key] else: feature_items = list() feature_items.append(feature_value) featureValue[value_key] = feature_items feature_name_list.append(VALUE_NAME) feature_name_list.insert(0, 'DATE') feature_vector_list.append(feature_name_list) for key in featureValue.keys(): feature_items = featureValue[key] feature_items.append(valueDict[key]) feature_items.insert(0, key) feature_vector_list.append(feature_items) CommonUtil.write_csv('../files/marketdata/FEATURE_VECTOR.csv', feature_vector_list)
def get_value_list(): value_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH + '/' + VALUE_NAME + '.csv') for value_i in range(1, len(value_list)): date = CommonUtil.get_datetime_from_string_( value_list[value_i][0]).date() # 收盘价 value = float(value_list[value_i][2]) valueDict[date] = value
def adjust_feature_vector(): feature_vector_list = CommonUtil.read_csv( '../files/marketdata/FEATURE_VECTOR.csv') pre_item = feature_vector_list[0] current_item = pre_item for vector_i in range(1, len(feature_vector_list)): current_item = feature_vector_list[vector_i] for i in range(1, len(current_item)): if current_item[i] == 'N/A': current_item[i] = pre_item[i] feature_vector_list[vector_i] = current_item pre_item = current_item CommonUtil.write_csv('../files/marketdata/ADJUSTED_FEATURE_VECTOR.csv', feature_vector_list)
def reduce_feature_vector(): logger.info("In Reduce Feature Vector...") prepare_feature() origin_feature_num = len(featureDict.keys()) global featureVectorList reduced_feature_vector_list = list() feature_list = list() feature_count_threshold = 2 file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX featureVectorList = CommonUtil.read_csv(file_path) feature_count_dict = dict() feature_count_list = [0] * origin_feature_num is_title = True for feature_vector in featureVectorList: if is_title: is_title = False else: for feature_value_index in range(0, origin_feature_num): if feature_vector[feature_value_index] != '0.0': feature_count_list[feature_value_index] += 1 feature_index = 0 for key in featureDict.keys(): feature_count = feature_count_list[feature_index] feature_count_dict[key] = feature_count if feature_count >= feature_count_threshold: feature_list.append(feature_index) feature_index += 1 logger.info(str('Reduce Feature Vector to: ' + str(len(feature_list)))) feature_list.append(origin_feature_num) # 拼装计数超过阈值的特征向量 for feature_vector in featureVectorList: reduced_feature_vector = list() for feature_value_index in range(0, origin_feature_num + 1): if feature_value_index in feature_list: try: reduced_feature_vector.append( feature_vector[feature_value_index]) except IndexError: logger.error(feature_vector) logger.error(feature_value_index) reduced_feature_vector_list.append(reduced_feature_vector) file_path = REDUCED_FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, reduced_feature_vector_list) logger.info("Reduce Feature Vector Done!")
def feature_col_count(): logger.info("In Count Feature Appear...") prepare_feature() feature_count_dict = dict() global newsFeatureList newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH) feature_count_list = [0] * len(featureDict.keys()) for feature_vector in newsFeatureList: feature_index = 0 for feature_value_index in range(1, len(feature_vector)): if feature_vector[feature_value_index] != '0': feature_count_list[feature_index] += 1 feature_index += 1 feature_index = 0 for key in featureDict.keys(): feature_count = feature_count_list[feature_index] feature_index += 1 feature_count_dict[key] = feature_count row_item = key + "," + feature_count logger.info(row_item) logger.info("Count Feature Appear Done!")
print(dataNum) for i in range(dataNum - 1): if y[i] == 0: y[i] = 0 elif y[i] < 0: y[i] = -1 else: y[i] = 1 return y if __name__ == "__main__": # 加载样本数据集 n = 120 csv_file = CommonUtil.read_csv('../files/files_train/files_' + str(n) + 'min/REDUCED_FEATURE_VECTOR_' + str(n) + '.csv') dataNum = len(csv_file) featureNum = len(csv_file[0]) - 1 print("Dimension of feature", featureNum) dataMat = np.array(csv_file) X = dataMat[1:, 0:featureNum].astype(float) y = dataMat[1:, featureNum].astype(float) y = convert2class(y, dataNum) # 转换为类别 # 神经网络对数据尺度敏感,所以最好在训练前标准化,或者归一化,或者缩放到[-1,1] scaler = StandardScaler() # 标准化转换 scaler.fit(X) # 训练标准化对象 X = scaler.transform(X) # 转换数据集 # solver='lbfgs', MLP的求解方法:L-BFGS 在小数据上表现较好,Adam 较为鲁棒,SGD在参数调整较优时会有最佳表现(分类效果与迭代次数);SGD标识随机梯度下降。 # alpha:L2的参数:MLP是可以支持正则化的,默认为L2,具体参数需要调整
print(dataNum) for i in range(dataNum - 1): if y[i] == 0: y[i] = 0 elif y[i] < 0: y[i] = -1 else: y[i] = 1 return y if __name__ == "__main__": # 加载样本数据集 n = 120 csv_file = CommonUtil.read_csv('../files/files_train/files_' + str(n) + 'min/REDUCED_FEATURE_VECTOR_' + str(n) + '.csv') dataNum = len(csv_file) featureNum = len(csv_file[0]) - 1 print("Dimension of feature", featureNum) dataMat = np.array(csv_file) X = dataMat[1:, 0:featureNum].astype(float) y = dataMat[1:, featureNum].astype(float) y = convert2class(y, dataNum) # 转换为类别 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) # 数据集分割 # 训练模型 model = xgb.XGBClassifier(max_depth=3, learning_rate=0.1,
# -*- coding:utf-8 -*- import numpy as np import matplotlib.pyplot as plt from sklearn import ensemble from sklearn.utils import shuffle from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from util import CommonUtil from sklearn.model_selection import train_test_split # ############################################################################# # 导入数据 # 加载数据集 file_dir = 'C:/Users/yuzhe/Desktop/OptionAnalysis/files/' csv_file = CommonUtil.read_csv(file_dir + 'TestUSDIndex.csv') dataNum = len(csv_file) featureNum = len(csv_file[0])-2 print("特征的维度", featureNum) dataMat = np.array(csv_file) X = dataMat[1:, 1: featureNum].astype(float) y = dataMat[1:, featureNum].astype(float) ''' # 将y标签的增长率转化为增、跌、不变三种标签 for i in range(dataNum-1): if y[i] == 0: y[i] = 0 elif y[i]<0: y[i] = -1 else:y[i]= 1 ''' # 数据集分
from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler import numpy as np from sklearn.model_selection import train_test_split from util import CommonUtil # 加载样本数据集 if __name__ == '__main__': n = 180 csv_file = CommonUtil.read_csv( 'C:/Users/yuzhe/Desktop/OptionAnalysis/files/files_' + n.__str__() + 'min/REDUCED_FEATURE_VECTOR_' + n.__str__() + '.csv') dataNum = len(csv_file) featureNum = len(csv_file[0]) - 1 print("Dimension of feature", featureNum) dataMat = np.array(csv_file) X = dataMat[1:, 0:featureNum].astype(float) y = dataMat[1:, featureNum].astype(float) print(dataNum) for i in range(dataNum - 1): if y[i] == 0: y[i] = 0 elif y[i] < 0: y[i] = -1 else: y[i] = 1 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) # 数据集分割 pipe_scv = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
def generate_feature_vector(): logger.info("In Generate Feature Vector...") prepare_feature() # 设置标题 title_list = list(featureDict.keys()) title_list.append('TARGET') featureVectorList.append(title_list) feature_size = len(featureDict.keys()) global newsFeatureList newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH) global processedPriceList file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX processedPriceList = CommonUtil.read_csv(file_path) # 新闻从20160630开始到20171229,价格从20160701开始到20171229 last_news_begin = 0 news_feature_begin_index = last_news_begin pre_price_item = list() pre_price_item.append(PRICE_START_TIME) pre_price_item.append(0) price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME) price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME) # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的 for news_index in range(0, len(newsFeatureList)): news_feature = newsFeatureList[news_index] news_time = news_feature[0] # 重设新闻时间 news_feature[0] = CommonUtil.\ reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) newsFeatureList[news_index] = news_feature for current_price_item in processedPriceList: current_price_time = CommonUtil.get_datetime_from_string( current_price_item[0]) if price_start_time <= current_price_time < price_end_time: # 计算价格的变化 price_delta = round( (float(current_price_item[1]) - float(pre_price_item[1])) * FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION) pre_price_time = CommonUtil.get_datetime_from_string( pre_price_item[0]) logger.debug(current_price_time) # 计算pre_price_time到current_price_time新闻的作用总和 # last_interval_minutes >= 1 last_interval_minutes = int( CommonUtil.get_interval_seconds(current_price_time, pre_price_time) / 60) influence_feature_vector = [0.0] * feature_size # 对两个价格之间的每个采样点计算新闻的影响 is_influenced_price = False for minute_i in range(0, last_interval_minutes): # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time time_i = CommonUtil.get_minute_changed(pre_price_time, minute_i + 1) # 该时刻点受到影响对应的新闻 for news_feature_begin_index in range(last_news_begin, len(newsFeatureList)): interval_seconds = CommonUtil.get_interval_seconds( time_i, CommonUtil.get_datetime_from_string( newsFeatureList[news_feature_begin_index][0])) # 如果有新闻在影响范围内 if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60: for news_feature_end_index in range( news_feature_begin_index, len(newsFeatureList)): if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \ > time_i: break str_begin_end = str(minute_i + 1) + ': news->' + str( news_feature_begin_index) + ' : ' + str( news_feature_end_index - 1) logger.debug(str_begin_end) for news_feature_index in range( news_feature_begin_index, news_feature_end_index): current_news_feature = newsFeatureList[ news_feature_index] influence_score = decay_influence( CommonUtil.get_datetime_from_string( current_news_feature[0]), time_i) for value_i in range(0, feature_size): influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \ * influence_score is_influenced_price = True break elif interval_seconds < 0: break last_news_begin = news_feature_begin_index if is_influenced_price: influence_feature_vector.append(price_delta) featureVectorList.append(influence_feature_vector) pre_price_item = current_price_item file_path = FEATURE_VECTOR_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, featureVectorList) logger.info("Generate Feature Vector Done!")
def process_original_price(): logger.info("In Process Original Price...") global originalPriceList originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH) sample_datetime = None sample_price_list = list() # 对每一个原始价格 for original_price in originalPriceList: logger.debug('price time: ' + original_price[0]) price_datetime = CommonUtil.get_datetime_from_string(original_price[0]) price_value = float(original_price[1]) if sample_datetime is None: sample_datetime = CommonUtil.get_datetime_from_string( PRICE_START_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格 if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2: continue # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点 while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2: # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) # 重置采样点价格列表 sample_price_list = list() # 计算下一个采样点 sample_datetime = CommonUtil.get_next_sample_time( sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME, MARKET_CLOSE_TIME) time_interval = CommonUtil.get_interval_seconds( price_datetime, sample_datetime) logger.debug('sample datetime:' + CommonUtil.get_string_from_datetime(sample_datetime)) # 价格时间在采集区间外 if sample_datetime > CommonUtil.get_datetime_from_string( PRICE_END_TIME): break # 属于当前采样点,加入当前采样点价格列表,前闭后开[,) sample_price_list.append(price_value) # 处理最后一个采集时刻的价格列表 # 如果当前采样点有价格 if len(sample_price_list) > 0: price_sum = 0 for price_item in sample_price_list: price_sum += price_item average_price = round(price_sum / len(sample_price_list), CURRENCY_PAIR_PRECISION + 2) sample_datetime_str = CommonUtil.get_string_from_datetime( sample_datetime) average_price_item = [sample_datetime_str, average_price] # 将采样时间及对应的计算后的价格加入列表 processedPriceList.append(average_price_item) file_path = PROCESSED_PRICE_PATH + '_' + str( PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX CommonUtil.write_csv(file_path, processedPriceList) logger.info("Process Original Price Done!")
def read_segmented_news(): logger.info("In Read Segmented News...") global newsSegmentationList newsSegmentationList = CommonUtil.read_csv(SEGMENTED_NEWS_PATH) logger.info("Read Segmented News Done!")