Beispiel #1
0
def feature_about():
    # 获取特征列表
    feature_dict = NewsUtil.get_feature()
    # 获取新闻中出现特征后最近的5个词及其属性
    logger.info("In Prepare Raw News...")
    raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH)
    raw_news_table = raw_news_data.sheet_by_index(0)
    raw_news_rows = raw_news_table.nrows
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,
                                CFETSFX_LEXICON_PATH)  # 加载模型,第二个参数是您的外部词典文件路径
    feature_about_list = list()
    for rowN in range(0, raw_news_rows):
        news_content = raw_news_table.cell_value(rowN, 2)
        sentences = SentenceSplitter.split(news_content)
        for sentence in sentences:
            print(sentence)
            # 分词
            words = segmentor.segment(sentence)
            print(list(words))
            for word_index in range(0, len(words)):
                word = words[word_index]
                for feature_word in feature_dict.values():
                    if feature_word in word:
                        about_list = list()
                        count = 0
                        while word_index < len(words) and count < 6:
                            about_list.append(words[word_index])
                            count += 1
                            word_index += 1
                        feature_about_list.append(about_list)
                        print(about_list)
                        break
    segmentor.release()
    CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def link_vec(NEWS_VEC, PRICE_VEC, p):
    ALL_VEC = []
    # print(NEWS_VEC[0][1])
    # print(PRICE_VEC[0][0])
    for each_price_vec in PRICE_VEC:
        flag = 0
        for each_news_vec in NEWS_VEC:
            # print("each_news_vec[0] , each_price_vec[0]", each_news_vec[0] , each_price_vec[0])
            if each_news_vec[0] == each_price_vec[0]:
                print("matched!")
                vec_of_all = each_news_vec[::]
                vec_of_all.append(each_price_vec[1])
                vec_of_all.append(each_price_vec[2])
                ALL_VEC.append(vec_of_all)
                flag = 1
                break
        if flag == 0:
            print("not matched! Created!")
            list_temp = []
            list_temp.append(each_price_vec[0])
            list_temp += [0] * (len(each_news_vec) - 1)
            list_temp.append(each_price_vec[1])
            list_temp.append(each_price_vec[2])
            ALL_VEC.append(list_temp)

    CommonUtil.write_csv(p, ALL_VEC)
Beispiel #3
0
def get_feature_value():
    feature_vector_list = list()
    feature_name_list = list()
    for feature_name in FEATURE_NAME_LIST:
        feature_name_list.append(feature_name)
        feature_list = CommonUtil.read_csv(MarketDataCrawler.MARKET_DATA_PATH +
                                           '/' + feature_name + '.csv')
        feature_dict = dict()
        for feature_i in range(1, len(feature_list)):
            date = CommonUtil.get_datetime_from_string_(
                feature_list[feature_i][0]).date()
            # 开盘价
            feature_value = float(feature_list[feature_i][1])
            feature_dict[date] = feature_value
        for value_key in valueDict.keys():
            if value_key in feature_dict.keys():
                feature_value = feature_dict[value_key]
            else:
                feature_value = 'N/A'
            if value_key in featureValue.keys():
                feature_items = featureValue[value_key]
            else:
                feature_items = list()
            feature_items.append(feature_value)
            featureValue[value_key] = feature_items
    feature_name_list.append(VALUE_NAME)
    feature_name_list.insert(0, 'DATE')
    feature_vector_list.append(feature_name_list)
    for key in featureValue.keys():
        feature_items = featureValue[key]
        feature_items.append(valueDict[key])
        feature_items.insert(0, key)
        feature_vector_list.append(feature_items)
    CommonUtil.write_csv('../files/marketdata/FEATURE_VECTOR.csv',
                         feature_vector_list)
Beispiel #4
0
def news_sentiment():
    logger.info("In News Sentiment...")
    count = 1
    for mapped_news in newsMappedList:
        feature_vector_item = list()
        news_index = mapped_news[0]
        news_time = mapped_news[1]
        feature_vector_item.append(news_index)
        feature_vector_item.append(news_time)
        feature_vector = list()
        keyword_sentiment_dict = dict()
        # 下标从0开始,减1
        news_mapped = newsList[news_index - 1]
        for mapped_news_index in range(2, len(mapped_news)):
            keyword = mapped_news[mapped_news_index]
            sentiment_result = BaiduNLPProcessor.sentiment_classify(
                news_mapped[2])
            keyword_sentiment_dict[keyword] = sentiment_result
        keys = featureDict.keys()
        for key in keys:
            if featureDict[key] in keyword_sentiment_dict.keys():
                feature_vector.append(keyword_sentiment_dict[featureDict[key]])
            else:
                feature_vector.append(0)
        feature_vector_item.append(feature_vector)
        newsItemList.append(feature_vector_item)
        feature_vector.insert(0, news_time)
        newsFeatureList.append(feature_vector)
        logger.info(count)
        count += 1
    CommonUtil.write_csv(NEWS_ITEM_PATH, newsItemList)
    CommonUtil.write_csv(NEWS_FEATURE_PATH, newsFeatureList)
    logger.info("News Sentiment Done!")
Beispiel #5
0
def news_segment():
    logger.info("In Segment News...")
    count = 1
    for news_item in newsList:
        word_list = BaiduNLPProcessor.lexer(news_item[2])
        word_list.insert(0, news_item[0])
        word_list.insert(1, CommonUtil.get_string_from_datetime(news_item[1]))
        newsSegmentationList.append(word_list)
        logger.info(count)
        count += 1
    CommonUtil.write_csv(SEGMENTED_NEWS_PATH, newsSegmentationList)
    logger.info("Segment News...Done!")
Beispiel #6
0
def adjust_feature_vector():
    feature_vector_list = CommonUtil.read_csv(
        '../files/marketdata/FEATURE_VECTOR.csv')
    pre_item = feature_vector_list[0]
    current_item = pre_item
    for vector_i in range(1, len(feature_vector_list)):
        current_item = feature_vector_list[vector_i]
        for i in range(1, len(current_item)):
            if current_item[i] == 'N/A':
                current_item[i] = pre_item[i]
        feature_vector_list[vector_i] = current_item
        pre_item = current_item
    CommonUtil.write_csv('../files/marketdata/ADJUSTED_FEATURE_VECTOR.csv',
                         feature_vector_list)
Beispiel #7
0
def reduce_feature_vector():
    logger.info("In Reduce Feature Vector...")
    prepare_feature()
    origin_feature_num = len(featureDict.keys())
    global featureVectorList
    reduced_feature_vector_list = list()
    feature_list = list()
    feature_count_threshold = 2
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    featureVectorList = CommonUtil.read_csv(file_path)
    feature_count_dict = dict()
    feature_count_list = [0] * origin_feature_num
    is_title = True
    for feature_vector in featureVectorList:
        if is_title:
            is_title = False
        else:
            for feature_value_index in range(0, origin_feature_num):
                if feature_vector[feature_value_index] != '0.0':
                    feature_count_list[feature_value_index] += 1
    feature_index = 0
    for key in featureDict.keys():
        feature_count = feature_count_list[feature_index]
        feature_count_dict[key] = feature_count
        if feature_count >= feature_count_threshold:
            feature_list.append(feature_index)
        feature_index += 1
    logger.info(str('Reduce Feature Vector to: ' + str(len(feature_list))))
    feature_list.append(origin_feature_num)
    # 拼装计数超过阈值的特征向量
    for feature_vector in featureVectorList:
        reduced_feature_vector = list()
        for feature_value_index in range(0, origin_feature_num + 1):
            if feature_value_index in feature_list:
                try:
                    reduced_feature_vector.append(
                        feature_vector[feature_value_index])
                except IndexError:
                    logger.error(feature_vector)
                    logger.error(feature_value_index)
        reduced_feature_vector_list.append(reduced_feature_vector)
    file_path = REDUCED_FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, reduced_feature_vector_list)
    logger.info("Reduce Feature Vector Done!")
Beispiel #8
0
# The coefficients
print('回归系数: \n', clf.coef_)
print('截距: \n', clf.intercept_)

print("Mean Absolute error:", mean_absolute_error(y_test, y_pred))
# The mean squared error
print("Mean squared error:", mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('模型得分: %.2f', clf.score(X_test, y_test))
print('R2 score: %.2f' % r2_score(y_test, y_pred))

y_pred = clf.predict(X_test)
y_pred = y_pred.reshape(len(y_pred), 1)
y_test = y_test.reshape(len(y_test), 1)

# write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/result.csv', X_test)
CommonUtil.write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/y_test.csv',
                     y_test)
CommonUtil.write_csv('C:/Users/yuzhe/Desktop/OptionAnalysis/files/y_pred.csv',
                     y_pred)

###############################
# 用predic预测,这里预测输入x对应的值,进行画线
X_test = [i for i in range(len(y_test))]

plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=2)
plt.xticks(())
plt.yticks(np.linspace(6.5, 7, 10))
plt.show()
Beispiel #9
0
def generate_feature_vector():
    logger.info("In Generate Feature Vector...")
    prepare_feature()
    # 设置标题
    title_list = list(featureDict.keys())
    title_list.append('TARGET')
    featureVectorList.append(title_list)
    feature_size = len(featureDict.keys())
    global newsFeatureList
    newsFeatureList = CommonUtil.read_csv(NEWS_FEATURE_PATH)
    global processedPriceList
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    processedPriceList = CommonUtil.read_csv(file_path)
    # 新闻从20160630开始到20171229,价格从20160701开始到20171229
    last_news_begin = 0
    news_feature_begin_index = last_news_begin
    pre_price_item = list()
    pre_price_item.append(PRICE_START_TIME)
    pre_price_item.append(0)
    price_start_time = CommonUtil.get_datetime_from_string(PRICE_START_TIME)
    price_end_time = CommonUtil.get_datetime_from_string(PRICE_END_TIME)
    # 将闭市时间内的新闻统一设置为开市前NEWS_INFLUENCE_MOST分钟时发生的
    for news_index in range(0, len(newsFeatureList)):
        news_feature = newsFeatureList[news_index]
        news_time = news_feature[0]
        # 重设新闻时间
        news_feature[0] = CommonUtil.\
            reset_news_time(news_time, NEWS_INFLUENCE_MOST, MARKET_OPEN_TIME, MARKET_CLOSE_TIME)
        newsFeatureList[news_index] = news_feature
    for current_price_item in processedPriceList:
        current_price_time = CommonUtil.get_datetime_from_string(
            current_price_item[0])
        if price_start_time <= current_price_time < price_end_time:
            # 计算价格的变化
            price_delta = round(
                (float(current_price_item[1]) - float(pre_price_item[1])) *
                FEATURE_VECTOR_SCALE, CURRENCY_PAIR_PRECISION)
            pre_price_time = CommonUtil.get_datetime_from_string(
                pre_price_item[0])
            logger.debug(current_price_time)
            # 计算pre_price_time到current_price_time新闻的作用总和
            # last_interval_minutes >= 1
            last_interval_minutes = int(
                CommonUtil.get_interval_seconds(current_price_time,
                                                pre_price_time) / 60)
            influence_feature_vector = [0.0] * feature_size
            # 对两个价格之间的每个采样点计算新闻的影响
            is_influenced_price = False
            for minute_i in range(0, last_interval_minutes):
                # 计算的时刻点,pre_price_time之后的时刻点,包括current_price_time
                time_i = CommonUtil.get_minute_changed(pre_price_time,
                                                       minute_i + 1)
                # 该时刻点受到影响对应的新闻
                for news_feature_begin_index in range(last_news_begin,
                                                      len(newsFeatureList)):
                    interval_seconds = CommonUtil.get_interval_seconds(
                        time_i,
                        CommonUtil.get_datetime_from_string(
                            newsFeatureList[news_feature_begin_index][0]))
                    # 如果有新闻在影响范围内
                    if 0 <= interval_seconds <= NEWS_INFLUENCE_DACAY_THRESHOLD * 60:
                        for news_feature_end_index in range(
                                news_feature_begin_index,
                                len(newsFeatureList)):
                            if CommonUtil.get_datetime_from_string(newsFeatureList[news_feature_end_index][0]) \
                                    > time_i:
                                break
                        str_begin_end = str(minute_i + 1) + ': news->' + str(
                            news_feature_begin_index) + ' : ' + str(
                                news_feature_end_index - 1)
                        logger.debug(str_begin_end)
                        for news_feature_index in range(
                                news_feature_begin_index,
                                news_feature_end_index):
                            current_news_feature = newsFeatureList[
                                news_feature_index]
                            influence_score = decay_influence(
                                CommonUtil.get_datetime_from_string(
                                    current_news_feature[0]), time_i)
                            for value_i in range(0, feature_size):
                                influence_feature_vector[value_i] += float(current_news_feature[value_i + 1]) \
                                                                     * influence_score
                        is_influenced_price = True
                        break
                    elif interval_seconds < 0:
                        break
                last_news_begin = news_feature_begin_index
            if is_influenced_price:
                influence_feature_vector.append(price_delta)
                featureVectorList.append(influence_feature_vector)
        pre_price_item = current_price_item
    file_path = FEATURE_VECTOR_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, featureVectorList)
    logger.info("Generate Feature Vector Done!")
Beispiel #10
0
def process_original_price():
    logger.info("In Process Original Price...")
    global originalPriceList
    originalPriceList = CommonUtil.read_csv(ORIGINAL_PRICE_PATH)
    sample_datetime = None
    sample_price_list = list()
    # 对每一个原始价格
    for original_price in originalPriceList:
        logger.debug('price time: ' + original_price[0])
        price_datetime = CommonUtil.get_datetime_from_string(original_price[0])
        price_value = float(original_price[1])
        if sample_datetime is None:
            sample_datetime = CommonUtil.get_datetime_from_string(
                PRICE_START_TIME)
        time_interval = CommonUtil.get_interval_seconds(
            price_datetime, sample_datetime)
        # 价格时间在采集区间外(价格对应时间远早于采集时刻点),取下一个价格
        if time_interval < -PRICE_SAMPLE_MINUTE * 60 / 2:
            continue
        # 如果当前时间超过采样区间(晚于),先计算上一个采样时间的平均价格,再寻找下一个采样点
        while time_interval >= PRICE_SAMPLE_MINUTE * 60 / 2:
            # 如果当前采样点有价格
            if len(sample_price_list) > 0:
                price_sum = 0
                for price_item in sample_price_list:
                    price_sum += price_item
                average_price = round(price_sum / len(sample_price_list),
                                      CURRENCY_PAIR_PRECISION + 2)
                sample_datetime_str = CommonUtil.get_string_from_datetime(
                    sample_datetime)
                average_price_item = [sample_datetime_str, average_price]
                # 将采样时间及对应的计算后的价格加入列表
                processedPriceList.append(average_price_item)
                # 重置采样点价格列表
                sample_price_list = list()
            # 计算下一个采样点
            sample_datetime = CommonUtil.get_next_sample_time(
                sample_datetime, PRICE_SAMPLE_MINUTE, MARKET_OPEN_TIME,
                MARKET_CLOSE_TIME)
            time_interval = CommonUtil.get_interval_seconds(
                price_datetime, sample_datetime)
        logger.debug('sample datetime:' +
                     CommonUtil.get_string_from_datetime(sample_datetime))
        # 价格时间在采集区间外
        if sample_datetime > CommonUtil.get_datetime_from_string(
                PRICE_END_TIME):
            break
        # 属于当前采样点,加入当前采样点价格列表,前闭后开[,)
        sample_price_list.append(price_value)
    # 处理最后一个采集时刻的价格列表
    # 如果当前采样点有价格
    if len(sample_price_list) > 0:
        price_sum = 0
        for price_item in sample_price_list:
            price_sum += price_item
        average_price = round(price_sum / len(sample_price_list),
                              CURRENCY_PAIR_PRECISION + 2)
        sample_datetime_str = CommonUtil.get_string_from_datetime(
            sample_datetime)
        average_price_item = [sample_datetime_str, average_price]
        # 将采样时间及对应的计算后的价格加入列表
        processedPriceList.append(average_price_item)
    file_path = PROCESSED_PRICE_PATH + '_' + str(
        PRICE_SAMPLE_MINUTE) + CSV_FILE_SUFFIX
    CommonUtil.write_csv(file_path, processedPriceList)
    logger.info("Process Original Price Done!")