Beispiel #1
0
    def regressor_dev_test(self, dev_set, dev_value_set, dev_date_set, dev_stock_id_set, save_clsfy_path="mlp_trade_regressor", is_cv=False, include_top_list = None):
        # test mode


        if not include_top_list:
            include_top_list = [1]
        mlp_regressor = self.mlp_regressor
        pred_value_list = np.array(mlp_regressor.predict(dev_set))
        actual_value_list = np.array(dev_value_set)
        mrse = calculate_rmse(actual_value_list, pred_value_list)
        date_list = dev_date_set
        stock_id_list = dev_stock_id_set

        avg_price_change_tuple, var_tuple, std_tuple = get_avg_price_change(pred_value_list, actual_value_list,
                                                                                  date_list, stock_id_list,
                                                                                  include_top_list=
                                                                                  include_top_list)

        # count how many predicted value has the same polarity as actual value
        polar_list = [1 for x, y in zip(pred_value_list, actual_value_list) if x * y >= 0]
        polar_count = len(polar_list)
        polar_percent = polar_count / len(pred_value_list)
        #

        # <uncomment for debugging>
        if not is_cv:
            print("----------------------------------------------------------------------------------------")
            print("actual_value_list, ", actual_value_list)
            print("pred_value_list, ", pred_value_list)
            print("polarity: {}".format(polar_percent))
            print("mrse: {}".format(mrse))
            print("avg_price_change: {}".format(avg_price_change_tuple))
            print("----------------------------------------------------------------------------------------")
        else:
            pass
            # print("Testing complete! Testing Set size: {}".format(len(self.r_dev_value_set)))
            # <uncomment for debugging>
    # ------------------------------------------------------------------------------------------------------------------
        return mrse, avg_price_change_tuple[0], polar_percent
Beispiel #2
0
    def regressor_dev(self, save_clsfy_path="mlp_trade_regressor", is_cv=False, include_top_list = None):
        # test mode


        if not include_top_list:
            include_top_list = [1]
        mlp_regressor = pickle.load(open(save_clsfy_path, "rb"))
        pred_value_list = np.array(mlp_regressor.predict(self.dev_set))
        actual_value_list = np.array(self.dev_value_set)
        date_list = self.dev_date_set
        stock_id_list = self.dev_stock_id_set
        avg_price_change_tuple, var_tuple, std_tuple = get_avg_price_change(pred_value_list, actual_value_list,
                                                                                  date_list, stock_id_list,
                                                                                  include_top_list=
                                                                                  include_top_list)

        # compute accuracy in terms of positive and negative

        # (3.) get the pred label for each week
        pred_label_dict_by_week = collections.defaultdict(lambda :[])
        golden_label_dict_by_week = collections.defaultdict(lambda :[])
        pred_value_dict_by_week = collections.defaultdict(lambda :[])
        golden_value_dict_by_week = collections.defaultdict(lambda :[])


        pred_label_list = ['pos' if x >= 0 else 'neg' for x in pred_value_list ]
        actual_label_list = ['pos' if x >= 0 else 'neg' for x in actual_value_list ]

        for i, pred_label in enumerate(pred_label_list):
            date = self.dev_date_set[i]
            # classification
            pred_label_dict_by_week[date].append(pred_label)
            golden_label = actual_label_list[i]
            golden_label_dict_by_week[date].append(golden_label)
            #
            # regression
            predict_value = pred_value_list[i]
            golden_value = actual_value_list[i]
            pred_value_dict_by_week[date].append(predict_value)
            golden_value_dict_by_week[date].append(golden_value)

        week_average_f1_list = []
        week_average_accuracy_list = []
        week_average_rmse = []
        dev_label_dict = collections.defaultdict(lambda: 0)
        pred_label_dict = collections.defaultdict(lambda: 0)
        label_f1_list_all = []

        # (4.) compute the f1, accuracy for each week in 1 validation set
        for date, pred_label_list_for_1_week in pred_label_dict_by_week.items():
            pred_label_list = pred_label_list_for_1_week
            golden_label_list = golden_label_dict_by_week[date]

            # (3.) compute the average f-measure

            _,average_f1  = compute_average_f1(pred_label_list, golden_label_list)
            week_average_f1_list.append(average_f1)
            #average_f1 = f1_list[0] # using F-measure
            #

            # (4.) compute accuracy
            correct = 0
            for i, pred_label in enumerate(pred_label_list):
                if pred_label == golden_label_list[i]:
                    correct += 1
            accuracy = correct / len(golden_label_list)
            week_average_accuracy_list.append(accuracy)
            #

            # (5.) count the occurrence for each label
            for dev_label in golden_label_list:
                dev_label_dict[dev_label] += 1
            for pred_label in pred_label_list:
                pred_label_dict[pred_label] += 1
            #

            # # (6.) save rmse

            pred_value_list1 = pred_value_dict_by_week[date]
            actual_value_list1 = golden_value_dict_by_week[date]
            rmse = calculate_rmse(actual_value_list1, pred_value_list1)
            week_average_rmse.append(rmse)


        week_average_f1 = np.average(week_average_f1_list)
        week_average_accuracy = np.average(week_average_accuracy_list)
        week_average_rmse = np.average(week_average_rmse)


        # <uncomment for debugging>
        if not is_cv:
            print("----------------------------------------------------------------------------------------")
            print("actual_value_list, ", actual_value_list)
            print("pred_value_list, ", pred_value_list)
            print("week_average_accuracy: {}".format(week_average_accuracy))
            print("week_average_f1: {}".format(week_average_f1))
            print("week_average_rmse: {}".format(week_average_rmse))
            print("week_average_price_change: {}".format(avg_price_change_tuple))
            print("----------------------------------------------------------------------------------------")
        else:
            pass
            # print("Testing complete! Testing Set size: {}".format(len(self.r_dev_value_set)))
            # <uncomment for debugging>
    # ------------------------------------------------------------------------------------------------------------------
        return week_average_rmse, avg_price_change_tuple, week_average_accuracy, week_average_f1
Beispiel #3
0
    def baseline_reg_dev(self, target_folder):
        file1 = os.listdir(target_folder)[0]
        file1_path = os.path.join(target_folder, file1)
        with open(file1_path, 'r') as f:
            feature_list = f.readlines()[0].strip().split(',')[::2]

        key_index = feature_list.index('percent_change_price')
        print ("key_index: ", key_index)

        pred_value_list = []
        for dev_sample in self.dev_set:
            percent_change_price = float(dev_sample[key_index])*0.01
            pred_value_list.append(percent_change_price)

        actual_value_list = np.array(self.dev_value_set)

        date_list = self.dev_date_set
        stock_id_list = self.dev_stock_id_set
        include_top_list = [1]

        avg_price_change_tuple, var_tuple, std_tuple = get_avg_price_change(pred_value_list, actual_value_list,
                                                                                  date_list, stock_id_list,
                                                                                  include_top_list=
                                                                                  include_top_list)

        date_actual_avg_priceChange_list = get_chosen_stock_return(pred_value_list, actual_value_list, date_list,
                          stock_id_list, include_top_list=None)

        # compute accuracy in terms of positive and negative

        # (3.) get the pred label for each week
        pred_label_dict_by_week = collections.defaultdict(lambda :[])
        golden_label_dict_by_week = collections.defaultdict(lambda :[])
        pred_value_dict_by_week = collections.defaultdict(lambda :[])
        golden_value_dict_by_week = collections.defaultdict(lambda :[])


        pred_label_list = ['pos' if x >= 0 else 'neg' for x in pred_value_list ]
        actual_label_list = ['pos' if x >= 0 else 'neg' for x in actual_value_list ]

        for i, pred_label in enumerate(pred_label_list):
            date = self.dev_date_set[i]
            # classification
            pred_label_dict_by_week[date].append(pred_label)
            golden_label = actual_label_list[i]
            golden_label_dict_by_week[date].append(golden_label)
            #
            # regression
            predict_value = pred_value_list[i]
            golden_value = actual_value_list[i]
            pred_value_dict_by_week[date].append(predict_value)
            golden_value_dict_by_week[date].append(golden_value)

        week_average_f1_list = []
        week_average_accuracy_list = []
        week_average_rmse = []
        dev_label_dict = collections.defaultdict(lambda: 0)
        pred_label_dict = collections.defaultdict(lambda: 0)
        label_f1_list_all = []

        # (4.) compute the f1, accuracy for each week in 1 validation set
        for date, pred_label_list_for_1_week in pred_label_dict_by_week.items():
            pred_label_list = pred_label_list_for_1_week
            golden_label_list = golden_label_dict_by_week[date]

            # (3.) compute the average f-measure

            _,average_f1  = compute_average_f1(pred_label_list, golden_label_list)
            week_average_f1_list.append(average_f1)
            #average_f1 = f1_list[0] # using F-measure
            #

            # (4.) compute accuracy
            correct = 0
            for i, pred_label in enumerate(pred_label_list):
                if pred_label == golden_label_list[i]:
                    correct += 1
            accuracy = correct / len(golden_label_list)
            week_average_accuracy_list.append(accuracy)
            #

            # (5.) count the occurrence for each label
            for dev_label in golden_label_list:
                dev_label_dict[dev_label] += 1
            for pred_label in pred_label_list:
                pred_label_dict[pred_label] += 1
            #

            # # (6.) save rmse

            pred_value_list1 = pred_value_dict_by_week[date]
            actual_value_list1 = golden_value_dict_by_week[date]
            rmse = calculate_rmse(actual_value_list1, pred_value_list1)
            week_average_rmse.append(rmse)


        week_average_f1 = np.average(week_average_f1_list)
        week_average_accuracy = np.average(week_average_accuracy_list)
        week_average_rmse = np.average(week_average_rmse)


        # # <uncomment for debugging>
        # print("----------------------------------------------------------------------------------------")
        # print("actual_value_list, ", actual_value_list)
        # print("pred_value_list, ", pred_value_list)
        # print("week_average_accuracy: {}".format(week_average_accuracy))
        # print("week_average_f1: {}".format(week_average_f1))
        # print("week_average_rmse: {}".format(week_average_rmse))
        # print("week_average_price_change: {}".format(avg_price_change_tuple))
        # print("----------------------------------------------------------------------------------------")

    # ------------------------------------------------------------------------------------------------------------------
        return week_average_rmse, avg_price_change_tuple, week_average_accuracy, week_average_f1, \
               date_actual_avg_priceChange_list
Beispiel #4
0
        rnn1.train_rnn()
        pred_value_list, actual_value_list, date_list, stock_id_list = rnn1.reg_dev_for_moving_window_test(
        )
        #
        #print ("date_list: ", date_list)
        pred_label_list_temp = [
            'pos' if x >= 0 else 'neg' for x in pred_value_list
        ]
        actual_label_list_temp = [
            'pos' if x >= 0 else 'neg' for x in actual_value_list
        ]
        pred_label_list.extend(pred_label_list_temp)
        actual_label_list.extend(actual_label_list_temp)
        data_list_for_classification.extend(date_list)
        #
        rmse = calculate_rmse(actual_value_list, pred_value_list)
        avg_price_change_tuple, var_tuple, std_tuple = get_avg_price_change(
            pred_value_list, actual_value_list, date_list, stock_id_list)

        chosen_stock_return_list_temp = get_chosen_stock_return(
            pred_value_list, actual_value_list, date_list, stock_id_list)
        avg_price_change_1 = avg_price_change_tuple[
            0]  # Strategy: choose the top 1 stock each week
        var_1 = var_tuple[0]
        std_1 = std_tuple[0]
        avg_price_change_list.append(avg_price_change_1)
        print("chosen_stock_return_list_temp: ", chosen_stock_return_list_temp)
        var_list.append(var_1)
        std_list.append(std_1)
        chosen_stock_return_list.extend(chosen_stock_return_list_temp)
        rmse_list.append(rmse)
Beispiel #5
0
def stock_prediction_baseline_reg(train_data_folder_name,
                                  test_data_folder_name,
                                  is_random=False,
                                  is_highest_profit=False,
                                  random_seed=None,
                                  data_set='a_share'):

    print("Baseline for regression!")

    # (1.) build classifer
    mlp_classifier1 = MlpTradeRegressor()
    #hidden_layer_sizes = (26,6)
    #learning_rate_init = 0.0001
    #print ("hidden_layer_sizes: ", hidden_layer_sizes)
    ##mlp_classifier1.set_regressor(hidden_layer_sizes, learning_rate_init = learning_rate_init)
    #clsfy_name = 'dow_jones_mlp_trade_classifier_window_shift'
    #clf_path = os.path.join(parent_folder, 'trained_classifiers', clsfy_name)

    # (2.) data folder
    train_data_folder = os.path.join(data_set, train_data_folder_name)
    train_data_folder = os.path.join(parent_folder, 'data', train_data_folder)

    test_data_folder = os.path.join(data_set, test_data_folder_name)
    test_data_folder = os.path.join(parent_folder, 'data', test_data_folder)
    #

    # read train data set
    train_data_set = set()
    train_stock_set = set()
    train_baseline_dict = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(lambda:
                                                                        0)))

    file_name_list = os.listdir(train_data_folder)
    for file_name in file_name_list:
        data_str = re.findall(r'([0-9\-]+)_', file_name)[0]
        stock_name = re.findall(r'[0-9\-]+_([A-Za-z0-9]+)_', file_name)[0]
        price_change = float(re.findall(r'#([0-9\.\-]+)#', file_name)[0])
        train_baseline_dict[data_str][stock_name]['actual'] = price_change
        train_data_set.add(data_str)
        train_stock_set.add(stock_name)
    train_data_list = sorted(list(train_data_set))
    print("train_data_list: ", train_data_list)
    print("train_stock_set: ", train_stock_set)

    #

    # read test data set
    test_date_set = set()
    test_stock_set = set()
    NUMBER_OF_PREVIOUS_WEEK = 2
    baseline_dict = collections.defaultdict(lambda: collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0)))
    file_name_list = os.listdir(test_data_folder)
    file_path_list = [
        os.path.join(test_data_folder, x) for x in file_name_list
    ]
    for file_name in file_name_list:
        data_str = re.findall(r'([0-9\-]+)_', file_name)[0]
        stock_name = re.findall(r'[0-9\-]+_([A-Za-z0-9]+)_', file_name)[0]
        price_change = float(re.findall(r'#([0-9\.\-]+)#', file_name)[0])
        baseline_dict[data_str][stock_name]['actual'] = price_change
        test_date_set.add(data_str)
        test_stock_set.add(stock_name)
    # baseline algorithm
    train_last_date_list = train_data_list[-NUMBER_OF_PREVIOUS_WEEK:]
    complete_test_date_list = train_last_date_list + sorted(
        list(test_date_set))
    for last_date in train_last_date_list:
        for test_stock in list(test_stock_set):
            baseline_dict[last_date][test_stock][
                'actual'] = train_baseline_dict[last_date][test_stock][
                    'actual']
    #
    #test_date_set.update(set(train_last_date_list))

    #

    for test_date in sorted(list(test_date_set)):
        this_week_index = complete_test_date_list.index(test_date)
        p1_week_index = this_week_index - 1
        p2_week_index = this_week_index - 2
        p1_week_date = complete_test_date_list[p1_week_index]
        p2_week_date = complete_test_date_list[p2_week_index]
        for stock in list(test_stock_set):
            p1_pc = baseline_dict[p1_week_date][stock]['actual']
            p2_pc = baseline_dict[p2_week_date][stock]['actual']
            #predicted_price = (p1_pc + p2_pc) / 2 # baseline algorithm1
            predicted_price = p1_pc  # baseline algorithm2

            baseline_dict[test_date][stock]['predict'] = predicted_price

    print("complete_test_date_list: ", complete_test_date_list)
    print("test_data_set: ", test_date_set)
    print("test_stock_set: ", test_stock_set)
    #print ("baseline_dict.dates: ", baseline_dict.items())
    #

    # get the predicted value
    if is_random:
        predict_pc_list = []
        for i, test_date in enumerate(sorted(list(test_date_set))):
            if random_seed:
                random.seed(i + random_seed)
            best_stock = random.sample(test_stock_set, 1)[0]
            actual_pc = baseline_dict[test_date][best_stock]['actual']
            predict_pc_list.append(actual_pc)
        print("predict_pc_list: ", predict_pc_list)

    elif is_highest_profit:
        predict_pc_list = []
        for i, test_date in enumerate(sorted(list(test_date_set))):
            highest_return = float('-inf')
            for stock in test_stock_set:
                stock_return = baseline_dict[test_date][stock]['actual']
                if stock_return > highest_return:
                    highest_return = stock_return
            predict_pc_list.append(highest_return)
        print("predict_pc_list: ", predict_pc_list)

    else:
        predict_stock_list = []
        predict_pc_list = []
        rmse_list = []
        for test_date in sorted(list(test_date_set)):
            complete_predict_value_list = []
            complete_actual_value_list = []
            highest_pc = float('-inf')
            for stock in list(test_stock_set):
                predict_value = baseline_dict[test_date][stock]['predict']
                actual_value = baseline_dict[test_date][stock]['actual']
                complete_predict_value_list.append(predict_value)
                complete_actual_value_list.append(actual_value)
                if predict_value > highest_pc:
                    highest_pc = predict_value
                    best_stock = stock
            rmse = calculate_rmse(complete_actual_value_list,
                                  complete_predict_value_list)
            rmse_list.append(rmse)
            predict_stock_list.append(best_stock)
            actual_pc = baseline_dict[test_date][best_stock]['actual']
            predict_pc_list.append(actual_pc)
            print("predict_pc_list: ", predict_pc_list)
            print("avg_pc: ", np.average(predict_pc_list))
            print("avg_rmse: ", np.average(rmse_list))

    return predict_pc_list
Beispiel #6
0
# ==========================================================================================================
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
mlp_path = os.path.join(parent_folder, 'general_functions')
sys.path.append(mlp_path)
# ==========================================================================================================

# ==========================================================================================================
# local package import
# ==========================================================================================================
from trade_general_funcs import calculate_rmse
# ==========================================================================================================

# ==========================================================================================================
# TEST-[1] <<<RMSE>>>
# ==========================================================================================================

# TODO complete the unit test
y_actual = np.array([1, 2, 3.2, 8.7, 5.5, 112])
y_predicted = np.array([2, 3.9, 4.1, 0, 34.3, 9.9])

rms1 = math.sqrt(mean_squared_error(y_actual, y_predicted))
rms2 = calculate_rmse(y_actual, y_predicted)

print("rms1: {}".format(rms1))
print("rms2: {}".format(rms2))

# ==========================================================================================================

# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT IMPORT I
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>