Ejemplo n.º 1
0
 def test_2_dump_instruments(self):
     ori_ins = set(
         map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
     res_ins = set(D.list_instruments(D.instruments("all"), as_list=True))
     assert len(ori_ins -
                res_ins) == len(ori_ins -
                                res_ins) == 0, "dump instruments failed"
Ejemplo n.º 2
0
    def instrument_callback(self, ibody, task_uri):
        """Target function for the established process when the received task asks for instrument data.

        Call the data provider to acquire data and publish the instrument data.
        """

        instruments = ibody["instruments"]
        start_time = ibody["start_time"]
        end_time = ibody["end_time"]
        if start_time == "None":
            start_time = None
        if end_time == "None":
            end_time = None
        freq = ibody["freq"]
        as_list = ibody["as_list"]
        status_code = 0
        # TODO: add exceptions detection and modify status_code
        self.logger.debug("process instrument data at %f" % time.time())
        try:
            instrument_result = D.list_instruments(instruments, start_time, end_time, freq, as_list)
            if isinstance(instrument_result, dict):
                instrument_result = {i: [(str(s), str(e)) for s, e in t] for i, t in instrument_result.items()}
            self.logger.debug("finish processing instrument data and publish message at %f" % time.time())
            self.publish_message("instrument", instrument_result, status_code, task_uri)
        except Exception as e:
            self.logger.exception(f"Error while processing request %.200s" % e)
            self.publish_message("instrument", None, 1, task_uri, str(e))
Ejemplo n.º 3
0
    def test_handler_storage(self):
        # init data handler
        data_handler = TestHandler(**self.data_handler_kwargs)

        # init data handler with hasing storage
        data_handler_hs = TestHandler(**self.data_handler_kwargs,
                                      infer_processors=["HashStockFormat"])

        fetch_start_time = "2019-01-01"
        fetch_end_time = "2019-12-31"
        instruments = D.instruments(market=self.market)
        instruments = D.list_instruments(instruments=instruments,
                                         start_time=fetch_start_time,
                                         end_time=fetch_end_time,
                                         as_list=True)

        with TimeInspector.logt("random fetch with DataFrame Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler.fetch(selector=(fetch_stock,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler.fetch(selector=(fetch_stocks,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

        with TimeInspector.logt("random fetch with HasingStock Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler_hs.fetch(selector=(fetch_stock,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler_hs.fetch(selector=(fetch_stocks,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)
Ejemplo n.º 4
0
    def _gen_stock_dataset(self, config, conf_type):
        try:
            path = config.pop("path")
        except KeyError as e:
            raise ValueError("Must specify the path to save the dataset.") from e

        if os.path.isfile(path + "tmp_dataset.pkl"):
            start = time.time()
            print_log("Dataset exists, load from disk.", __name__)
        else:
            start = time.time()
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print_log("Generating dataset", __name__)
            self._prepare_calender_cache()
            dataset = init_instance_by_config(config)
            print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__)
            dataset.config(dump_all=False, recursive=True)
            dataset.to_pickle(path + "tmp_dataset.pkl")

        with open(path + "tmp_dataset.pkl", "rb") as f:
            new_dataset = pkl.load(f)

        instruments = D.instruments(market="all")
        stock_list = D.list_instruments(
            instruments=instruments, start_time=self.start_time, end_time=self.end_time, freq="1min", as_list=True
        )

        def generate_dataset(stock):
            if os.path.isfile(path + stock + ".pkl"):
                print("exist " + stock)
                return
            self._init_qlib(self.qlib_conf)
            new_dataset.handler.config(**{"instruments": [stock]})
            if conf_type == "backtest":
                new_dataset.handler.setup_data()
            else:
                new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS)
            new_dataset.config(dump_all=True, recursive=True)
            new_dataset.to_pickle(path + stock + ".pkl")

        Parallel(n_jobs=32)(delayed(generate_dataset)(stock) for stock in stock_list)
Ejemplo n.º 5
0
def prepareTrainDataset(ifSavePortfolioIndex=False):
    print(
        "------------------------ Begin to prepare train dataset... ------------------------"
    )

    # read config file
    cf = configparser.ConfigParser()
    cf.read("config/config.ini")

    minDaysRange = int(cf.get("Parameter", "minDaysRange"))

    # offset of days
    numberOfYears = int(cf.get("Parameter", "numberOfYears"))
    numberOfMonths = int(cf.get("Parameter", "numberOfMonths"))
    numberOfDays = int(cf.get("Parameter", "numberOfDays"))

    # qlib init
    qlib.init(provider_uri='data/bin')

    # use one fund be the standard of trading day
    calendar = D.calendar(freq='day')
    lastDay = calendar[-1]  # 2021-02-10 00:00:00
    firstDay = lastDay - DateOffset(years=numberOfYears,
                                    months=numberOfMonths,
                                    days=numberOfDays)  # 2018-02-10 00:00:00

    # exclude the influence of days without trading
    calendarBetweenFirstDayAndLastDay = D.calendar(freq='day',
                                                   start_time=firstDay,
                                                   end_time=lastDay)
    firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0]
    lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1]

    # get portfolio
    pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio")
    if not os.path.exists(pathOfDfSparsePortfolio):
        getSparseMatrixForPortfolioInAllFunds()
    dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0)

    if ifSavePortfolioIndex:
        dfPortfolioIndex = dfSparsePortfolio["FullElements"]
        dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv")

    folderToSaveTrainDataset = getFolderNameInConfig(
        "folderToSaveTrainDataset")  # the folder to save train dataset
    folderToSaveTestDataset = getFolderNameInConfig(
        "folderToSaveTestDataset")  # the folder to save test dataset

    count = 0
    instruments = D.instruments(market='all')
    for file in D.list_instruments(instruments=instruments, as_list=True):
        fundCode = file.split("_")[0]  # 000001

        if count % 100 == 0:
            print("count = %s\tfundCode=%s" % (count, fundCode))

        try:
            # can't find portfolio for this fund
            try:
                dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]]
            except:
                continue

            # read file and remove empty line
            df = D.features([file], ['$AccumulativeNetAssetValue'],
                            start_time=firstDayToAnalyze,
                            end_time=lastDayToAnalyze)
            df.columns = ['AccumulativeNetAssetValue']
            #df = df.unstack(level=0)
            df["datetime"] = df.index.levels[1]

            # reset the index
            df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True)

            # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high
            if df.shape[0] <= minDaysRange:
                continue

            # count the days between first day and last day
            day = df['datetime']
            # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday)
            firstDayInThisFund = day[day.first_valid_index(
            )]  # 2018-02-12 00:00:00, 2018-02-10 is Satuaday
            lastDayInThisFund = day[
                day.last_valid_index()]  # 2021-02-10 00:00:00

            # must have value in latest day
            if (lastDayInThisFund - lastDayToAnalyze).days != 0:
                continue

            df['daysDiffWithLastDay'] = df['datetime'].apply(
                lambda x: (lastDayInThisFund - x).days)

            # get the value in important days
            netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][
                "AccumulativeNetAssetValue"].tolist()[0]  # 4.046

            # get train dataset which found more than 3 years
            if (firstDayInThisFund - firstDayToAnalyze).days <= 0:
                # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day])
                df["adjustFactorToLatestDay"] = df[
                    "AccumulativeNetAssetValue"] / netValueInFirstDay
                df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]]

                # abandon the latest day, it's meaningless
                df.reset_index(drop=True, inplace=True)
                df = df.T.drop(labels=0, axis=1).T
                # reset index to concat with dfSparsePortfolioForThisFund
                df.reset_index(drop=True, inplace=True)
                df = df.T

                # duplicate to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = pd.concat(
                    [dfSparsePortfolioForThisFund.T] * df.shape[1])
                # reset index to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index(
                    drop=True).T

                dfDataset = pd.concat([dfSparsePortfolioForThisFund, df],
                                      axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTrainDataset,
                                 "%s.csv" % fundCode))
            else:
                dfInFirstDay = df[df['datetime'] ==
                                  firstDayInThisFund].reset_index(drop=True)
                dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T
                dfInFirstDay[fundCode] = dfInFirstDay[0]
                dfDataset = pd.concat(
                    [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]],
                    axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode))

            count += 1
        except Exception as e:
            print("fundCode = %s\terror = %s" % (fundCode, e))
            continue

    print("------------------------ Done. ------------------------")
Ejemplo n.º 6
0
 def test_1_dump_instruments(self):
     self.DUMP_DATA.dump_instruments()
     ori_ins = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.iterdir()))
     res_ins = set(D.list_instruments(D.instruments("all"), as_list=True))
     assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"