Ejemplo n.º 1
0
def load_dataset(market='csi300'):

    # features
    fields = []
    names = []

    fields += ['$open/$close']  # NOTE: Ref($open, 0) != $open
    fields += ['Ref($open, %d)/$close' % d for d in range(1, 60)]
    names  += ['OPEN%d'%d for d in range(60)]

    fields += ['$high/$close']
    fields += ['Ref($high, %d)/$close' % d for d in range(1, 60)]
    names  += ['HIGH%d'%d for d in range(60)]

    fields += ['$low/$close']
    fields += ['Ref($low, %d)/$close' % d for d in range(1, 60)]
    names  += ['LOW%d'%d for d in range(60)]

    fields += ['$close/$close']  # 1
    fields += ['Ref($close, %d)/$close' % d for d in range(1, 60)]
    names  += ['CLOSE%d'%d for d in range(60)]

    fields += ['$vwap/$close']
    fields += ['Ref($vwap, %d)/$close' % d for d in range(1, 60)]
    names  += ['VWAP%d'%d for d in range(60)]

    # fields += ['Log($volume/$volume)']  # 1
    # fields += ['Log(Ref($volume, %d)/$volume)' % d for d in range(1, 60)]
    # names  += ['VOLUME%d'%d for d in range(60)]

    fields += ['$volume/$volume']  # 1
    fields += ['Ref($volume, %d)/$volume' % d for d in range(1, 60)]
    names  += ['VOLUME%d'%d for d in range(60)]

    # labels
    labels = ['Ref($vwap, -2)/Ref($vwap, -1)-1']
    label_names = ['LABEL0']

    ## load features
    print('loading features...')
    df = D.features(D.instruments(market), fields, start_time='2007-01-01')
    df.columns = names
    print('load features over')
    ## load labels
    if len(labels):
        print('loading labels...')
        df_labels = D.features(D.instruments('all'), labels, start_time='2007-01-01')
        df_labels.columns = label_names
        df[label_names] = df_labels
        print('load labels over')

    return df, names, label_names
Ejemplo n.º 2
0
    def load_data(self):
        ret = D.calendar(start_time='2010-01-01', end_time='2017-12-31', freq='day')[:2]
        print(ret)

        instruments = D.instruments('csi300')# ['SH600570','SH600000']
        fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low']
        data = D.features(instruments, fields, start_time='2010-01-01', end_time='2017-12-31', freq='day')
Ejemplo n.º 3
0
    def load_group_df(
        self,
        instruments,
        exprs: list,
        names: list,
        start_time: Union[str, pd.Timestamp] = None,
        end_time: Union[str, pd.Timestamp] = None,
        gp_name: str = None,
    ) -> pd.DataFrame:
        if instruments is None:
            warnings.warn("`instruments` is not set, will load all stocks")
            instruments = "all"
        if isinstance(instruments, str):
            instruments = D.instruments(instruments, filter_pipe=self.filter_pipe)
        elif self.filter_pipe is not None:
            warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")

        freq = self.freq[gp_name] if isinstance(self.freq, dict) else self.freq
        df = D.features(
            instruments, exprs, start_time, end_time, freq=freq, inst_processors=self.inst_processor.get(gp_name, [])
        )
        df.columns = names
        if self.swap_level:
            df = df.swaplevel().sort_index()  # NOTE: if swaplevel, return <datetime, instrument>
        return df
Ejemplo n.º 4
0
 def test_2_dump_instruments(self):
     ori_ins = set(
         map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
     res_ins = set(D.list_instruments(D.instruments("all"), as_list=True))
     assert len(ori_ins -
                res_ins) == len(ori_ins -
                                res_ins) == 0, "dump instruments failed"
Ejemplo n.º 5
0
    def test_0_qlib_data(self):

        GetData().qlib_data_cn(QLIB_DIR)
        df = D.features(D.instruments("csi300"), self.FIELDS)
        self.assertListEqual(list(df.columns), self.FIELDS,
                             "get qlib data failed")
        self.assertFalse(df.dropna().empty, "get qlib data failed")
Ejemplo n.º 6
0
    def test_handler_storage(self):
        # init data handler
        data_handler = TestHandler(**self.data_handler_kwargs)

        # init data handler with hasing storage
        data_handler_hs = TestHandler(**self.data_handler_kwargs,
                                      infer_processors=["HashStockFormat"])

        fetch_start_time = "2019-01-01"
        fetch_end_time = "2019-12-31"
        instruments = D.instruments(market=self.market)
        instruments = D.list_instruments(instruments=instruments,
                                         start_time=fetch_start_time,
                                         end_time=fetch_end_time,
                                         as_list=True)

        with TimeInspector.logt("random fetch with DataFrame Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler.fetch(selector=(fetch_stock,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler.fetch(selector=(fetch_stocks,
                                             slice(fetch_start_time,
                                                   fetch_end_time)),
                                   level=None)

        with TimeInspector.logt("random fetch with HasingStock Storage"):

            # single stock
            for i in range(100):
                random_index = np.random.randint(len(instruments), size=1)[0]
                fetch_stock = instruments[random_index]
                data_handler_hs.fetch(selector=(fetch_stock,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)

            # multi stocks
            for i in range(100):
                random_indexs = np.random.randint(len(instruments), size=5)
                fetch_stocks = [
                    instruments[_index] for _index in random_indexs
                ]
                data_handler_hs.fetch(selector=(fetch_stocks,
                                                slice(fetch_start_time,
                                                      fetch_end_time)),
                                      level=None)
Ejemplo n.º 7
0
 def testClose(self):
     close_p = D.features(D.instruments("csi300"),
                          ["Ref($close, 1)/$close - 1"])
     close_desc = close_p.describe(percentiles=np.arange(0.1, 1.0, 0.1))
     print(close_desc)
     self.assertLessEqual(abs(close_desc.loc["90%"][0]), 0.1,
                          "Close value is abnormal")
     self.assertLessEqual(abs(close_desc.loc["10%"][0]), 0.1,
                          "Close value is abnormal")
Ejemplo n.º 8
0
    def _get_old_data(self, qlib_data_dir: [str, Path]):
        import qlib
        from qlib.data import D

        qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve())
        qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None)
        df = D.features(D.instruments("all"), ["$close/$factor", "$adjclose/$close"])
        df.columns = [self._ori_close_field, self._first_close_field]
        return df
Ejemplo n.º 9
0
def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"):

    universe = D.features(D.instruments("csi300"), ["$close"],
                          start_time=start_time).swaplevel().sort_index()

    price_all = (D.features(
        D.instruments("all"), ["$close"],
        start_time=start_time).squeeze().unstack(level="instrument"))

    # StructuredCovEstimator is a statistical risk model
    riskmodel = StructuredCovEstimator()

    for i in range(T - 1, len(price_all)):

        date = price_all.index[i]
        ref_date = price_all.index[i - T + 1]

        print(date)

        codes = universe.loc[date].index
        price = price_all.loc[ref_date:date, codes]

        # calculate return and remove extreme return
        ret = price.pct_change()
        ret.clip(ret.quantile(0.025),
                 ret.quantile(0.975),
                 axis=1,
                 inplace=True)

        # run risk model
        F, cov_b, var_u = riskmodel.predict(ret,
                                            is_price=False,
                                            return_decomposed_components=True)

        # save risk data
        root = riskdata_root + "/" + date.strftime("%Y%m%d")
        os.makedirs(root, exist_ok=True)

        pd.DataFrame(F, index=codes).to_pickle(root + "/factor_exp.pkl")
        pd.DataFrame(cov_b).to_pickle(root + "/factor_cov.pkl")
        # for specific_risk we follow the convention to save volatility
        pd.Series(np.sqrt(var_u),
                  index=codes).to_pickle(root + "/specific_risk.pkl")
Ejemplo n.º 10
0
    def _get_all_1d_data(self):
        import qlib
        from qlib.data import D

        qlib.init(provider_uri=self.qlib_data_1d_dir)
        df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor", "$close"], freq="day")
        df.reset_index(inplace=True)
        df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True)
        df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
        return df
Ejemplo n.º 11
0
def fill_1min_using_1d(
    data_1min_dir: [str, Path],
    qlib_data_1d_dir: [str, Path],
    max_workers: int = 16,
    date_field_name: str = "date",
    symbol_field_name: str = "symbol",
):
    """Use 1d data to fill in the missing symbols relative to 1min

    Parameters
    ----------
    data_1min_dir: str
        1min data dir
    qlib_data_1d_dir: str
        1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format
    max_workers: int
        ThreadPoolExecutor(max_workers), by default 16
    date_field_name: str
        date field name, by default date
    symbol_field_name: str
        symbol field name, by default symbol

    """
    data_1min_dir = Path(data_1min_dir).expanduser().resolve()
    qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()

    min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name)
    symbols_1min = get_symbols(data_1min_dir)

    qlib.init(provider_uri=str(qlib_data_1d_dir))
    data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day")

    miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min)
    if not miss_symbols:
        logger.warning("More symbols in 1min than 1d, no padding required")
        return

    logger.info(f"miss_symbols  {len(miss_symbols)}: {miss_symbols}")
    tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0])
    columns = tmp_df.columns
    _si = tmp_df[symbol_field_name].first_valid_index()
    is_lower = tmp_df.loc[_si][symbol_field_name].islower()
    for symbol in tqdm(miss_symbols):
        if is_lower:
            symbol = symbol.lower()
        index_1d = data_1d.loc(axis=0)[symbol.upper()].index
        index_1min = generate_minutes_calendar_from_daily(index_1d)
        index_1min.name = date_field_name
        _df = pd.DataFrame(columns=columns, index=index_1min)
        if date_field_name in _df.columns:
            del _df[date_field_name]
        _df.reset_index(inplace=True)
        _df[symbol_field_name] = symbol
        _df["paused_num"] = 0
        _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
Ejemplo n.º 12
0
 def testClose(self):
     close_p = D.features(D.instruments('csi300'),
                          ['Ref($close, 1)/$close - 1'])
     close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1))
     print(close_desc)
     self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1,
                          "Close value is abnormal")
     self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2,
                          "Close value is abnormal")
     self.assertGreaterEqual(close_desc.loc["min"][0], -0.2,
                             "Close value is abnormal")
Ejemplo n.º 13
0
    def test_0_qlib_data(self):

        GetData().qlib_data(name="qlib_data_simple",
                            target_dir=QLIB_DIR,
                            region="cn",
                            interval="1d",
                            version="latest")
        df = D.features(D.instruments("csi300"), self.FIELDS)
        self.assertListEqual(list(df.columns), self.FIELDS,
                             "get qlib data failed")
        self.assertFalse(df.dropna().empty, "get qlib data failed")
Ejemplo n.º 14
0
    def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame:
        if instruments is None:
            warnings.warn("`instruments` is not set, will load all stocks")
            instruments = "all"
        if isinstance(instruments, str):
            instruments = D.instruments(instruments, filter_pipe=self.filter_pipe)
        elif self.filter_pipe is not None:
            warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")

        df = D.features(instruments, exprs, start_time, end_time)
        df.columns = names
        df = df.swaplevel().sort_index()  # NOTE: always return <datetime, instrument>
        return df
Ejemplo n.º 15
0
    def testCSI300(self):
        close_p = D.features(D.instruments("csi300"), ["$close"])
        size = close_p.groupby("datetime").size()
        cnt = close_p.groupby("datetime").count()["$close"]
        size_desc = size.describe(percentiles=np.arange(0.1, 1.0, 0.1))
        cnt_desc = cnt.describe(percentiles=np.arange(0.1, 1.0, 0.1))

        print(size_desc)
        print(cnt_desc)

        self.assertLessEqual(size_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks")
        self.assertGreaterEqual(size_desc.loc["80%"], 290, "Insufficient number of CSI300 constituent stocks")

        self.assertLessEqual(cnt_desc.loc["max"], 305, "Excessive number of CSI300 constituent stocks")
Ejemplo n.º 16
0
    def setUpClass(cls, enable_1d_type="simple", enable_1min=False) -> None:
        # use default data
        super().setUpClass(enable_1d_type, enable_1min)
        nameDFilter = NameDFilter(name_rule_re="SH600110")
        instruments = D.instruments("csi300", filter_pipe=[nameDFilter])
        start_time = "2005-01-04"
        end_time = "2005-12-31"
        freq = "day"

        instruments_d = DatasetD.get_instruments_d(instruments, freq)
        cls.instruments_d = instruments_d
        cal = Cal.calendar(start_time, end_time, freq)
        cls.cal = cal
        cls.start_time = cal[0]
        cls.end_time = cal[-1]
        cls.inst = list(instruments_d.keys())[0]
        cls.spans = list(instruments_d.values())[0]
Ejemplo n.º 17
0
    def _gen_stock_dataset(self, config, conf_type):
        try:
            path = config.pop("path")
        except KeyError as e:
            raise ValueError("Must specify the path to save the dataset.") from e

        if os.path.isfile(path + "tmp_dataset.pkl"):
            start = time.time()
            print_log("Dataset exists, load from disk.", __name__)
        else:
            start = time.time()
            if not os.path.exists(os.path.dirname(path)):
                os.makedirs(os.path.dirname(path))
            print_log("Generating dataset", __name__)
            self._prepare_calender_cache()
            dataset = init_instance_by_config(config)
            print_log(f"Dataset init, time cost: {time.time() - start:.2f}", __name__)
            dataset.config(dump_all=False, recursive=True)
            dataset.to_pickle(path + "tmp_dataset.pkl")

        with open(path + "tmp_dataset.pkl", "rb") as f:
            new_dataset = pkl.load(f)

        instruments = D.instruments(market="all")
        stock_list = D.list_instruments(
            instruments=instruments, start_time=self.start_time, end_time=self.end_time, freq="1min", as_list=True
        )

        def generate_dataset(stock):
            if os.path.isfile(path + stock + ".pkl"):
                print("exist " + stock)
                return
            self._init_qlib(self.qlib_conf)
            new_dataset.handler.config(**{"instruments": [stock]})
            if conf_type == "backtest":
                new_dataset.handler.setup_data()
            else:
                new_dataset.handler.setup_data(init_type=DataHandlerLP.IT_LS)
            new_dataset.config(dump_all=True, recursive=True)
            new_dataset.to_pickle(path + stock + ".pkl")

        Parallel(n_jobs=32)(delayed(generate_dataset)(stock) for stock in stock_list)
Ejemplo n.º 18
0
    def testCSI300(self):
        close_p = D.features(D.instruments('csi300'), ['$close'])
        size = close_p.groupby('datetime').size()
        cnt = close_p.groupby('datetime').count()
        size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1))
        cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1))

        print(size_desc)
        print(cnt_desc)

        self.assertLessEqual(size_desc.loc["max"][0], 305,
                             "Excessive number of CSI300 constituent stocks")
        self.assertLessEqual(
            size_desc.loc["80%"][0], 290,
            "Insufficient number of CSI300 constituent stocks")

        self.assertLessEqual(cnt_desc.loc["max"][0], 305,
                             "Excessive number of CSI300 constituent stocks")
        self.assertEqual(cnt_desc.loc["80%"][0], 300,
                         "Insufficient number of CSI300 constituent stocks")
Ejemplo n.º 19
0
    def generate_target_weight_position(self, score, current, trade_start_time,
                                        trade_end_time):

        trade_date = trade_start_time
        pre_date = get_pre_trading_date(trade_date,
                                        future=True)  # previous trade date

        # load risk data
        outs = self.get_risk_data(pre_date)
        if outs is None:
            self.logger.warning(
                f"no risk data for {pre_date:%Y-%m-%d}, skip optimization")
            return None
        factor_exp, factor_cov, specific_risk, universe, blacklist = outs

        # transform score
        # NOTE: for stocks missing score, we always assume they have the lowest score
        score = score.reindex(universe).fillna(score.min()).values

        # get current weight
        # NOTE: if a stock is not in universe, its current weight will be zero
        cur_weight = current.get_stock_weight_dict(only_stock=False)
        cur_weight = np.array([cur_weight.get(stock, 0) for stock in universe])
        assert all(cur_weight >= 0), "current weight has negative values"
        cur_weight = cur_weight / self.get_risk_degree(
            trade_date)  # sum of weight should be risk_degree
        if cur_weight.sum() > 1 and self.verbose:
            self.logger.warning(
                f"previous total holdings excess risk degree (current: {cur_weight.sum()})"
            )

        # load bench weight
        bench_weight = D.features(D.instruments("all"),
                                  [f"${self.market}_weight"],
                                  start_time=pre_date,
                                  end_time=pre_date).squeeze()
        bench_weight.index = bench_weight.index.droplevel(level="datetime")
        bench_weight = bench_weight.reindex(universe).fillna(0).values

        # whether stock tradable
        # NOTE: currently we use last day volume to check whether tradable
        tradable = D.features(D.instruments("all"), ["$volume"],
                              start_time=pre_date,
                              end_time=pre_date).squeeze()
        tradable.index = tradable.index.droplevel(level="datetime")
        tradable = tradable.reindex(universe).gt(0).values
        mask_force_hold = ~tradable

        # mask force sell
        mask_force_sell = np.array([stock in blacklist for stock in universe],
                                   dtype=bool)

        # optimize
        weight = self.optimizer(
            r=score,
            F=factor_exp,
            cov_b=factor_cov,
            var_u=specific_risk**2,
            w0=cur_weight,
            wb=bench_weight,
            mfh=mask_force_hold,
            mfs=mask_force_sell,
        )

        target_weight_position = {
            stock: weight
            for stock, weight in zip(universe, weight) if weight > 0
        }

        if self.verbose:
            self.logger.info("trade date: {:%Y-%m-%d}".format(trade_date))
            self.logger.info("number of holding stocks: {}".format(
                len(target_weight_position)))
            self.logger.info("total holding weight: {:.6f}".format(
                weight.sum()))

        return target_weight_position
Ejemplo n.º 20
0
def prepareTrainDataset(ifSavePortfolioIndex=False):
    print(
        "------------------------ Begin to prepare train dataset... ------------------------"
    )

    # read config file
    cf = configparser.ConfigParser()
    cf.read("config/config.ini")

    minDaysRange = int(cf.get("Parameter", "minDaysRange"))

    # offset of days
    numberOfYears = int(cf.get("Parameter", "numberOfYears"))
    numberOfMonths = int(cf.get("Parameter", "numberOfMonths"))
    numberOfDays = int(cf.get("Parameter", "numberOfDays"))

    # qlib init
    qlib.init(provider_uri='data/bin')

    # use one fund be the standard of trading day
    calendar = D.calendar(freq='day')
    lastDay = calendar[-1]  # 2021-02-10 00:00:00
    firstDay = lastDay - DateOffset(years=numberOfYears,
                                    months=numberOfMonths,
                                    days=numberOfDays)  # 2018-02-10 00:00:00

    # exclude the influence of days without trading
    calendarBetweenFirstDayAndLastDay = D.calendar(freq='day',
                                                   start_time=firstDay,
                                                   end_time=lastDay)
    firstDayToAnalyze = calendarBetweenFirstDayAndLastDay[0]
    lastDayToAnalyze = calendarBetweenFirstDayAndLastDay[-1]

    # get portfolio
    pathOfDfSparsePortfolio = cf.get("Analyze", "pathOfDfSparsePortfolio")
    if not os.path.exists(pathOfDfSparsePortfolio):
        getSparseMatrixForPortfolioInAllFunds()
    dfSparsePortfolio = pd.read_csv(pathOfDfSparsePortfolio, index_col=0)

    if ifSavePortfolioIndex:
        dfPortfolioIndex = dfSparsePortfolio["FullElements"]
        dfPortfolioIndex.to_csv("data/dfPortfolioIndex.csv")

    folderToSaveTrainDataset = getFolderNameInConfig(
        "folderToSaveTrainDataset")  # the folder to save train dataset
    folderToSaveTestDataset = getFolderNameInConfig(
        "folderToSaveTestDataset")  # the folder to save test dataset

    count = 0
    instruments = D.instruments(market='all')
    for file in D.list_instruments(instruments=instruments, as_list=True):
        fundCode = file.split("_")[0]  # 000001

        if count % 100 == 0:
            print("count = %s\tfundCode=%s" % (count, fundCode))

        try:
            # can't find portfolio for this fund
            try:
                dfSparsePortfolioForThisFund = dfSparsePortfolio[[fundCode]]
            except:
                continue

            # read file and remove empty line
            df = D.features([file], ['$AccumulativeNetAssetValue'],
                            start_time=firstDayToAnalyze,
                            end_time=lastDayToAnalyze)
            df.columns = ['AccumulativeNetAssetValue']
            #df = df.unstack(level=0)
            df["datetime"] = df.index.levels[1]

            # reset the index
            df = df.dropna(axis=0, subset=['datetime']).reset_index(drop=True)

            # like http://fundf10.eastmoney.com/jjjz_010476.html, the return in 30 days is 26%, so the annualized return is too high
            if df.shape[0] <= minDaysRange:
                continue

            # count the days between first day and last day
            day = df['datetime']
            # TODO: how about fund 519858, which trade in 2018-01-28 (Sunday)
            firstDayInThisFund = day[day.first_valid_index(
            )]  # 2018-02-12 00:00:00, 2018-02-10 is Satuaday
            lastDayInThisFund = day[
                day.last_valid_index()]  # 2021-02-10 00:00:00

            # must have value in latest day
            if (lastDayInThisFund - lastDayToAnalyze).days != 0:
                continue

            df['daysDiffWithLastDay'] = df['datetime'].apply(
                lambda x: (lastDayInThisFund - x).days)

            # get the value in important days
            netValueInFirstDay = df[df['datetime'] == firstDayInThisFund][
                "AccumulativeNetAssetValue"].tolist()[0]  # 4.046

            # get train dataset which found more than 3 years
            if (firstDayInThisFund - firstDayToAnalyze).days <= 0:
                # count the adjust factor, we can get the value in 3 years by adjustFactorToLatestDay * (value[0]/value[day])
                df["adjustFactorToLatestDay"] = df[
                    "AccumulativeNetAssetValue"] / netValueInFirstDay
                df = df[["daysDiffWithLastDay", "adjustFactorToLatestDay"]]

                # abandon the latest day, it's meaningless
                df.reset_index(drop=True, inplace=True)
                df = df.T.drop(labels=0, axis=1).T
                # reset index to concat with dfSparsePortfolioForThisFund
                df.reset_index(drop=True, inplace=True)
                df = df.T

                # duplicate to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = pd.concat(
                    [dfSparsePortfolioForThisFund.T] * df.shape[1])
                # reset index to concat with dfSparsePortfolioForThisFund
                dfSparsePortfolioForThisFund = dfSparsePortfolioForThisFund.reset_index(
                    drop=True).T

                dfDataset = pd.concat([dfSparsePortfolioForThisFund, df],
                                      axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTrainDataset,
                                 "%s.csv" % fundCode))
            else:
                dfInFirstDay = df[df['datetime'] ==
                                  firstDayInThisFund].reset_index(drop=True)
                dfInFirstDay = dfInFirstDay[["daysDiffWithLastDay"]].T
                dfInFirstDay[fundCode] = dfInFirstDay[0]
                dfDataset = pd.concat(
                    [dfSparsePortfolioForThisFund, dfInFirstDay[[fundCode]]],
                    axis=0)
                dfDataset.to_csv(
                    os.path.join(folderToSaveTestDataset, "%s.csv" % fundCode))

            count += 1
        except Exception as e:
            print("fundCode = %s\terror = %s" % (fundCode, e))
            continue

    print("------------------------ Done. ------------------------")
Ejemplo n.º 21
0
 def test_1_dump_instruments(self):
     self.DUMP_DATA.dump_instruments()
     ori_ins = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.iterdir()))
     res_ins = set(D.list_instruments(D.instruments("all"), as_list=True))
     assert len(ori_ins - res_ins) == len(ori_ins - res_ins) == 0, "dump instruments failed"
Ejemplo n.º 22
0
def get_features(fields):
    qlib.init(provider_uri=TestAutoData.provider_uri,
              expression_cache=None,
              dataset_cache=None,
              joblib_backend="loky")
    return D.features(D.instruments("csi300"), fields)