Ejemplo n.º 1
0
 def _dump_instruments(self):
     logger.info("start dump instruments......")
     _fun = partial(self._get_date, is_begin_end=True)
     new_stock_files = sorted(
         filter(
             lambda x: fname_to_code(x.name[:-len(self.file_suffix)].strip(
             ).lower()).upper() not in self._old_instruments,
             self.csv_files,
         ))
     with tqdm(total=len(new_stock_files)) as p_bar:
         with ProcessPoolExecutor(max_workers=self.works) as execute:
             for file_path, (_begin_time, _end_time) in zip(
                     new_stock_files, execute.map(_fun, new_stock_files)):
                 if isinstance(_begin_time, pd.Timestamp) and isinstance(
                         _end_time, pd.Timestamp):
                     symbol = fname_to_code(
                         self.get_symbol_from_file(
                             file_path).lower()).upper()
                     _dt_map = self._old_instruments.setdefault(
                         symbol, dict())
                     _dt_map[
                         self.
                         INSTRUMENTS_START_FIELD] = self._format_datetime(
                             _begin_time)
                     _dt_map[self.
                             INSTRUMENTS_END_FIELD] = self._format_datetime(
                                 _end_time)
                 p_bar.update()
     _inst_df = pd.DataFrame.from_dict(self._old_instruments,
                                       orient="index")
     _inst_df.index.names = [self.symbol_field_name]
     self.save_instruments(_inst_df.reset_index())
     logger.info("end of instruments dump.\n")
Ejemplo n.º 2
0
    def _dump_bin(self, file_or_data: [Path, pd.DataFrame],
                  calendar_list: List[pd.Timestamp]):
        if not calendar_list:
            logger.warning("calendar_list is empty")
            return
        if isinstance(file_or_data, pd.DataFrame):
            if file_or_data.empty:
                return
            code = fname_to_code(
                str(file_or_data.iloc[0][self.symbol_field_name]).lower())
            df = file_or_data
        elif isinstance(file_or_data, Path):
            code = self.get_symbol_from_file(file_or_data)
            df = self._get_source_data(file_or_data)
        else:
            raise ValueError(f"not support {type(file_or_data)}")
        if df is None or df.empty:
            logger.warning(f"{code} data is None or empty")
            return

        # try to remove dup rows or it will cause exception when reindex.
        df = df.drop_duplicates(self.date_field_name)

        # features save dir
        features_dir = self._features_dir.joinpath(code_to_fname(code).lower())
        features_dir.mkdir(parents=True, exist_ok=True)
        self._data_to_bin(df, calendar_list, features_dir)
Ejemplo n.º 3
0
    def _dump_features(self):
        logger.info("start dump features......")
        error_code = {}
        with ProcessPoolExecutor(max_workers=self.works) as executor:
            futures = {}
            for _code, _df in self._all_data.groupby(self.symbol_field_name):
                _code = fname_to_code(str(_code).lower()).upper()
                _start, _end = self._get_date(_df, is_begin_end=True)
                if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
                    continue
                if _code in self._update_instruments:
                    self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
                    futures[executor.submit(self._dump_bin, _df, self._update_calendars)] = _code
                else:
                    # new stock
                    _dt_range = self._update_instruments.setdefault(_code, dict())
                    _dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
                    _dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
                    futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code

            with tqdm(total=len(futures)) as p_bar:
                for _future in as_completed(futures):
                    try:
                        _future.result()
                    except Exception:
                        error_code[futures[_future]] = traceback.format_exc()
                    p_bar.update()
            logger.info(f"dump bin errors: {error_code}")

        logger.info("end of features dump.\n")
Ejemplo n.º 4
0
 def save_instruments(self, instruments_data: Union[list, pd.DataFrame]):
     self._instruments_dir.mkdir(parents=True, exist_ok=True)
     instruments_path = str(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME).resolve())
     if isinstance(instruments_data, pd.DataFrame):
         _df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
         instruments_data = instruments_data.loc[:, _df_fields]
         instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
             lambda x: fname_to_code(x.lower()).upper()
         )
         instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
     else:
         np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
Ejemplo n.º 5
0
 def get_symbol_from_file(self, file_path: Path) -> str:
     return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
Ejemplo n.º 6
0
 def symbol_to_yahoo(self, symbol):
     return fname_to_code(symbol)