def _dump_instruments(self): logger.info("start dump instruments......") _fun = partial(self._get_date, is_begin_end=True) new_stock_files = sorted( filter( lambda x: fname_to_code(x.name[:-len(self.file_suffix)].strip( ).lower()).upper() not in self._old_instruments, self.csv_files, )) with tqdm(total=len(new_stock_files)) as p_bar: with ProcessPoolExecutor(max_workers=self.works) as execute: for file_path, (_begin_time, _end_time) in zip( new_stock_files, execute.map(_fun, new_stock_files)): if isinstance(_begin_time, pd.Timestamp) and isinstance( _end_time, pd.Timestamp): symbol = fname_to_code( self.get_symbol_from_file( file_path).lower()).upper() _dt_map = self._old_instruments.setdefault( symbol, dict()) _dt_map[ self. INSTRUMENTS_START_FIELD] = self._format_datetime( _begin_time) _dt_map[self. INSTRUMENTS_END_FIELD] = self._format_datetime( _end_time) p_bar.update() _inst_df = pd.DataFrame.from_dict(self._old_instruments, orient="index") _inst_df.index.names = [self.symbol_field_name] self.save_instruments(_inst_df.reset_index()) logger.info("end of instruments dump.\n")
def _dump_bin(self, file_or_data: [Path, pd.DataFrame], calendar_list: List[pd.Timestamp]): if not calendar_list: logger.warning("calendar_list is empty") return if isinstance(file_or_data, pd.DataFrame): if file_or_data.empty: return code = fname_to_code( str(file_or_data.iloc[0][self.symbol_field_name]).lower()) df = file_or_data elif isinstance(file_or_data, Path): code = self.get_symbol_from_file(file_or_data) df = self._get_source_data(file_or_data) else: raise ValueError(f"not support {type(file_or_data)}") if df is None or df.empty: logger.warning(f"{code} data is None or empty") return # try to remove dup rows or it will cause exception when reindex. df = df.drop_duplicates(self.date_field_name) # features save dir features_dir = self._features_dir.joinpath(code_to_fname(code).lower()) features_dir.mkdir(parents=True, exist_ok=True) self._data_to_bin(df, calendar_list, features_dir)
def _dump_features(self): logger.info("start dump features......") error_code = {} with ProcessPoolExecutor(max_workers=self.works) as executor: futures = {} for _code, _df in self._all_data.groupby(self.symbol_field_name): _code = fname_to_code(str(_code).lower()).upper() _start, _end = self._get_date(_df, is_begin_end=True) if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)): continue if _code in self._update_instruments: self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end) futures[executor.submit(self._dump_bin, _df, self._update_calendars)] = _code else: # new stock _dt_range = self._update_instruments.setdefault(_code, dict()) _dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start) _dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end) futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code with tqdm(total=len(futures)) as p_bar: for _future in as_completed(futures): try: _future.result() except Exception: error_code[futures[_future]] = traceback.format_exc() p_bar.update() logger.info(f"dump bin errors: {error_code}") logger.info("end of features dump.\n")
def save_instruments(self, instruments_data: Union[list, pd.DataFrame]): self._instruments_dir.mkdir(parents=True, exist_ok=True) instruments_path = str(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME).resolve()) if isinstance(instruments_data, pd.DataFrame): _df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD] instruments_data = instruments_data.loc[:, _df_fields] instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply( lambda x: fname_to_code(x.lower()).upper() ) instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False) else: np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
def get_symbol_from_file(self, file_path: Path) -> str: return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
def symbol_to_yahoo(self, symbol): return fname_to_code(symbol)