def remove_temp_files(): """删除日志、缓存文件""" dirs = ['geckordriver', 'webcache', 'download'] for d in dirs: path = data_root(d) try: shutil.rmtree(path) except PermissionError: # 可能后台正在使用中,忽略 pass # 然后再创建该目录 data_root(d)
def bcolz_table_path(table_name): """bcolz文件路径""" root_dir = data_root('bcolz') if not os.path.exists(root_dir): os.makedirs(root_dir) path_ = os.path.join(root_dir, '{}.bcolz'.format(table_name)) return path_
def data_path(code, date_str): name = pd.Timestamp(date_str).strftime(r'%Y%m%d') data_dir = os.path.join(data_root('cjmx'), code) if not os.path.exists(data_dir): os.mkdir(data_dir) return os.path.join(data_dir, f'{name}.pkl')
import os import sys import pickle import pandas as pd from pandas.tseries.offsets import BDay, Week, MonthBegin, QuarterBegin, Hour, Minute, Second from hashlib import md5 from six import iteritems import logbook from cnswd.constants import MARKET_START from cnswd.utils import data_root logger = logbook.Logger(__name__) TEMP_DIR = data_root('webcache') DEFAULT_TIME_STR = '18:00:00' # 网站更新数据时间 DEFAULT_FREQ = 'D' def hash_args(*args, **kwargs): """Define a unique string for any set of representable args.""" arg_string = '_'.join([str(arg) for arg in args]) kwarg_string = '_'.join([str(key) + '=' + str(value) for key, value in iteritems(kwargs)]) combined = ':'.join([arg_string, kwarg_string]) hasher = md5() hasher.update(combined.encode('utf-8')) return hasher.hexdigest()
EARLIEST_POSSIBLE_DATE = pd.Timestamp('2002-1-4', tz='UTC') DB_COLS_NAME = [ 'm0', 'm1', 'm2', 'm3', 'm6', 'm9', 'y1', 'y3', 'y5', 'y7', 'y10', 'y15', 'y20', 'y30', 'y40', 'y50' ] DB_INDEX_NAME = 'date' OUTPUT_COLS_NAME = [ '0month', '1month', '2month', '3month', '6month', '9month', '1year', '3year', '5year', '7year', '10year', '15year', '20year', '30year', '40year', '50year' ] OUTPUT_INDEX_NAME = 'Time Period' DATA_DIR = data_root('treasury') # 在该目录存储国债利率数据 def read_local_data(): """读取本地文件数据""" dfs = [] for root, _, files in os.walk(DATA_DIR): for name in files: if name.endswith("xlsx"): file_path = os.path.join(root, name) df = pd.read_excel(file_path, index_col='日期', parse_dates=True) dfs.append(df) return pd.concat(dfs) def download_last_year():
def __init__(self, download_path=data_root('download')): self.host_url = 'http://www.sse.com.cn' logger.notice('初始化无头浏览器......') self.driver = make_headless_browser() self.wait = WebDriverWait(self.driver, MAX_WAIT_SECOND)
import time import sys import numpy as np import pandas as pd from cnswd.websource.exceptions import RetryException from cnswd.utils import data_root, loop_period_by from cnswd.websource.cninfo.constants import DB_NAME, DB_DATE_FREQ, TS_NAME, TS_DATE_FREQ from cnswd.websource.cninfo.data_browse import DataBrowse from cnswd.websource.cninfo.thematic_statistics import ThematicStatistics from cnswd.sql.base import get_engine, get_session from .base import DB_DATE_FIELD, DB_MODEL_MAPS, TS_DATE_FIELD, TS_MODEL_MAPS from .units import fixed_data record_path = os.path.join(data_root('record'), 'cninfo.csv') def get_record(index): try: return pd.read_csv(record_path, index_col=0).loc[index].to_dict() except FileNotFoundError: df = pd.DataFrame( { '完成状态': '未执行', '尝试次数': 0, '完成时间': pd.Timestamp('now'), '备注': '' }, index=[index]) df.to_csv(record_path)