def test_get_china_stock_list(): print(settings.FOOLTRADER_STORE_PATH) df = technical.get_security_list('stock', exchanges=['sh', 'sz']) assert '000001' in df.index assert '金融行业' == df.loc['000001', 'sinaIndustry'] df = technical.get_security_list('stock', exchanges=['sh']) assert '600000' in df.index assert '金融行业' == df.loc['600000', 'sinaIndustry'] df = technical.get_security_list('stock', exchanges=['sh', 'sz'], start_code='000338', end_code='600388') assert '000338' in df.index assert '600388' in df.index assert '600389' not in df.index df = technical.get_security_list('stock', exchanges=['sh', 'sz'], codes=['300027', '000002']) assert len(df.index) == 2 df = technical.get_security_list('stock', exchanges=['sh', 'sz'], mode='es') assert type(df.loc['600004', 'sinaArea']) == list assert '广州' in (df.loc['600004', 'sinaArea']) assert '广东' in (df.loc['600004', 'sinaArea'])
def init_env(): if not os.path.exists(FOOLTRADER_STORE_PATH): print("{} is a wrong path") print("please set env FOOLTRADER_STORE_PATH to working path or set it in settings.py") else: # 初始化股票文件夹 for _, item in get_security_list(exchanges=EXCHANGE_LIST_COL).iterrows(): mkdir_for_stock(item) # 初始化指数文件夹 for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows(): kdata_dir = get_kdata_dir(item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) # 初始化期货文件夹 for exchange in ['shfe', 'dce', 'zce']: exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange=exchange) if not os.path.exists(exchange_cache_dir): os.makedirs(exchange_cache_dir) exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.datetime.today().year, data_type="day_kdata") if not os.path.exists(exchange_cache_dir): os.makedirs(exchange_cache_dir) exchange_dir = get_exchange_dir(security_type='future', exchange=exchange) if not os.path.exists(exchange_dir): os.makedirs(exchange_dir)
def crawl_index_quote(): for _, security_item in get_security_list(security_type='index').iterrows(): # 抓取日K线 logger.info("{} get index kdata start".format(security_item['code'])) start_date, _ = get_latest_download_trading_date(security_item, source='163') end_date = pd.Timestamp.today() if start_date > end_date: logger.info("{} kdata is ok".format(security_item['code'])) else: process_crawl(StockKdata163Spider, {"security_item": security_item, "start_date": start_date, "end_date": end_date}) logger.info("{} get index kdata from 163 end".format(security_item['code'])) # 获取市场概况数据[上海,深圳,中小板,创业板] if security_item['id'] in ['index_sh_000001', 'index_sz_399106', 'index_sz_399005', 'index_sz_399006']: # if security_item['id'] in ['index_sz_399106', 'index_sz_399005', 'index_sz_399006']: df = get_kdata(security_item=security_item) df = df[df['turnoverRate'].isna() | df['tCap'].isna() | df['mCap'].isna() | df[ 'pe'].isna()] if not df.empty: dates = df.index.strftime('%Y-%m-%d').tolist() # if security_item['id'] == 'index_sz_399106': # dates = [the_date for the_date in dates if # pd.Timestamp(the_date).date().year >= 2018] if dates: process_crawl(StockSummarySpider, {"security_item": security_item, "the_dates": dates})
def legacy_kdata_to_csv(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: tmp = os.path.basename(f).split('_') if fuquan: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'fuquan']] df.columns = KDATA_COLUMN_SINA_FQ df.to_csv(csv_path, index=False) else: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, KDATA_COLUMN_SINA] df.to_csv(csv_path, index=False)
def kdata_to_es(security_type='stock', start_code=None, end_code=None, force=False): codes = None if security_type == 'stock': doc_type = StockKData elif security_type == 'index': doc_type = IndexKData elif security_type == 'cryptocurrency': doc_type = CryptoCurrencyKData codes = CRYPTOCURRENCY_CODE for _, security_item in get_security_list(security_type=security_type, start_code=start_code, end_code=end_code, codes=codes).iterrows(): index_name = get_es_kdata_index(security_item['type'], security_item['exchange']) df = get_kdata(security_item, generate_id=True) df_to_es(df, doc_type=doc_type, index_name=index_name, security_item=security_item, force=force)
def kdata_to_es(security_type='stock', start_code=None, end_code=None, force=False): if security_type == 'stock': doc_type = StockKData elif security_type == 'index': doc_type = IndexKData elif security_type == 'cryptocurrency': doc_type = CryptoCurrencyKData for _, security_item in get_security_list(security_type=security_type, start_code=start_code, end_code=end_code).iterrows(): index_name = get_es_kdata_index(security_item['type'], security_item['exchange']) query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] df = get_kdata(security_item, generate_id=True) df_to_es(df, doc_type=doc_type, index_name=index_name, query=query, force=force)
def start_requests(self): security_item = self.settings.get("security_item") if security_item is not None: item = security_item data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={ 'path': data_path, 'item': item }, callback=self.download_finance_csv) else: for _, item in get_security_list(exchanges=['nasdaq']).iterrows(): data_url = self.get_finance_url(item['code']) data_path = get_finance_path(item) yield Request(url=data_url, meta={ 'path': data_path, 'item': item }, callback=self.download_finance_csv)
def remove_old_kdata(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): if fuquan: logger.info("remove {}".format(dir)) shutil.rmtree(dir)
def finance_report_event_to_csv(): for index, security_item in get_security_list().iterrows(): the_path = get_finance_report_event_path(security_item) if os.path.exists(the_path): df = pd.read_csv(the_path) df = df.rename(columns={'reportEventDate': 'timestamp', 'reportDate': 'reportPeriod'}) df = df.loc[:, EVENT_STOCK_FINANCE_REPORT_COL] df.to_csv(get_finance_report_event_path(security_item), index=False) logger.info("transform {} report event".format(security_item['code']))
def start_requests(self): for _, item in get_security_list().iterrows(): url = self.get_forecast_url(item['code']) yield Request(url=url, headers=DEFAULT_KDATA_HEADER, meta={ 'item': item, }, callback=self.download_forecast_data)
def remove_old_tick(): for index, security_item in get_security_list().iterrows(): dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('xls' in f and 'lock' not in f and 'error' not in f and os.path.isfile(os.path.join(dir, f)))] for f in files: logger.info("remove {}".format(f)) os.remove(f)
def start_requests(self): security_item = self.settings.get("security_item") if security_item is not None: for request in self.yield_request(security_item): yield request else: for _, item in get_security_list().iterrows(): for request in self.yield_request(item): yield request
def forecast_event_to_csv(): for index, security_item in get_security_list().iterrows(): the_path = get_forecast_event_path(security_item) if os.path.exists(the_path): df = pd.read_json(get_forecast_event_path(security_item)) df = df.rename(columns={'reportDate': 'timestamp'}) df = df.loc[:, EVENT_STOCK_FINANCE_FORECAST_COL] df.to_csv(get_finance_forecast_event_path(security_item), index=False) logger.info("transform {} forecast event".format(security_item['code'])) os.remove(the_path)
def crawl_finance_data(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE): for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): try: # 先抓事件,有些后续抓取依赖事件 process_crawl(StockFinanceReportEventSpider, {"security_item": security_item}) current_report_period = get_report_period() # 资产负债表 path = get_balance_sheet_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) else: current_items = get_balance_sheet_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "balance_sheet"}) # 利润表 path = get_income_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) else: current_items = get_income_statement_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "income_statement"}) # 现金流量表 path = get_cash_flow_statement_path(security_item) if not os.path.exists(path): process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) else: current_items = get_cash_flow_statement_items(security_item) # 当前报告期还没抓取 if current_report_period != current_items[-1]['reportPeriod']: # 报告出来了 # df = event.get_finance_report_event(security_item, index='reportPeriod') # if current_report_period in df.index: process_crawl(StockFinanceSpider, {"security_item": security_item, "report_type": "cash_flow"}) except Exception as e: logger.exception(e)
def usa_stock_finance_to_es(force=False): for _, security_item in get_security_list(security_type='stock', exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): df = get_finance_summary_items(security_item) df_to_es(df, doc_type=FinanceSummary, timestamp_filed='reportPeriod', security_item=security_item, force=force)
def security_meta_to_es(security_type='stock'): if security_type == 'stock': doc_type = StockMeta elif security_type == 'cryptocurrency': doc_type = CryptocurrencyMeta elif security_type == 'index': doc_type = IndexMeta df = get_security_list(security_type=security_type) df_to_es(df, doc_type, force=True)
def start_requests(self): item = self.settings.get("security_item") if item is not None: for request in self.yield_request(item): yield request else: for _, item in get_security_list( start_code=STOCK_START_CODE, end_code=STOCK_END_CODE).iterrows(): for request in self.yield_request(item): yield request
def start_requests(self): self.category_type = self.settings.get("category_type") self.sh_df = get_security_list(exchanges=['sh']) self.sz_df = get_security_list(exchanges=['sz']) self.file_lock = threading.RLock() # 清除老数据 self.sh_df[self.category_type] = None self.sz_df[self.category_type] = None if self.category_type == 'sinaIndustry': url = 'http://vip.stock.finance.sina.com.cn/q/view/newSinaHy.php' elif self.category_type == 'sinaConcept': url = 'http://money.finance.sina.com.cn/q/view/newFLJK.php?param=class' elif self.category_type == 'sinaArea': url = 'http://money.finance.sina.com.cn/q/view/newFLJK.php?param=area' else: return yield Request(url=url, callback=self.download_sina_category)
def check_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if not os.path.exists(dayk_path): logger.warn(get_security_dir(security_item)) dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('csv' in f and os.path.isfile(os.path.join(dir, f)))] if not files: logger.warn(get_security_dir(security_item))
def merge_kdata_to_one(security_item=None, replace=False, fuquan='bfq'): if type(security_item) != 'NoneType': items = pd.DataFrame().append(security_item).iterrows() else: items = get_security_list().iterrows() if fuquan: fuquans = [fuquan] else: fuquans = ['bfq', 'hfq'] for index, security_item in items: for fuquan in fuquans: dayk_path = get_kdata_path(security_item, source='sina', fuquan=fuquan) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA_FQ) else: df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_SINA) the_dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(the_dir): files = [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if ('dayk.csv' not in f and os.path.isfile(os.path.join(the_dir, f))) ] for f in files: df = df.append(pd.read_csv(f, dtype=str), ignore_index=True) if df.size > 0: df = df.set_index(df['timestamp']) df.index = pd.to_datetime(df.index) df = df.sort_index() logger.info("{} to {}".format(security_item['code'], dayk_path)) if replace: df.to_csv(dayk_path, index=False) else: StockKDataSinaSpider.merge_to_current_kdata( security_item, df, fuquan=fuquan) for f in files: logger.info("remove {}".format(f)) os.remove(f) if fuquan == 'hfq': StockKDataSinaSpider.add_factor_to_163(security_item)
def start_requests(self): # 两种模式: # 1)item,trading_dates不指定,用于全量下载数据 # 2)指定,用于修复 item = self.settings.get("security_item") trading_dates = self.settings.get("trading_dates") fuquan = self.settings.get("fuquan") if item is not None: for request in self.yield_request(item, trading_dates, fuquan): yield request else: for _, item in get_security_list(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE).iterrows(): for request in self.yield_request(item): yield request
def handle_error_tick(): for index, security_item in get_security_list().iterrows(): dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if (('fatal' in f or 'error' in f) and os.path.isfile(os.path.join(dir, f)))] for f in files: try: the_date = get_file_name(f) csv_path = get_tick_path(security_item, the_date) if not os.path.exists(csv_path): logger.info("{} to {}".format(f, csv_path)) sina_tick_to_csv(security_item, f, the_date) except Exception as e: logger.warn(e) os.rename(f, f + ".fatal")
def usa_stock_finance_to_es(force=False): for _, security_item in get_security_list(security_type='stock', exchanges=['nasdaq'], codes=US_STOCK_CODES).iterrows(): query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] df = get_finance_summary_items(security_item) df_to_es(df, doc_type=FinanceSummary, timestamp_filed='reportPeriod', query=query, force=force)
def finance_sheet_to_es(sheet_type=None, start_code=None, end_code=None, force=False): if sheet_type is None: sheet_types = [ 'balance_sheet', 'income_statement', 'cash_flow_statement' ] else: sheet_types = [sheet_type] for sheet_type in sheet_types: if sheet_type == 'balance_sheet': doc_type = BalanceSheet elif sheet_type == 'income_statement': doc_type = IncomeStatement elif sheet_type == 'cash_flow_statement': doc_type = CashFlowStatement es_index_mapping(sheet_type, doc_type) for _, security_item in get_security_list( start_code=start_code, end_code=end_code).iterrows(): try: if sheet_type == 'balance_sheet': items = get_balance_sheet_items(security_item) elif sheet_type == 'income_statement': items = get_income_statement_items(security_item) elif sheet_type == 'cash_flow_statement': items = get_cash_flow_statement_items(security_item) df = pd.DataFrame(items) df = index_df_with_time(df, index='reportPeriod') df_to_es(df, doc_type=doc_type, timestamp_filed='reportPeriod', security_item=security_item, force=force) except Exception as e: logger.exception( "index {} {} failed".format(security_item['code'], sheet_type), e)
def crawl_stock_quote(start_code=STOCK_START_CODE, end_code=STOCK_END_CODE, crawl_tick=True): # 抓取股票k线 for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): # 抓取日K线 logger.info("{} get stock kdata start".format(security_item['code'])) start_date, _ = get_latest_download_trading_date(security_item, source='163') end_date = pd.Timestamp.today() if start_date > end_date: logger.info("{} stock kdata is ok".format(security_item['code'])) else: process_crawl(StockKdata163Spider, {"security_item": security_item, "start_date": start_date, "end_date": end_date}) logger.info("{} get stock kdata from 163 end".format(security_item['code'])) base_dates = set(get_trading_dates(security_item, source='163')) for fuquan in ('bfq', 'hfq'): sina_dates = set(get_trading_dates(security_item, source='sina', fuquan=fuquan)) diff_dates = base_dates - sina_dates if diff_dates: logger.info("{} get {} kdata from sina start".format(security_item['code'], fuquan)) process_crawl(StockKDataSinaSpider, {"security_item": security_item, "trading_dates": diff_dates, "fuquan": fuquan}) logger.info("{} get {} kdata from sina end".format(security_item['code'], fuquan)) else: logger.info("{} {} kdata from sina is ok".format(security_item['code'], fuquan)) # 抓取tick # FIXME:新浪该服务已不可用 if crawl_tick and False: tick_dates = {x for x in base_dates if x >= settings.START_TICK_DATE} diff_dates = tick_dates - set(get_available_tick_dates(security_item)) if diff_dates: logger.info("{} get tick start".format(security_item['code'])) process_crawl(StockTickSpider, {"security_item": security_item, "trading_dates": diff_dates}) logger.info("{} get tick end".format(security_item['code'])) else: logger.info("{} tick is ok".format(security_item['code']))
def finance_event_to_es(event_type='finance_forecast', start_code=None, end_code=None, force=False): if event_type == 'finance_forecast': doc_type = FinanceForecastEvent elif event_type == 'finance_report': doc_type = FinanceReportEvent for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): if event_type == 'finance_forecast': df = get_finance_forecast_event(security_item) elif event_type == 'finance_report': df = get_finance_report_event(security_item) df_to_es(df, doc_type=doc_type, security_item=security_item, force=force)
def restore_kdata(): for index, security_item in get_security_list(start_code='600000', end_code='600017').iterrows(): path_163 = get_kdata_path(security_item, source='163', fuquan='bfq') df = pd.read_csv(path_163, dtype=str) df = time_index_df(df) if 'id' in df.columns: df = df.drop(['id'], axis=1) df = df[~df.index.duplicated(keep='first')] df.timestamp.apply(lambda x: to_time_str(x)) df.to_csv(path_163, index=False) for fuquan in ('hfq', 'bfq'): path_sina = get_kdata_path(security_item, source='sina', fuquan=fuquan) df = pd.read_csv(path_sina, dtype=str) df = time_index_df(df) if 'id' in df.columns: df = df.drop(['id'], axis=1) df = df[~df.index.duplicated(keep='first')] df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) df.to_csv(path_sina, index=False)
def finance_event_to_es(event_type='finance_forecast', start_code=None, end_code=None, force=False): if event_type == 'finance_forecast': doc_type = FinanceForecastEvent elif event_type == 'finance_report': doc_type = FinanceReportEvent for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] if event_type == 'finance_forecast': df = get_finance_forecast_event(security_item) elif event_type == 'finance_report': df = get_finance_report_event(security_item) df_to_es(df, doc_type=doc_type, query=query, force=force)
def check_convert_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if os.path.exists(dayk_path): df_result = pd.read_csv(dayk_path) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA_FQ) else: df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA) dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: df = df.append(pd.read_csv(f), ignore_index=True) assert_df(df, df_result) logger.info("{} merge as one ok".format(security_item['code']))
def finance_sheet_to_es(sheet_type='balance_sheet', start_code=None, end_code=None, force=False): if sheet_type == 'balance_sheet': doc_type = BalanceSheet elif sheet_type == 'income_statement': doc_type = IncomeStatement elif sheet_type == 'cash_flow_statement': doc_type = CashFlowStatement es_index_mapping(sheet_type, doc_type) for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] if sheet_type == 'balance_sheet': items = get_balance_sheet_items(security_item) elif sheet_type == 'income_statement': items = get_income_statement_items(security_item) elif sheet_type == 'cash_flow_statement': items = get_cash_flow_statement_items(security_item) df = pd.DataFrame(items) df = index_df_with_time(df, index='reportPeriod') df_to_es(df, doc_type=doc_type, timestamp_filed='reportPeriod', query=query, force=force)