def download_forecast_data(self, response): security_item = response.meta['item'] trs = response.xpath('//*[@id="dataTable"]//tr').extract() forecast_jsons = [] try: for tr in trs[1:]: tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] # 业绩变动字符串转为float change_str = tds[7] change_start = None if '~' in change_str: i = change_str.index('~') change_start = change_str[0:i] change = change_str[i + 1:] else: change = change_str if change: change = change.strip('%') change = float(change) / 100 if change_start: change_start = change_start.strip('%') change_start = float(change_start) / 100 # preEPS可能为空 preEPS = None try: preEPS = float(tds[6]) except Exception as e: pass json_item = { "securityId": security_item['id'], "timestamp": tds[3], "reportPeriod": tds[4], "type": tds[2], "description": tds[5], "preEPS": preEPS, "changeStart": change_start, "change": change, } forecast_jsons.append(json_item) if forecast_jsons: df = pd.DataFrame(forecast_jsons) df = df.drop_duplicates() df = df[:, EVENT_STOCK_FINANCE_FORECAST_COL] df = index_df_with_time(df) df.to_csv(get_finance_forecast_event_path(security_item), index=False) except Exception as e: self.logger.exception( 'error when getting k data url={} error={}'.format( response.url, e))
def get_finance_report_event(security_item, index='timestamp'): """ get finance report event items. Parameters ---------- security_item : SecurityItem or str the security item,id or code index : {'timestamp','reportPeriod'} default is 'timestamp' the index for the return df Returns ------- DataFrame """ security_item = to_security_item(security_item) path = get_finance_report_event_path(security_item) if os.path.exists(path): df = pd.read_csv(path) df = index_df_with_time(df, index=index) else: df = pd.DataFrame() return df
def on_tick(self, tick_item): # 只计算日内,超短线 if not is_same_date(self.current_time, tick_item['timestamp']): self.today_traded = False if not self.df.empty: self.df = pd.DataFrame() else: if not self.today_traded: if not self.df.empty and self.df.index.size == 1: self.df = index_df_with_time(self.df) self.df = self.df.append(tick_item, ignore_index=True) # 14:50时,计算当日资金流 if ("14:50:" in tick_item['timestamp']): money_flow = (self.df['turnover'] * self.df['direction']).sum() money_all = (self.df['turnover'] * abs(self.df['direction'])).sum() # 净流入 if money_flow > 0 and not self.account_service.get_position(tick_item['securityId']): # 使用用后复权价格 factor = \ get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[ 'factor'] self.buy(tick_item['securityId'], current_price=tick_item['price'] * factor) # 净流出 elif money_flow < 0 and self.account_service.get_position(tick_item['securityId']): # 使用用后复权价格 factor = \ get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[ 'factor'] self.sell(tick_item['securityId'], current_price=tick_item['price'] * factor) self.today_traded = True
def get_finance_report_event(security_item, index='reportEventDate'): """ get finance report event items. Parameters ---------- security_item : SecurityItem or str the security item,id or code index : {'reportEventDate','reportDate'} default is 'reportEventDate' the index for the return df Returns ------- DataFrame """ path = get_event_path(security_item, event='finance_report') if os.path.exists(path): df = pd.read_csv(path) df = index_df_with_time(df, index=index) else: df = pd.DataFrame() return df
def on_tick(self, tick_item): # 只计算日内,超短线 if not is_same_date(self.current_time, tick_item['timestamp']): self.today_traded = False if not self.df.empty: self.df = pd.DataFrame() else: if not self.today_traded: if not self.df.empty and self.df.index.size == 1: self.df = index_df_with_time(self.df) self.df = self.df.append(tick_item, ignore_index=True) # 14:50时,计算当日资金流 if ("14:50:" in tick_item['timestamp']): money_flow = (self.df['turnover'] * self.df['direction']).sum() money_all = (self.df['turnover'] * abs(self.df['direction'])).sum() # 净流入 if money_flow > 0 and not self.account_service.get_position(tick_item['securityId']): # 使用用后复权价格 factor = \ get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[ 'factor'] self.buy(tick_item['securityId'], current_price=tick_item['price'] * factor) # 净流出 elif money_flow < 0 and self.account_service.get_position(tick_item['securityId']): # 使用用后复权价格 factor = \ get_kdata(tick_item['securityId'], the_date=pd.Timestamp(tick_item['timestamp']).date())[ 'factor'] self.sell(tick_item['securityId'], current_price=tick_item['price'] * factor) self.today_traded = True
def get_finance_report_event(security_item, index='reportEventDate'): path = get_event_path(security_item, event='finance_report') if os.path.exists(path): df = pd.read_csv(path) df = index_df_with_time(df, index=index) else: df = pd.DataFrame() return df
def download_sp500_pe(self, response): trs = response.xpath('//*[@id="datatable"]/tr').extract() price_jsons = [] try: for tr in trs[1:]: tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] price_jsons.append({"timestamp": to_time_str(tds[0]), "pe": to_float(tds[1])}) if price_jsons: self.df_pe = self.df_pe.append(price_jsons, ignore_index=True) self.df_pe = index_df_with_time(self.df_pe) except Exception as e: self.logger.error('error when getting sp500 pe url={} error={}'.format(response.url, e))
def download_sp500_pe(self, response): trs = response.xpath('//*[@id="datatable"]/tr').extract() price_jsons = [] try: for tr in trs[1:]: tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] price_jsons.append({"timestamp": to_time_str(tds[0]), "pe": to_float(tds[1])}) if price_jsons: self.df_pe = self.df_pe.append(price_jsons, ignore_index=True) self.df_pe = index_df_with_time(self.df_pe) except Exception as e: self.logger.exception('error when getting sp500 pe url={} error={}'.format(response.url, e))
def finance_sheet_to_es(sheet_type=None, start_code=None, end_code=None, force=False): if sheet_type is None: sheet_types = [ 'balance_sheet', 'income_statement', 'cash_flow_statement' ] else: sheet_types = [sheet_type] for sheet_type in sheet_types: if sheet_type == 'balance_sheet': doc_type = BalanceSheet elif sheet_type == 'income_statement': doc_type = IncomeStatement elif sheet_type == 'cash_flow_statement': doc_type = CashFlowStatement es_index_mapping(sheet_type, doc_type) for _, security_item in get_security_list( start_code=start_code, end_code=end_code).iterrows(): try: if sheet_type == 'balance_sheet': items = get_balance_sheet_items(security_item) elif sheet_type == 'income_statement': items = get_income_statement_items(security_item) elif sheet_type == 'cash_flow_statement': items = get_cash_flow_statement_items(security_item) df = pd.DataFrame(items) df = index_df_with_time(df, index='reportPeriod') df_to_es(df, doc_type=doc_type, timestamp_filed='reportPeriod', security_item=security_item, force=force) except Exception as e: logger.exception( "index {} {} failed".format(security_item['code'], sheet_type), e)
def download_finance_csv(self, response): content_type_header = response.headers.get('content-type', None) if content_type_header.decode("utf-8") == content_type_header.decode( "utf-8") == 'text/csv': path = response.meta['path'] security_item = response.meta['item'] df = pd.read_csv(io.BytesIO(response.body), na_values='None') df.columns = [ "reportDate", "shares", "sharesAdjusted", "factor", "totalAssets", "totalCurrentAssets", "totalLiabilities", "totalCurrentLiabilities", "bookValue", "minorityBookValue", "preferredEquity", "goodwill", "longTermBorrowing", "operatingRevenue", "netProfit", "netProfitAttributedToParentCompanyOwner", "EPS", "dilutedEPS", "DPS", "netCashFlowsFromOperatingActivities", "netCashFlowsFromInvesting", "netCashFlowsFromFinancingActivities", "cashChange", "cashAtTheEndOfPeriod", "capitalExpenditures", "price", "priceHigh", "priceLow", "ROE", "ROA", "BVPS", "PB", "PE", "cumulativeDividendsPerShare", "dividendPayoutRatio", "longTermDebtToEquityRatio", "equityToAssetsRatio", "netMargin", "assetTurnover", "freeCashFlowPerShare", "currentRatio" ] df['code'] = security_item['code'] df['securityId'] = security_item['id'] df['id'] = df[['securityId', 'reportDate' ]].apply(lambda x: '_'.join(x.astype(str)), axis=1) df = index_df_with_time(df, index='reportDate') df.fillna(0, inplace=True) df.to_csv(path, index=False) else: self.logger.error( "get finance csv error:url={} content type={} body={}".format( response.url, content_type_header, response.body))
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_event_path(security_item, event='finance_report') df = event.get_finance_report_event(security_item, index='reportEventDate') try: report_event_dates = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_event_dates = [date.strip() for date in report_event_dates if date.strip()] report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title(title, period_type, report_event_dates[i]) # 如果最新的事件已经抓取,直接返回 if i == 0: if not df.empty: latest = pd.Timestamp(report_event_dates[0]).date() if df.index.contains(latest) and (df.loc[latest, 'title'] == title): self.logger.info( "{} {} report has been the latest".format(security_item['code'], report_period)) return df = df.append({ "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period), "securityId": security_item['id'], "reportEventDate": report_event_dates[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportDate": report_period}, ignore_index=True) if not df.empty: df = df.drop_duplicates(subset=['id', 'title'], keep='last') df = index_df_with_time(df, index='reportEventDate') df.to_csv(path, index=False) except Exception as e: self.logger.error('error when getting k data url={} error={}'.format(response.url, e))
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_finance_report_event_path(security_item) df = pd.DataFrame() try: report_timestamps = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_timestamps = [ date.strip() for date in report_timestamps if date.strip() ] report_contents = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title( title, period_type, report_timestamps[i]) df = df.append( { "securityId": security_item['id'], "timestamp": report_timestamps[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportPeriod": report_period }, ignore_index=True) if not df.empty: df = df.drop_duplicates() df = index_df_with_time(df) df.to_csv(path, index=False) except Exception as e: self.logger.exception('error when getting k data url={}'.format( response.url))
def get_finance_forecast_event(security_item): """ get forecast items. Parameters ---------- security_item : SecurityItem or str the security item,id or code Returns ------- DataFrame """ security_item = to_security_item(security_item) path = get_finance_forecast_event(security_item) if os.path.exists(path): df = pd.read_csv(path) df = index_df_with_time(df) else: df = pd.DataFrame() return df
def finance_sheet_to_es(sheet_type='balance_sheet', start_code=None, end_code=None, force=False): if sheet_type == 'balance_sheet': doc_type = BalanceSheet elif sheet_type == 'income_statement': doc_type = IncomeStatement elif sheet_type == 'cash_flow_statement': doc_type = CashFlowStatement es_index_mapping(sheet_type, doc_type) for _, security_item in get_security_list(start_code=start_code, end_code=end_code).iterrows(): query = None if not force: query = {"term": {"securityId": ""}} query["term"]["securityId"] = security_item["id"] if sheet_type == 'balance_sheet': items = get_balance_sheet_items(security_item) elif sheet_type == 'income_statement': items = get_income_statement_items(security_item) elif sheet_type == 'cash_flow_statement': items = get_cash_flow_statement_items(security_item) df = pd.DataFrame(items) df = index_df_with_time(df, index='reportPeriod') df_to_es(df, doc_type=doc_type, timestamp_filed='reportPeriod', query=query, force=force)
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_event_path(security_item, event='finance_report') df = event.get_finance_report_event(security_item, index='reportEventDate') try: report_event_dates = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_event_dates = [ date.strip() for date in report_event_dates if date.strip() ] report_contents = response.xpath( '//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title( title, period_type, report_event_dates[i]) # 如果最新的事件已经抓取,直接返回 if i == 0: if not df.empty: latest = pd.Timestamp(report_event_dates[0]).date() if df.index.contains(latest) and (df.loc[latest, 'title'] == title): self.logger.info( "{} {} report has been the latest".format( security_item['code'], report_period)) return df = df.append( { "id": "{}_{}_{}".format(security_item['id'], report_event_dates[i], report_period), "securityId": security_item['id'], "reportEventDate": report_event_dates[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportDate": report_period }, ignore_index=True) if not df.empty: df = df.drop_duplicates(subset=['id', 'title'], keep='last') df = index_df_with_time(df, index='reportEventDate') df.to_csv(path, index=False) except Exception as e: self.logger.error( 'error when getting k data url={} error={}'.format( response.url, e))
def download_finance_csv(self, response): content_type_header = response.headers.get('content-type', None) if content_type_header.decode("utf-8") == content_type_header.decode("utf-8") == 'text/csv': path = response.meta['path'] security_item = response.meta['item'] df = pd.read_csv(io.BytesIO(response.body), na_values='None') df.columns = [ "reportDate", "shares", "sharesAdjusted", "factor", "totalAssets", "totalCurrentAssets", "totalLiabilities", "totalCurrentLiabilities", "bookValue", "minorityBookValue", "preferredEquity", "goodwill", "longTermBorrowing", "operatingRevenue", "netProfit", "netProfitAttributedToParentCompanyOwner", "EPS", "dilutedEPS", "DPS", "netCashFlowsFromOperatingActivities", "netCashFlowsFromInvesting", "netCashFlowsFromFinancingActivities", "cashChange", "cashAtTheEndOfPeriod", "capitalExpenditures", "price", "priceHigh", "priceLow", "ROE", "ROA", "BVPS", "PB", "PE", "cumulativeDividendsPerShare", "dividendPayoutRatio", "longTermDebtToEquityRatio", "equityToAssetsRatio", "netMargin", "assetTurnover", "freeCashFlowPerShare", "currentRatio"] df['code'] = security_item['code'] df['securityId'] = security_item['id'] df['id'] = df[['securityId', 'reportDate']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df = index_df_with_time(df, index='reportDate') df.fillna(0, inplace=True) df.to_csv(path, index=False) else: self.logger.error( "get finance csv error:url={} content type={} body={}".format(response.url, content_type_header, response.body))