def record(self, entity, start, end, size, timestamps, http_session): the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN)) if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] param = { 'security_item': entity, 'quarters': the_quarters, 'level': self.level.value } security_item = param['security_item'] quarters = param['quarters'] level = param['level'] result_df = pd.DataFrame() for year, quarter in quarters: query_url = self.url.format(security_item.code, year, quarter) response = request_get(http_session, query_url) response.encoding = 'gbk' try: dfs = pd.read_html(response.text) except ValueError as error: self.logger.error( f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})' ) self.sleep() continue if len(dfs) < 5: self.sleep() continue df = dfs[4].copy() df = df.iloc[1:] df.columns = [ 'timestamp', 'open', 'high', 'close', 'low', 'volume', 'turnover' ] df['name'] = security_item.name df['level'] = level df['timestamp'] = pd.to_datetime(df['timestamp']) df['provider'] = Provider.Sina result_df = pd.concat([result_df, df]) self.logger.info( f'({security_item.code}{security_item.name})({year}-{quarter:02d})' ) self.sleep() result_df = result_df.sort_values(by='timestamp') return result_df.to_dict(orient='records')
def record(self, entity, start, end, size, timestamps): the_quarters = get_year_quarters(start) if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] param = { "security_item": entity, "quarters": the_quarters, "level": self.level.value } security_item = param["security_item"] quarters = param["quarters"] level = param["level"] result_df = pd.DataFrame() for year, quarter in quarters: query_url = self.url.format(security_item.code, year, quarter) response = requests.get(query_url) response.encoding = "gbk" try: dfs = pd.read_html(response.text) except ValueError as error: self.logger.error( f"skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})" ) time.sleep(10.0) continue if len(dfs) < 5: time.sleep(10.0) continue df = dfs[4].copy() df = df.iloc[1:] df.columns = [ "timestamp", "open", "high", "close", "low", "volume", "turnover" ] df["name"] = security_item.name df["level"] = level df["timestamp"] = pd.to_datetime(df["timestamp"]) df["provider"] = "sina" result_df = pd.concat([result_df, df]) self.logger.info( f"({security_item.code}{security_item.name})({year}-{quarter:02d})" ) time.sleep(10.0) result_df = result_df.sort_values(by="timestamp") return result_df.to_dict(orient="records")
def generate_request_param(self, security_item, start, end, size, timestamp): the_quarters = get_year_quarters(start) if not is_same_date(security_item.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] return { 'security_item': security_item, 'quarters': the_quarters, 'level': self.level.value }
def record(self, entity, start, end, size, timestamps, http_session): the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN)) if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] param = { 'security_item': entity, 'quarters': the_quarters, 'level': self.level.value } security_item = param['security_item'] quarters = param['quarters'] level = param['level'] result_df = pd.DataFrame() for year, quarter in quarters: query_url = self.url.format(security_item.code, year, quarter) text = sync_get(http_session, query_url, encoding='gbk', return_type='text') if text is None: continue try: dfs = pd.read_html(text) except ValueError as error: self.logger.error( f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})' ) self.sleep() continue if len(dfs) < 5: self.sleep() continue df = dfs[4].copy() df = df.iloc[1:] df.columns = [ 'timestamp', 'open', 'high', 'close', 'low', 'volume', 'turnover' ] result_df = pd.concat([result_df, df]) self.sleep() if pd_is_not_null(result_df): result_df['level'] = level return result_df return None
def record(self, security_item, start, end, size, timestamps): the_quarters = get_year_quarters(start) # treat has recorded the season if contains some date if not is_same_date(security_item.timestamp, start) and len(the_quarters) > 1: the_quarters = the_quarters[1:] for year, quarter in the_quarters: kdatas = [] for fuquan in ['bfq', 'hfq']: the_url = self.get_kdata_url(security_item.code, year, quarter, fuquan) resp = requests.get(the_url) trs = Selector(text=resp.text).xpath( '//*[@id="FundHoldSharesTable"]/tr[position()>1 and position()<=last()]' ).extract() for idx, tr in enumerate(trs): tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] open = tds[1] high = tds[2] close = tds[3] low = tds[4] volume = tds[5] turnover = tds[6] if fuquan == 'hfq': factor = tds[7] the_timestamp = to_pd_timestamp(tds[0]) the_id = generate_kdata_id(security_id=security_item.id, timestamp=the_timestamp, level=self.level) if fuquan == 'hfq': # we got bfq at first and then update hfq data existed = [ item for item in kdatas if item['id'] == the_id ] if existed: kdata = existed[0] else: self.logger.error( "bfq not got for:{}".format(the_id)) kdata = { 'id': the_id, 'timestamp': the_timestamp, 'name': security_item.name, 'level': self.level.value, 'open': to_float(open) / to_float(factor), 'close': to_float(close) / to_float(factor), 'high': to_float(high) / to_float(factor), 'low': to_float(low) / to_float(factor), 'volume': to_float(volume), 'turnover': to_float(turnover) } kdatas.append(kdata) kdata['hfq_open'] = to_float(open) kdata['hfq_high'] = to_float(high) kdata['hfq_close'] = to_float(close) kdata['hfq_low'] = to_float(low) kdata['factor'] = to_float(factor) self.latest_factors[security_item.id] = to_float( factor) else: kdatas.append({ 'id': the_id, 'timestamp': the_timestamp, 'name': security_item.name, 'level': self.level.value, 'open': to_float(open), 'close': to_float(close), 'high': to_float(high), 'low': to_float(low), 'volume': to_float(volume), 'turnover': to_float(turnover) }) return kdatas