def __crawl(self, date, proc=1): self.logger.debug(f'RecommendationItem crawling {proc} : {date}') self.param['startDt'] = date self.param['endDt'] = date self.param['proc'] = proc curr_page = 1 max_page = 1 item_df = pd.DataFrame([]) while curr_page <= max_page: self.param['curPage'] = curr_page res = requests.post(RECOMM_URL, data=self.param) res = json.loads(res.text) if len(res['data']) > 0: res = res['data'] res = pd.DataFrame.from_dict(res) item_df = pd.concat([item_df, res]) max_page = int(res['TOTROW'].values[0]) curr_page += 1 self.logger.debug( f'RecommendationItem crawl complete {proc}: {date} ({len(item_df)})' ) timer.random_sleep(min_delay=self.delay) item_df.columns = map(str.lower, item_df.columns) return item_df
def crawl_fnguide(self, cmp_cd): self.logger.debug(f'Fnguide crawling start') header = { 'Host': 'comp.fnguide.com', } result = [] cmp_dict = dict() gicode = 'A%06d' % int(cmp_cd) cmp_dict['code'] = cmp_cd for url in [INVEST_URL, FINANCE_RATIO_URL]: res = requests.get(f'{url}?gicode={gicode}', headers=header) soup = BeautifulSoup(res.text, 'lxml') table_list = soup.find_all('table', attrs={'class': 'us_table_ty1'}) for tb in table_list: trs = tb.find_all('tr')[1:] for tr in trs: td = list(tr.children) if int(td[1].attrs.get('colspan', 0)) > 0: continue # 지표 key-value key = td[1].find_all('span', attrs={'class': 'txt_acd'}) if len(key) > 0: key = key[0].text else: key = td[1].text key = key.strip() val = td[-2].text.strip() if key == 'EV/EBITDA': # 1년주기여서, 작년기준으로 사용 val = td[-4].text.strip() if len(val) > 0: try: cmp_dict[key] = float(val.replace(',', '')) except: pass result.append(cmp_dict) timer.random_sleep(min_delay=self.delay) df_result = pd.DataFrame(result) try: df_result = df_result[[ 'code', 'EPS', 'CFPS', 'BPS', 'SPS', 'EV/EBITDA', 'ROE' ]] except KeyError: self.logger.debug( f"{df_result[['code']]}, KeyError : ['EV/EBITDA'] not in index" ) df_result = None self.logger.debug(f'Fnguide crawling complete') return df_result
def __crawl_stock_price(self, stock_code, max_page=250): sise_list = [] page = 1 last_date = '' while page <= max_page: _url = SISE_URL.format(code=stock_code, page=page) res = requests.get(_url) _list = self.__parse_sise_list(res.text) sise_list.extend(_list) if _list[0][0].startswith('2010.11') or _list[0][0] == last_date: break last_date = _list[0][0] page += 1 timer.random_sleep(min_delay=self.delay) return sise_list
def articles_id(column_id): article_list = list() offset = zhihu.Controller() while not offset.is_end(): print(GET_ARTICLES_ID) response = net.column_spider(column_id, offset.next_offset()) if response is None: raise ValueError('Response is None') content = response.text totals = re.search(r'"totals":\W(\d+)', content).group(1) offset.totals = int(totals) article_id_list = re.findall(r'"id":\W(\d+)', content) offset.increase(len(article_id_list)) article_list.extend(article_id_list) article_id_list.clear() timer.random_sleep(end=zhihu.SLEEP) return article_list
def articles(column_id, warehouse): articles_list = articles_id(column_id) request_times = dict([(i, 0) for i in articles_list]) warehouse = column_warehouse(column_id, warehouse) while len(articles_list) != 0: article_id = articles_list.pop(0) try: ar = article(article_id, warehouse) print(ar) except ValueError: if request_times.get(article_id) < 5: articles_list.append(article_id) request_times[articles_id] += 1 timer.random_sleep(end=zhihu.SLEEP) for article_id, times in request_times.items(): if times >= 5: print(net.article_spider_url(article_id))