def __crawl(self, date, proc=1):
        self.logger.debug(f'RecommendationItem crawling {proc} : {date}')

        self.param['startDt'] = date
        self.param['endDt'] = date
        self.param['proc'] = proc
        curr_page = 1
        max_page = 1
        item_df = pd.DataFrame([])
        while curr_page <= max_page:
            self.param['curPage'] = curr_page
            res = requests.post(RECOMM_URL, data=self.param)
            res = json.loads(res.text)
            if len(res['data']) > 0:
                res = res['data']
                res = pd.DataFrame.from_dict(res)
                item_df = pd.concat([item_df, res])
                max_page = int(res['TOTROW'].values[0])
            curr_page += 1
        self.logger.debug(
            f'RecommendationItem crawl complete {proc}: {date} ({len(item_df)})'
        )

        timer.random_sleep(min_delay=self.delay)
        item_df.columns = map(str.lower, item_df.columns)
        return item_df
Beispiel #2
0
    def crawl_fnguide(self, cmp_cd):
        self.logger.debug(f'Fnguide crawling start')

        header = {
            'Host': 'comp.fnguide.com',
        }
        result = []
        cmp_dict = dict()
        gicode = 'A%06d' % int(cmp_cd)
        cmp_dict['code'] = cmp_cd

        for url in [INVEST_URL, FINANCE_RATIO_URL]:
            res = requests.get(f'{url}?gicode={gicode}', headers=header)
            soup = BeautifulSoup(res.text, 'lxml')
            table_list = soup.find_all('table',
                                       attrs={'class': 'us_table_ty1'})
            for tb in table_list:
                trs = tb.find_all('tr')[1:]
                for tr in trs:
                    td = list(tr.children)

                    if int(td[1].attrs.get('colspan', 0)) > 0:
                        continue

                    # 지표 key-value
                    key = td[1].find_all('span', attrs={'class': 'txt_acd'})
                    if len(key) > 0:
                        key = key[0].text
                    else:
                        key = td[1].text
                    key = key.strip()

                    val = td[-2].text.strip()
                    if key == 'EV/EBITDA':  # 1년주기여서, 작년기준으로 사용
                        val = td[-4].text.strip()
                    if len(val) > 0:
                        try:
                            cmp_dict[key] = float(val.replace(',', ''))
                        except:
                            pass
        result.append(cmp_dict)

        timer.random_sleep(min_delay=self.delay)

        df_result = pd.DataFrame(result)
        try:
            df_result = df_result[[
                'code', 'EPS', 'CFPS', 'BPS', 'SPS', 'EV/EBITDA', 'ROE'
            ]]
        except KeyError:
            self.logger.debug(
                f"{df_result[['code']]}, KeyError : ['EV/EBITDA'] not in index"
            )
            df_result = None

        self.logger.debug(f'Fnguide crawling complete')
        return df_result
Beispiel #3
0
    def __crawl_stock_price(self, stock_code, max_page=250):
        sise_list = []
        page = 1
        last_date = ''
        while page <= max_page:
            _url = SISE_URL.format(code=stock_code, page=page)
            res = requests.get(_url)
            _list = self.__parse_sise_list(res.text)
            sise_list.extend(_list)
            if _list[0][0].startswith('2010.11') or _list[0][0] == last_date:
                break
            last_date = _list[0][0]
            page += 1
            timer.random_sleep(min_delay=self.delay)

        return sise_list
Beispiel #4
0
def articles_id(column_id):
    article_list = list()
    offset = zhihu.Controller()
    while not offset.is_end():
        print(GET_ARTICLES_ID)
        response = net.column_spider(column_id, offset.next_offset())
        if response is None:
            raise ValueError('Response is None')
        content = response.text
        totals = re.search(r'"totals":\W(\d+)', content).group(1)
        offset.totals = int(totals)
        article_id_list = re.findall(r'"id":\W(\d+)', content)
        offset.increase(len(article_id_list))
        article_list.extend(article_id_list)
        article_id_list.clear()
        timer.random_sleep(end=zhihu.SLEEP)
    return article_list
Beispiel #5
0
def articles(column_id, warehouse):
    articles_list = articles_id(column_id)
    request_times = dict([(i, 0) for i in articles_list])
    warehouse = column_warehouse(column_id, warehouse)
    while len(articles_list) != 0:
        article_id = articles_list.pop(0)
        try:
            ar = article(article_id, warehouse)
            print(ar)
        except ValueError:
            if request_times.get(article_id) < 5:
                articles_list.append(article_id)
                request_times[articles_id] += 1
        timer.random_sleep(end=zhihu.SLEEP)
    for article_id, times in request_times.items():
        if times >= 5:
            print(net.article_spider_url(article_id))