def handle(self, *args, **options): """ 우선 연합 뉴스만 예시로 작성한다. """ urls = list() now = datetime.now() dates = [now] if now.hour < 6: dates.append(now - timedelta(1)) # requirement: 6시간 이상 지나지 않았으면 전날도 크롤링 for adate in dates: for page in range(10, 0, -1): url = f'http://news.kmib.co.kr/article/list.asp?sid1=all&sid2=&page={page}&sdate={adate:%Y%m%d}&st=' urls.append(url) for url in urls: try: _header = { 'User-Agent': self.press.user_agent } _response = requests.get(url, headers=_header) _response.encoding = self.press.encoding _response.close() except requests.exceptions.ConnectionError as e: _second = random.randrange(5 * 60, 15 * 60) time.sleep(_second) exit() soup = BeautifulSoup(_response.text, 'lxml') soup_list = soup.select_one('.nws_list') for item in soup_list.select('div.nws'): dt = item.select_one('dt').select_one('a') url = dt['href'] title = dt.string datetime_str = item.select_one('dd.date').string datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M') datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False) summary = item.select_one('dd.tx').string.strip() thumbnail_obj = item.select_one('p.pic a img') if thumbnail_obj: thumbnail_src = thumbnail_obj['src'] if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None: time.sleep(10) continue print(f'국민일보: {datetime_obj}: {title}: {url}') time.sleep(10)
def handle(self, *args, **options): """ 우선 연합 뉴스만 예시로 작성한다. """ urls = list() for idx in range(10, 0, -1): urls.append("{}/news/{}".format(self.url_base, idx)) for url in urls: try: _header = { 'User-Agent': self.press.user_agent } _response = requests.get(url, headers=_header) _response.encoding = self.press.encoding _response.close() except requests.exceptions.ConnectionError as e: _second = random.randrange(5 * 60, 15 * 60) time.sleep(_second) exit() soup_body = BeautifulSoup(_response.text, 'lxml') soup_list = soup_body.select_one('.headline-list ul') soup_section = soup_list.find_all('li', {'class': 'section02'}) for item in reversed(soup_section): title = item.select_one('.news-tl').select_one('a').string url_orig = item.select_one('.news-tl').select_one('a')['href'] url_parsed = urlparse(url_orig) url = url_parsed.netloc + url_parsed.path datetime_string = item.select_one('.lead').select_one('.p-time').string datetime_obj = datetime.datetime.strptime(datetime_string, '%m-%d %H:%M').replace(year=2019) datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False) if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None: time.sleep(10) continue print(f'연합뉴스: {datetime_obj}: {title}: {url}') # Article.perceive('https://' + url, title, datetime_obj) time.sleep(10)