Ejemplo n.º 1
0
    def handle(self, *args, **options):
        """
        우선 연합 뉴스만 예시로 작성한다.
        """
        urls = list()

        now = datetime.now()
        dates = [now]
        if now.hour < 6:
            dates.append(now - timedelta(1)) # requirement: 6시간 이상 지나지 않았으면 전날도 크롤링

        for adate in dates:
            for page in range(10, 0, -1):
                url = f'http://news.kmib.co.kr/article/list.asp?sid1=all&sid2=&page={page}&sdate={adate:%Y%m%d}&st='
                urls.append(url)

        for url in urls:
            try:
                _header = {
                    'User-Agent': self.press.user_agent
                }
                _response = requests.get(url, headers=_header)
                _response.encoding = self.press.encoding
                _response.close()

            except requests.exceptions.ConnectionError as e:
                _second = random.randrange(5 * 60, 15 * 60)
                time.sleep(_second)
                exit()

            soup = BeautifulSoup(_response.text, 'lxml')
            soup_list = soup.select_one('.nws_list')

            for item in soup_list.select('div.nws'):
                dt = item.select_one('dt').select_one('a')
                url = dt['href']
                title = dt.string
                datetime_str = item.select_one('dd.date').string
                datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M')
                datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False)
                summary = item.select_one('dd.tx').string.strip()
                thumbnail_obj = item.select_one('p.pic a img')
                if thumbnail_obj:
                    thumbnail_src = thumbnail_obj['src']

                if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None:
                    time.sleep(10)
                    continue

                print(f'국민일보: {datetime_obj}: {title}: {url}')

            time.sleep(10)
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        """
        우선 연합 뉴스만 예시로 작성한다.
        """
        urls = list()

        for idx in range(10, 0, -1):
            urls.append("{}/news/{}".format(self.url_base, idx))

        for url in urls:
            try:
                _header = {
                    'User-Agent': self.press.user_agent
                }
                _response = requests.get(url, headers=_header)
                _response.encoding = self.press.encoding
                _response.close()

            except requests.exceptions.ConnectionError as e:
                _second = random.randrange(5 * 60, 15 * 60)
                time.sleep(_second)
                exit()

            soup_body = BeautifulSoup(_response.text, 'lxml')
            soup_list = soup_body.select_one('.headline-list ul')

            soup_section = soup_list.find_all('li', {'class': 'section02'})

            for item in reversed(soup_section):
                title = item.select_one('.news-tl').select_one('a').string
                url_orig = item.select_one('.news-tl').select_one('a')['href']
                url_parsed = urlparse(url_orig)
                url = url_parsed.netloc + url_parsed.path

                datetime_string = item.select_one('.lead').select_one('.p-time').string
                datetime_obj = datetime.datetime.strptime(datetime_string, '%m-%d %H:%M').replace(year=2019)
                datetime_obj = timezone.make_aware(datetime_obj, timezone=pytz.timezone('Asia/Seoul'), is_dst=False)

                if Article.create_new(press=self.press, url=url, title=title, datetime=datetime_obj) is None:
                    time.sleep(10)
                    continue

                print(f'연합뉴스: {datetime_obj}: {title}: {url}')
                # Article.perceive('https://' + url, title, datetime_obj)

            time.sleep(10)