def parse_page(url): print('пошел собирать заголовки') if platform.system() == 'Windows': locale.setlocale(locale.LC_ALL, 'russian') else: locale.setlocale(locale.LC_TIME, 'ru_RU') html = get_html(url) if html: soup = BeautifulSoup(html, 'html.parser') rows = soup.find_all('li', class_='content-list__item content-list__item_post shortcuts_item') for row in rows: try: url = row.find('h2').find('a')['href'] title = row.find('h2').find('a').text date = row.find('header', class_='post__meta').find('span', class_='post__time').text if 'сегодня' in date: today = datetime.now() date = date.replace('сегодня', today.strftime('%d %B %Y')) elif 'вчера' in date: yesterday = datetime.now() - timedelta(days=1) date = date.replace('вчера', yesterday.strftime('%d %B %Y')) try: date = datetime.strptime(date, '%d %B %Y в %H:%M') except ValueError: date = datetime.now() save_news(url, title, date) except (AttributeError, TypeError): pass
def get_news_snippets(): html = get_html('https://habr.com/ru/search/?target_type=posts&q=python&order_by=date') if html: soup = BeautifulSoup(html, 'html.parser') all_news = soup.find('ul', class_='content-list_posts').findAll('li', class_='content-list__item_post') for news in all_news: title = news.find('a', class_='post__title_link').text url = news.find('a', class_='post__title_link')['href'] published = news.find('span', class_='post__time').text published = parse_habr_date(published) save_news(title, url, published)
def get_news_snippets(): html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, "html.parser") all_news = soup.find("ul", class_="content-list_posts").find_all("li", class_="content-list__item_post") for news in all_news: title = news.find("a", class_="post__title_link").text url = news.find("a", class_="post__title_link")["href"] published = news.find("span", class_="post__time").text published = date_translate(published) save_news(title, url, published) return False
def get_news_snippets(): html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup(html, 'html.parser') # Бреобразует наш HTML в дерево элементов с которым удобнее работать фунциям этой библиотеки all_news = soup.find("ul", class_="content-list_posts").findAll('li', class_='content-list__item_post') # Делаем выборку элементов страницы при помощи поиска #result_news = [] for news in all_news: title = news.find('a', class_="post__title_link").text # Выбираем текст заголовка url = news.find('a', class_="post__title_link")["href"] # Выбираем ссылку заголовка (к атрибутам обращаемся как к элементам славаря) published = news.find('span', class_="post__time").text # Выбираем время published = parse_habr_date(published) save_news(title, url, published) # После формирования, вызываем запись в базу
def get_news_snippets(): html = get_html( "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date") if html: soup = BeautifulSoup( html, "html.parser") #Преобразованный html в дерево супа all_news = soup.find("ul", class_="content-list_posts").findAll( "li", class_="content-list__item_post") # all_news = all_news.findAll("li") for news in all_news: title = news.find('a', class_="post__title_link").text url = news.find('a', class_="post__title_link")['href'] published = news.find('span', class_="post__time").text published = parse_habr_date(published) save_news(title, url, published)
def get_python_news(): html = get_html("https://www.python.org/blogs/") if html: soup = BeautifulSoup(html, "html.parser") all_news = soup.find('ul', class_='list-recent-posts') all_news = all_news.findAll('li') for news in all_news: title = news.find('a').text url = news.find('a')['href'] published = news.find('time')['datetime'] try: published = datetime.strptime(published, '%Y-%m-%d') except ValueError: published = datetime.now() save_news(title, url, published)
def parsehabr(html, keyword): if html: soup = BeautifulSoup(html, 'html.parser') all_articles = soup.find(class_='tm-articles-list').findAll(class_='tm-articles-list__item') for article in all_articles: try: title = article.find('a', class_ = 'tm-article-snippet__title-link').text url = 'https://habr.com'+ article.find('a', class_ = 'tm-article-snippet__title-link')['href'] published = article.find('span', class_='tm-article-snippet__datetime-published').find('time')['datetime'] try: published = datetime.strptime(published, '%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: published = datetime.now() save_news(title, url, published, keyword) except(AttributeError): print(f'Error!!!') return False