Esempio n. 1
0
def parse_page(url):
    print('пошел собирать заголовки')
    if platform.system() == 'Windows':
        locale.setlocale(locale.LC_ALL, 'russian')
    else:
        locale.setlocale(locale.LC_TIME, 'ru_RU')

    html = get_html(url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        rows = soup.find_all('li', class_='content-list__item content-list__item_post shortcuts_item')
        for row in rows:
            try:
                url = row.find('h2').find('a')['href']
                title = row.find('h2').find('a').text
                date = row.find('header', class_='post__meta').find('span', class_='post__time').text
                if 'сегодня' in date:
                    today = datetime.now()
                    date = date.replace('сегодня', today.strftime('%d %B %Y'))
                elif 'вчера' in date:
                    yesterday = datetime.now() - timedelta(days=1)
                    date = date.replace('вчера', yesterday.strftime('%d %B %Y'))
                try:
                    date = datetime.strptime(date, '%d %B %Y в %H:%M')
                except ValueError:
                    date = datetime.now()
                save_news(url, title, date)
            except (AttributeError, TypeError):
                pass
Esempio n. 2
0
def get_news_snippets():
    html = get_html('https://habr.com/ru/search/?target_type=posts&q=python&order_by=date')
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_news = soup.find('ul', class_='content-list_posts').findAll('li', class_='content-list__item_post')
        for news in all_news:
            title = news.find('a', class_='post__title_link').text
            url = news.find('a', class_='post__title_link')['href']
            published = news.find('span', class_='post__time').text
            published = parse_habr_date(published)
            save_news(title, url, published)
Esempio n. 3
0
def get_news_snippets():
    html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, "html.parser")
        all_news = soup.find("ul", class_="content-list_posts").find_all("li", class_="content-list__item_post")
        for news in all_news:
            title = news.find("a", class_="post__title_link").text
            url = news.find("a", class_="post__title_link")["href"]
            published = news.find("span", class_="post__time").text
            published = date_translate(published)
            save_news(title, url, published)
    return False
Esempio n. 4
0
def get_news_snippets():
    html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, 'html.parser') # Бреобразует наш HTML в дерево элементов с которым удобнее работать фунциям этой библиотеки
        all_news = soup.find("ul", class_="content-list_posts").findAll('li', class_='content-list__item_post') # Делаем выборку элементов страницы при помощи поиска
        #result_news = []
        for news in all_news:
            title = news.find('a', class_="post__title_link").text  # Выбираем текст заголовка
            url = news.find('a', class_="post__title_link")["href"]  # Выбираем ссылку заголовка (к атрибутам обращаемся как к элементам славаря)
            published = news.find('span', class_="post__time").text  # Выбираем время
            published = parse_habr_date(published)
            save_news(title, url, published) # После формирования, вызываем запись в базу
Esempio n. 5
0
def get_news_snippets():
    html = get_html(
        "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(
            html, "html.parser")  #Преобразованный html в дерево супа
        all_news = soup.find("ul", class_="content-list_posts").findAll(
            "li", class_="content-list__item_post")
        # all_news = all_news.findAll("li")
        for news in all_news:
            title = news.find('a', class_="post__title_link").text
            url = news.find('a', class_="post__title_link")['href']
            published = news.find('span', class_="post__time").text
            published = parse_habr_date(published)
            save_news(title, url, published)
Esempio n. 6
0
def get_python_news():
    html = get_html("https://www.python.org/blogs/")
    if html:
        soup = BeautifulSoup(html, "html.parser")
        all_news = soup.find('ul', class_='list-recent-posts')
        all_news = all_news.findAll('li')
        for news in all_news:
            title = news.find('a').text
            url = news.find('a')['href']
            published = news.find('time')['datetime']
            try:
                published = datetime.strptime(published, '%Y-%m-%d')
            except ValueError:
                published = datetime.now()
            save_news(title, url, published)
Esempio n. 7
0
def parsehabr(html, keyword):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_articles = soup.find(class_='tm-articles-list').findAll(class_='tm-articles-list__item')
        for article in all_articles:
            try:
                title = article.find('a', class_ = 'tm-article-snippet__title-link').text
                url = 'https://habr.com'+ article.find('a', class_ = 'tm-article-snippet__title-link')['href']
                published = article.find('span', class_='tm-article-snippet__datetime-published').find('time')['datetime']
                try:
                    published = datetime.strptime(published, '%Y-%m-%dT%H:%M:%S.%fZ')
                except ValueError:
                    published = datetime.now()
                save_news(title, url, published, keyword)
            except(AttributeError):
                print(f'Error!!!')          
    return False