Example #1
0
def parse_page(url):
    print('пошел собирать заголовки')
    if platform.system() == 'Windows':
        locale.setlocale(locale.LC_ALL, 'russian')
    else:
        locale.setlocale(locale.LC_TIME, 'ru_RU')

    html = get_html(url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        rows = soup.find_all('li', class_='content-list__item content-list__item_post shortcuts_item')
        for row in rows:
            try:
                url = row.find('h2').find('a')['href']
                title = row.find('h2').find('a').text
                date = row.find('header', class_='post__meta').find('span', class_='post__time').text
                if 'сегодня' in date:
                    today = datetime.now()
                    date = date.replace('сегодня', today.strftime('%d %B %Y'))
                elif 'вчера' in date:
                    yesterday = datetime.now() - timedelta(days=1)
                    date = date.replace('вчера', yesterday.strftime('%d %B %Y'))
                try:
                    date = datetime.strptime(date, '%d %B %Y в %H:%M')
                except ValueError:
                    date = datetime.now()
                save_news(url, title, date)
            except (AttributeError, TypeError):
                pass
Example #2
0
def get_habr_snippets():
    html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_news = soup.find('ul', class_='content-list_posts').findAll('li', class_='content-list__item_post')
        for news in all_news:
            title = news.find('a', class_='post__title_link').text
            url = news.find('a', class_='post__title_link')['href']
            published = news.find('span', class_='post__time').text
            print(title, url, published)
Example #3
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            news_text = soup.find('div', class_='post__text-html').decode_contents()
            if news_text:
                news.text = news_text
                db.session.add(news)
                db.session.commit()
Example #4
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            article = soup.find('div', class_='tm-article-body').decode_contents()
            if article:
                news.text = article
                db.session.add(news)
                db.session.commit()        
Example #5
0
def get_news_content():
    news_without_texh = News.query.filter(News.text.is_(None))
    for news in news_without_texh:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            # .decode_contents() позволяет получить html вместо текста
            article = soup.find('div', class_='post__text-html').decode_contents()
            if article:
                news.text = article
                db.session.add(news)
                db.session.commit()
Example #6
0
def get_news_snippets():
    html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, 'html.parser') # Бреобразует наш HTML в дерево элементов с которым удобнее работать фунциям этой библиотеки
        all_news = soup.find("ul", class_="content-list_posts").findAll('li', class_='content-list__item_post') # Делаем выборку элементов страницы при помощи поиска
        #result_news = []
        for news in all_news:
            title = news.find('a', class_="post__title_link").text  # Выбираем текст заголовка
            url = news.find('a', class_="post__title_link")["href"]  # Выбираем ссылку заголовка (к атрибутам обращаемся как к элементам славаря)
            published = news.find('span', class_="post__time").text  # Выбираем время
            published = parse_habr_date(published)
            save_news(title, url, published) # После формирования, вызываем запись в базу
Example #7
0
def get_news_snippets():
    html = get_html("https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, "html.parser")
        all_news = soup.find("ul", class_="content-list_posts").find_all("li", class_="content-list__item_post")
        for news in all_news:
            title = news.find("a", class_="post__title_link").text
            url = news.find("a", class_="post__title_link")["href"]
            published = news.find("span", class_="post__time").text
            published = date_translate(published)
            save_news(title, url, published)
    return False
Example #8
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            news_text = soup.find(
                'div',
                class_='post__text-html').decode_contents()  # получаем html
            if news_text:
                news.text = news_text  # положим в таблицу news столбец text
                db.session.add(news)  # добавляем в таблицу news
                db.session.commit()
Example #9
0
def get_news_snippets():  # Сниппеты- это небольшие новостные блоки на станице
    html = get_html(
        "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_news = soup.find('ul', class_='content-list_posts').findAll(
            'li', class_='content-list__item_post')
        for news in all_news:
            title = news.find('a', class_='post__title_link').text
            url = news.find('a', class_='post__title_link')['href']
            published = news.find('span', class_='post__time').text
            published = parse_habr_date(published)
            save_news(title, url, published)
Example #10
0
def habr_news_func():
    html = get_html(
        "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_habr_news = soup.find('ul', class_='content-list_posts').findAll(
            'li', class_='content-list__item_post')
        result_news = []
        for habr_news in all_habr_news:
            title = habr_news.find('a', class_='post__title_link').text
            url = habr_news.find('a', class_='post__title_link')['href']
            published = habr_news.find('span', class_='post__time').text
            published = parse_habr_date(published)
            save_news(title, url, published)
Example #11
0
def get_text():
    print('Собираю текста')
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        print(f'{news.id} - у нее нет текста')
        html = get_html(news.url)
        print(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            full = soup.find('div', class_='post__body post__body_full').decode_contents()
            if full:
                news.text = full
                db.session.add(news)
                db.session.commit()
Example #12
0
def get_python_news():
    html = get_html("https://www.python.org/blogs/")
    if html:
        soup = BeautifulSoup(html, "html.parser")
        all_news = soup.find('ul', class_='list-recent-posts')
        all_news = all_news.findAll('li')
        for news in all_news:
            title = news.find('a').text
            url = news.find('a')['href']
            published = news.find('time')['datetime']
            try:
                published = datetime.strptime(published, '%Y-%m-%d')
            except ValueError:
                published = datetime.now()
            save_news(title, url, published)
Example #13
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            # Только текст
            #article = soup.find('div', class_='post__text-html').text
            # html разметка
            article = soup.find('div',
                                class_='post__text-html').decode_contents()
            if article:
                news.text = article
                db.session.add(news)
                db.session.commit()
Example #14
0
def get_news_snippets():
    html = get_html(
        "https://habr.com/ru/search/?target_type=posts&q=python&order_by=date")
    if html:
        soup = BeautifulSoup(
            html, "html.parser")  #Преобразованный html в дерево супа
        all_news = soup.find("ul", class_="content-list_posts").findAll(
            "li", class_="content-list__item_post")
        # all_news = all_news.findAll("li")
        for news in all_news:
            title = news.find('a', class_="post__title_link").text
            url = news.find('a', class_="post__title_link")['href']
            published = news.find('span', class_="post__time").text
            published = parse_habr_date(published)
            save_news(title, url, published)
Example #15
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        try:
            if html:
                soup = BeautifulSoup(html, "html.parser")
                # Для получения именно html используем .decode_contents() а не .text
                news_text = soup.find(
                    'div', class_='post__text-html').decode_contents()
                if news_text:
                    news.text = news_text
                    db.session.add(news)
                    db.session.commit()
        except AttributeError:
            print("Данных <div class='post__text-html' нет на странице")
Example #16
0
def get_news_content():
    news_without_text = News.query.filter(
        News.text.is_(None)
    )  # is_ позволяет сделать сравнение на идентичность, не используется .all(), и все равно можно сделать цикл
    for news in news_without_text:
        html = get_html(news.url)
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            news_text = soup.find(
                'div', class_='post__text-html'
            ).decode_contents(
            )  # decode_contents() показывает не просто текст, а html странички
            if news_text:
                news.text = news_text
                db.session.add(news)
                db.session.commit()
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)

        if not html:
            continue

        soup = BeautifulSoup(html, "html.parser")
        article = soup.find("div", class_="post__text-html").decode_contents()

        if not article:
            continue

        news.text = article
        db.session.add(news)
        db.session.commit()
Example #18
0
def get_news_content():
    news_without_text = News.query.filter(News.text.is_(None))
    for news in news_without_text:
        html = get_html(news.url)
        try:
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                article = soup.find(
                    'div', class_='post__text-html'
                ).decode_contents(
                )  # decode_contents() - позволяет не просто получить текст находящийся внутри div, а получить html код
                if article:
                    news.text = article
                    db.session.add(news)
                    db.session.commit()
        except (AttributeError):
            continue
Example #19
0
def get_news_snippets():
    html = get_html(
        'https://habr.com/ru/search/?target_type=posts&q=python&order_by=date')
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        all_news = soup.find('ul', class_='content-list_posts').findAll(
            'li', class_='content-list__item_post')
        # result_news = []
        for news in all_news:
            try:
                title = news.find('a', class_='post__title_link').text
                url = news.find('a', class_='post__title_link')['href']
                published = news.find('span', class_='post__time').text
                published = parse_habr_date(published)
                save_news(title, url, published)
                print(title, url, published)
            except AttributeError:
                title = news.find('a', class_='preview-data__title-link').text
                url = news.find('a', class_='preview-data__title-link')['href']
                published = news.find(
                    'span', class_='preview-data__time-published').text
Example #20
0
def get_news():
    hubs = [ 'python', 'web_testing','it_testing', 'data_engineering', 'bigdata']
    for hub in hubs:
        url = 'https://habr.com/ru/hub/' + hub
        parsehabr(get_html(url), hub)