def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                meta = page.select('.head')[0]
                headline = meta.h1.text.strip()

                # Вижте 50-те най-четени мнения в сайта ни за годината
                if headline == '':
                    continue
                info = clean_text(
                    meta.select('.article-date')[0].text.split('(')[0]) if len(
                        meta.select('.article-date')) > 0 else ''

                # 30.12.2019 10:33
                articleDate = info.split(';')[0] if info != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d.%m.%Y  %H:%M')

                author = info.split(';')[1] if ';' in info else None
                views = requests.get(
                    'https://www.24chasa.bg/Article/{id}/4'.format(
                        id=re.search('(\d+)$', link).group(1))).text
                article_text = ' '.join([
                    clean_text(par.text)
                    for par in page.select('.content')[0].select('p')
                ]).split('Tweet')[0] if len(
                    page.select('.content')) > 0 else ''

                # shares - will need selenium for that
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'author': clean_text(author),
                        'date': articleDate,
                        #'shares': shares,
                        'views': clean_text(views),
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content
Exemple #2
0
def crawlLinks(links):
    articlesContent = pd.DataFrame()

    for link, section in tqdm(list(links)):
        try:
            rq = requests.get(link)
            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                articleTitle = page.select('h1')[0].text if len(
                    page.select('h1')) > 0 else ''
                articleSubtitle = page.select('h2.subtitle')[0].text if len(
                    page.select('h2.subtitle')) > 0 else ''

                articleDate = page.select(
                    '.article-time')[0].text.split(', oбновена')[0] if len(
                        page.select('.article-time')) > 0 else ''
                articleDate = clean_text(articleDate)
                month_name = re.search('([а-яА-Я]+)', articleDate)
                if month_name is not None:
                    month_name = month_name.group(1)
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name))
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y,  %H:%M')

                category = page.select(
                    'div.article-category')[0].a.text if len(
                        page.select('div.article-category')) > 0 else ''
                comments = page.select('.commentsButtonNumber')[0].text if len(
                    page.select('.commentsButtonNumber')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text)
                    for par in page.select('.article-text')[0].select('p')
                ])

                # article-tags
                tags = page.select('.article-tags')
                tags = ' - '.join([
                    clean_text(tag.text) for tag in tags[0].select('a')
                ]) if tags is not None else None

                articlesContent = articlesContent.append(
                    {
                        'link': link,
                        'section': section,
                        'comments': clean_text(comments),
                        'title': clean_text(articleTitle),
                        'subtitle': clean_text(articleSubtitle),
                        'date': articleDate,
                        'category': category,
                        'tags': tags,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articlesContent
def get_date(date):
    date = date.replace('Публикувана в ', '')
    month_name = [m for m in months.keys() if m in date][0]
    # "15:54 на 21 февруари, 2020 год."
    # "12:03 на 3 септември, 2020 год."
    articleDate = date.replace(month_name,
                                      replace_month_with_digit(month_name)) if month_name is not None else date
    articleDate = pd.to_datetime(articleDate, format='%H:%M на %d %m, %Y год.')

    return articleDate
def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                headline = page.select('.content')[0].h1.text.strip()
                meta = clean_text(page.select('.article-tools')[0].text) if len(page.select('.article-tools')) > 0 else '' 

                # 14:41, 30 дек 19
                articleDate = re.search('(.*),', meta).group(1) if re.search('(.*),', meta) is not None else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(1) if month_name is not None else None
                    articleDate = articleDate.replace(month_name, replace_month_with_digit(
                        month_name)) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate, format='%H:%M, %d %m %y')

                views = re.search('(\d+)$', meta).group(1) if re.search('(\d+)$', meta) is not None else ''
                comments = page.select('.comments')[0].text.strip() if len(page.select('.comments')) > 0 else ''
                article_body = page.select('.article-content')[0].select('p') if len(page.select('.article-content')) > 0 else ''
                if article_body != '':
                    author = article_body[0].text
                    article_text = ' '.join([clean_text(par.text)
                                            for par in article_body[1:] if '<' not in par.text])
                    article_text = article_text[article_text.find('}') + 1:].strip()
                else:
                    article_text = ''
                    author = ''

                tags = ' - '.join(
                    [clean_text(tag.text) for tag in page.select('.tags')[0].select('li') if tag != ',' and tag != "\n"]) \
                    if len(page.select('.tags')) > 0 else None
                tags = clean_text(tags)
                
                articles_content = articles_content.append({'link': link,
                                                            'title': clean_text(headline),
                                                            'author': clean_text(author),
                                                            'date': articleDate,
                                                            'category': category,
                                                            'comments': clean_text(comments),
                                                            'views': clean_text(views),
                                                            'tags': tags,
                                                            'article_text': article_text},
                                                           ignore_index=True)
        except:
            continue

    return articles_content
def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                titles = page.select('.text-wrapper')[0]
                headline = titles.h2.text
                subtitle = page.select('.text-wrapper')[0].p.text
                meta = page.select('.additional-info')[0] if len(
                    page.select('.additional-info')) > 0 else ''
                date_author_info = clean_text(
                    meta.select('.timestamp')[0].text) if len(
                        meta.select('.timestamp')) > 0 else ''
                author = re.search(':([А-Яа-я\s]+$)', date_author_info)
                author = author.group(
                    1).strip() if author is not None else None

                # 10:21                   27 декември 2019
                articleDate = ' '.join(date_author_info.split(
                    '|')[0:2]).strip() if date_author_info != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) \
                        if month_name is not None else articleDate
                    articleDate = pd.to_datetime(
                        articleDate, format='%H:%M                   %d %m %Y')

                views = meta.select('#articleViews')[0].text if len(
                    meta.select('#articleViews')) > 0 else ''
                comments = meta.select('.comments')[0].text if len(
                    meta.select('.comments')) > 0 else ''
                article_text = ' '.join([
                    par.text.strip()
                    for par in page.select('.article-body')[0].select('p')
                ]) if len(page.select('.article-body')) > 0 else ''
                """
                window._io_config=window._io_config||{};window._io_config["0.2.0"]=window._io_config["0.2.0"]||[];window._io_config["0.2.0"].push({"page_url":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga"
"page_url_canonical":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga"
"page_title":"\u0412\u041a\u0421:\u0418\u0434\u0430\u043e\u0442\u043a\u0440\u0430\u0434\u043d\u0430\u0442\u043a\u043e\u043b\u0430\u0442\u0430
\u0442\u0440\u044f\u0431\u0432\u0430\u0434\u0430\u0441\u0438\u043f\u043b\u0430\u0449\u0430\u0448\u043b\u0438\u0437\u0438\u043d\u0433\u0430|\u0414\u043d\u0435\u0441.dir.bg"
"page_type":"article"
"page_language":"bg"
"article_authors":["\u041a\u0430\u043b\u0438\u043d\u041a\u0430\u043c\u0435\u043d\u043e\u0432"]
"article_categories":["\u0422\u0435\u043c\u0438\u0434\u0430"]
"article_subcategories":[]
"article_type":"image"
"article_word_count":425
"article_publication_date":"Fri
03Jan2020:52:40+0200"});
"""

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        'comments': clean_text(comments),
                        'author': clean_text(author),
                        'date': articleDate,
                        'views': clean_text(views),
                        'category': category,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content
def crawlLinks(links):
    articles_content = pd.DataFrame()
    for link in tqdm(links):
        try:
            rq = requests.get(link)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                category = page.select(
                    '.gtm-ArticleBreadcrumb-click'
                )[0].text if len(
                    page.select('.gtm-ArticleBreadcrumb-click')) > 0 else ''
                headline = page.select('.title-wrap-roboto')[0].h1.text.strip(
                ) if len(page.select('.title-wrap-roboto')) > 0 else ''

                # Гледайте цялата емисия
                if headline == '':
                    continue

                subtitle = page.select('.article-sub-title')[0].text.strip(
                ) if len(page.select('.article-sub-title')) > 0 else ''
                #author = page.select('.author-name')
                #author = author[0].text if author is not None else None

                # 21 ноември 2019  19:42
                articleDate = page.select('.date-time')[0].text if len(
                    page.select('.date-time')) > 0 else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y  %H:%M')

                article_body = page.select('.article-body')[0].find_all(
                    'p',
                    a=False) if len(page.select('.article-body')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text) for par in article_body
                    if 'ГАЛЕРИЯ' not in par and 'СНИМКИ' not in par
                    and 'ВИДЕО' not in par
                ])

                #tags

                tags_start_phrase = 'w2g.targeting = '
                start_ind = rq.text.find(tags_start_phrase)
                end_ind = rq.text.find(';', start_ind)
                aoi = rq.text[start_ind +
                              len(tags_start_phrase):end_ind].strip()
                tags = re.findall('([а-яА-Я]+)', aoi)
                tags = ' - '.join(
                    clean_text(tag.replace("'", '').strip())
                    for tag in tags) if len(tags) > 0 else None

                #shares
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text
                """
                function
                getCookie(k)
                {
                return (document.cookie.match('(^|; )' + k + '=([^;]*)') | | 0)[2]
                }
                // header
                bidding
                targeting.Main
                script is loaded
                via
                GTM
                var
                w2g = w2g | | {};
                w2g.targeting = {
                cid: 'news',
                bid: 'view',
                aid: '273680',
                catid: '12',
                subcatid: '4',
                procatid: '1',
                prpage: '0',
                safe: '1',
                tag: 'тенис',
                tag: 'джейми',
                tag: 'мъри',
                tag: 'григор',
                tag: 'димитров',
                tag: 'александър',
                tag: 'лазаров',
                tag: 'великобритания',
                tag: 'българия'
            };
                """

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        #'author': clean_text(author),
                        'date': articleDate,
                        'tags': tags,
                        #'shares': shares,
                        'category': category,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content
def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")
                info = page.select('#news_details')[0] if len(
                    page.select('#news_details')) > 0 else ''

                headline = info.h1.text.strip()
                subtitle = info.h2.text.strip()

                meta = info.select('.info')[0].select('div')

                # 30 Декември, 2019 15:26
                articleDate = meta[0].text.split('Публикувана:')[1].strip()
                month_name = re.search('([а-яА-Я]+)', articleDate)
                month_name = month_name.group(
                    1) if month_name is not None else None
                articleDate = articleDate.replace(
                    month_name, replace_month_with_digit(
                        month_name)) if month_name is not None else articleDate
                articleDate = pd.to_datetime(articleDate,
                                             format='%d %m, %Y %H:%M')

                meta = meta[1].text.strip() if len(meta) > 0 else ''
                if meta != '':
                    comments = re.search('(^\d+)', meta).group(1)
                    views = re.search('(\d+)$', meta).group(1)

                author = page.select('.linksProfile')[0].text if len(
                    page.select('.linksProfile')) > 0 else ''

                article_body = page.select('#news_content')[0].select(
                    'p') if len(page.select('#news_content')) > 0 else ''
                article_text = ' '.join([
                    clean_text(par.text) for par in article_body
                    if '<' not in par.text
                ]) if article_body != '' else ''
                tags = ' - '.join(
                    [clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"]) \
                    if len(page.select('.tags')) > 0 else None

                #shares
                # shares = page.select('.inlineBlock')[1].select('.span')[-1].text

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'subtitle': clean_text(subtitle),
                        'author': clean_text(author),
                        'date': articleDate,
                        'category': category,
                        'comments': clean_text(comments),
                        # 'shares': shares,
                        'views': clean_text(views),
                        'tags': tags,
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content
def crawlLinks(links):
    articles_content = pd.DataFrame()

    for link in tqdm(links):
        try:
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))

            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                category = page.find(
                    'div', attrs={
                        'class': 'printing_large_text_toolbar'
                    }).text if page.find(
                        'div', attrs={'class': 'printing_large_text_toolbar'
                                      }) is not None else ''
                headline = page.select('#news_heading')[0].h1.text.strip(
                ) if len(page.select('#news_heading')) > 0 else ''
                shares = page.select(".social_count")[0].text.strip() if len(
                    page.select(".social_count")) > 0 else ''
                comments = page.select('.comments')[0].text.strip() if len(
                    page.select('.comments')) else ''
                views = page.select(
                    '.btn_reads')[0].text.split('Прочетена')[1].strip() if len(
                        page.select('.btn_reads')) > 0 else ''
                article_text = clean_text(
                    page.select('#news_content')[0].text) if len(
                        page.select('#news_content')) > 0 else ''

                # 01 януари 2020 | 16:26 - Обновена
                articleDate = page.find('td', attrs={'id': 'news_heading'})
                articleDate = articleDate.find(
                    'span', attrs={
                        'class': 'dark_text'
                    }).text if articleDate is not None and articleDate.find(
                        'span', attrs={'class': 'dark_text'
                                       }) is not None else ''
                articleDate = articleDate.split(
                    '- Обновена')[0].strip() if articleDate != '' else ''
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    month_name = month_name.group(
                        1) if month_name is not None else None
                    articleDate = articleDate.replace(
                        month_name, replace_month_with_digit(month_name)
                    ) if month_name is not None else articleDate
                    articleDate = pd.to_datetime(articleDate,
                                                 format='%d %m %Y | %H:%M')

                author = page.select(
                    '#author_box')[0].select('h5')[0].a.text if len(
                        page.select('#author_box')) > 0 else ''
                tags = " - ".join([
                    clean_text(i.text)
                    for i in page.find('div', attrs={
                        'class': 'news_tags'
                    }).findAll('span')
                ])

                articles_content = articles_content.append(
                    {
                        'link': link,
                        'title': clean_text(headline),
                        'author': clean_text(author),
                        'date': articleDate,
                        'category': clean_text(category),
                        'tags': tags,
                        'comments': clean_text(comments),
                        'views': clean_text(views),
                        'shares': clean_text(shares),
                        'article_text': article_text
                    },
                    ignore_index=True)
        except:
            continue

    return articles_content
def crawlLinks(links):
    articlesContent = pd.DataFrame()

    for link in tqdm(list(links)):
        try:    
            rq = requests.get(link)
            domain = "{0.netloc}".format(urlsplit(link))
            category = re.search(domain + '/([^/]+)', link).group(1)
            if rq.status_code == 200:
                page = bs4.BeautifulSoup(rq.text, features="html.parser")

                if page.find({'class': 'article-post'}):
                    body = page.select('.article-post')[0]
                    headline = body.select('h1')[0].text if len(body.select('h1')) else ''
                    subtitle = None

                    #metadata
                    location = body.select('.location')[0].text if len(body.select('.location')) else ''
                    articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else ''
                    views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else ''
                    comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')])
                else: 
                    headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else ''
                    subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else ''

                    #metadata
                    simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else ''
                    li = simpleShare.find_all('li')
                    location = li[0].text if len(li) > 0 else ''
                    articleDate = li[1].text if len(li) > 1 else ''
                    views = li[2].text if len(li) > 2 else ''
                    views = views.split(" ")[0] if views != '' else ''
                    comments = li[3].text if len(li) > 3 else ''
                    comments = comments.split(" ")[0] if comments != '' else ''
                    tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else ''

                # 30 Дек. 2019, 16:13
                if articleDate != '':
                    month_name = re.search('([а-яА-Я]+)', articleDate)
                    if month_name is not None:
                        month_name = month_name.group(1)
                        articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name))
                        articleDate = pd.to_datetime(articleDate, format='%d %m %Y,  %H:%M')

                article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else ''

                articlesContent = articlesContent.append({'link': link,
                                                          'title': clean_text(headline),
                                                          'subtitle': clean_text(subtitle),
                                                          'location': clean_text(location),
                                                          'comments': clean_text(comments),
                                                          'date': articleDate,
                                                          'views': clean_text(views),
                                                          'category': category,
                                                          'tags': clean_text(tags),
                                                          'article_text': article_text},
                                                         ignore_index=True)
        except:
            continue

    return articlesContent