def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") meta = page.select('.head')[0] headline = meta.h1.text.strip() # Вижте 50-те най-четени мнения в сайта ни за годината if headline == '': continue info = clean_text( meta.select('.article-date')[0].text.split('(')[0]) if len( meta.select('.article-date')) > 0 else '' # 30.12.2019 10:33 articleDate = info.split(';')[0] if info != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d.%m.%Y %H:%M') author = info.split(';')[1] if ';' in info else None views = requests.get( 'https://www.24chasa.bg/Article/{id}/4'.format( id=re.search('(\d+)$', link).group(1))).text article_text = ' '.join([ clean_text(par.text) for par in page.select('.content')[0].select('p') ]).split('Tweet')[0] if len( page.select('.content')) > 0 else '' # shares - will need selenium for that # shares = page.select('.inlineBlock')[1].select('.span')[-1].text articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, #'shares': shares, 'views': clean_text(views), 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articlesContent = pd.DataFrame() for link, section in tqdm(list(links)): try: rq = requests.get(link) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") articleTitle = page.select('h1')[0].text if len( page.select('h1')) > 0 else '' articleSubtitle = page.select('h2.subtitle')[0].text if len( page.select('h2.subtitle')) > 0 else '' articleDate = page.select( '.article-time')[0].text.split(', oбновена')[0] if len( page.select('.article-time')) > 0 else '' articleDate = clean_text(articleDate) month_name = re.search('([а-яА-Я]+)', articleDate) if month_name is not None: month_name = month_name.group(1) articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name)) articleDate = pd.to_datetime(articleDate, format='%d %m %Y, %H:%M') category = page.select( 'div.article-category')[0].a.text if len( page.select('div.article-category')) > 0 else '' comments = page.select('.commentsButtonNumber')[0].text if len( page.select('.commentsButtonNumber')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in page.select('.article-text')[0].select('p') ]) # article-tags tags = page.select('.article-tags') tags = ' - '.join([ clean_text(tag.text) for tag in tags[0].select('a') ]) if tags is not None else None articlesContent = articlesContent.append( { 'link': link, 'section': section, 'comments': clean_text(comments), 'title': clean_text(articleTitle), 'subtitle': clean_text(articleSubtitle), 'date': articleDate, 'category': category, 'tags': tags, 'article_text': article_text }, ignore_index=True) except: continue return articlesContent
def get_date(date): date = date.replace('Публикувана в ', '') month_name = [m for m in months.keys() if m in date][0] # "15:54 на 21 февруари, 2020 год." # "12:03 на 3 септември, 2020 год." articleDate = date.replace(month_name, replace_month_with_digit(month_name)) if month_name is not None else date articleDate = pd.to_datetime(articleDate, format='%H:%M на %d %m, %Y год.') return articleDate
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") headline = page.select('.content')[0].h1.text.strip() meta = clean_text(page.select('.article-tools')[0].text) if len(page.select('.article-tools')) > 0 else '' # 14:41, 30 дек 19 articleDate = re.search('(.*),', meta).group(1) if re.search('(.*),', meta) is not None else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group(1) if month_name is not None else None articleDate = articleDate.replace(month_name, replace_month_with_digit( month_name)) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%H:%M, %d %m %y') views = re.search('(\d+)$', meta).group(1) if re.search('(\d+)$', meta) is not None else '' comments = page.select('.comments')[0].text.strip() if len(page.select('.comments')) > 0 else '' article_body = page.select('.article-content')[0].select('p') if len(page.select('.article-content')) > 0 else '' if article_body != '': author = article_body[0].text article_text = ' '.join([clean_text(par.text) for par in article_body[1:] if '<' not in par.text]) article_text = article_text[article_text.find('}') + 1:].strip() else: article_text = '' author = '' tags = ' - '.join( [clean_text(tag.text) for tag in page.select('.tags')[0].select('li') if tag != ',' and tag != "\n"]) \ if len(page.select('.tags')) > 0 else None tags = clean_text(tags) articles_content = articles_content.append({'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, 'category': category, 'comments': clean_text(comments), 'views': clean_text(views), 'tags': tags, 'article_text': article_text}, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") titles = page.select('.text-wrapper')[0] headline = titles.h2.text subtitle = page.select('.text-wrapper')[0].p.text meta = page.select('.additional-info')[0] if len( page.select('.additional-info')) > 0 else '' date_author_info = clean_text( meta.select('.timestamp')[0].text) if len( meta.select('.timestamp')) > 0 else '' author = re.search(':([А-Яа-я\s]+$)', date_author_info) author = author.group( 1).strip() if author is not None else None # 10:21 27 декември 2019 articleDate = ' '.join(date_author_info.split( '|')[0:2]).strip() if date_author_info != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) \ if month_name is not None else articleDate articleDate = pd.to_datetime( articleDate, format='%H:%M %d %m %Y') views = meta.select('#articleViews')[0].text if len( meta.select('#articleViews')) > 0 else '' comments = meta.select('.comments')[0].text if len( meta.select('.comments')) > 0 else '' article_text = ' '.join([ par.text.strip() for par in page.select('.article-body')[0].select('p') ]) if len(page.select('.article-body')) > 0 else '' """ window._io_config=window._io_config||{};window._io_config["0.2.0"]=window._io_config["0.2.0"]||[];window._io_config["0.2.0"].push({"page_url":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga" "page_url_canonical":"https:\/\/dnes.dir.bg\/temida\/vks-i-da-otkradnat-kolata-tryabva-da-si-plashtash-lizinga" "page_title":"\u0412\u041a\u0421:\u0418\u0434\u0430\u043e\u0442\u043a\u0440\u0430\u0434\u043d\u0430\u0442\u043a\u043e\u043b\u0430\u0442\u0430 \u0442\u0440\u044f\u0431\u0432\u0430\u0434\u0430\u0441\u0438\u043f\u043b\u0430\u0449\u0430\u0448\u043b\u0438\u0437\u0438\u043d\u0433\u0430|\u0414\u043d\u0435\u0441.dir.bg" "page_type":"article" "page_language":"bg" "article_authors":["\u041a\u0430\u043b\u0438\u043d\u041a\u0430\u043c\u0435\u043d\u043e\u0432"] "article_categories":["\u0422\u0435\u043c\u0438\u0434\u0430"] "article_subcategories":[] "article_type":"image" "article_word_count":425 "article_publication_date":"Fri 03Jan2020:52:40+0200"}); """ articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'comments': clean_text(comments), 'author': clean_text(author), 'date': articleDate, 'views': clean_text(views), 'category': category, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") category = page.select( '.gtm-ArticleBreadcrumb-click' )[0].text if len( page.select('.gtm-ArticleBreadcrumb-click')) > 0 else '' headline = page.select('.title-wrap-roboto')[0].h1.text.strip( ) if len(page.select('.title-wrap-roboto')) > 0 else '' # Гледайте цялата емисия if headline == '': continue subtitle = page.select('.article-sub-title')[0].text.strip( ) if len(page.select('.article-sub-title')) > 0 else '' #author = page.select('.author-name') #author = author[0].text if author is not None else None # 21 ноември 2019 19:42 articleDate = page.select('.date-time')[0].text if len( page.select('.date-time')) > 0 else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m %Y %H:%M') article_body = page.select('.article-body')[0].find_all( 'p', a=False) if len(page.select('.article-body')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in article_body if 'ГАЛЕРИЯ' not in par and 'СНИМКИ' not in par and 'ВИДЕО' not in par ]) #tags tags_start_phrase = 'w2g.targeting = ' start_ind = rq.text.find(tags_start_phrase) end_ind = rq.text.find(';', start_ind) aoi = rq.text[start_ind + len(tags_start_phrase):end_ind].strip() tags = re.findall('([а-яА-Я]+)', aoi) tags = ' - '.join( clean_text(tag.replace("'", '').strip()) for tag in tags) if len(tags) > 0 else None #shares # shares = page.select('.inlineBlock')[1].select('.span')[-1].text """ function getCookie(k) { return (document.cookie.match('(^|; )' + k + '=([^;]*)') | | 0)[2] } // header bidding targeting.Main script is loaded via GTM var w2g = w2g | | {}; w2g.targeting = { cid: 'news', bid: 'view', aid: '273680', catid: '12', subcatid: '4', procatid: '1', prpage: '0', safe: '1', tag: 'тенис', tag: 'джейми', tag: 'мъри', tag: 'григор', tag: 'димитров', tag: 'александър', tag: 'лазаров', tag: 'великобритания', tag: 'българия' }; """ articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), #'author': clean_text(author), 'date': articleDate, 'tags': tags, #'shares': shares, 'category': category, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") info = page.select('#news_details')[0] if len( page.select('#news_details')) > 0 else '' headline = info.h1.text.strip() subtitle = info.h2.text.strip() meta = info.select('.info')[0].select('div') # 30 Декември, 2019 15:26 articleDate = meta[0].text.split('Публикувана:')[1].strip() month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit( month_name)) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m, %Y %H:%M') meta = meta[1].text.strip() if len(meta) > 0 else '' if meta != '': comments = re.search('(^\d+)', meta).group(1) views = re.search('(\d+)$', meta).group(1) author = page.select('.linksProfile')[0].text if len( page.select('.linksProfile')) > 0 else '' article_body = page.select('#news_content')[0].select( 'p') if len(page.select('#news_content')) > 0 else '' article_text = ' '.join([ clean_text(par.text) for par in article_body if '<' not in par.text ]) if article_body != '' else '' tags = ' - '.join( [clean_text(tag.text) for tag in page.select('.tags')[0].select('a') if tag != ',' and tag != "\n"]) \ if len(page.select('.tags')) > 0 else None #shares # shares = page.select('.inlineBlock')[1].select('.span')[-1].text articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'author': clean_text(author), 'date': articleDate, 'category': category, 'comments': clean_text(comments), # 'shares': shares, 'views': clean_text(views), 'tags': tags, 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articles_content = pd.DataFrame() for link in tqdm(links): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") category = page.find( 'div', attrs={ 'class': 'printing_large_text_toolbar' }).text if page.find( 'div', attrs={'class': 'printing_large_text_toolbar' }) is not None else '' headline = page.select('#news_heading')[0].h1.text.strip( ) if len(page.select('#news_heading')) > 0 else '' shares = page.select(".social_count")[0].text.strip() if len( page.select(".social_count")) > 0 else '' comments = page.select('.comments')[0].text.strip() if len( page.select('.comments')) else '' views = page.select( '.btn_reads')[0].text.split('Прочетена')[1].strip() if len( page.select('.btn_reads')) > 0 else '' article_text = clean_text( page.select('#news_content')[0].text) if len( page.select('#news_content')) > 0 else '' # 01 януари 2020 | 16:26 - Обновена articleDate = page.find('td', attrs={'id': 'news_heading'}) articleDate = articleDate.find( 'span', attrs={ 'class': 'dark_text' }).text if articleDate is not None and articleDate.find( 'span', attrs={'class': 'dark_text' }) is not None else '' articleDate = articleDate.split( '- Обновена')[0].strip() if articleDate != '' else '' if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) month_name = month_name.group( 1) if month_name is not None else None articleDate = articleDate.replace( month_name, replace_month_with_digit(month_name) ) if month_name is not None else articleDate articleDate = pd.to_datetime(articleDate, format='%d %m %Y | %H:%M') author = page.select( '#author_box')[0].select('h5')[0].a.text if len( page.select('#author_box')) > 0 else '' tags = " - ".join([ clean_text(i.text) for i in page.find('div', attrs={ 'class': 'news_tags' }).findAll('span') ]) articles_content = articles_content.append( { 'link': link, 'title': clean_text(headline), 'author': clean_text(author), 'date': articleDate, 'category': clean_text(category), 'tags': tags, 'comments': clean_text(comments), 'views': clean_text(views), 'shares': clean_text(shares), 'article_text': article_text }, ignore_index=True) except: continue return articles_content
def crawlLinks(links): articlesContent = pd.DataFrame() for link in tqdm(list(links)): try: rq = requests.get(link) domain = "{0.netloc}".format(urlsplit(link)) category = re.search(domain + '/([^/]+)', link).group(1) if rq.status_code == 200: page = bs4.BeautifulSoup(rq.text, features="html.parser") if page.find({'class': 'article-post'}): body = page.select('.article-post')[0] headline = body.select('h1')[0].text if len(body.select('h1')) else '' subtitle = None #metadata location = body.select('.location')[0].text if len(body.select('.location')) else '' articleDate = body.select('.fa-calendar')[0].text if len(body.select('.fa-calendar')) else '' views = body.select('.fa-eye')[0].text if len(body.select('.fa-eye')) else '' comments = body.select('.fa-comments-o')[0].text if len(body.select('.fa-comments-o')) else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag['a'].text for tag in body.select('.tags').select('li')]) else: headline = page.select('.post-title')[0].text if len(page.select('.post-title')) else '' subtitle = page.select('.post-subtitle')[0].text if len(page.select('.post-subtitle')) else '' #metadata simpleShare = page.select('.simple-share')[0] if len(page.select('.simple-share')) > 0 else '' li = simpleShare.find_all('li') location = li[0].text if len(li) > 0 else '' articleDate = li[1].text if len(li) > 1 else '' views = li[2].text if len(li) > 2 else '' views = views.split(" ")[0] if views != '' else '' comments = li[3].text if len(li) > 3 else '' comments = comments.split(" ")[0] if comments != '' else '' tags = ' - '.join([tag.a.text for tag in page.select('.tags-widget')[0].select('li')[1:]]) if len(page.select('.tags-widget')) > 0 else '' # 30 Дек. 2019, 16:13 if articleDate != '': month_name = re.search('([а-яА-Я]+)', articleDate) if month_name is not None: month_name = month_name.group(1) articleDate = articleDate.replace(month_name, replace_month_with_digit(month_name)) articleDate = pd.to_datetime(articleDate, format='%d %m %Y, %H:%M') article_text = clean_text(page.select('.post-content')[0].select('div')[2].text) if len(page.select('.post-content')) > 0 else '' articlesContent = articlesContent.append({'link': link, 'title': clean_text(headline), 'subtitle': clean_text(subtitle), 'location': clean_text(location), 'comments': clean_text(comments), 'date': articleDate, 'views': clean_text(views), 'category': category, 'tags': clean_text(tags), 'article_text': article_text}, ignore_index=True) except: continue return articlesContent