def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(role='main') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='article-author__image')) categories = processor.collect_categories( article.find_all(class_='article__section')) datetime_list = processor.collect_datetime( article.find(class_='article__published')) author = processor.collect_text(article.find(class_='article__author')) title = processor.collect_text(article.find(class_='article__title')) text = processor.collect_text(article.find(class_='article__body')) images = processor.collect_images_by_parent( article.find_all(class_='article__images'), '') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Savon sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'iso-8859-1' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( id = 'container_keski' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose( article.find( class_ = 'kp-share-area' ) ) categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) ) author_div = article.find( class_ = 'author' ) processor.decompose( author_div.find( 'a' ) ) author = processor.collect_text( author_div, True ) title = processor.collect_text( article.find( 'h1' ) ) ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) ) processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) ) text = processor.collect_text( article.find( 'isense' ) ) return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( class_ = 'somebar' ) ) processor.decompose( article.find( class_ = 'tags' ) ) categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) ) datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True ) datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) ) datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )] author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True ) title = processor.collect_text( article.find( class_ = 'post-title' ) ) text = processor.collect_text( article.find( class_ = 'post-content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( 'figcaption' ) ) return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(id='main-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='reviewpic')) datetime_list = processor.collect_datetime( article.find(class_='published')) author = processor.collect_text(article.find(class_='author')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='entry-content')) images = processor.collect_images(article.find_all('img'), 'src', '') return processor.create_dictionary('Faktabaari', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='single-article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-links')) processor.decompose_all(article.find_all(class_='article-ad-block')) categories = processor.collect_categories( soup.find(class_='section-title')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='byline')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='ingress')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images(article.find_all('img'), 'src', 'http:') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Taloussanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def get_answers(answers_html_element, to_user): answers = [] for answer in answers_html_element: answer_data = {} answer_div = answer.find(class_='answer-container') answer_data['likes'] = processor.collect_text( answer_div.find(class_='action-bar-vote-count')) processor.decompose(answer_div.find(class_='action-bar')) answer_data['user'] = processor.collect_text( answer_div.find(class_='user-info-name')) answer_data['user_role'] = processor.collect_text( answer_div.find(class_='user-info-role')) answer_data['time'] = str( processor.collect_datetime( answer_div.find(class_='user-info-timestamp'))[0]) answer_data['text'] = processor.collect_text( answer_div.find(class_='answer')) answer_data['to'] = to_user answer_data['comments'] = get_comments(answer.find(class_='comments'), answer_data['user']) answers.append(answer_data) return answers
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find( 'header' ).find_all( 'img' ) ) processor.decompose_all( article.find_all( 'blockquote' ) ) processor.decompose( article.find( class_ = "meta-sidebar" ) ) categories = processor.collect_categories( article.find_all( class_ = 'cat' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) ) author = processor.collect_text( article.find( class_ = 'author' ) ) title = processor.collect_text( article.find( class_ = 'article-title' ) ) ingress = processor.collect_text( article.find( class_ = 'ingress' ), True ) text = processor.collect_text( article.find( class_ = 'content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) ) return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'mainArticle-content-wrapper' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) header = article.find( id = 'main-article-header' ) categories = processor.collect_categories( header.find_all( class_ = 'section' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) ) author = processor.collect_text( article.find( class_ = 'authorName' ) ) title = processor.collect_text( article.find( class_ = 'main-article-header' ) ) text = processor.collect_text( article.find( class_ = 'body' ) ) processor.decompose( article.find( class_ = 'authorPicture' ) ) processor.decompose( article.find( id = 'main-subscribe' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) ) return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) departments = article.find( class_ = 'field-name-field-department-tref' ) categories = processor.collect_categories( departments.find_all( 'a' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) ) author = article.find( class_ = 'author' ) if author != None: processor.decompose( author.find( class_ = 'img' ) ) author = processor.collect_text( author.find( 'h3' ) ) else: author = u'' title = processor.collect_text( article.find( 'h1' ) ) text = processor.collect_text( article.find( class_ = 'field field-name-body' ) ) images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '') captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) ) return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('blockquote')) processor.decompose(article.find(class_='sticky-inner-wrapper')) categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1] categories = processor.collect_categories(categories_list) datetime_list = processor.collect_datetime(article.find(class_='meta')) authors = article.find(class_='authors') author = '' for div in authors.find_all(class_='author'): author += processor.collect_text(div.find('p')) + ',' author = author[:-1] processor.decompose(authors) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='lead')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) processor.decompose(article.find(class_='sticky-outer-wrapper active')) processor.decompose(article.find('header')) processor.decompose(article.find('footer')) text = processor.collect_text(article) return processor.create_dictionary('Kouvolan sanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='region bottom')) processor.decompose( article.find(class_='field-name-field-related-content')) categories = processor.collect_categories( article.find_all(class_='field-name-field-category')) datetime_list = processor.collect_datetime( article.find(class_='field-name-post-date'), 'timedate') author = processor.collect_text( article.find(class_='field-name-field-author')) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text( article.find(class_='field-name-field-summary')) text = processor.collect_text(article.find(class_='field-name-field-body')) images = [] for img in processor.collect_images(article.find_all('img'), 'src', ''): if 'placeholder' not in img: images.append(img) captions = processor.collect_image_captions( article.find_all(class_='file-image-description-caption')) return processor.create_dictionary('Hyvä terveys', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all( article.find_all(class_='views-field-field-aamuset-related-images')) categories_element = soup.find(class_='tsv3-c-as-articletags') categories = processor.collect_categories( categories_element.find_all('li')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='kirjoittaja')) processor.decompose(article.find(class_='kirjoittaja')) title = processor.collect_text(article.find(class_='otsikko')) text = processor.collect_text( article.find(class_='tsv3-c-as-article__textitem--teksti')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.aamuset.fi') captions = processor.collect_image_captions( article.find_all(class_='tsv3-c-as-article__attachment__caption')) return processor.create_dictionary('Aamuset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='posted-on')) author = author.replace(' |', '') processor.decompose(article.find(class_='entry-meta')) title = processor.collect_text(article.find(class_='entry-title')) ingress = processor.collect_text( article.find(class_='entry-content__ingress')) processor.decompose(article.find(class_='entry-content__ingress')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='entry-header__caption')) text = processor.collect_text(article.find(class_='entry-content')) return processor.create_dictionary('Verkkouutiset', url, r.status_code, [u''], datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'region-content-inner' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( 'noscript' ) ) processor.decompose( article.find( id = 'comments' ) ) processor.decompose( article.find( class_ = 'contributor' ) ) processor.decompose( article.find( class_ = 'field-name-field-author-image' ) ) categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) ) datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' ) author = processor.collect_text( article.find( class_ = 'author-name' ) ) title = processor.collect_text( article.find( id = 'page-title' ) ) text = processor.collect_text( article.find( class_ = 'field-name-body' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) ) return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='ad')) processor.decompose_all(article.find_all(class_='ad-container')) processor.decompose_all(article.find_all('style')) processor.decompose(article.find(id='fullWidthBottom')) categories = processor.collect_categories( article.find_all(class_='article-category')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='author-name')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='lead-paragraph')) text = processor.collect_text(article.find(class_='editorial')) images = processor.collect_images_by_parent( article.find_all(class_='img-container'), 'http:') captions = processor.collect_image_captions( article.find_all(class_='figcaption')) return processor.create_dictionary('Mtv', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find('footer')) processor.decompose_all(article.find_all(class_='cb-module-title')) processor.decompose_all(article.find_all('blockquote')) processor.decompose_all(article.find_all('aside')) categories = processor.collect_categories( article.find_all(class_='cb-category')) datetime_list = processor.collect_datetime(article.find(class_='cb-date')) author = processor.collect_text(article.find(class_='cb-author')) title = processor.collect_text(article.find(class_='entry-title')) ingress = processor.collect_text( article.find(class_='cb-entry-content').find('h4'), True) text = processor.collect_text(article.find(class_='cb-entry-content')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='caption')) return processor.create_dictionary('Kansan uutiset', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def get_comments(comments_html_element, to_user): comments = [] comments_list = comments_html_element.find(class_='comments-list') if comments_list: for comment_div in comments_list.find_all(class_='comment'): comment_data = {} comment_data['likes'] = processor.collect_text( comment_div.find(class_='action-bar-vote-count')) processor.decompose(comment_div.find(class_='action-bar')) comment_data['user'] = processor.collect_text( comment_div.find(class_='user-info-name')) comment_data['user_role'] = processor.collect_text( comment_div.find(class_='user-info-role')) comment_data['time'] = str( processor.collect_datetime( comment_div.find(class_='user-info-timestamp'))[0]) comment_data['text'] = processor.collect_text( comment_div.find(class_='comment-text')) comment_data['to'] = to_user comment_data['quote'] = {} blockquote = comment_div.find('blockquote') if blockquote: comment_data['quote']['quoted_user'] = processor.collect_text( blockquote.find('header').find('strong')) comment_data['quote']['text'] = processor.collect_text( blockquote.find( class_='text-muted blockquote-collapse-body')) comments.append(comment_data) return comments
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='post-single') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='avatar')) categories = processor.collect_categories( article.find_all(itemprop='articleSection')) datetime_list = processor.collect_datetime( article.find(itemprop='dateCreated datePublished')) author = processor.collect_text(article.find(rel='author')) title = processor.collect_text(article.find(itemprop='headline')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='sopuli-image-caption')) processor.decompose_all(article.find_all(itemprop='associatedMedia')) text = processor.collect_text(article.find(itemprop='articleBody')) return processor.create_dictionary('Kokemäenjokilaakson uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse_from_archive(url, content): article = BeautifulSoup(content, "html.parser") if article == None: return processor.create_dictionary('Keskisuomalainen', url, 404, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) meta = article.find(class_='date') categories = [processor.collect_text(meta).split(' ')[0]] datetime_list = processor.collect_datetime(meta) author = processor.collect_text(article.find(class_='author'), True) processor.decompose(meta) title_parts = article.find_all('h2') title = '' for part in title_parts: title += processor.collect_text(part, True) + ' ' title = title.strip() ingress_parts = article.find_all('h4') ingress = '' for part in ingress_parts: ingress += processor.collect_text(part, True) + ' ' ingress = ingress.strip() processor.decompose(article.find_all('p')[-1]) text = processor.collect_text(article) return processor.create_dictionary('Keskisuomalainen', url, 200, categories, datetime_list, author, title, ingress, text, [u''], [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-container') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='article__related')) processor.decompose_all( article.find_all(class_='smartblock--juttusivu-markkinointi')) meta = article.find(class_='news__meta') categories = [processor.collect_text(meta).split(' ')[0]] datetime_list = processor.collect_datetime(meta) author = processor.collect_text(meta.find(class_='news__source')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='article__text')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='image__caption')) return processor.create_dictionary('Kaleva', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-articles-container')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='post-meta') processor.decompose(datetime_data.find(class_='category')) processor.decompose(datetime_data.find(class_='updated')) datetime_list = processor.collect_datetime(datetime_data) author = processor.collect_text(article.find(class_='author--main')) title = processor.collect_text(article.find(class_='heading--main')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='caption')) processor.decompose_all(article.find_all(class_='image-wrapper')) text = processor.collect_text(article.find(class_='content--main')) return processor.create_dictionary('Aamulehti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='post') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all('blockquote')) processor.decompose(article.find(class_='author-avatar')) processor.decompose( article.find(id='after-single-post-widget-zone-single-post')) processor.decompose(article.find(id='sidebar')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(itemprop='name')) title = processor.collect_text(article.find(class_=' xt-post-title')) text = processor.collect_text(article.find(class_='post-body')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Mahorkka', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='keywords-block')) processor.decompose_all(article.find_all(class_='share-buttons-block')) processor.decompose(article('p')[-1]) processor.decompose(article.footer) processor.decompose(article.find(class_='wp-user-avatar')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='single-post-date') processor.decompose(datetime_data.find(class_='category')) datetime_list = processor.collect_datetime(datetime_data) processor.decompose(article.find(class_='single-post-date')) author = processor.collect_text( article.find(class_='post-author').find('li')) title = processor.collect_text(article.find(class_='entry-title')) text = processor.collect_text(article.find(class_='post-content')) images = processor.collect_images(article.find_all('img'), 'src', 'https://demokraatti.fi') return processor.create_dictionary('Demokraatti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='node-wrap') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='kredIso')) processor.decompose_all(article.find_all(class_='tyrkkyBox')) processor.decompose(article.find(class_='avainsanat')) processor.decompose(article.find(class_='twitter-share-button')) processor.decompose(article.find(class_='fb-like')) processor.decompose(article.find(class_='moreLanka')) processor.decompose(article.find(class_='kredIso')) processor.decompose(article.find('cite')) meta = article.find(class_='juttutiedot') datetime_list = processor.collect_datetime(meta, ) author = processor.collect_text(meta.find(class_='author')) processor.decompose(meta) title = processor.collect_text(article.find('h2'), True) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all('figcaption')) text = processor.collect_text(article) return processor.create_dictionary('Vihreä lanka', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, captions)