Beispiel #1
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(role='main')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='article-author__image'))

    categories = processor.collect_categories(
        article.find_all(class_='article__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article__published'))
    author = processor.collect_text(article.find(class_='article__author'))
    title = processor.collect_text(article.find(class_='article__title'))
    text = processor.collect_text(article.find(class_='article__body'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='article__images'), '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Savon sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Beispiel #2
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'iso-8859-1'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( id = 'container_keski' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose( article.find( class_ = 'kp-share-area' ) )

	categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) )

	author_div = article.find( class_ = 'author' )
	processor.decompose( author_div.find( 'a' ) )
	author = processor.collect_text( author_div, True )

	title = processor.collect_text( article.find( 'h1' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) )

	processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) )

	text = processor.collect_text( article.find( 'isense' ) )

	return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
Beispiel #3
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( class_ = 'somebar' ) )
	processor.decompose( article.find( class_ = 'tags' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) )

	datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True )
	datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) )
	datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )]

	author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True )
	title = processor.collect_text( article.find( class_ = 'post-title' ) )
	text = processor.collect_text( article.find( class_ = 'post-content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( 'figcaption' ) )

	return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
Beispiel #4
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(id='main-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='reviewpic'))

    datetime_list = processor.collect_datetime(
        article.find(class_='published'))
    author = processor.collect_text(article.find(class_='author'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='entry-content'))
    images = processor.collect_images(article.find_all('img'), 'src', '')

    return processor.create_dictionary('Faktabaari', url, r.status_code, [u''],
                                       datetime_list, author, title, u'', text,
                                       images, [u''])
Beispiel #5
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='single-article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-links'))
    processor.decompose_all(article.find_all(class_='article-ad-block'))

    categories = processor.collect_categories(
        soup.find(class_='section-title'))
    datetime_list = processor.collect_datetime(article.find('time'))
    author = processor.collect_text(article.find(class_='byline'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='ingress'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(article.find_all('img'), 'src', 'http:')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Taloussanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Beispiel #6
0
def get_answers(answers_html_element, to_user):
    answers = []

    for answer in answers_html_element:
        answer_data = {}
        answer_div = answer.find(class_='answer-container')

        answer_data['likes'] = processor.collect_text(
            answer_div.find(class_='action-bar-vote-count'))
        processor.decompose(answer_div.find(class_='action-bar'))

        answer_data['user'] = processor.collect_text(
            answer_div.find(class_='user-info-name'))
        answer_data['user_role'] = processor.collect_text(
            answer_div.find(class_='user-info-role'))
        answer_data['time'] = str(
            processor.collect_datetime(
                answer_div.find(class_='user-info-timestamp'))[0])
        answer_data['text'] = processor.collect_text(
            answer_div.find(class_='answer'))
        answer_data['to'] = to_user
        answer_data['comments'] = get_comments(answer.find(class_='comments'),
                                               answer_data['user'])
        answers.append(answer_data)

    return answers
Beispiel #7
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find( 'header' ).find_all( 'img' ) )
	processor.decompose_all( article.find_all( 'blockquote' ) )
	processor.decompose( article.find( class_ = "meta-sidebar" ) )

	categories = processor.collect_categories( article.find_all( class_ = 'cat' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) )
	author = processor.collect_text( article.find( class_ = 'author' ) )
	title = processor.collect_text( article.find( class_ = 'article-title' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingress' ), True )
	text = processor.collect_text( article.find( class_ = 'content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) )

	return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
Beispiel #8
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'mainArticle-content-wrapper' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	header = article.find( id = 'main-article-header' )
	categories = processor.collect_categories( header.find_all( class_ = 'section' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) )
	author = processor.collect_text( article.find( class_ = 'authorName' ) )
	title = processor.collect_text( article.find( class_ = 'main-article-header' ) )
	text = processor.collect_text( article.find( class_ = 'body' ) )

	processor.decompose( article.find( class_ = 'authorPicture' ) )
	processor.decompose( article.find( id = 'main-subscribe' ) )

	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) )

	return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
Beispiel #9
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	departments = article.find( class_ = 'field-name-field-department-tref' )
	categories = processor.collect_categories( departments.find_all( 'a' ) )

	datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) )

	author = article.find( class_ = 'author' )
	if author != None:
		processor.decompose( author.find( class_ = 'img' ) )
		author = processor.collect_text( author.find( 'h3' ) )
	else:
		author = u''

	title = processor.collect_text( article.find( 'h1' ) )
	text = processor.collect_text( article.find( class_ = 'field field-name-body' ) )
	images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '')
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
Beispiel #10
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose(article.find(class_='sticky-inner-wrapper'))

    categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1]
    categories = processor.collect_categories(categories_list)

    datetime_list = processor.collect_datetime(article.find(class_='meta'))

    authors = article.find(class_='authors')
    author = ''
    for div in authors.find_all(class_='author'):
        author += processor.collect_text(div.find('p')) + ','
    author = author[:-1]

    processor.decompose(authors)

    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='lead'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    processor.decompose(article.find(class_='sticky-outer-wrapper active'))
    processor.decompose(article.find('header'))
    processor.decompose(article.find('footer'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Kouvolan sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Beispiel #11
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='region bottom'))
    processor.decompose(
        article.find(class_='field-name-field-related-content'))

    categories = processor.collect_categories(
        article.find_all(class_='field-name-field-category'))
    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-post-date'), 'timedate')
    author = processor.collect_text(
        article.find(class_='field-name-field-author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(
        article.find(class_='field-name-field-summary'))
    text = processor.collect_text(article.find(class_='field-name-field-body'))

    images = []
    for img in processor.collect_images(article.find_all('img'), 'src', ''):
        if 'placeholder' not in img:
            images.append(img)

    captions = processor.collect_image_captions(
        article.find_all(class_='file-image-description-caption'))

    return processor.create_dictionary('Hyvä terveys', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Beispiel #12
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(
        article.find_all(class_='views-field-field-aamuset-related-images'))

    categories_element = soup.find(class_='tsv3-c-as-articletags')
    categories = processor.collect_categories(
        categories_element.find_all('li'))

    datetime_list = processor.collect_datetime(article.find('time'))

    author = processor.collect_text(article.find(class_='kirjoittaja'))
    processor.decompose(article.find(class_='kirjoittaja'))

    title = processor.collect_text(article.find(class_='otsikko'))
    text = processor.collect_text(
        article.find(class_='tsv3-c-as-article__textitem--teksti'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.aamuset.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='tsv3-c-as-article__attachment__caption'))

    return processor.create_dictionary('Aamuset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Beispiel #13
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')

    author = processor.collect_text(article.find(class_='posted-on'))
    author = author.replace(' |', '')

    processor.decompose(article.find(class_='entry-meta'))

    title = processor.collect_text(article.find(class_='entry-title'))

    ingress = processor.collect_text(
        article.find(class_='entry-content__ingress'))
    processor.decompose(article.find(class_='entry-content__ingress'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='entry-header__caption'))
    text = processor.collect_text(article.find(class_='entry-content'))

    return processor.create_dictionary('Verkkouutiset', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       ingress, text, images, captions)
Beispiel #14
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'region-content-inner' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( 'noscript' ) )
	processor.decompose( article.find( id = 'comments' ) )
	processor.decompose( article.find( class_ = 'contributor' ) )
	processor.decompose( article.find( class_ = 'field-name-field-author-image' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) )
	datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' )
	author = processor.collect_text( article.find( class_ = 'author-name' ) )
	title = processor.collect_text( article.find( id = 'page-title' ) )
	text = processor.collect_text( article.find( class_ = 'field-name-body' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) )

	return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
Beispiel #15
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='ad'))
    processor.decompose_all(article.find_all(class_='ad-container'))
    processor.decompose_all(article.find_all('style'))
    processor.decompose(article.find(id='fullWidthBottom'))

    categories = processor.collect_categories(
        article.find_all(class_='article-category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(class_='author-name'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='lead-paragraph'))
    text = processor.collect_text(article.find(class_='editorial'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='img-container'), 'http:')
    captions = processor.collect_image_captions(
        article.find_all(class_='figcaption'))

    return processor.create_dictionary('Mtv', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
Beispiel #16
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find('footer'))
    processor.decompose_all(article.find_all(class_='cb-module-title'))
    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose_all(article.find_all('aside'))

    categories = processor.collect_categories(
        article.find_all(class_='cb-category'))
    datetime_list = processor.collect_datetime(article.find(class_='cb-date'))
    author = processor.collect_text(article.find(class_='cb-author'))
    title = processor.collect_text(article.find(class_='entry-title'))
    ingress = processor.collect_text(
        article.find(class_='cb-entry-content').find('h4'), True)
    text = processor.collect_text(article.find(class_='cb-entry-content'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    return processor.create_dictionary('Kansan uutiset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Beispiel #17
0
def get_comments(comments_html_element, to_user):
    comments = []

    comments_list = comments_html_element.find(class_='comments-list')

    if comments_list:

        for comment_div in comments_list.find_all(class_='comment'):
            comment_data = {}

            comment_data['likes'] = processor.collect_text(
                comment_div.find(class_='action-bar-vote-count'))
            processor.decompose(comment_div.find(class_='action-bar'))

            comment_data['user'] = processor.collect_text(
                comment_div.find(class_='user-info-name'))
            comment_data['user_role'] = processor.collect_text(
                comment_div.find(class_='user-info-role'))
            comment_data['time'] = str(
                processor.collect_datetime(
                    comment_div.find(class_='user-info-timestamp'))[0])
            comment_data['text'] = processor.collect_text(
                comment_div.find(class_='comment-text'))
            comment_data['to'] = to_user
            comment_data['quote'] = {}

            blockquote = comment_div.find('blockquote')
            if blockquote:
                comment_data['quote']['quoted_user'] = processor.collect_text(
                    blockquote.find('header').find('strong'))
                comment_data['quote']['text'] = processor.collect_text(
                    blockquote.find(
                        class_='text-muted blockquote-collapse-body'))

            comments.append(comment_data)

    return comments
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='post-single')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='avatar'))

    categories = processor.collect_categories(
        article.find_all(itemprop='articleSection'))
    datetime_list = processor.collect_datetime(
        article.find(itemprop='dateCreated datePublished'))
    author = processor.collect_text(article.find(rel='author'))
    title = processor.collect_text(article.find(itemprop='headline'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='sopuli-image-caption'))

    processor.decompose_all(article.find_all(itemprop='associatedMedia'))
    text = processor.collect_text(article.find(itemprop='articleBody'))

    return processor.create_dictionary('Kokemäenjokilaakson uutiset', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, u'', text,
                                       images, captions)
Beispiel #19
0
def parse_from_archive(url, content):
    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Keskisuomalainen', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='date')

    categories = [processor.collect_text(meta).split(' ')[0]]
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(article.find(class_='author'), True)

    processor.decompose(meta)

    title_parts = article.find_all('h2')
    title = ''
    for part in title_parts:
        title += processor.collect_text(part, True) + ' '
    title = title.strip()

    ingress_parts = article.find_all('h4')
    ingress = ''
    for part in ingress_parts:
        ingress += processor.collect_text(part, True) + ' '
    ingress = ingress.strip()

    processor.decompose(article.find_all('p')[-1])

    text = processor.collect_text(article)

    return processor.create_dictionary('Keskisuomalainen', url, 200,
                                       categories, datetime_list, author,
                                       title, ingress, text, [u''], [u''])
Beispiel #20
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-container')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='article__related'))
    processor.decompose_all(
        article.find_all(class_='smartblock--juttusivu-markkinointi'))

    meta = article.find(class_='news__meta')

    categories = [processor.collect_text(meta).split(' ')[0]]
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(meta.find(class_='news__source'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='article__text'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='image__caption'))

    return processor.create_dictionary('Kaleva', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Beispiel #21
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-articles-container'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='post-meta')
    processor.decompose(datetime_data.find(class_='category'))
    processor.decompose(datetime_data.find(class_='updated'))
    datetime_list = processor.collect_datetime(datetime_data)

    author = processor.collect_text(article.find(class_='author--main'))
    title = processor.collect_text(article.find(class_='heading--main'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    processor.decompose_all(article.find_all(class_='image-wrapper'))
    text = processor.collect_text(article.find(class_='content--main'))

    return processor.create_dictionary('Aamulehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Beispiel #22
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='post')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose(article.find(class_='author-avatar'))
    processor.decompose(
        article.find(id='after-single-post-widget-zone-single-post'))
    processor.decompose(article.find(id='sidebar'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(itemprop='name'))
    title = processor.collect_text(article.find(class_=' xt-post-title'))
    text = processor.collect_text(article.find(class_='post-body'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Mahorkka', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Beispiel #23
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='keywords-block'))
    processor.decompose_all(article.find_all(class_='share-buttons-block'))
    processor.decompose(article('p')[-1])
    processor.decompose(article.footer)
    processor.decompose(article.find(class_='wp-user-avatar'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='single-post-date')
    processor.decompose(datetime_data.find(class_='category'))
    datetime_list = processor.collect_datetime(datetime_data)

    processor.decompose(article.find(class_='single-post-date'))

    author = processor.collect_text(
        article.find(class_='post-author').find('li'))
    title = processor.collect_text(article.find(class_='entry-title'))
    text = processor.collect_text(article.find(class_='post-content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'https://demokraatti.fi')

    return processor.create_dictionary('Demokraatti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, [u''])
Beispiel #24
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='node-wrap')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose_all(article.find_all(class_='tyrkkyBox'))
    processor.decompose(article.find(class_='avainsanat'))
    processor.decompose(article.find(class_='twitter-share-button'))
    processor.decompose(article.find(class_='fb-like'))
    processor.decompose(article.find(class_='moreLanka'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose(article.find('cite'))

    meta = article.find(class_='juttutiedot')
    datetime_list = processor.collect_datetime(meta, )
    author = processor.collect_text(meta.find(class_='author'))
    processor.decompose(meta)

    title = processor.collect_text(article.find('h2'), True)
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='kuvaTekstiIso'))

    processor.decompose_all(article.find_all(class_='kuvaTekstiIso'))
    processor.decompose_all(article.find_all('figcaption'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Vihreä lanka', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       u'', text, images, captions)