def mail(): mail_headlines_array = [] url = "http://www.dailymail.co.uk" soup = soupifier(url) headlines_total = searchifier({ "soup": soup, "tag": 'h2', "attribute": 'class', "names": 'linkro-darkred', "find": 0 }) headlines_total = tenifier(headlines_total) for headline in headlines_total: headline = searchifier({ "soup": headline, "tag": 'a', "names": None, "find": 0 })[0] link = linkifier(url, headline) soup = soupifier(link) body_soup = searchifier({ "soup": soup, "tag": 'div', "attribute": 'itemprop', "names": 'articleBody', "find": 1 }) body = bodifier(body_soup) appendifier(mail_headlines_array, headline, link, url, body) return mail_headlines_array
def guardian(): guardianuk_headlines_array = [] url = "https://www.theguardian.com/uk" soup = soupifier(url) section = searchifier({ "soup": soup, "tag": 'section', "attribute": 'id', "names": 'headlines', "find": 0 }) headlines_total = searchifier({ "soup": section, "tag": 'a', "attribute": 'class', "names": 'u-faux-block-link__overlay js-headline-text', "find": 0 }) headlines_total = tenifier(headlines_total) for headline in headlines_total: link = linkifier('https://www.theguardian.com/', headline) soup = soupifier(link) body_soup = searchifier({ "soup": soup, "tag": 'div', "attribute": 'class', "names": 'content__article-body from-content-api js-article__body', "find": 0 }) body = bodifier(body_soup) appendifier(guardianuk_headlines_array, headline, link, url, body) return guardianuk_headlines_array
def bbc(): bbc_headlines_array = [] url = "http://www.bbc.co.uk/news" soup = soupifier(url) headlines_total = searchifier({ "soup": soup, "tag": 'a', "attribute": 'class', "names": 'gs-c-promo-heading', "find": 0 }) headlines = tenifier(headlines_total) for headline in headlines: link = linkifier('http://www.bbc.co.uk/', headline) soup = soupifier(link) body_soup = searchifier({ "soup": soup, "tag": 'div', "attribute": 'class', "names": ['story-body__inner','vxp-media__body','story-body sp-story-body gel-body-copy'], "find": 0 }) body = bodifier(body_soup) appendifier(bbc_headlines_array, headline, link, url, body) return bbc_headlines_array
def independent(): independent_headlines_array = [] headlines_array = [] url = "https://www.independent.co.uk" soup = soupifier(url) section_content = searchifier({"soup": soup, "tag": 'section', "attribute": 'class', "names": 'section-content', "find": 1}) splash_row = searchifier({"soup": section_content, "tag": 'div', "attribute": 'class', "names": 'splash-row', "find": 1}) headlines_total = searchifier({"soup": splash_row, "tag": 'div', "attribute": 'class', "names": 'content', "find": 0}) for headline in headlines_total: top_two = searchifier({"soup": headline, "tag": 'h2', "names": None, "find": 1}) if top_two: link = searchifier({"soup": headline, "tag": 'a', "names": None, "find": 1}) link = linkifier(url, link) body_soup = soupifier(link) body_soup = searchifier({"soup": body_soup, "tag": 'div', "attribute": 'class', "names": 'body-content', "find": 1}) body = bodifier(body_soup) appendifier(independent_headlines_array, headline, link, url, body) eight_articles_dmpu = searchifier({"soup": soup, "tag": 'div', "attribute": 'class', "names": 'eight-articles-dmpu position-left', "find": 1}) top_eight = searchifier({"soup": eight_articles_dmpu, "tag": 'div', "attribute": 'class', "names": 'content', "find": 0}) for headline in top_eight: headline = searchifier({"soup": headline, "tag": 'a', "names": None, "find": 0})[1:] for element in headline: link = linkifier(url, element) headline = searchifier({"soup": element, "tag": 'div', "attribute": 'class', "names": 'headline', "find": 1}) body_soup = soupifier(link) body_soup = searchifier({"soup": body_soup, "tag": 'div', "attribute": 'class', "names": 'body-content', "find": 1}) body = bodifier(body_soup) appendifier(independent_headlines_array, headline, link, url, body) return independent_headlines_array
def sun(): the_sun_headlines_array = [] url = "https://www.thesun.co.uk" soup = soupifier(url) for item in soup.find_all('a', {'class': 'text-anchor-wrap'}, limit=10): headline = item.find('p', {'class': 'teaser__subdeck'}) link = item['href'] soup2 = soupifier(link) div2 = soup2.find_all('div', {'class': 'article__content'}) if div2: ps = div2[0].find_all('p') body = [p.text.strip() for p in ps] else: body = [] appendifier(the_sun_headlines_array, headline, link, url, body) return the_sun_headlines_array
def telegraph(): telegraph_headlines_array = [] url = "https://www.telegraph.co.uk" soup = soupifier(url) headlines_total = searchifier({ "soup": soup, "tag": 'h3', "attribute": 'class', "names": ['list-of-entities__item-body-headline','list-headline'], "find": 0 }) headlines_total = tenifier(headlines_total) for headline in headlines_total: headline_ = searchifier([{ "soup": headline, "tag": 'a', "names": None, "find": 1 }, { "soup": headline, "tag": 'span', "attribute": 'class', "names": 'list-of-entities__item-headline-text', "find": 1 }]) link = linkifier(url, headline_) if link is False: headline_ = headline.parent link = linkifier(url, headline_) soup = soupifier(link) body_soup = searchifier([{ "soup": soup, "tag": 'article', "names": None, "find": 0 }, { "soup": soup, "tag": 'div', "attribute": 'class', "names": 'js-article-inner', "find": 0 }]) body = bodifier(body_soup) appendifier(telegraph_headlines_array, headline, link, url, body) return telegraph_headlines_array