def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "div", {"class": "article-body"}) body = "".join( [p.text for p in body.findAll("p") if not check_for_strong_link(p)]) unwanted = [ "Fox News Flash top headlines are here. Check out what's clicking on Foxnews.com.", "Get all the latest news on\xa0coronavirus\xa0and more delivered daily to your inbox.\xa0Sign up here.", ] # hack for coronavirus tag that appears in later articles for unw in unwanted: body.replace(unw, "") # TODO scripts = soup.findAll("script", attrs={"type": "application/ld+json"}) assert len(scripts) == 2 app = str(scripts[0].contents[0]) app = app.replace("\n", "") app = json.loads(app) headline = app['headline'] published = app['datePublished'] return { **fox, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, 'div', {'id': 'article-body'}) except utils.ParserError as error: body = utils.find_one_tag(soup, 'section', {"class": "article__main"}) body = "".join([p.text for p in body.findAll("p")]) app = utils.find_application_json(soup, find='headline') headline = app['headline'] published = app['datePublished'] return { **nzherald, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag( soup, "span", { "class": "sics-component__story__body sics-component__story__body--nativform" }) body = "".join([p.text for p in body.findAll("p")]) published = soup.findAll("meta", attrs={"itemprop": "datePublished"}) assert len(published) == 1 published = published[0]["content"] headline = soup.findAll("h1", attrs={"itemprop": "headline"}) assert len(headline) == 1 headline = headline[0].text return { **stuff, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "section", {"name": "articleBody"}) body = "".join([p.text for p in body.findAll("p")]) noise = [ "The Times is committed to publishing a diversity of letters to the editor. We’d like to hear what you think about this or any of our articles. Here are some tips. And here’s our email: [email protected] The New York Times Opinion section on Facebook, Twitter (@NYTopinion) and Instagram.", "Want climate news in your inbox? Sign up here for Climate Fwd:, our email newsletter.", "For more news on climate and the environment, follow @NYTClimate on Twitter.", ] for n in body: body.replace(n, "") app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **nytimes, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_guardian_html(url): r = request(url) html = r['html'] soup = r['soup'] try: body = find_one_tag( soup, 'div', { 'class': 'article-body-commercial-selector css-79elbk article-body-viewer-selector' }) except ParserError as error: body = find_one_tag(soup, 'div', {'itemprop': 'articleBody'}) body = "".join([p.text for p in body.findAll("p")]) headline = find_one_tag(soup, 'meta', {'property': 'og:title'})['content'] published = find_one_tag(soup, 'meta', {'property': 'article:published_time'})['content'] return { "newspaper_id": "guardian", "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_guardian_article_id(url), "date_published": published, }
def check_url(url): # wierd redirect to /av/ if url == 'https://www.bbc.com/news/science-environment-52926683': return False # `The papers` - just a list of images - not sure how to check without parsing if url == 'https://www.bbc.com/news/uk-scotland-53961637': return False if url == 'https://www.bbc.com/news/science_and_environment': return False # check to see if it is news section u = urlparse(url) if u.path and u.path.split('/')[1] != "news": return False if len(u.path.split('/')) > 3: return False # check to see if it redirects res = request(url) if res['response'].status_code == 404: return False elif res['url'] != url: return check_url(res['url']) # check to see if url ends with an integer matcher = re.compile('.*-\d*') if matcher.match(url): return url
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "p", {"class": "description-text"}) except utils.ParserError: body = utils.find_one_tag(soup, "p", {"class": "article-text row"}) body = "".join([p.text for p in body.findAll("p")]) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **paper, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"}) body = "".join([p.text for p in body.findAll("p")[:-1]]) noise = [ "This article is republished from The Conversation under a Creative Commons license. Read the original article here. ", ] for n in noise: body = body.replace(n, "") headline = utils.find_one_tag(soup, "title").text published = utils.find_one_tag(soup, "meta", {"itemprop": "datePublished"})["content"] return { **newshub, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def test_bbc_bad_redirects(): bad_redirect_urls = ( "https://www.bbc.com/news/science-environment-51129250", ) for u in bad_redirect_urls: r = request(u) res = r['response'] assert not check_url(r['url']) assert not check_url(u)
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag( soup, "div", { "itemprop": "text", "class": "ds-layout-grid ds-layout-grid--edged layout-article-body" }) new_body = [] for p_tag in body.findAll("p"): if 'class' in p_tag.attrs.keys(): if 'article__body' in p_tag.attrs['class'][0]: new_body.append(p_tag.text) body = new_body body = "".join(body) unwanted = [ "For more coverage of climate change, register for The Climate Issue, our fortnightly newsletter, or visit our climate-change hub", 'Sign up to our new fortnightly climate-change newsletter hereThis article appeared in the Leaders section of the print edition under the headline "The climate issue"', ] body = clean_string(body, unwanted) headline = utils.find_one_tag(soup, "span", { "class": "article__headline", "itemprop": "headline" }).text app = utils.find_one_tag(soup, "script", {"type": "application/json"}) app = json.loads(app.text) if 'metadata' in app["props"]["pageProps"].keys(): meta = app["props"]["pageProps"]["metadata"] published = meta["datePublished"] else: published = app['props']['pageProps']['content'][0]['datePublished'] return { **economist, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "p", {"class": "description-text"}) body = body.text except utils.ParserError: body = utils.find_one_tag(soup, "p", {"class": "article-text row"}) body = "".join([p.text for p in body.findAll("p")]) scripts = soup.findAll("script") for script in scripts: if script.string: if "publishedDate" in script.string: # get_text() & .text not working data = script.string data = data.replace("window.__INITIAL_STATE__ = ", "") data = data.replace("\n", "") data = data.replace(";", "") data = data.split("window.__ENV__ =")[0] data = json.loads(data) data = data["items"] key = list(data.keys())[0] published = int( data[key]["content"]["attributes"]["publishedDate"]) published = datetime.fromtimestamp(published / 1000).isoformat() break else: published = datetime.fromtimestamp(0).isoformat() headline = utils.find_one_tag(soup, 'title').text headline = headline.split("|")[0] return { **skyau, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] # hope it's going to be the first one :) try: body = soup.findAll("div", {"class": "longText"})[0] body = "".join([ p.text for p in body.findAll("p", recursive=False) if not p.attrs ]) except IndexError: raise utils.ParserError('no longText') try: headline = utils.find_one_tag(soup, 'title', { "id": None }).text.split('|')[0].strip(' ') date = soup.findAll('div', {'class': 'col1 dim'})[0].findAll('li') for li in date: st = li.findAll('strong') if st[0].text == 'Date': date = li.text break date = date.split('\n')[1] published = datetime.strptime(date, '%d.%m.%Y').isoformat() except IndexError: raise utils.ParserError('no headline or date') return { **dw, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, 'article') body = ''.join([p.text for p in body.findAll('p')]) except utils.ParserError: body = utils.find_one_tag( soup, "div", { "class": "article-body js-article-container", "itemprop": "articleBody" }) body = body.findAll("p") body = "".join(p.text for p in body if "c-letters-cta__text" not in p.attrs.values()) app = utils.find_application_json(soup, 'headline') headline = app['headline'] # sometimes can be "" in the ld+json if headline == "": headline = utils.find_one_tag(soup, "h1", { "class": "c-article-header__hed" }).text published = app['datePublished'] return { **atlantic, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): headers = {'User-Agent': 'Mozilla/5.0'} response = utils.request(url, headers=headers) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"}) body = "".join([p.text for p in body.findAll("p")[:-1]]) headline = utils.find_one_tag(soup, 'title').text.split('|')[0] published = utils.find_one_tag( soup, 'meta', {'property': 'article:published_time'})['content'] return { **dailymail, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): headers = {'User-Agent':'Mozilla/5.0'} response = utils.request(url, headers=headers) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, 'div', {'id': 'main'}) body = [p.text for p in body.findAll('p') if p.attrs == {} or p.attrs == {'dir': 'ltr'}] body = ''.join(body) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **independent, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): r = request(url) if 'error' in r.keys(): return {'error': r['error']} html = r['html'] soup = r['soup'] body = find_one_tag(soup, 'article') text_blocks = body.findAll("div", attrs={'data-component': 'text-block'}) body = [] for block in text_blocks: body.extend(block.findAll("p", attrs={'class': None})) deep_body = [] for p_tag in body: # style tags were slipping into the p tag for s in p_tag('style'): s.decompose() text = p_tag.get_text() # last link tag, often a link to Twitter or Read more here if p_tag.find('a') and p_tag is body[-1]: pass else: deep_body.append(text) body = "".join(deep_body) app = utils.find_application_json(soup, find='headline') return { "newspaper_id": "bbc", "body": body, "article_id": get_bbc_article_id(url), "headline": app['headline'], "article_url": url, "html": html, "date_published": app["datePublished"], }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"}) new_body = [] for pt in body.findAll("div"): flag = False if 'class' in pt.attrs.keys(): class_ = pt.attrs['class'] for thing in class_: if 'zn-body__paragraph' in thing: flag = True if flag: new_body.append(pt.text) body = new_body body = "".join(body) headline = utils.find_one_tag(soup, 'title', { "id": None }).text.replace(" - CNN", "") published = utils.find_one_tag(soup, 'meta', {'itemprop': 'datePublished'}) published = published['content'] return { **cnn, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "div", {"class": "main-article-body"}) body = body.findAll("p") except utils.ParserError: # possible to have multiple 'text section' divs body = soup.findAll("div", {"class": "text section"}) p_tags = [] for b in body: p_tags.extend(b.findAll("p")) body = p_tags if len(body) == 0: body = utils.find_one_tag(soup, "div", {"class": "wysiwyg wysiwyg--all-content"}) body = body.findAll("p") body = "".join([p.text for p in body]) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **aljazeera, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": strip_aljazzera_dt(app["datePublished"]), }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "div", {"class": "article-body"}) except utils.ParserError: body = utils.find_one_tag( soup, "div", {"class": "ent-article-body ent-layout-centered"}) new_body = [] for p in body.findAll("p"): if 'data-elm-loc' in p.attrs.keys(): new_body.append(p.text) if 'class' in p.attrs.keys(): if 'font--body' in p.attrs['class']: new_body.append(p.text) body = "".join(new_body) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **washington_post, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }