def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "p", {"class": "description-text"}) except utils.ParserError: body = utils.find_one_tag(soup, "p", {"class": "article-text row"}) body = "".join([p.text for p in body.findAll("p")]) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **paper, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, 'div', {'id': 'article-body'}) except utils.ParserError as error: body = utils.find_one_tag(soup, 'section', {"class": "article__main"}) body = "".join([p.text for p in body.findAll("p")]) app = utils.find_application_json(soup, find='headline') headline = app['headline'] published = app['datePublished'] return { **nzherald, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, "section", {"name": "articleBody"}) body = "".join([p.text for p in body.findAll("p")]) noise = [ "The Times is committed to publishing a diversity of letters to the editor. We’d like to hear what you think about this or any of our articles. Here are some tips. And here’s our email: [email protected] The New York Times Opinion section on Facebook, Twitter (@NYTopinion) and Instagram.", "Want climate news in your inbox? Sign up here for Climate Fwd:, our email newsletter.", "For more news on climate and the environment, follow @NYTClimate on Twitter.", ] for n in body: body.replace(n, "") app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **nytimes, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, 'article') body = ''.join([p.text for p in body.findAll('p')]) except utils.ParserError: body = utils.find_one_tag( soup, "div", { "class": "article-body js-article-container", "itemprop": "articleBody" }) body = body.findAll("p") body = "".join(p.text for p in body if "c-letters-cta__text" not in p.attrs.values()) app = utils.find_application_json(soup, 'headline') headline = app['headline'] # sometimes can be "" in the ld+json if headline == "": headline = utils.find_one_tag(soup, "h1", { "class": "c-article-header__hed" }).text published = app['datePublished'] return { **atlantic, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): headers = {'User-Agent':'Mozilla/5.0'} response = utils.request(url, headers=headers) soup = response['soup'] html = response['html'] body = utils.find_one_tag(soup, 'div', {'id': 'main'}) body = [p.text for p in body.findAll('p') if p.attrs == {} or p.attrs == {'dir': 'ltr'}] body = ''.join(body) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **independent, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }
def parse_url(url): r = request(url) if 'error' in r.keys(): return {'error': r['error']} html = r['html'] soup = r['soup'] body = find_one_tag(soup, 'article') text_blocks = body.findAll("div", attrs={'data-component': 'text-block'}) body = [] for block in text_blocks: body.extend(block.findAll("p", attrs={'class': None})) deep_body = [] for p_tag in body: # style tags were slipping into the p tag for s in p_tag('style'): s.decompose() text = p_tag.get_text() # last link tag, often a link to Twitter or Read more here if p_tag.find('a') and p_tag is body[-1]: pass else: deep_body.append(text) body = "".join(deep_body) app = utils.find_application_json(soup, find='headline') return { "newspaper_id": "bbc", "body": body, "article_id": get_bbc_article_id(url), "headline": app['headline'], "article_url": url, "html": html, "date_published": app["datePublished"], }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "div", {"class": "main-article-body"}) body = body.findAll("p") except utils.ParserError: # possible to have multiple 'text section' divs body = soup.findAll("div", {"class": "text section"}) p_tags = [] for b in body: p_tags.extend(b.findAll("p")) body = p_tags if len(body) == 0: body = utils.find_one_tag(soup, "div", {"class": "wysiwyg wysiwyg--all-content"}) body = body.findAll("p") body = "".join([p.text for p in body]) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **aljazeera, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": strip_aljazzera_dt(app["datePublished"]), }
def parse_url(url): response = utils.request(url) soup = response['soup'] html = response['html'] try: body = utils.find_one_tag(soup, "div", {"class": "article-body"}) except utils.ParserError: body = utils.find_one_tag( soup, "div", {"class": "ent-article-body ent-layout-centered"}) new_body = [] for p in body.findAll("p"): if 'data-elm-loc' in p.attrs.keys(): new_body.append(p.text) if 'class' in p.attrs.keys(): if 'font--body' in p.attrs['class']: new_body.append(p.text) body = "".join(new_body) app = utils.find_application_json(soup, 'headline') headline = app['headline'] published = app['datePublished'] return { **washington_post, "body": body, "headline": headline, "article_url": url, "html": html, "article_id": get_article_id(url), "date_published": published, }