Exemple #1
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "div", {"class": "article-body"})
    body = "".join(
        [p.text for p in body.findAll("p") if not check_for_strong_link(p)])

    unwanted = [
        "Fox News Flash top headlines are here. Check out what's clicking on Foxnews.com.",
        "Get all the latest news on\xa0coronavirus\xa0and more delivered daily to your inbox.\xa0Sign up here.",
    ]
    #  hack for coronavirus tag that appears in later articles
    for unw in unwanted:
        body.replace(unw, "")

    #  TODO
    scripts = soup.findAll("script", attrs={"type": "application/ld+json"})
    assert len(scripts) == 2
    app = str(scripts[0].contents[0])
    app = app.replace("\n", "")
    app = json.loads(app)
    headline = app['headline']
    published = app['datePublished']

    return {
        **fox,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #2
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, 'div', {'id': 'article-body'})
    except utils.ParserError as error:
        body = utils.find_one_tag(soup, 'section', {"class": "article__main"})

    body = "".join([p.text for p in body.findAll("p")])

    app = utils.find_application_json(soup, find='headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **nzherald,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #3
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(
        soup, "span", {
            "class":
            "sics-component__story__body sics-component__story__body--nativform"
        })
    body = "".join([p.text for p in body.findAll("p")])

    published = soup.findAll("meta", attrs={"itemprop": "datePublished"})
    assert len(published) == 1
    published = published[0]["content"]

    headline = soup.findAll("h1", attrs={"itemprop": "headline"})
    assert len(headline) == 1
    headline = headline[0].text

    return {
        **stuff,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #4
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "section", {"name": "articleBody"})
    body = "".join([p.text for p in body.findAll("p")])
    noise = [
        "The Times is committed to publishing a diversity of letters to the editor. We’d like to hear what you think about this or any of our articles. Here are some tips. And here’s our email: [email protected] The New York Times Opinion section on Facebook, Twitter (@NYTopinion) and Instagram.",
        "Want climate news in your inbox? Sign up here for Climate Fwd:, our email newsletter.",
        "For more news on climate and the environment, follow @NYTClimate on Twitter.",
    ]
    for n in body:
        body.replace(n, "")

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **nytimes,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #5
0
def parse_guardian_html(url):
    r = request(url)

    html = r['html']
    soup = r['soup']

    try:
        body = find_one_tag(
            soup, 'div', {
                'class':
                'article-body-commercial-selector css-79elbk article-body-viewer-selector'
            })

    except ParserError as error:
        body = find_one_tag(soup, 'div', {'itemprop': 'articleBody'})

    body = "".join([p.text for p in body.findAll("p")])

    headline = find_one_tag(soup, 'meta', {'property': 'og:title'})['content']
    published = find_one_tag(soup, 'meta',
                             {'property': 'article:published_time'})['content']

    return {
        "newspaper_id": "guardian",
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_guardian_article_id(url),
        "date_published": published,
    }
Exemple #6
0
def check_url(url):
    #  wierd redirect to /av/
    if url == 'https://www.bbc.com/news/science-environment-52926683':
        return False
    #  `The papers` - just a list of images - not sure how to check without parsing
    if url == 'https://www.bbc.com/news/uk-scotland-53961637':
        return False
    if url == 'https://www.bbc.com/news/science_and_environment':
        return False

    #  check to see if it is news section
    u = urlparse(url)
    if u.path and u.path.split('/')[1] != "news":
        return False

    if len(u.path.split('/')) > 3:
        return False

    #  check to see if it redirects
    res = request(url)
    if res['response'].status_code == 404:
        return False
    elif res['url'] != url:
        return check_url(res['url'])

    #  check to see if url ends with an integer
    matcher = re.compile('.*-\d*')
    if matcher.match(url):
        return url
Exemple #7
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "p", {"class": "description-text"})

    except utils.ParserError:
        body = utils.find_one_tag(soup, "p", {"class": "article-text row"})

    body = "".join([p.text for p in body.findAll("p")])

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **paper,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #8
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"})
    body = "".join([p.text for p in body.findAll("p")[:-1]])

    noise = [
        "This article is republished from The Conversation under a Creative Commons license. Read the original article here. ",
    ]
    for n in noise:
        body = body.replace(n, "")

    headline = utils.find_one_tag(soup, "title").text
    published = utils.find_one_tag(soup, "meta",
                                   {"itemprop": "datePublished"})["content"]

    return {
        **newshub,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #9
0
def test_bbc_bad_redirects():
    bad_redirect_urls = (
        "https://www.bbc.com/news/science-environment-51129250", )
    for u in bad_redirect_urls:
        r = request(u)
        res = r['response']
        assert not check_url(r['url'])
        assert not check_url(u)
Exemple #10
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(
        soup, "div", {
            "itemprop": "text",
            "class": "ds-layout-grid ds-layout-grid--edged layout-article-body"
        })

    new_body = []
    for p_tag in body.findAll("p"):
        if 'class' in p_tag.attrs.keys():
            if 'article__body' in p_tag.attrs['class'][0]:
                new_body.append(p_tag.text)

    body = new_body
    body = "".join(body)
    unwanted = [
        "For more coverage of climate change, register for The Climate Issue, our fortnightly newsletter, or visit our climate-change hub",
        'Sign up to our new fortnightly climate-change newsletter hereThis article appeared in the Leaders section of the print edition under the headline "The climate issue"',
    ]
    body = clean_string(body, unwanted)

    headline = utils.find_one_tag(soup, "span", {
        "class": "article__headline",
        "itemprop": "headline"
    }).text

    app = utils.find_one_tag(soup, "script", {"type": "application/json"})
    app = json.loads(app.text)

    if 'metadata' in app["props"]["pageProps"].keys():
        meta = app["props"]["pageProps"]["metadata"]
        published = meta["datePublished"]
    else:
        published = app['props']['pageProps']['content'][0]['datePublished']

    return {
        **economist,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #11
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "p", {"class": "description-text"})
        body = body.text
    except utils.ParserError:
        body = utils.find_one_tag(soup, "p", {"class": "article-text row"})
        body = "".join([p.text for p in body.findAll("p")])

    scripts = soup.findAll("script")
    for script in scripts:
        if script.string:
            if "publishedDate" in script.string:
                #  get_text() & .text not working
                data = script.string
                data = data.replace("window.__INITIAL_STATE__ = ", "")
                data = data.replace("\n", "")
                data = data.replace(";", "")
                data = data.split("window.__ENV__ =")[0]
                data = json.loads(data)
                data = data["items"]
                key = list(data.keys())[0]
                published = int(
                    data[key]["content"]["attributes"]["publishedDate"])
                published = datetime.fromtimestamp(published /
                                                   1000).isoformat()
                break

            else:
                published = datetime.fromtimestamp(0).isoformat()

    headline = utils.find_one_tag(soup, 'title').text
    headline = headline.split("|")[0]

    return {
        **skyau,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #12
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    #  hope it's going to be the first one :)
    try:
        body = soup.findAll("div", {"class": "longText"})[0]
        body = "".join([
            p.text for p in body.findAll("p", recursive=False) if not p.attrs
        ])
    except IndexError:
        raise utils.ParserError('no longText')
    try:
        headline = utils.find_one_tag(soup, 'title', {
            "id": None
        }).text.split('|')[0].strip(' ')

        date = soup.findAll('div', {'class': 'col1 dim'})[0].findAll('li')
        for li in date:
            st = li.findAll('strong')
            if st[0].text == 'Date':
                date = li.text
                break

        date = date.split('\n')[1]
        published = datetime.strptime(date, '%d.%m.%Y').isoformat()

    except IndexError:
        raise utils.ParserError('no headline or date')

    return {
        **dw,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #13
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, 'article')
        body = ''.join([p.text for p in body.findAll('p')])

    except utils.ParserError:
        body = utils.find_one_tag(
            soup, "div", {
                "class": "article-body js-article-container",
                "itemprop": "articleBody"
            })
        body = body.findAll("p")
        body = "".join(p.text for p in body
                       if "c-letters-cta__text" not in p.attrs.values())

    app = utils.find_application_json(soup, 'headline')

    headline = app['headline']
    #  sometimes can be "" in the ld+json
    if headline == "":
        headline = utils.find_one_tag(soup, "h1", {
            "class": "c-article-header__hed"
        }).text

    published = app['datePublished']

    return {
        **atlantic,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #14
0
def parse_url(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = utils.request(url, headers=headers)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"})
    body = "".join([p.text for p in body.findAll("p")[:-1]])

    headline = utils.find_one_tag(soup, 'title').text.split('|')[0]

    published = utils.find_one_tag(
        soup, 'meta', {'property': 'article:published_time'})['content']

    return {
        **dailymail,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #15
0
def parse_url(url):
    headers = {'User-Agent':'Mozilla/5.0'}
    response = utils.request(url, headers=headers)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, 'div', {'id': 'main'})
    body = [p.text for p in body.findAll('p') if p.attrs == {} or p.attrs == {'dir': 'ltr'}]
    body = ''.join(body)

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **independent,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
Exemple #16
0
def parse_url(url):
    r = request(url)
    if 'error' in r.keys():
        return {'error': r['error']}
    html = r['html']
    soup = r['soup']

    body = find_one_tag(soup, 'article')
    text_blocks = body.findAll("div", attrs={'data-component': 'text-block'})

    body = []
    for block in text_blocks:
        body.extend(block.findAll("p", attrs={'class': None}))

    deep_body = []
    for p_tag in body:
        #  style tags were slipping into the p tag
        for s in p_tag('style'):
            s.decompose()

        text = p_tag.get_text()
        #  last link tag, often a link to Twitter or Read more here
        if p_tag.find('a') and p_tag is body[-1]:
            pass
        else:
            deep_body.append(text)
    body = "".join(deep_body)
    app = utils.find_application_json(soup, find='headline')
    return {
        "newspaper_id": "bbc",
        "body": body,
        "article_id": get_bbc_article_id(url),
        "headline": app['headline'],
        "article_url": url,
        "html": html,
        "date_published": app["datePublished"],
    }
Exemple #17
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "div", {"itemprop": "articleBody"})
    new_body = []
    for pt in body.findAll("div"):
        flag = False

        if 'class' in pt.attrs.keys():
            class_ = pt.attrs['class']

            for thing in class_:
                if 'zn-body__paragraph' in thing:
                    flag = True

        if flag:
            new_body.append(pt.text)

    body = new_body
    body = "".join(body)

    headline = utils.find_one_tag(soup, 'title', {
        "id": None
    }).text.replace(" - CNN", "")
    published = utils.find_one_tag(soup, 'meta', {'itemprop': 'datePublished'})
    published = published['content']

    return {
        **cnn, "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published
    }
Exemple #18
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "div", {"class": "main-article-body"})
        body = body.findAll("p")

    except utils.ParserError:
        #  possible to have multiple 'text section' divs
        body = soup.findAll("div", {"class": "text section"})
        p_tags = []
        for b in body:
            p_tags.extend(b.findAll("p"))
        body = p_tags

    if len(body) == 0:
        body = utils.find_one_tag(soup, "div", {"class": "wysiwyg wysiwyg--all-content"})
        body = body.findAll("p")

    body = "".join([p.text for p in body])

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **aljazeera,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": strip_aljazzera_dt(app["datePublished"]),
    }
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "div", {"class": "article-body"})

    except utils.ParserError:
        body = utils.find_one_tag(
            soup, "div", {"class": "ent-article-body ent-layout-centered"})

    new_body = []
    for p in body.findAll("p"):

        if 'data-elm-loc' in p.attrs.keys():
            new_body.append(p.text)

        if 'class' in p.attrs.keys():
            if 'font--body' in p.attrs['class']:
                new_body.append(p.text)

    body = "".join(new_body)
    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **washington_post,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }