Esempio n. 1
0
def mercury_scraper(link):
    '''
    Returns the 'content' field of the json object returned by the
    MercuryParser api
    '''

    USER_AGENT = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

    if not link.lower().endswith('.pdf'):

        access_granted = requests.get(link, headers=USER_AGENT).status_code == 200
        if access_granted:
            parser = MercuryParser(api_key='ETkLjaGuTmB4FF0eQxWwPwUNjIeJwTDOJhKgigYA')

            article = parser.parse_article(link)
            text = article.json()['content']

            # MercuryParser still returns the html tag
            # we use BeautifulSoup to strip those
            soup = BeautifulSoup(text, 'html.parser')
            content = soup.get_text()

            return content

    else:
        return "PDF file can't be accessed at this time."
Esempio n. 2
0
def test_parse_article():
    parser = MercuryParser()
    response = parser.parse_article(
        "https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f"
    )  # noqa
    assert "Alexa, play some music" in response.json()["title"]
    assert response.json()["domain"] == "medium.com"
    assert response.status_code == 200
Esempio n. 3
0
def test_parse_article():
    parser = MercuryParser(API_KEY)
    response = parser.parse_article(
        'https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f'
    )  # noqa
    assert 'Alexa, play some music' in response.json()['title']
    assert response.json()['domain'] == 'medium.com'
    assert response.status_code == 200
Esempio n. 4
0
import os
import argparse
import time

import html2text

from dotenv import load_dotenv, find_dotenv

if __name__ == "__main__":
    load_dotenv(find_dotenv())

    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True

    parser = argparse.ArgumentParser()
    parser.add_argument('urls',
                        help='The urls to parse.',
                        metavar='N',
                        nargs='+')
    args = parser.parse_args()

    mercury = MercuryParser(api_key=os.environ['MERCURY_PARSER_KEY'])

    for url in args.urls:
        print("Parsing", url, "...")
        content = h.handle(mercury.parse_article(url).json()['content'])
        with open(slugify(url) + ".txt", "wb") as f:
            f.write(content.encode('utf8'))
        time.sleep(1)