Exemple #1
0
def create_doc(event: utils.LambdaEvent) -> str:
    """Build clean HTML file from URL source and store it to S3."""
    utils.Log.info("Fetch content from %s", event["url"])
    requests = helpers.import_non_stdlib_module("requests")
    response = requests.get(url=event["url"])

    if not response.status_code == 200:
        raise utils.HandledError("Error downloading %s: "
                                 "HTTP status code %d" %
                                 (event["ur"], response.status_code),
                                 status_code=response.status_code)

    utils.Log.info("Create readability-clean HTML text from %s source",
                   event["url"])
    readability = helpers.import_non_stdlib_module("readability")

    doc = readability.Document(response.text)

    utils.Log.debug("Document title:\n%s", doc.title())
    utils.Log.debug("Document readability-cleaned content:\n%s", doc.summary())

    now = datetime.utcnow()
    file_name = f"pocket-{event['item_id']}" if "item_id" in event else uuid4()
    key_name = now.strftime(f"%Y/%m/%d/{file_name}.html")

    aws.put_object_to_s3_bucket(key=key_name,
                                bucket=env["DOCUMENT_BUCKET"],
                                body=bytes(doc.summary(), encoding="utf-8"))

    file_url = f"s3://{env['DOCUMENT_BUCKET']}/{key_name}"

    utils.Log.info("File %s created successfully", file_url)

    return f"success: {file_url}"
def _poll_new_posts() -> list:
    """Poll RSS/Atom feed and return new entries since yesterday at midnight."""
    now = datetime.utcnow()
    yesterday = now - timedelta(days=1)

    # Jekyll's RSS plugin adds articles dated midnight of the publishing day, so we
    # check from yesterday to today at 00:00 precisely
    yesterday_noon = helpers.midnightify(yesterday)
    today_noon = helpers.midnightify(now)

    utils.Log.info("Considering new entries from %s between '%s' and '%s'",
                   env["BLOG_FEED_URL"], yesterday_noon, now)

    feedparser = helpers.import_non_stdlib_module("feedparser")

    # Unfortunately our RSS feed doesn't support ETAG yet,
    # i.e. we need to fetch the whole feed content every time.
    utils.Log.info("Fetching content from %s", env["BLOG_FEED_URL"])
    source = feedparser.parse(env["BLOG_FEED_URL"])

    struct_to_datetime = helpers.struct_to_datetime

    new_posts = [
        entry.link for entry in reversed(source.entries) if yesterday_noon <=
        struct_to_datetime(entry.published_parsed) < today_noon
    ]

    if new_posts:
        utils.Log.info("Found %d new posts", len(new_posts))
        utils.Log.info(new_posts)
    else:
        utils.Log.info("No new posts found")

    return new_posts
def scrape_page(url: str) -> dict:
    """Scrape title and description from webpage at `url`."""
    utils.Log.info("Scraping %s in search of title and description", url)
    output = {"url": url}

    utils.Log.debug("Fetching content from %s", url)
    page = helpers.send_http_request(url=url, method="GET").text

    utils.Log.debug("Parsing content with BeautifulSoup4")
    bs4 = helpers.import_non_stdlib_module("bs4")
    soup = bs4.BeautifulSoup(page, "html.parser")

    try:
        utils.Log.debug("Searching for categories meta tag")
        found = soup.find("meta", {"name": "categories"})
        categories = found["content"].split(",") if found else []
        categories = [cat.strip(' ') for cat in categories if len(cat) > 0]

    except TypeError as error:
        utils.Log.warning("Could not find any categories meta tag: %s", error)

    output.update({
        "title":
        soup.find("title").contents[0],
        "categories":
        categories,
        "description":
        soup.find("meta", {"name": "description"})["content"],
    })

    utils.Log.debug("Parsing done. Output: %s", output)
    utils.Log.info("Scraping completed successfully")

    return output
def test_import_non_stdlib_module():
    mod = helpers.import_non_stdlib_module("pytest")
    assert isinstance(mod, ModuleType)
    assert mod.__name__ == "pytest"
'''Get page speed information thanks to Google Pagespeed APIs, store in DynamoDB table.'''
from os import environ as env
from statistics import mean
from typing import Tuple

import utils
import utils.aws as aws
import utils.handlers as handlers
import utils.helpers as helpers

requests = helpers.import_non_stdlib_module("requests")  # pylint: disable=invalid-name

GOOGLE_PAGESPEED_API_URL = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"


def _get_average_pagespeed_score_and_timestamp(url: str) -> Tuple[float, str]:
    """Return average of audit responses from Google PageSpeed API"""
    helpers.validate_url(url)

    helpers.Log.info("Fetching data for %s from %s", url,
                     GOOGLE_PAGESPEED_API_URL)
    response = requests.get(url=GOOGLE_PAGESPEED_API_URL,
                            params={
                                "url": url,
                                "key": env["GOOGLE_PAGESPEED_API_KEY"],
                            })
    response = response.json()
    helpers.Log.debug("Response content: %s", response)

    score = float(
        mean(val["score"]
Exemple #6
0
def create_epub(event: utils.LambdaEvent) -> str:
    """Build EPUB file from URL source and store it to S3."""
    utils.Log.info("Fetch content from %s", event["url"])
    requests = helpers.import_non_stdlib_module("requests")
    response = requests.get(url=event["url"])

    if not response.status_code == 200:
        raise utils.HandledError("Error downloading %s: "
                                 "HTTP status code %d" %
                                 (event["ur"], response.status_code),
                                 status_code=response.status_code)

    utils.Log.info("Create Markdown text from %s source", event["url"])
    html2text = helpers.import_non_stdlib_module("html2text")
    markdown_maker = html2text.HTML2Text()
    markdown_maker.ignore_links = True
    markdown = markdown_maker.handle(response.text)
    utils.Log.debug("Markdown content:\n%s", markdown)

    utils.Log.info("Create temporary file to store epub content")
    epub = NamedTemporaryFile(suffix=".epub")
    utils.Log.debug("tempfile created: %s", epub.name)

    try:
        completed = run(["pandoc", "--version"],
                        check=True,
                        capture_output=True,
                        text=True)
        utils.Log.debug(completed.stdout)

        pandoc_cmd = [
            "pandoc",
            "--quiet",
            "--from=markdown",
            "--to=epub",
            f"--metadata=title:'{event['title']}'",
            f"--output={epub.name}",
        ]
        timeout = 200
        utils.Log.info("Executing %s", join(pandoc_cmd))
        run(pandoc_cmd,
            input=bytes(markdown, encoding="utf-8"),
            check=True,
            timeout=timeout)
        utils.Log.info("EPUB creation completed (%d bytes)",
                       stat(epub.name).st_size)

    except TimeoutExpired:
        raise utils.HandledError(
            "Error: pandoc execution exceeded timeout of %d seconds" % timeout,
            status_code=500)

    except CalledProcessError as error:
        raise utils.HandledError("Error: %s" % error,
                                 status_code=500) from error

    now = datetime.utcnow()
    file_name = f"pocket-{event['item_id']}" if "item_id" in event else uuid4()
    key_name = now.strftime(f"%Y/%m/%d/{file_name}.epub")

    aws.put_object_to_s3_bucket(key=key_name,
                                bucket=env["EPUB_BUCKET"],
                                body=epub)

    file_url = f"s3://{env['EPUB_BUCKET']}/{key_name}"

    utils.Log.info("File %s created successfully", file_url)

    return f"success: {file_url}"
Exemple #7
0
'''Get page speed information thanks to Google Pagespeed APIs, store in DynamoDB table.'''
from os import environ as env
from datetime import datetime
from typing import Dict

import utils
import utils.aws as aws
import utils.handlers as handlers
import utils.helpers as helpers

feedparser = helpers.import_non_stdlib_module("feedparser")  # pylint: disable=invalid-name


def _get_last_update(feed_url: str) -> datetime:
    '''Fetch RSS entries, return update datetime for last entry.'''
    utils.Log.info("Fetching content from %s", feed_url)
    last_update = feedparser.parse(feed_url)
    if 'entries' in last_update and len(last_update['entries']) > 0:
        last_update = last_update['entries'][0]['updated_parsed']
        utils.Log.debug('%s updated_parsed: %s', feed_url, last_update)
        return helpers.struct_to_datetime(last_update)
    else:
        raise IndexError(f"Unexpected response content from {feed_url}")


def _get_stored_timestamp() -> Dict[str, str]:
    '''Scan DynamoDB table, return dict of `url: timestamp` items.'''
    data = aws.scan_dynamodb_table(table_name=env['DYNAMODB_TABLE'])

    if data["Count"] == 0:
        utils.Log.warning("Table %s is empty", env['DYNAMODB_TABLE'])