Beispiel #1
0
def article(article_id):
    """Retrieve a sanitized article.

    Request
    -------

    ::

        GET /44d85795-248d-5899-b8ca-ac2bd8233755
        
    Response
    --------

    .. note::
        The following is formatted for readability and does not match the 
        actual response from the API.  Also, the body parameter has been
        shortened to fit this example more concisely.

    ::

        HTTP/1.0 200 Ok

        {
          "body": "…Singularity, an Alternative Openstack Guest Agent | Hackery &c…
          "url": "http://blog.alunduil.com/posts/singularity-an-alternative-openstack-guest-agent.html",
          "created_at": {"$date": 1374007667571},
          "etag": "6e2f69536ca15cc18260bffe7583b849",
          "_id": "03db19bb92205b4fb5fc3c4c0e4b1279",
          "parsed_at": {"$date": 1374008521414},
          "size": 9964
        }

    """

    article = get_collection("articles").find_one({ "_id": uuid.UUID(article_id).hex })

    logger.debug("article: %s", article)

    if article is None or "etag" not in article:
        # 404 not only if the object doesn't exist but also if we haven't
        # sanitized the body yet.
        abort(404)

    container_name, object_name = article.pop("text_container_name"), article.pop("text_object_name")

    logger.debug("article: %s", article)

    # TODO Catch connection issues and return Temporarily Unavailable.
    if request.method != "HEAD":
        data = get_container(container_name).get_object(object_name).fetch()

        logger.debug("type(data): %s", type(data))
        logger.debug("len(data): %s", len(data))

        article["body"] = data

    response = make_response(json.dumps(article, default = json_util.default), 200)

    response.mimetype = "application/json"

    response.headers["Access-Control-Allow-Origin"] = Parameters()["server.domain"]

    return response
Beispiel #2
0
def sanitize_html_consumer(channel, method, header, body):
    """Download and sanitize the HTML for the given article.

    The HTML should be simplified as much as possible without modifying the
    feel of the structure to someone reading the content of the body of the
    document.

    .. note::
        Analysis will be necessary that shows the statistics on sanitized HTML
        size for a determination as to whether we can store it inline in Mongo
        or out of band in an object store like Rackspace Cloud Files.

    The decisions and algorithms used for streamlining the HTML are not
    proprietary in any way and can be used and modified under the terms of this
    file's licensing but more importantly can be improved or modified if 
    imperfections are found.

    """

    _id = json.loads(body)["_id"]

    logger.debug("article._id: %s", _id)

    articles = get_collection("articles")

    article = articles.find_one({ "_id": _id }, { "_id": 0 })

    request = urllib2.Request(article["url"])
    request.get_method = lambda: "HEAD"

    response = urllib2.urlopen(request)

    logger.debug("response: %s", response)
    logger.debug("response.info(): %s", response.info())
    logger.debug("response.info().__class__: %s", response.info().__class__)

    etag = response.info().getheader("etag")

    # TODO Check Last-Modified?
    # TODO Use expires to set the next poll?
    # TODO Respect Cache-Control?
    # TODO Other header considerations.
    # TODO Use Content-Type to set encoding?

    if article.get("etag") != etag:
        logger.info("Parsing full HTML of %s", article["url"])

        article["etag"] = etag

        response = urllib2.urlopen(article["url"])

        soup = bs4.BeautifulSoup(response.read())

        # TODO Use this when more is required:
        #html = sanitize(soup)
        html = soup.get_text()

        article["parsed_at"] = datetime.datetime.now()

        logger.debug("HTML Size: %s B", sys.getsizeof(html))
        article["size"] = sys.getsizeof(html)

        container_part, object_part = str(uuid.UUID(_id)).split("-", 1)

        article["text_container_name"] = "margarine-" + container_part
        article["text_object_name"] = object_part

        logger.info("Uploading text to cloudfiles")

        get_container(article["text_container_name"]).store_object(article["text_object_name"], html, content_type = "text/html")

        logger.info("Uploaded text to cloudfiles")

        articles.update({ "_id": _id }, { "$set": article }, upsert = True)

    logger.info("finished processing article: %s", article["url"])

    channel.basic_ack(delivery_tag = method.delivery_tag)