コード例 #1
0
ファイル: scrape.py プロジェクト: xsankorre/unsourced
def process_scraped(url, response):
    """ process http response from scrapomat, return an article (or raise exception) """
    scraped_art = None
    enter_form = EnterArticleForm(url=url)
    err_msg = None
    if response.error:
        # scrapomat down :-(
        raise Exception("Sorry, there was a problem reading the article.")

    results = json.loads(response.body)
    if results['status'] != Status.SUCCESS:
        error_messages = {
            Status.PAYWALLED:
            u"Sorry, that article seems to be behind a paywall.",
            Status.PARSE_ERROR:
            u"Sorry, we couldn't read the article",
            Status.BAD_REQ:
            u"Sorry, that URL doesn't look like an article",
            Status.NET_ERROR:
            u"Sorry, we couldn't read that article - is the URL correct?",
        }
        err_msg = error_messages.get(results['status'], "Unknown error")

        raise Exception(err_msg)

    scraped_art = results['article']
    scraped_art['pubdate'] = datetime.datetime.fromtimestamp(
        scraped_art['pubdate'])
    # use entry form to validate everything's there (ugh!)
    enter_form.url.data = url
    enter_form.title.data = scraped_art['headline']
    enter_form.pubdate.data = scraped_art['pubdate']
    if not enter_form.validate():
        scraped_art = None
        err_msg = u"Sorry, we weren't able to automatically read all the details"
        raise Exception(err_msg)

    # if we've got this far, we now have all the details needed to load the article into the DB. Yay!
    url_objs = [ArticleURL(url=u) for u in scraped_art['urls']]
    art = Article(scraped_art['headline'], scraped_art['permalink'],
                  scraped_art['pubdate'], url_objs)
    return art
コード例 #2
0
    def post(self):
        form = EnterArticleForm(TornadoMultiDict(self))
        if not form.validate():
            self.render("enterarticle.html", form=form, notice=None)
            return

        # done - add the article to the db
        url = form.url.data
        title = form.title.data
        pubdate = form.pubdate.data

        url_objs = [ArticleURL(url=url),]
        art = Article(title,url, pubdate, url_objs)
        action = Action('art_add', self.current_user, article=art)
        self.session.add(art)
        self.session.add(action)
        self.session.commit()

        # all done. phew.
        self.redirect("/art/%d" % (art.id,))
コード例 #3
0
ファイル: scrape.py プロジェクト: Anghelmanuel/unsourced
def process_scraped(url,response):
    """ process http response from scrapomat, return an article (or raise exception) """
    scraped_art = None
    enter_form = EnterArticleForm(url=url)
    err_msg = None
    if response.error:
        # scrapomat down :-(
        raise Exception("Sorry, there was a problem reading the article.")

    results = json.loads(response.body)
    if results['status'] != Status.SUCCESS:
        error_messages = {
            Status.PAYWALLED: u"Sorry, that article seems to be behind a paywall.",
            Status.PARSE_ERROR: u"Sorry, we couldn't read the article",
            Status.BAD_REQ: u"Sorry, that URL doesn't look like an article",
            Status.NET_ERROR: u"Sorry, we couldn't read that article - is the URL correct?",
        }
        err_msg = error_messages.get(results['status'],"Unknown error")

        raise Exception(err_msg)


    scraped_art = results['article']
    scraped_art['pubdate'] = datetime.datetime.fromtimestamp(scraped_art['pubdate'])
    # use entry form to validate everything's there (ugh!)
    enter_form.url.data = url
    enter_form.title.data = scraped_art['headline']
    enter_form.pubdate.data = scraped_art['pubdate']
    if not enter_form.validate():
        scraped_art = None
        err_msg = u"Sorry, we weren't able to automatically read all the details"
        raise Exception(err_msg)

    # if we've got this far, we now have all the details needed to load the article into the DB. Yay!
    url_objs = [ArticleURL(url=u) for u in scraped_art['urls']]
    art = Article(scraped_art['headline'],scraped_art['permalink'], scraped_art['pubdate'], url_objs)
    return art