Esempio n. 1
0
    def get(self):
        url = self.get_argument('url',None)
        if url is None:
            # blank url - prompt for one
            form = SubmitArticleForm()
            self.render("addarticle.html", form=form, notice='')
            return

        # basic validation
        form = SubmitArticleForm(TornadoMultiDict(self))
        if not form.validate():
            self.render("addarticle.html", form=form, notice='')
            return

        # article already in db?
        art = self.session.query(Article).join(ArticleURL).\
                filter(ArticleURL.url==url).first()
        print "ART: ",art

        if art is None:
            
            # nope. try scraping it.
            params = {'url': url}
            scrape_url = config.settings.scrapeomat + '/scrape?' + urllib.urlencode(params)
            http = tornado.httpclient.AsyncHTTPClient()

            response = yield tornado.gen.Task(http.fetch, scrape_url)


            try:
                art = scrape.process_scraped(url,response);
            except Exception as err:
                # uhoh... we weren't able to scrape it. If user wants article, they'll have to log
                # in and enter the details themselves...

                login_next_url = None
                enter_form = EnterArticleForm(url=url)
                if self.current_user is None:
                    params = {'url': url}
                    login_next_url = '/enterarticle?' + urllib.urlencode(params)
                notice = unicode(err)
                notice += " Please enter the details manually (or try again later)."
                self.render("enterarticle.html", form=enter_form, notice=notice, login_next_url=login_next_url)
                return

            # ok, add the new article to the db (with an action)
            user = self.current_user
            if user is None:
                user = self.get_anon_user()
            action = Action('art_add', user, article=art)
            self.session.add(art)
            self.session.add(action)
            self.session.commit()

        # all done
        self.redirect("/art/%d" % (art.id,))
        return
Esempio n. 2
0
def process_scraped(url, response):
    """ process http response from scrapomat, return an article (or raise exception) """
    scraped_art = None
    enter_form = EnterArticleForm(url=url)
    err_msg = None
    if response.error:
        # scrapomat down :-(
        raise Exception("Sorry, there was a problem reading the article.")

    results = json.loads(response.body)
    if results['status'] != Status.SUCCESS:
        error_messages = {
            Status.PAYWALLED:
            u"Sorry, that article seems to be behind a paywall.",
            Status.PARSE_ERROR:
            u"Sorry, we couldn't read the article",
            Status.BAD_REQ:
            u"Sorry, that URL doesn't look like an article",
            Status.NET_ERROR:
            u"Sorry, we couldn't read that article - is the URL correct?",
        }
        err_msg = error_messages.get(results['status'], "Unknown error")

        raise Exception(err_msg)

    scraped_art = results['article']
    scraped_art['pubdate'] = datetime.datetime.fromtimestamp(
        scraped_art['pubdate'])
    # use entry form to validate everything's there (ugh!)
    enter_form.url.data = url
    enter_form.title.data = scraped_art['headline']
    enter_form.pubdate.data = scraped_art['pubdate']
    if not enter_form.validate():
        scraped_art = None
        err_msg = u"Sorry, we weren't able to automatically read all the details"
        raise Exception(err_msg)

    # if we've got this far, we now have all the details needed to load the article into the DB. Yay!
    url_objs = [ArticleURL(url=u) for u in scraped_art['urls']]
    art = Article(scraped_art['headline'], scraped_art['permalink'],
                  scraped_art['pubdate'], url_objs)
    return art
Esempio n. 3
0
    def post(self):
        form = EnterArticleForm(TornadoMultiDict(self))
        if not form.validate():
            self.render("enterarticle.html", form=form, notice=None)
            return

        # done - add the article to the db
        url = form.url.data
        title = form.title.data
        pubdate = form.pubdate.data

        url_objs = [ArticleURL(url=url),]
        art = Article(title,url, pubdate, url_objs)
        action = Action('art_add', self.current_user, article=art)
        self.session.add(art)
        self.session.add(action)
        self.session.commit()

        # all done. phew.
        self.redirect("/art/%d" % (art.id,))
Esempio n. 4
0
def process_scraped(url,response):
    """ process http response from scrapomat, return an article (or raise exception) """
    scraped_art = None
    enter_form = EnterArticleForm(url=url)
    err_msg = None
    if response.error:
        # scrapomat down :-(
        raise Exception("Sorry, there was a problem reading the article.")

    results = json.loads(response.body)
    if results['status'] != Status.SUCCESS:
        error_messages = {
            Status.PAYWALLED: u"Sorry, that article seems to be behind a paywall.",
            Status.PARSE_ERROR: u"Sorry, we couldn't read the article",
            Status.BAD_REQ: u"Sorry, that URL doesn't look like an article",
            Status.NET_ERROR: u"Sorry, we couldn't read that article - is the URL correct?",
        }
        err_msg = error_messages.get(results['status'],"Unknown error")

        raise Exception(err_msg)


    scraped_art = results['article']
    scraped_art['pubdate'] = datetime.datetime.fromtimestamp(scraped_art['pubdate'])
    # use entry form to validate everything's there (ugh!)
    enter_form.url.data = url
    enter_form.title.data = scraped_art['headline']
    enter_form.pubdate.data = scraped_art['pubdate']
    if not enter_form.validate():
        scraped_art = None
        err_msg = u"Sorry, we weren't able to automatically read all the details"
        raise Exception(err_msg)

    # if we've got this far, we now have all the details needed to load the article into the DB. Yay!
    url_objs = [ArticleURL(url=u) for u in scraped_art['urls']]
    art = Article(scraped_art['headline'],scraped_art['permalink'], scraped_art['pubdate'], url_objs)
    return art
Esempio n. 5
0
 def get(self):
     form = EnterArticleForm(TornadoMultiDict(self))
     self.render("enterarticle.html", form=form, notice=None)