def get(self): url = self.get_argument('url',None) if url is None: # blank url - prompt for one form = SubmitArticleForm() self.render("addarticle.html", form=form, notice='') return # basic validation form = SubmitArticleForm(TornadoMultiDict(self)) if not form.validate(): self.render("addarticle.html", form=form, notice='') return # article already in db? art = self.session.query(Article).join(ArticleURL).\ filter(ArticleURL.url==url).first() print "ART: ",art if art is None: # nope. try scraping it. params = {'url': url} scrape_url = config.settings.scrapeomat + '/scrape?' + urllib.urlencode(params) http = tornado.httpclient.AsyncHTTPClient() response = yield tornado.gen.Task(http.fetch, scrape_url) try: art = scrape.process_scraped(url,response); except Exception as err: # uhoh... we weren't able to scrape it. If user wants article, they'll have to log # in and enter the details themselves... login_next_url = None enter_form = EnterArticleForm(url=url) if self.current_user is None: params = {'url': url} login_next_url = '/enterarticle?' + urllib.urlencode(params) notice = unicode(err) notice += " Please enter the details manually (or try again later)." self.render("enterarticle.html", form=enter_form, notice=notice, login_next_url=login_next_url) return # ok, add the new article to the db (with an action) user = self.current_user if user is None: user = self.get_anon_user() action = Action('art_add', user, article=art) self.session.add(art) self.session.add(action) self.session.commit() # all done self.redirect("/art/%d" % (art.id,)) return
def process_scraped(url, response): """ process http response from scrapomat, return an article (or raise exception) """ scraped_art = None enter_form = EnterArticleForm(url=url) err_msg = None if response.error: # scrapomat down :-( raise Exception("Sorry, there was a problem reading the article.") results = json.loads(response.body) if results['status'] != Status.SUCCESS: error_messages = { Status.PAYWALLED: u"Sorry, that article seems to be behind a paywall.", Status.PARSE_ERROR: u"Sorry, we couldn't read the article", Status.BAD_REQ: u"Sorry, that URL doesn't look like an article", Status.NET_ERROR: u"Sorry, we couldn't read that article - is the URL correct?", } err_msg = error_messages.get(results['status'], "Unknown error") raise Exception(err_msg) scraped_art = results['article'] scraped_art['pubdate'] = datetime.datetime.fromtimestamp( scraped_art['pubdate']) # use entry form to validate everything's there (ugh!) enter_form.url.data = url enter_form.title.data = scraped_art['headline'] enter_form.pubdate.data = scraped_art['pubdate'] if not enter_form.validate(): scraped_art = None err_msg = u"Sorry, we weren't able to automatically read all the details" raise Exception(err_msg) # if we've got this far, we now have all the details needed to load the article into the DB. Yay! url_objs = [ArticleURL(url=u) for u in scraped_art['urls']] art = Article(scraped_art['headline'], scraped_art['permalink'], scraped_art['pubdate'], url_objs) return art
def post(self): form = EnterArticleForm(TornadoMultiDict(self)) if not form.validate(): self.render("enterarticle.html", form=form, notice=None) return # done - add the article to the db url = form.url.data title = form.title.data pubdate = form.pubdate.data url_objs = [ArticleURL(url=url),] art = Article(title,url, pubdate, url_objs) action = Action('art_add', self.current_user, article=art) self.session.add(art) self.session.add(action) self.session.commit() # all done. phew. self.redirect("/art/%d" % (art.id,))
def process_scraped(url,response): """ process http response from scrapomat, return an article (or raise exception) """ scraped_art = None enter_form = EnterArticleForm(url=url) err_msg = None if response.error: # scrapomat down :-( raise Exception("Sorry, there was a problem reading the article.") results = json.loads(response.body) if results['status'] != Status.SUCCESS: error_messages = { Status.PAYWALLED: u"Sorry, that article seems to be behind a paywall.", Status.PARSE_ERROR: u"Sorry, we couldn't read the article", Status.BAD_REQ: u"Sorry, that URL doesn't look like an article", Status.NET_ERROR: u"Sorry, we couldn't read that article - is the URL correct?", } err_msg = error_messages.get(results['status'],"Unknown error") raise Exception(err_msg) scraped_art = results['article'] scraped_art['pubdate'] = datetime.datetime.fromtimestamp(scraped_art['pubdate']) # use entry form to validate everything's there (ugh!) enter_form.url.data = url enter_form.title.data = scraped_art['headline'] enter_form.pubdate.data = scraped_art['pubdate'] if not enter_form.validate(): scraped_art = None err_msg = u"Sorry, we weren't able to automatically read all the details" raise Exception(err_msg) # if we've got this far, we now have all the details needed to load the article into the DB. Yay! url_objs = [ArticleURL(url=u) for u in scraped_art['urls']] art = Article(scraped_art['headline'],scraped_art['permalink'], scraped_art['pubdate'], url_objs) return art
def get(self): form = EnterArticleForm(TornadoMultiDict(self)) self.render("enterarticle.html", form=form, notice=None)