def crawlers_update(): # FIXME ugly code to do both edit/update if request.method == 'GET': id = request.args.get('id', '') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(None, crawler) else: id = request.form.get('id') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(request.form) form.validate() if form.errors: pass else: now = datetime.utcnow() crawler.name = request.form.get('name') crawler.runnable = request.form.get('runnable') crawler.gspread_link = request.form.get('gspread_link') crawler.url = None crawler.updated_at = now crawler.save() new_crawler = False form = CrawlersForm(None, crawler) flash('Crawler was updated') return redirect(url_for('crawlers.crawlers_list')) return render_template('crawler.html', current_user=current_user, form=form, new_crawler=False, id=id)
def crawlers_summary(): id = request.args.get('id', None) if id is None: flash('Error: id is missing for Crawler summary page!') return redirect(url_for('crawlers.crawlers_list')) c = Crawler.get(Crawler.id == id) pages = CrawlerPage.select().where(CrawlerPage.name == c.name).order_by( CrawlerPage.timestamp.desc()) return render_template('crawlers_summary.html', current_user=current_user, pages=pages, id=id)
def async_spider(app, crawler_id): now = datetime.utcnow() print("%s async_spider started..." % datetime.utcnow()) print("\tPID=%s" % os.getpid()) print("\tcrawler_id=%s" % crawler_id) crawler = Crawler.get(Crawler.id == crawler_id) if crawler.is_runnable: # delete crawled pages before crawling it again: dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name) deleted_count = dq.execute() crawler.crawled_at = now crawler.crawl_status = 'crawling' crawler.save() pages_len = 0 try: pages_len = CrawlerPage.crawl(crawler) finally: crawler.crawl_status = "crawled %s pages" % pages_len crawler.save() print("\tnumber of pages crawled=%s" % pages_len) print("%s async_spider ended" % datetime.utcnow())