def crawlers_update(): # FIXME ugly code to do both edit/update if request.method == 'GET': id = request.args.get('id', '') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(None, crawler) else: id = request.form.get('id') crawler = Crawler.get(Crawler.id == id) form = CrawlersForm(request.form) form.validate() if form.errors: pass else: now = datetime.utcnow() crawler.name = request.form.get('name') crawler.runnable = request.form.get('runnable') crawler.gspread_link = request.form.get('gspread_link') crawler.url = None crawler.updated_at = now crawler.save() new_crawler = False form = CrawlersForm(None, crawler) flash('Crawler was updated') return redirect(url_for('crawlers.crawlers_list')) return render_template('crawler.html', current_user=current_user, form=form, new_crawler=False, id=id)
def db_create(): db.connect() User.create_table(True) # True means fail siliently if table exists SocialCounter.create_table(True) SocialCount.create_table(True) ChannelCounter.create_table(True) Channel.create_table(True) UserFunction.create_table(True) FunctionResult.create_table(True) Crawler.create_table(True) CrawlerPage.create_table(True) Mozscape.create_table(True) MozscapeResult.create_table(True) MozscapeIndexMetadata.create_table(True) db.close()
def crawlers_create(): form = CrawlersForm(request.form) form.validate() new_crawler = True if form.errors: pass else: now = datetime.utcnow() crawler = Crawler.create( name=request.form.get('name'), runnable=request.form.get('runnable'), gspread_link=request.form.get('gspread_link'), # url=request.form.get('url'), url=None, crawl_status=None, crawled_at=None, created_at=now, updated_at=now) new_crawler = False form = CrawlersForm(None, crawler) flash('Crawler was created') return redirect(url_for('crawlers.crawlers_list')) return render_template('crawler.html', current_user=current_user, form=form, new_crawler=new_crawler)
def crawlers(): crawlers = Crawler.select() for crawler in crawlers: if crawler.is_runnable(): # delete crawler before crawling it again: dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name) deleted_count = dq.execute() pages = CrawlerPage.crawl(crawler) print("%s job: crawlers processed" % datetime.utcnow())
def crawlers_summary(): id = request.args.get('id', None) if id is None: flash('Error: id is missing for Crawler summary page!') return redirect(url_for('crawlers.crawlers_list')) c = Crawler.get(Crawler.id == id) pages = CrawlerPage.select().where(CrawlerPage.name == c.name).order_by( CrawlerPage.timestamp.desc()) return render_template('crawlers_summary.html', current_user=current_user, pages=pages, id=id)
def async_spider(app, crawler_id): now = datetime.utcnow() print("%s async_spider started..." % datetime.utcnow()) print("\tPID=%s" % os.getpid()) print("\tcrawler_id=%s" % crawler_id) crawler = Crawler.get(Crawler.id == crawler_id) if crawler.is_runnable: # delete crawled pages before crawling it again: dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name) deleted_count = dq.execute() crawler.crawled_at = now crawler.crawl_status = 'crawling' crawler.save() pages_len = 0 try: pages_len = CrawlerPage.crawl(crawler) finally: crawler.crawl_status = "crawled %s pages" % pages_len crawler.save() print("\tnumber of pages crawled=%s" % pages_len) print("%s async_spider ended" % datetime.utcnow())
def crawlers_list(): crawlers = Crawler.select() return render_template('crawlers_list.html', current_user=current_user, crawlers=crawlers)