Example #1
0
def crawlers_update():
    # FIXME ugly code to do both edit/update
    if request.method == 'GET':
        id = request.args.get('id', '')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(None, crawler)
    else:
        id = request.form.get('id')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(request.form)
        form.validate()
        if form.errors:
            pass
        else:
            now = datetime.utcnow()
            crawler.name = request.form.get('name')
            crawler.runnable = request.form.get('runnable')
            crawler.gspread_link = request.form.get('gspread_link')
            crawler.url = None
            crawler.updated_at = now
            crawler.save()
            new_crawler = False
            form = CrawlersForm(None, crawler)
            flash('Crawler was updated')
            return redirect(url_for('crawlers.crawlers_list'))
    return render_template('crawler.html',
                           current_user=current_user,
                           form=form,
                           new_crawler=False,
                           id=id)
Example #2
0
def crawlers_summary():
    id = request.args.get('id', None)
    if id is None:
        flash('Error: id is missing for Crawler summary page!')
        return redirect(url_for('crawlers.crawlers_list'))
    c = Crawler.get(Crawler.id == id)
    pages = CrawlerPage.select().where(CrawlerPage.name == c.name).order_by(
        CrawlerPage.timestamp.desc())
    return render_template('crawlers_summary.html',
                           current_user=current_user,
                           pages=pages,
                           id=id)
def async_spider(app, crawler_id):
    now = datetime.utcnow()
    print("%s async_spider started..." % datetime.utcnow())
    print("\tPID=%s" % os.getpid())
    print("\tcrawler_id=%s" % crawler_id)
    crawler = Crawler.get(Crawler.id == crawler_id)
    if crawler.is_runnable:
        # delete crawled pages before crawling it again:
        dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name)
        deleted_count = dq.execute()
        crawler.crawled_at = now
        crawler.crawl_status = 'crawling'
        crawler.save()
        pages_len = 0
        try:
            pages_len = CrawlerPage.crawl(crawler)
        finally:
            crawler.crawl_status = "crawled %s pages" % pages_len
            crawler.save()
            print("\tnumber of pages crawled=%s" % pages_len)
    print("%s async_spider ended" % datetime.utcnow())