def process_item(self, spider, item): sampled = stats.get_value("items_sampled", 0, spider=spider) if sampled < items_per_spider: self.items[item.guid] = item sampled += 1 stats.set_value("items_sampled", sampled, spider=spider) log.msg("Sampled %s" % item, spider=spider, level=log.INFO) if close_spider and sampled == items_per_spider: scrapyengine.close_spider(spider) return item
def webconsole_control(self, wc_request): args = wc_request.args s = "<hr />\n" if "stop_running_domains" in args: s += "<p>" stopped_domains = [] for domain in args["stop_running_domains"]: if domain in self.running: scrapyengine.close_spider(self.running[domain]) stopped_domains.append(domain) s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains) s += "</p>" if "remove_pending_domains" in args: removed = [] for domain in args["remove_pending_domains"]: if scrapyengine.spider_scheduler.remove_pending_domain(domain): removed.append(domain) if removed: s += "<p>" s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"]) s += "</p>" if "add_pending_domains" in args: for domain in args["add_pending_domains"]: if domain not in scrapyengine.scheduler.pending_requests: scrapymanager.crawl(domain) s += "<p>" s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"]) s += "</p>" if "rerun_finished_domains" in args: for domain in args["rerun_finished_domains"]: if domain not in scrapyengine.scheduler.pending_requests: scrapymanager.crawl(domain) self.finished.remove(domain) s += "<p>" s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"]) s += "</p>" return s
def item_passed(self, item, spider): self.counts[spider] += 1 if self.counts[spider] == self.itempassed: scrapyengine.close_spider(spider, 'closespider_itempassed')