def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) crawl = Crawl(dataset=dataset, status='QUEUED') crawl.put() ''' Queue the crawl immediately ''' crawl.queue(5) return webapp2.redirect('/datasets/' + datasetID)
def get(self, datasetID): startIn = self.request.get('start').split(':') if len(startIn) == 2: logging.info('Queuing harvest in ' + startIn[0] + ' hours ' + startIn[1] + ' minutes') seconds = int(startIn[0]) * 3600 + int(startIn[1]) * 60 dataset = Dataset.get_by_id(long(datasetID)) ''' TODO store 'interval' param in dataset object (if any) ''' crawl = Crawl(dataset=dataset, status='QUEUED') crawl.put() crawl.queue(seconds) return webapp2.redirect('/datasets/' + datasetID) else: ''' TODO decent error handling ''' logging.info('Invalid crawl time: ' + self.request.get('start')) return webapp2.redirect('/datasets/' + datasetID + '?error=true')
class Crawler(object): def __init__(self, robot, crawl=None): self.robot = robot urlsource = robot.urlsource self._crawl = crawl def get_jobs(self): seq = 0 for url in self.robot.urlsource.get_urls(): job = Job( crawl_key=self.crawl.key, url=url, seq_num=seq, created_at=datetime.utcnow, status='new' ) yield job def run_job(self, job, request_id): ck = self.crawl.key logging.info('Running job #{} from crawl #{}'.format(job.seq_num, ck.id())) job.request_id = request_id # self.status = 'failure' # try: # self.request_id = os.environ.get('REQUEST_LOG_ID') # dldr = Downloader() # html = dldr.html(self.url) # if crawl.robot.datasets: # self.result = crawl.robot.process_datasets(html) # self.status = 'success' # finally: # # FIXME: If no more tries - suppress exception and set status=failed # self.put() # logging.info('Finished job {} from crawl {}'.format(self, self.crawl_key)) # self.crawl_key.get().finish(self.status, self.result) @property def crawl(self): if not self._crawl: self._crawl = Crawl(parent=self.robot.key) self._crawl.put() return self._crawl