Esempio n. 1
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))   
     crawl = Crawl(dataset=dataset, status='QUEUED')
     crawl.put()
     
     ''' Queue the crawl immediately '''
     crawl.queue(5)      
     return webapp2.redirect('/datasets/' + datasetID)
Esempio n. 2
0
 def get(self, datasetID):
     startIn = self.request.get('start').split(':')
     if len(startIn) == 2:
         logging.info('Queuing harvest in ' + startIn[0] + ' hours ' + startIn[1] + ' minutes')
         seconds = int(startIn[0]) * 3600 + int(startIn[1]) * 60
         dataset = Dataset.get_by_id(long(datasetID))
         
         ''' TODO store 'interval' param in dataset object (if any) '''                    
         
         crawl = Crawl(dataset=dataset, status='QUEUED')
         crawl.put()
         crawl.queue(seconds)
         return webapp2.redirect('/datasets/' + datasetID)        
     else:
         ''' TODO decent error handling '''
         logging.info('Invalid crawl time: ' + self.request.get('start'))
         return webapp2.redirect('/datasets/' + datasetID + '?error=true')
Esempio n. 3
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))
     for crawl in Crawl.all().filter('dataset =', dataset).run():
         crawl.delete()
         
     for dump in Dumpfile.all().filter('dataset =', dataset).run():
         dump.delete()
     
     dataset.delete()
     logging.info('Deleted dataset ' + datasetID)
     return webapp2.redirect('/datasets')
Esempio n. 4
0
class Crawler(object):
    def __init__(self, robot, crawl=None):
        self.robot = robot
        urlsource = robot.urlsource
        self._crawl = crawl

    def get_jobs(self):
        seq = 0
        for url in self.robot.urlsource.get_urls():
            job = Job(
                crawl_key=self.crawl.key,
                url=url,
                seq_num=seq, created_at=datetime.utcnow,
                status='new'
            )
            yield job

    def run_job(self, job, request_id):
        ck = self.crawl.key
        logging.info('Running job #{} from crawl #{}'.format(job.seq_num, ck.id()))
        job.request_id = request_id
        # self.status = 'failure'
        # try:
        # self.request_id = os.environ.get('REQUEST_LOG_ID')
        #     dldr = Downloader()
        #     html = dldr.html(self.url)
        #     if crawl.robot.datasets:
        #         self.result = crawl.robot.process_datasets(html)
        #     self.status = 'success'
        # finally:
        #     # FIXME: If no more tries - suppress exception and set status=failed
        #     self.put()
        #     logging.info('Finished job {} from crawl {}'.format(self, self.crawl_key))
        #     self.crawl_key.get().finish(self.status, self.result)

    @property
    def crawl(self):
        if not self._crawl:
            self._crawl = Crawl(parent=self.robot.key)
            self._crawl.put()
        return self._crawl
Esempio n. 5
0
    def get(self, crawlID):
        logging.info("Starting execution of task " + crawlID)
        crawl = Crawl.get_by_id(long(crawlID))

        startTime = time.time()
        crawl.status = "RUNNING"
        crawl.save()

        """ Download VoID """
        voidURI = crawl.dataset.voidURI
        logging.info("Downloading VoID from " + voidURI)
        result = urlfetch.fetch(voidURI)
        logging.info("Completed download from " + voidURI + " with HTTP " + str(result.status_code))

        if result.status_code != 200:
            crawl.changeDetected = False
            crawl.status = "ERROR"
            crawl.message = "VoID Download Failed"
        else:
            """ Compute VoID hash """
            voidHash = self.computeHash(result.content)
            if crawl.dataset.voidHash != voidHash:
                """ VoID changed - update DB """
                logging.info("VoID file at " + voidURI + " changed")
                crawl.dataset.voidHash = voidHash
                crawl.dataset.save()

                self.processVoID(result.content, crawl)

                """ No matter what the task status for the data dumps """
                """ is - just overwrite with info about VoID change   """
                crawl.changeDetected = True
                crawl.message = "VoID file changed"
            else:
                """ No change in VoID """
                logging.info("No changes in VoID file " + voidURI)

                self.processVoID(result.content, crawl)

        """ Complete task log with timing info """
        crawl.finishedAt = datetime.datetime.utcnow()
        crawl.duration = int(time.time() - startTime)
        crawl.save()
Esempio n. 6
0
 def get(self):
     crawls = Crawl.all().order("-queuedFor")
     self.render_response("crawls/crawls_listall.html", **{"crawls": crawls})
Esempio n. 7
0
 def get(self):
     Crawl.deleteAll()
     return webapp2.redirect("/crawls")
Esempio n. 8
0
        lat=51.5069549,
        lng=-0.1275326,
        terrace=False,
        description=
        'Bit epic really getting all up close and personal with Admiral Nelson. ',
        location='Central',
        hero=
        'https://thenudge.com/wp-content/uploads/1970/01/trafalgar-st-james-rooftop-1920x849.jpg'
    )
    db.session.add(trafalgar)

    ################ CRAWLS ######################

    history_crawl = Crawl(
        name='Historic Bar Crawl',
        description=
        'Start off this crawl by having a couple of beers over looking the Thames at the one of the oldest pubs in London, The Grapes. Charles Dickens was a patron, and even made reference to the pub in his novel Our Mutual Friend. Take a stroll along the river to the Prospect of Whitby,the hostelry of choice of "Hanging" Judge Jeffreys, scourge of the Monmouth Rebellion. He lived nearby and a replica gallows and noose hangs by the Thameside window, commemorating his custom. According to legend, criminals would be tied up to the posts at low tide and left there to drown when the tide came in. Views from the pub were sketched by both Turner and Whistler, the writers Charles Dickens and Samuel Pepys are known to have paused to sup here. Next head from the river towards the Blind Beggar, site of the notorious Kray murder in 70s and the location of William Booth\s first sermon, which led to the creation of the Salvation Army. Finally end up at the Royal Oak, another Kray twin haunt situated next to London\s best known flower market.',
        creator=mike)

    history_crawl.stops = [
        Stop(bar=grapes, order=0),
        Stop(bar=whitby, order=1),
        Stop(bar=blind_beggar, order=2),
        Stop(bar=royal_oak, order=3)
    ]
    db.session.add(history_crawl)

    rooftop_crawl = Crawl(
        name='Rooftops of London',
        description=
        'If you\'re like me you too can become the chimp you were born to be and swing from rooftop to rooftop bar, traversing the central sights... even if you fail to finish it, start early and you will catch the golden hour at one of the best viewing spots in London',
Esempio n. 9
0
 def view_crawl(self, mid, cid):
     robot = Robot.get_by_id(int(mid), parent=self.current_user.key)
     crawl = Crawl.get_by_id(int(cid), parent=robot.key)
     jobs = crawl.jobs
     self.render_response('robot/crawl.html', robot=robot, crawl=crawl, jobs=jobs)
Esempio n. 10
0
 def view(self, mid):
     robot = Robot.get_by_id(int(mid), parent=self.current_user.key)
     crawls = Crawl.query(ancestor=robot.key).order(-Crawl.started_at).fetch()
     datasets = DataSet.query(ancestor=robot.key).fetch()
     self.render_response('robot/view.html', robot=robot, mid=mid, crawls=crawls,
                          datasets=datasets, schedules=SCHEDULES)
Esempio n. 11
0
 def crawl(self):
     if not self._crawl:
         self._crawl = Crawl(parent=self.robot.key)
         self._crawl.put()
     return self._crawl