Beispiel #1
0
 def get(self):
     Dataset.deleteAll()
     Dumpfile.deleteAll()
     for dataset in yaml.load(open("predefined_datasets.yaml", 'r')):
         Dataset(name=dataset['name'], voidURI=dataset['voidURI']).put()
         
     return webapp2.redirect('/datasets')
Beispiel #2
0
    def processVoID(self, content, task):
        """ Process line by line """

        changeDetected = False
        status = "SUCCESS"
        message = "No Changes"

        abortCrawl = False

        for line in content.split("\n"):
            if abortCrawl:
                logging.info("Aborting dataset crawl")
                break

            if line.strip().startswith("void:dataDump"):
                """ Get known dump file meta-data for this URI, if it's recorded in DB """
                dumpfileURL = line[line.find("<") + 1 : line.find(">")]
                lastKnownDump = Dumpfile.all().filter("uri =", dumpfileURL).fetch(1)
                lastKnownHash = lastKnownDump[0].hash if len(lastKnownDump) > 0 else None

                """ Download the dump file """
                logging.info("Downloading dump from " + dumpfileURL)
                logging.info("Previously recorded hash for this file: " + str(lastKnownHash))

                downloadPending = True
                retryCount = 0

                while downloadPending and retryCount < 9:
                    try:
                        result = urlfetch.fetch(dumpfileURL)
                        logging.info("Completed download from " + dumpfileURL + " with HTTP " + str(result.status_code))

                        if result.status_code != 200:
                            status = "ERROR"
                            message = "Dump Download(s) Failed"
                            downloadPending = False
                            abortCrawl = True

                        else:
                            downloadPending = False
                            dumpfileHash = self.computeHash(result.content)
                            if dumpfileHash != lastKnownHash:
                                logging.info("Change detected in dump " + dumpfileURL)

                                changeDetected = True
                                message = "Data changed"

                                Dumpfile(
                                    dataset=task.dataset, uri=dumpfileURL, hash=self.computeHash(result.content)
                                ).put()
                    except:
                        retryCount += 1
                        logging.info("Download failed - retry #" + str(retryCount))

        task.changeDetected = changeDetected
        task.status = status
        task.message = message
Beispiel #3
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))
     for crawl in Crawl.all().filter('dataset =', dataset).run():
         crawl.delete()
         
     for dump in Dumpfile.all().filter('dataset =', dataset).run():
         dump.delete()
     
     dataset.delete()
     logging.info('Deleted dataset ' + datasetID)
     return webapp2.redirect('/datasets')