def processVoID(self, content, task): """ Process line by line """ changeDetected = False status = "SUCCESS" message = "No Changes" abortCrawl = False for line in content.split("\n"): if abortCrawl: logging.info("Aborting dataset crawl") break if line.strip().startswith("void:dataDump"): """ Get known dump file meta-data for this URI, if it's recorded in DB """ dumpfileURL = line[line.find("<") + 1 : line.find(">")] lastKnownDump = Dumpfile.all().filter("uri =", dumpfileURL).fetch(1) lastKnownHash = lastKnownDump[0].hash if len(lastKnownDump) > 0 else None """ Download the dump file """ logging.info("Downloading dump from " + dumpfileURL) logging.info("Previously recorded hash for this file: " + str(lastKnownHash)) downloadPending = True retryCount = 0 while downloadPending and retryCount < 9: try: result = urlfetch.fetch(dumpfileURL) logging.info("Completed download from " + dumpfileURL + " with HTTP " + str(result.status_code)) if result.status_code != 200: status = "ERROR" message = "Dump Download(s) Failed" downloadPending = False abortCrawl = True else: downloadPending = False dumpfileHash = self.computeHash(result.content) if dumpfileHash != lastKnownHash: logging.info("Change detected in dump " + dumpfileURL) changeDetected = True message = "Data changed" Dumpfile( dataset=task.dataset, uri=dumpfileURL, hash=self.computeHash(result.content) ).put() except: retryCount += 1 logging.info("Download failed - retry #" + str(retryCount)) task.changeDetected = changeDetected task.status = status task.message = message
def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) for crawl in Crawl.all().filter('dataset =', dataset).run(): crawl.delete() for dump in Dumpfile.all().filter('dataset =', dataset).run(): dump.delete() dataset.delete() logging.info('Deleted dataset ' + datasetID) return webapp2.redirect('/datasets')