def main(): disk.getFileLockOrDie("locks/backend.pid") storedUrlsSet = frozenset(storage.getUrls()) logging.info("Got %s stored urls", len(storedUrlsSet)) validUrlsSet = frozenset(getValidUrls()) if len(validUrlsSet) == 0: logging.error("No valid URL found") sys.exit(1) logging.info("Found %s valid urls", len(validUrlsSet)) invalidUrlsSet = storedUrlsSet - validUrlsSet for url in invalidUrlsSet: storage.purge(url) discoverUnknownUrls(validUrlsSet - storedUrlsSet)
def main(): disk.getFileLockOrDie("locks/backend.pid") newMap = {} oldMap = load(AntiFraud.REAL_DATES_PATH) analyses = report.fetchReport() for analysis in analyses: oldHashKey = getOldHashKey(analysis) newHashKey = AntiFraud._getHashKey(analysis) createdDate = oldMap.get(oldHashKey) if createdDate is not None: olderDate = min(newMap.get(newHashKey, "3000"), createdDate) newMap[newHashKey] = olderDate logging.debug("Found date %s for %s", olderDate, newHashKey) if len(newMap) == 0: raise Exception("No dates found.") save(newMap, AntiFraud.REAL_DATES_PATH)
def main(): disk.getFileLockOrDie("locks/backend.pid") num_found = 0 for index in xrange(10**6): wasNew = False urls = urlCollector.collectUrls(index) num_found += len(urls) for url in urls: if not storage.isDiscovered(url): wasNew = True logging.info("Discovered new url: %s", url) storage.storeUrl(url) tobe.toDownload(url) if not wasNew: break if num_found == 0: logging.error("No valid URL discovered")
def main(): disk.getFileLockOrDie("locks/backend.pid") options, args = parseArgs() if options.fix: urls = [url for url in storage.getUrls() if not storage.isDownloaded(url)] else: urls = tobe.getToBeDownloaded() logging.info("Downloading %s urls", len(urls)) for url in urls: content = _try_download(url) if content is not None: storage.storeContent(url, content) tobe.toAnalyse(url) tobe.nothingToBeDownloaded() logging.info("Downloaded %s urls", len(urls))
def main(): disk.getFileLockOrDie("locks/backend.pid") options, args = parseArgs() if options.fix: urls = [url for url in storage.getUrls() if storage.isDownloaded(url)] tobe.nothingToBeReported() if not options.clean: urls = _skipAnalysed(urls) else: urls = tobe.getToBeAnalysed() for url in urls: analysis = analyser.analyseUrl(url) storage.storeAnalysis(url, analysis) tobe.toReport(url) tobe.nothingToBeAnalysed() logging.info("Analysed %s urls", len(urls)) cleanStart = options.fix or options.clean updateReport(cleanStart) warmer.updateWarmPicture()