def crawl(mod): cache = Cache() repos = mod.get_repos(test) i = 0 for repo in repos: print str(i) + "/" + str(len(repos)), repo s = time.clock() if not last: repo.last_crawl = None last_crawl, rels = mod.crawl_repo(repo) total_new = downstream.add_releases(repo, rels, test, cache) if total_new > 0: cache.evict([(None, repo.distro_id)]) downstream.set_last_crawl(repo, last_crawl, test) print "\t" + str(total_new), "new releases", "\t\t", time.clock() - s, "secs" i += 1
def crawl(mod): cache = Cache() repos = mod.get_repos(test) i = 0 for repo in repos: print str(i) + "/" + str(len(repos)), repo s = time.clock() if not last: repo.last_crawl = None last_crawl, rels = mod.crawl_repo(repo) total_new = downstream.add_releases(repo, rels, test, cache) if total_new > 0: cache.evict([(None, repo.distro_id)]) downstream.set_last_crawl(repo, last_crawl, test) print "\t" + str( total_new), "new releases", "\t\t", time.clock() - s, "secs" i += 1
total_start = time.clock() stats = [] for d in downstream_targets: s = time.clock() try: stats.append((d, crawl(DISTROS[d]))) except: print "error from distro:", d print traceback.format_exc() gc.collect() print time.clock() - s, "distro seconds" for u in upstream_targets: s = time.clock() try: stats.append((u, UPSTREAM[u].crawl(test))) except: print "error from upstream:", u print traceback.format_exc() gc.collect() print time.clock() - s, "upstream seconds" cache = Cache() cache.evict([(None, None)]) print time.clock() - total_start, "seconds total" save_to = open("crawl_stats/" + str(int(time.time())) + ".pickle", "w") pickle.dump(stats, save_to) save_to.close()
total_start = time.clock() stats = [] for d in downstream_targets: s = time.clock() try: stats.append((d, crawl(DISTROS[d]))) except: print "error from distro:", d print traceback.format_exc() gc.collect() print time.clock() - s, "distro seconds" for u in upstream_targets: s = time.clock() try: stats.append((u, UPSTREAM[u].crawl(test))) except: print "error from upstream:", u print traceback.format_exc() gc.collect() print time.clock() - s, "upstream seconds" cache = Cache() cache.evict([(None, None)]) print time.clock() - total_start, "seconds total" save_to = open("crawl_stats/" + str(int(time.time())) + ".pickle", "w") pickle.dump(stats, save_to) save_to.close()