# Pull URL # Strip query string # Query exists # IFN save to sqlite # IFN push to queue # IFY do nothing seen = {} if __name__ == '__main__': from helpers import client ingest = client.queue('ingest') scrape = client.queue('scrape') while True: claimed = ingest.claim(ttl=180, grace=60) send = [] for msg in claimed: msg.delete() if seen.get(msg.body): print "skipping %s, seen %d pages" % (msg.body, len(seen.keys())) continue print "Sending along %s" % msg.body seen[msg.body] = True send.append({'body': msg.body, 'ttl': 180}) if len(send): scrape.post(send)
else: yield parent.scheme + '://' + parent.netloc + parent.path + '/' + link seen = {} def scrapable(uri): if not uri.startswith(sys.argv[1]): return False if seen.get(uri): return False seen[uri] = True return True if __name__ == '__main__': scrape = client.queue('scrape') ingest = client.queue('ingest') complete = client.queue('completed') while True: claimed = scrape.claim(ttl=180, grace=60, limit=1) for msg in claimed: messages = [ {'body': u, 'ttl': 180} for u in scrape_generator(msg.body) if scrapable(u) ] if len(messages): ingest.post(messages) complete.post({'body': msg.body, 'ttl': 300}) msg.delete()
# 1. take a URL from the CLI # 2. Push to zaqar # 3. Listen on `collected` queue and print import sys import datetime from helpers import client if __name__ == '__main__': q = client.queue('ingest') q.post({'body': sys.argv[1], 'ttl': 300}) complete = client.queue('completed') while True: claimed = complete.claim(ttl=180, grace=60) for msg in claimed: print "%s %s" % (datetime.datetime.now(), msg.body) msg.delete()