Beispiel #1
0
# Pull URL
# Strip query string
# Query exists
# IFN save to sqlite
# IFN push to queue
# IFY do nothing

seen = {}

if __name__ == '__main__':
    from helpers import client
    ingest = client.queue('ingest')
    scrape = client.queue('scrape')

    while True:
        claimed = ingest.claim(ttl=180, grace=60)
        send = []
        for msg in claimed:
            msg.delete()
            if seen.get(msg.body):
                print "skipping %s, seen %d pages" % (msg.body, len(seen.keys()))
                continue
            print "Sending along %s" % msg.body
            seen[msg.body] = True
            send.append({'body': msg.body, 'ttl': 180})

        if len(send): scrape.post(send)
Beispiel #2
0
        else:
            yield parent.scheme + '://' + parent.netloc + parent.path + '/' + link


seen = {}
def scrapable(uri):
    if not uri.startswith(sys.argv[1]):
        return False
    if seen.get(uri):
        return False
    seen[uri] = True
    return True


if __name__ == '__main__':
    scrape = client.queue('scrape')
    ingest = client.queue('ingest')
    complete = client.queue('completed')

    while True:
        claimed = scrape.claim(ttl=180, grace=60, limit=1)
        for msg in claimed:
            messages = [
                {'body': u, 'ttl': 180}
                for u in scrape_generator(msg.body)
                if scrapable(u)
            ]
            if len(messages):
                ingest.post(messages)
            complete.post({'body': msg.body, 'ttl': 300})
            msg.delete()
Beispiel #3
0
# 1. take a URL from the CLI
# 2. Push to zaqar
# 3. Listen on `collected` queue and print
import sys
import datetime
from helpers import client

if __name__ == '__main__':
    q = client.queue('ingest')

    q.post({'body': sys.argv[1], 'ttl': 300})

    complete = client.queue('completed')
    while True:
        claimed = complete.claim(ttl=180, grace=60)
        for msg in claimed:
            print "%s %s" % (datetime.datetime.now(), msg.body)
            msg.delete()