def process(status=lambda x:None): """Process all of the outstanding raw messages Args: status: Function that accepts a string to report status. Right now "status" is just "." for each processed item and "!" for each batched save. Todo: We shuld be using multigets to reduce the load on cassandra. """ q = common.Que() pool, raw, polished, index = common.open_cassandra_connections() polished = polished.batch() batched_keywords = [] counter = 0 try: while True: keywords = process_one_element(q, pool, raw, polished) if not keywords: if batched_keywords: index.insert(dict_merge(*batched_keywords)) polished.send() break counter += 1 status('.') if counter > BATCH_SIZE: status('!') if batched_keywords: keywords = dict_merge(*batched_keywords) index.batch_insert(keywords) polished.send() counter = 0 status('done\n') except KeyboardInterrupt: polished.send()
#!/usr/bin/env python import common common.open_redis_connection().delete('pending') pool, _raw, _polished, _index = common.open_cassandra_connections() for cf in ('RawData', 'PolishedData', 'index'): print "Truncating ", cf pool.truncate(cf)
def __init__(self): self.que = common.BatchQue() self.pool, self.raw, _, _ = common.open_cassandra_connections() self.raw = self.raw.batch() self.counter = 1