import logging import json from common import engine log = logging.getLogger(__name__) raw_table = engine.get_table('raw') #status_table = engine.get_table('raw_dumped') BATCH_SIZE = 10000 def dump_batches(): if len(raw_table) < BATCH_SIZE: log.info("Not enough entries remaining.") return False data, min_id, max_id = [], None, 0 log.info("Fetching %s raw tweets...", BATCH_SIZE) engine.begin() for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']): if min_id is None: min_id = row['id'] data.append(row['json']) raw_table.delete(id=row['id']) log.info("Saving file...") fh = open('dumps/raw_%s.json' % min_id, 'wb') data = '\n'.join(data) fh.write(data.encode('utf-8')) fh.close() engine.commit() return True
import json import logging from common import engine from dataset.freeze.format.fjson import JSONEncoder from datetime import datetime, timedelta import sqlalchemy.sql.expression as sql log = logging.getLogger(__name__) raw_tbl = engine.get_table('raw').table hashtags_tbl = engine.get_table('hashtags').table #status_table = engine.get_table('raw_dumped') def dump_hashtag(tag): data = [] status_tbl = engine['status'].table user_tbl = engine['user'].table q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id) q = q.join(hashtags_tbl, status_tbl.c.id == hashtags_tbl.c.status_id) q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True) q = q.where(hashtags_tbl.c.text.ilike(tag)) q = q.order_by(hashtags_tbl.c.status_id.asc()) statuses = [] for row in engine.query(q): data.append(row) #data.append(json.loads(row['raw_json'])) #for json_file in os.listdir('dumps'): # print json_file, len(statuses), len(data) # #min_id = int(json_file.split('.', 1)[0].split('_', 1)[-1]) # fh = open('dumps/%s' % json_file, 'rb')