def iterate_from_s3(game_id, bucket, logname, start_time, end_time, verbose=True): assert start_time > 0 # to protect against same-time collisions, create a unique fake "PID" for MongoDB row _ids sha = hashlib.sha1() sha.update(game_id) dig = sha.digest() fake_pid = (ord(dig[1]) << 8) | ord(dig[0]) s3 = SpinS3.S3(SpinConfig.aws_key_file()) last_id_time = -1 id_serial = 0 for t in xrange(86400 * (start_time // 86400), 86400 * (end_time // 86400), 86400): # for each day y, m, d = SpinConfig.unix_to_cal(t) prefix = '%04d%02d/%s-%04d%02d%02d-%s' % ( y, m, SpinConfig.game_id_long(override_game_id=game_id), y, m, d, logname) for entry in s3.list_bucket(bucket, prefix=prefix): filename = entry['name'].split('/')[-1] if verbose: print 'reading', filename if entry['name'].endswith('.zip'): tf = tempfile.NamedTemporaryFile(prefix=logname + '-' + filename, suffix='.zip') s3.get_file(bucket, entry['name'], tf.name) unzipper = subprocess.Popen(['unzip', '-q', '-p', tf.name], stdout=subprocess.PIPE) elif entry['name'].endswith('.gz'): tf = tempfile.NamedTemporaryFile(prefix=logname + '-' + filename, suffix='.gz') s3.get_file(bucket, entry['name'], tf.name) unzipper = subprocess.Popen(['gunzip', '-c', tf.name], stdout=subprocess.PIPE) else: raise Exception('unhandled file extension: ' + entry['name']) for line in unzipper.stdout.xreadlines(): row = SpinJSON.loads(line) if row['time'] < start_time: continue # skip ahead elif row['time'] >= end_time: break if '_id' not in row: # synthesize a fake MongoDB row ID if row['time'] != last_id_time: last_id_time = row['time'] id_serial = 0 row['_id'] = SpinNoSQLId.creation_time_id(row['time'], pid=fake_pid, serial=id_serial) assert SpinNoSQLId.is_valid(row['_id']) id_serial += 1 # note: there's a small chance this could end up duplicating an event at the boundary of an S3 import and MongoDB import if verbose: print row yield row
def do_upload(nosql_client, table, verbose, dry_run, keep_local): if not table['s3_name']: return msg_fd = sys.stderr if verbose else NullFD() s3_logs = SpinS3.S3(s3_key_file_for_logs) print >> msg_fd, '%s: upload' % (table['table_name']) tbl = nosql_client._table(table['table_name']) # find earliest timestamp first = list(tbl.find({}, {'time': 1}).sort([('time', 1)]).limit(1)) if not first: print >> msg_fd, 'no records' return start_time = first[0]['time'] # snap to day boundary start_time = 86400 * (start_time // 86400) today_start = 86400 * (time_now // 86400) # check each full UTC day from start_time, stopping before the current day while start_time < today_start: date_str = time.strftime('%Y%m%d', time.gmtime(start_time)) year_month = date_str[:-2] obj_name = '%s/%s-%s-%s.json.%s' % ( year_month, SpinConfig.game_id_long(), date_str, table['s3_name'], table['compression']) print >> msg_fd, ' checking %s/%s...' % (log_bucket, obj_name), msg_fd.flush() if s3_logs.exists(log_bucket, obj_name, has_read_permission=False): print >> msg_fd, 'already exists, skipping.' else: # upload one day's data print >> msg_fd, 'does not exist, dumping...' # spit out the entries to a flat file using SpinLog tf_name = '%s/%s-%s-%s.json' % (tempfile.gettempdir(), SpinConfig.game_id_long(), date_str, table['s3_name']) try: target = SpinLog.SimpleJSONLog(tf_name, buffer=-1) cursor = tbl.find({ 'time': { '$gte': start_time, '$lt': start_time + 86400 } }).sort([('time', 1)]) total = cursor.count() count = 0 for row in cursor: if '_id' in row: if type(row['_id']) is bson.objectid.ObjectId: row['_id'] = SpinNoSQL.NoSQLClient.decode_object_id( row['_id']) assert 'time' in row t = row['time'] del row['time'] target.event(t, row) count += 1 if count == 1 or count == total or (count % 1000) == 0: print >> msg_fd, '\r %d/%d %s dump' % ( count, total, table['table_name']), print >> msg_fd, 'finished' target.close() # compress the file obj_file_name = os.path.basename(obj_name) print >> msg_fd, ' compressing', os.path.basename( tf_name), '->', os.path.basename(obj_file_name), '...', msg_fd.flush() assert table['compression'] == 'zip' save_cwd = os.getcwd() try: os.chdir(os.path.dirname(tf_name)) args = [ '/usr/bin/zip', '-q', os.path.basename(obj_file_name), os.path.basename(tf_name) ] subprocess.check_call(args) print >> msg_fd, 'done' print >> msg_fd, ' uploading', obj_file_name, '->', log_bucket + ':' + obj_name, '...', msg_fd.flush() if not dry_run: s3_logs.put_file(log_bucket, obj_name, os.path.basename(obj_file_name)) finally: safe_unlink(os.path.basename(obj_file_name)) os.chdir(save_cwd) finally: if keep_local: print >> msg_fd, ' KEEPING', tf_name else: safe_unlink(tf_name) print >> msg_fd, 'done' start_time += 86400
def do_slave(task): date = task['date'] game_id = task['game_id'] verbose = task['verbose'] dry_run = task['dry_run'] commit_interval = task['commit_interval'] start_time = SpinConfig.cal_to_unix((int(date[0:4]),int(date[4:6]),int(date[6:8]))) end_time = start_time + 86400 gamedata = SpinJSON.load(open(SpinConfig.gamedata_filename(override_game_id=game_id))) STORE = {} [get_store_items(STORE, sku) for sku in gamedata['store']['catalog']] if verbose: print >> sys.stderr, 'converting date', date, 'start_time', start_time, 'end_time', end_time, '...' if not verbose: filterwarnings('ignore', category = MySQLdb.Warning) cfg = SpinConfig.get_mysql_config(game_id+'_upcache') con = MySQLdb.connect(*cfg['connect_args'], **cfg['connect_kwargs']) store_table = cfg['table_prefix']+game_id+'_store' s3 = SpinS3.S3(SpinConfig.aws_key_file()) bucket = 'spinpunch-logs' batch = 0 total = 0 cur = con.cursor() for entry in s3.list_bucket(bucket, prefix='%s/%s-%s-metrics.json' % (date[0:6], SpinConfig.game_id_long(override_game_id=game_id), date)): filename = entry['name'].split('/')[-1] if verbose: print >> sys.stderr, 'reading', filename if entry['name'].endswith('.zip'): tf = tempfile.NamedTemporaryFile(prefix='old_metrics_to_mysql-'+filename, suffix='.zip') s3.get_file(bucket, entry['name'], tf.name) unzipper = subprocess.Popen(['unzip', '-q', '-p', tf.name], stdout = subprocess.PIPE) elif entry['name'].endswith('.gz'): fd = s3.get_open(bucket, entry['name'], allow_keepalive = False) unzipper = subprocess.Popen(['gunzip', '-c', '-'], stdin = fd.fileno(), stdout = subprocess.PIPE) for line in unzipper.stdout.xreadlines(): if '5120_buy_item' in line: #and ('item:token' in line): entry = SpinJSON.loads(line) if entry['event_name'] != '5120_buy_item': continue if 'price_currency' not in entry: # old metric, need to fill in manually if entry['items'][0]['spec'] in STORE: entry['price_currency'] = 'item:token' entry['price'] = STORE[entry['items'][0]['spec']] if verbose: print >> sys.stderr, SpinJSON.dumps(entry) if entry.get('price_currency','unknown') != 'item:token': continue if '_id' in entry: entry_id = entry['_id'] else: id_generator.set_time(int(time.time())) entry_id = id_generator.generate() # arbitrary assert len(entry['items']) == 1 item = entry['items'][0] keyvals = [('_id', entry_id), ('time', entry['time']), ('user_id', entry['user_id']), ('price', entry['price']), ('currency', entry['price_currency']), ('item', item['spec']), ('stack', item.get('stack',1))] query = "INSERT INTO " + store_table + \ "("+', '.join(['`'+k+'`' for k,v in keyvals])+")"+ \ " VALUES ("+', '.join(['%s'] * len(keyvals)) +")" if dry_run: print >> sys.stderr, query, [v for k,v in keyvals] else: cur.execute(query, [v for k,v in keyvals]) batch += 1 total += 1 if commit_interval > 0 and batch >= commit_interval: batch = 0 con.commit() cur = con.cursor() if verbose: print >> sys.stderr, total, 'inserted' if not dry_run: con.commit()
# set TMPDIR environment variable to a suitable location import sys, os, getopt, time, tempfile, shutil import SpinS3 import SpinUserDB import SpinConfig import SpinParallel import SpinSingletonProcess import subprocess date_str = time.strftime('%Y%m%d', time.gmtime()) # autoconfigure based on config.json game_id = SpinConfig.config['game_id'] backup_bucket = 'spinpunch-backups' backup_obj_prefix = '%s-player-data-%s/' % (SpinConfig.game_id_long(), date_str) s3_key_file_for_db = SpinConfig.aws_key_file() s3_key_file_for_backups = SpinConfig.aws_key_file() class NullFD(object): def write(self, stuff): pass def backup_s3_dir(title, bucket_name, prefix='', ignore_errors=False, verbose=False):