def fix_missing_files(): """ Finds replays set as "ARCHIVED" that are missing a corresponding file stored in S3. Re-adds them to GC queue. """ _error = "MISSING_S3_FILE" all_s3_replay_ids = [ replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" ] archived_replays_no_file = Replay.query.filter( Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() for replay in archived_replays_no_file: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print( "Replay {} is \"ARCHIVED\" but does not have a file stored on S3. Re-adding to GC queue." .format(replay.id)) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)
def small_replay_exodus(self): small_replay_files = {replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024)} small_replays = Replay.query.filter(Replay.id.in_(small_replay_files.keys())).all() replays_removed = [] # IDs of removed replays for replay in small_replays: # Save local URI so we can remove the file from S3 after we've changed the databose. local_uri = replay.local_uri # Clean up metadata associated with an archived replay. replay.dl_done_time = None replay.local_uri = None replay.state = "WAITING_DOWNLOAD" # Save ne state to database db.session.add(replay) db.session.commit() # Remove bad file from S3. dotabank_bucket.delete_key(local_uri or "replays/{}.dem.bz2".format(replay.id)) # Add a new download job Replay.add_dl_job(replay) # Note that we've done things to this replay. replays_removed.append(replay.id) return jsonify( success=True, replays_removed=replays_removed )
def fix_long_waiting_download(): """ Finds replays that have been "WAITING_DOWNLOAD" for over 24 hours, and re-adds them to the GC queue. """ _error = "LONGEST_WAIT_OF_MY_LIFE" replay_waiting_download_over24hrs = Replay.query.filter( Replay.state == "WAITING_DOWNLOAD", Replay.gc_done_time <= (datetime.utcnow() - timedelta(hours=24)) # Over 24 hrs ago ).all() for replay in replay_waiting_download_over24hrs: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print( "Replay {} has been \"WAITING_DOWNLOAD\" for over 24 hours. Re-adding to DL queue." .format(replay.id)) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)
def small_replay_exodus(self): small_replay_files = { replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024) } small_replays = Replay.query.filter( Replay.id.in_(small_replay_files.keys())).all() replays_removed = [] # IDs of removed replays for replay in small_replays: # Save local URI so we can remove the file from S3 after we've changed the databose. local_uri = replay.local_uri # Clean up metadata associated with an archived replay. replay.dl_done_time = None replay.local_uri = None replay.state = "WAITING_DOWNLOAD" # Save ne state to database db.session.add(replay) db.session.commit() # Remove bad file from S3. dotabank_bucket.delete_key( local_uri or "replays/{}.dem.bz2".format(replay.id)) # Add a new download job Replay.add_dl_job(replay) # Note that we've done things to this replay. replays_removed.append(replay.id) return jsonify(success=True, replays_removed=replays_removed)
def fix_small_replays(): """ Finds replays with a tiny filesize and re-adds them to the GC queue (we probably downloaded a error page. """ _error = "SMALL_REPLAY" # FIXME: This step will take longer and longer the more replays we store. It would be more efficient to store # the filesize in our local database after a file has been archived, and then directly query the database. small_replay_files = {replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024)} small_replays = db.session.query(Replay, db.func.count(ReplayAutoFix.id)).filter( Replay.state == "ARCHIVED", # Ignore non-archived files (they shouldnt be in s3 if they aren't archived, but vOv) Replay.id.in_(small_replay_files.keys()), # Check the replays that the S3 call above has flagged as small ReplayAutoFix.replay_id == Replay.id ).group_by( ReplayAutoFix.replay_id ).having( db.func.count(ReplayAutoFix.id) < app.config.get('MAX_REPLAY_FIX_ATTEMPTS') # Ignore replays that have exceeded max fix attempts ).all() for replay, fix_attempts in small_replays: if not should_fix_be_attempted(replay.id, _error, extra={ 'file_size': small_replay_files[unicode(replay.id)] }): continue print ("Replay {} has a small file stored on s3 ({} bytes). Re-adding to DL queue.".format( replay.id, small_replay_files[unicode(replay.id)] )) replay.state = "WAITING_GC" # Switch state back to WAITING_GC. Replay.add_dl_job(replay)
def fix_long_waiting_download(): """ Finds replays that have been "WAITING_DOWNLOAD" for over 24 hours, and re-adds them to the GC queue. """ _error = "LONGEST_WAIT_OF_MY_LIFE" replay_waiting_download_over24hrs = Replay.query.filter( Replay.state == "WAITING_DOWNLOAD", Replay.gc_done_time <= (datetime.utcnow() - timedelta(hours=24)) # Over 24 hrs ago ).all() for replay in replay_waiting_download_over24hrs: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print ("Replay {} has been \"WAITING_DOWNLOAD\" for over 24 hours. Re-adding to DL queue.".format( replay.id )) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)
def requeue_waiting_downloads(self): waiting_downloads = Replay.query.filter( Replay.state == "WAITING_DOWNLOAD").all() done = [] for replay in waiting_downloads: if Replay.add_dl_job(replay): done.append(replay.id) return jsonify(success=True, readded=done)
def requeue_waiting_downloads(self): waiting_downloads = Replay.query.filter(Replay.state == "WAITING_DOWNLOAD").all() done = [] for replay in waiting_downloads: if Replay.add_dl_job(replay): done.append(replay.id) return jsonify( success=True, readded=done )
def fix_small_replays(): """ Finds replays with a tiny filesize and re-adds them to the GC queue (we probably downloaded a error page. """ _error = "SMALL_REPLAY" # FIXME: This step will take longer and longer the more replays we store. It would be more efficient to store # the filesize in our local database after a file has been archived, and then directly query the database. small_replay_files = { replay_file.key[8:-8]: replay_file.size for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/" and replay_file.size < (1024 * 1024) } small_replays = db.session.query( Replay, db.func.count(ReplayAutoFix.id) ).filter( Replay.state == "ARCHIVED", # Ignore non-archived files (they shouldnt be in s3 if they aren't archived, but vOv) Replay.id.in_(small_replay_files.keys( )), # Check the replays that the S3 call above has flagged as small ReplayAutoFix.replay_id == Replay.id).group_by( ReplayAutoFix.replay_id).having( db.func.count(ReplayAutoFix.id) < app.config.get( 'MAX_REPLAY_FIX_ATTEMPTS' ) # Ignore replays that have exceeded max fix attempts ).all() for replay, fix_attempts in small_replays: if not should_fix_be_attempted( replay.id, _error, extra={'file_size': small_replay_files[unicode(replay.id)]}): continue print( "Replay {} has a small file stored on s3 ({} bytes). Re-adding to DL queue." .format(replay.id, small_replay_files[unicode(replay.id)])) replay.state = "WAITING_GC" # Switch state back to WAITING_GC. Replay.add_dl_job(replay)
def fix_missing_files(): """ Finds replays set as "ARCHIVED" that are missing a corresponding file stored in S3. Re-adds them to GC queue. """ _error = "MISSING_S3_FILE" all_s3_replay_ids = [replay_file.key[8:-8] for replay_file in dotabank_bucket.list() if replay_file.key[:8] == "replays/"] archived_replays_no_file = Replay.query.filter(Replay.state == 'ARCHIVED', Replay.id.notin_(all_s3_replay_ids)).all() for replay in archived_replays_no_file: if not should_fix_be_attempted(replay.id, _error): # Tag as "DOWNLOAD_ERROR" because we can't fix this - the problem is entirely in Valve (or their partners) domain. replay.state = "DOWNLOAD_ERROR" replay.local_uri = None replay.dl_done_time = None db.session.add(replay) db.session.commit() continue print ("Replay {} is \"ARCHIVED\" but does not have a file stored on S3. Re-adding to GC queue.".format( replay.id )) replay.state = "WAITING_DOWNLOAD" # Switch state back to WAITING_DOWNLOAD. Replay.add_dl_job(replay)