def update_session_status(session, status, archive_id=False): """Update stored session item status""" if archive_id: ytarchive().sessionsUpdate({ 'id': session.id, 'archive_id': archive_id, 'state': status }) else: ytarchive().sessionsUpdate({'id': session.id, 'state': status})
def store_unharvested_file(file): """Insert a new live file into our stored cache""" ytarchive().filesInsert({ 'id': file.id, 'session_id': file.session_id, 'url': file.url, 'type': file.type, 'state': c.FILE_NEW }) log(file, "New File", c.FILE_NEW)
def sync(): args = get_args() if 'site' in args and args.site: session = ytarchive().sessionsGet(id=None, params={ 'state': c.SESSION_PROCESSED, 'site_id': args.site, 'sort': 'last_updated:asc', 'limit': 1 }) else: session = ytarchive().sessionsGet(id=None, params={ 'state': c.SESSION_PROCESSED, 'sort': 'last_updated:asc', 'limit': 1 }) if session: update_session_status(session, c.SESSION_SYNCING) log(session, "Session queued for archive.org sync", c.SESSION_SYNCING) update_success = True delete_success = True archive_id = session_archive_id(session) archive_info = False session_files = ytarchive().filesGet(id=None, params={ 'session_id': session.id, 'state': c.FILE_PROCESSED }) # remove any unchanged files and mark them as finished if session.archive_id: archive_info = get_item(archive_id) session_files = finish_unchanged_files(archive_info, session_files) update_success = archive_update(archive_id, session, session_files, archive_info) # remove any files no longer present on session # note that we never delete videos or sessions for permanent backup removed_session_files = ytarchive().filesGetRemoved(session.id) if removed_session_files: delete_success = archive_delete_removed_files( archive_id, removed_session_files, session) if not delete_success: update_session_status(session, c.SESSION_FAILED) log(session, "Failed to delete items from archive.org", c.SESSION_FAILED, c.LOG_ERROR)
def log_harvest_run_end(site, items_added, items_updated, items_deleted): """Stored end time and various operation summary data for finished harvest_run """ message = str(items_added) + " items added, " + str( items_updated) + " updated, " + str(items_deleted) + " deleted" ytarchive().logsInsert({ 'time': time.time(), 'site_id': site['site_id'], 'type': 'harvest_run', 'severity': c.LOG_STATUS, 'message': message, 'state': 'harvested' })
def get_stored_sessions(site): """Get previously stored site sessions to compare against live""" stored_sessions = ytarchive().sessionsGet( id=None, params={'site_id': site['site_id']}) if not stored_sessions: stored_sessions = [] return stored_sessions
def validate(): site_id = None args = get_args() if 'site' in args and args.site: site_id = args.site """Check all synced files to make sure their md5 hash matches the hash stored on Archive.org""" synced_session = ytarchive().sessionsGetSyncedOldest(site_id) if synced_session: session_log = ytarchive().logsGetSynced(synced_session.id) validation_time = time.time() - (60 * 60) if session_log and session_log.time < validation_time: archive_info = get_item(synced_session.archive_id) session_files = ytarchive().filesGetSynced(synced_session.id) valid = validate_files(session_files, archive_info) if valid: ytarchive().sessionsUpdate({ 'id': synced_session.id, 'validated': True }) log(synced_session, "Session files validated", c.SESSION_SYNCED) cleanup_files(synced_session) else: ytarchive().sessionsUpdate({ 'id': synced_session.id, 'state': c.SESSION_FAILED, 'validated': False }) log(synced_session, "Session files failed validation", c.SESSION_FAILED, c.LOG_ERROR)
def update_session_file_status(session_file, status): """Update stored file status and properties based on state""" if status == c.FILE_PROCESSED: data = { 'state': status, 'md5': session_file.md5, 'id': session_file.id} elif status == c.FILE_FETCHED: data = { 'state': status, 'filepath': session_file.filepath, 'id': session_file.id} else: data = { 'state': status, 'id': session_file.id} ytarchive().filesUpdate(data)
def mark_stored_files_for_removal(stored_files): """Mark stored files that have finished processing as removed that are no longer present on the live site""" for stored_key, stored_file in enumerate(stored_files): current_state = stored_file.state # avoid deleting files that are actively being processed condition = ((current_state == c.FILE_NEW or current_state == c.FILE_SYNCED) and stored_file.validated) if condition: ytarchive().filesUpdate({ 'id': stored_file.id, 'url': stored_file.url, 'type': stored_file.type, 'state': c.FILE_REMOVED }) log(stored_file, "File removed", c.FILE_REMOVED)
def get(self, id=None): try: data = ytarchive().logsGet(id, request.args) except ValueError as e: abort(400, str(e)) schema = LogSchema() if id else LogSchema(many=True) data = schema.dump(data) return response(results=data, id=id, type='log')
def last_run_time(site): """Get the unix timestamp from the last harvest run for a given site""" results = ytarchive().logsGet(id=None, params={ 'site_id': site['site_id'], 'type': 'harvest_run', 'state': 'harvesting', 'sort': 'time:desc', 'limit': 1 }) if results: return results[0].time else: return time.time()
def process(): site_id = None args = get_args() if 'site' in args and args.site: site_id = args.site """Download files and metadata for the oldest updated session""" updated_session = ytarchive().sessionsGetChangedOldest(site_id) if updated_session: ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_FETCHING}) log(updated_session, "Files queued for download", c.SESSION_FETCHING) updated_session_files = ytarchive().filesGetNewChanged(updated_session.id) downloaded_session_files = download_session_files(updated_session, updated_session_files) log(updated_session, "Files downloaded locally", c.SESSION_FETCHED) ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_PROCESSING}) hash_session_files(updated_session, downloaded_session_files) ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_PROCESSED}) log(updated_session, "Files hashed", c.SESSION_PROCESSED)
def store_sessions(site, last_run_time, created_after): """Loop through all sessions that have changed since last harvest run and update their status in the youtube_archive MySQL database """ # previously harvested sessions stored_sessions = get_stored_sessions(site) stored_sessions = dict_by_id(stored_sessions) # live sessions that have been updated after the last harvest run sessions = get_live_sessions(site, stored_sessions, last_run_time, created_after) sessions = sessions_map_sessions(sessions) results = {'new': [], 'updated': [], 'skipped': []} for session in sessions: current_state = session_current_state(session, stored_sessions) next_state = session_next_state(current_state) message = session_state_message(next_state) if next_state == c.SESSION_NEW: item = session_map_item(site, session, stored_sessions, c.SESSION_NEW) ytarchive().sessionsInsert(item) results['new'].append(session) store_files(session) # session is new but already has archive information so we avoid it elif next_state == c.SESSION_UNMANAGED: item = session_map_item(site, session, stored_sessions, c.SESSION_UNMANAGED) ytarchive().sessionsInsert(item) results['skipped'].append(session) elif next_state == c.SESSION_SKIPPED: results['skipped'].append(session) # session has been previously imported, check for updates else: # only update if the session has been changed since last import if stored_sessions[session.id].last_updated < session.updated: # only update items that are not currently being processed if session_processing_completed(session, current_state, stored_sessions): item = session_map_item(site, session, stored_sessions, next_state) ytarchive().sessionsUpdate(item) results['updated'].append(session) store_files(session) log(session, message, next_state) return results
def update_session_file_validation(session_file, validated): """shortcut to update stored validated status for items""" ytarchive().filesUpdate({'id': session_file.id, 'validated': validated})
def get_stored_files(session): """Get previously stored session files to compare against live""" stored_files = ytarchive().filesGet(id=None, params={'session_id': session.id}) return stored_files
#!/usr/bin/env python3 from sqlalchemy_declarative import Session, File, Log from db import ytarchive session = ytarchive().db # sample session new_session = Session( id=179, site_id=1, group="Admin", archive_collection_id="coloradochannel", archive_id="om-1-179", title="Creating a Foundation for Change new title", category="City Council", state="synced", created=1489783849, last_updated=1531858777, validated=1) session.add(new_session) session.commit() # sample session new_session = Session( id=180, site_id=1, group="Admin", archive_collection_id="coloradochannel", archive_id="om-1-180", title="Another session title",
def update_file_status(file, status): """Update stored session file status""" ytarchive().filesUpdate({'id': file.id, 'state': status})
def update_session_file_status(session_file, status): """shortcut to update stored file state""" ytarchive().filesUpdate({'id': session_file.id, 'state': status})