Beispiel #1
0
def update_session_status(session, status, archive_id=False):
    """Update stored session item status"""
    if archive_id:
        ytarchive().sessionsUpdate({
            'id': session.id,
            'archive_id': archive_id,
            'state': status
        })
    else:
        ytarchive().sessionsUpdate({'id': session.id, 'state': status})
Beispiel #2
0
def store_unharvested_file(file):
    """Insert a new live file into our stored cache"""
    ytarchive().filesInsert({
        'id': file.id,
        'session_id': file.session_id,
        'url': file.url,
        'type': file.type,
        'state': c.FILE_NEW
    })
    log(file, "New File", c.FILE_NEW)
Beispiel #3
0
def sync():
    args = get_args()

    if 'site' in args and args.site:
        session = ytarchive().sessionsGet(id=None,
                                          params={
                                              'state': c.SESSION_PROCESSED,
                                              'site_id': args.site,
                                              'sort': 'last_updated:asc',
                                              'limit': 1
                                          })
    else:
        session = ytarchive().sessionsGet(id=None,
                                          params={
                                              'state': c.SESSION_PROCESSED,
                                              'sort': 'last_updated:asc',
                                              'limit': 1
                                          })

    if session:
        update_session_status(session, c.SESSION_SYNCING)
        log(session, "Session queued for archive.org sync", c.SESSION_SYNCING)

        update_success = True
        delete_success = True
        archive_id = session_archive_id(session)
        archive_info = False
        session_files = ytarchive().filesGet(id=None,
                                             params={
                                                 'session_id': session.id,
                                                 'state': c.FILE_PROCESSED
                                             })
        # remove any unchanged files and mark them as finished
        if session.archive_id:
            archive_info = get_item(archive_id)
            session_files = finish_unchanged_files(archive_info, session_files)
        update_success = archive_update(archive_id, session, session_files,
                                        archive_info)

        # remove any files no longer present on session
        # note that we never delete videos or sessions for permanent backup
        removed_session_files = ytarchive().filesGetRemoved(session.id)
        if removed_session_files:
            delete_success = archive_delete_removed_files(
                archive_id, removed_session_files, session)

        if not delete_success:
            update_session_status(session, c.SESSION_FAILED)
            log(session, "Failed to delete items from archive.org",
                c.SESSION_FAILED, c.LOG_ERROR)
Beispiel #4
0
def log_harvest_run_end(site, items_added, items_updated, items_deleted):
    """Stored end time and various operation summary data for finished
    harvest_run
    """
    message = str(items_added) + " items added, " + str(
        items_updated) + " updated, " + str(items_deleted) + " deleted"
    ytarchive().logsInsert({
        'time': time.time(),
        'site_id': site['site_id'],
        'type': 'harvest_run',
        'severity': c.LOG_STATUS,
        'message': message,
        'state': 'harvested'
    })
Beispiel #5
0
def get_stored_sessions(site):
    """Get previously stored site sessions to compare against live"""
    stored_sessions = ytarchive().sessionsGet(
        id=None, params={'site_id': site['site_id']})
    if not stored_sessions:
        stored_sessions = []
    return stored_sessions
Beispiel #6
0
def validate():
    site_id = None
    args = get_args()
    if 'site' in args and args.site:
        site_id = args.site
    """Check all synced files to make sure their md5 hash matches the hash
    stored on Archive.org"""
    synced_session = ytarchive().sessionsGetSyncedOldest(site_id)

    if synced_session:
        session_log = ytarchive().logsGetSynced(synced_session.id)
        validation_time = time.time() - (60 * 60)

        if session_log and session_log.time < validation_time:
            archive_info = get_item(synced_session.archive_id)
            session_files = ytarchive().filesGetSynced(synced_session.id)
            valid = validate_files(session_files, archive_info)

            if valid:
                ytarchive().sessionsUpdate({
                    'id': synced_session.id,
                    'validated': True
                })
                log(synced_session, "Session files validated",
                    c.SESSION_SYNCED)
                cleanup_files(synced_session)
            else:
                ytarchive().sessionsUpdate({
                    'id': synced_session.id,
                    'state': c.SESSION_FAILED,
                    'validated': False
                })
                log(synced_session, "Session files failed validation",
                    c.SESSION_FAILED, c.LOG_ERROR)
Beispiel #7
0
def update_session_file_status(session_file, status):
    """Update stored file status and properties based on state"""
    if status == c.FILE_PROCESSED:
        data = {
            'state': status,
            'md5': session_file.md5,
            'id': session_file.id}
    elif status == c.FILE_FETCHED:
        data = {
            'state': status,
            'filepath': session_file.filepath,
            'id': session_file.id}
    else:
        data = {
            'state': status,
            'id': session_file.id}

    ytarchive().filesUpdate(data)
Beispiel #8
0
def mark_stored_files_for_removal(stored_files):
    """Mark stored files that have finished processing as removed that are no longer present
    on the live site"""
    for stored_key, stored_file in enumerate(stored_files):
        current_state = stored_file.state
        # avoid deleting files that are actively being processed
        condition = ((current_state == c.FILE_NEW
                      or current_state == c.FILE_SYNCED)
                     and stored_file.validated)

        if condition:
            ytarchive().filesUpdate({
                'id': stored_file.id,
                'url': stored_file.url,
                'type': stored_file.type,
                'state': c.FILE_REMOVED
            })
            log(stored_file, "File removed", c.FILE_REMOVED)
Beispiel #9
0
    def get(self, id=None):
        try:
            data = ytarchive().logsGet(id, request.args)
        except ValueError as e:
            abort(400, str(e))

        schema = LogSchema() if id else LogSchema(many=True)
        data = schema.dump(data)

        return response(results=data, id=id, type='log')
Beispiel #10
0
def last_run_time(site):
    """Get the unix timestamp from the last harvest run for a given site"""
    results = ytarchive().logsGet(id=None,
                                  params={
                                      'site_id': site['site_id'],
                                      'type': 'harvest_run',
                                      'state': 'harvesting',
                                      'sort': 'time:desc',
                                      'limit': 1
                                  })
    if results:
        return results[0].time
    else:
        return time.time()
Beispiel #11
0
def process():
    site_id = None
    args = get_args()
    if 'site' in args and args.site:
        site_id = args.site

    """Download files and metadata for the oldest updated session"""
    updated_session = ytarchive().sessionsGetChangedOldest(site_id)

    if updated_session:
        ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_FETCHING})
        log(updated_session, "Files queued for download", c.SESSION_FETCHING)
        updated_session_files = ytarchive().filesGetNewChanged(updated_session.id)
        downloaded_session_files = download_session_files(updated_session, updated_session_files)
        log(updated_session, "Files downloaded locally", c.SESSION_FETCHED)
        ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_PROCESSING})
        hash_session_files(updated_session, downloaded_session_files)
        ytarchive().sessionsUpdate({'id': updated_session.id, 'state': c.SESSION_PROCESSED})
        log(updated_session, "Files hashed", c.SESSION_PROCESSED)
Beispiel #12
0
def store_sessions(site, last_run_time, created_after):
    """Loop through all sessions that have changed since last harvest run and
    update their status in the youtube_archive MySQL database
    """
    # previously harvested sessions
    stored_sessions = get_stored_sessions(site)
    stored_sessions = dict_by_id(stored_sessions)

    # live sessions that have been updated after the last harvest run
    sessions = get_live_sessions(site, stored_sessions, last_run_time,
                                 created_after)
    sessions = sessions_map_sessions(sessions)
    results = {'new': [], 'updated': [], 'skipped': []}

    for session in sessions:
        current_state = session_current_state(session, stored_sessions)
        next_state = session_next_state(current_state)
        message = session_state_message(next_state)

        if next_state == c.SESSION_NEW:
            item = session_map_item(site, session, stored_sessions,
                                    c.SESSION_NEW)
            ytarchive().sessionsInsert(item)
            results['new'].append(session)
            store_files(session)
        # session is new but already has archive information so we avoid it
        elif next_state == c.SESSION_UNMANAGED:
            item = session_map_item(site, session, stored_sessions,
                                    c.SESSION_UNMANAGED)
            ytarchive().sessionsInsert(item)
            results['skipped'].append(session)
        elif next_state == c.SESSION_SKIPPED:
            results['skipped'].append(session)
        # session has been previously imported, check for updates
        else:
            # only update if the session has been changed since last import
            if stored_sessions[session.id].last_updated < session.updated:
                # only update items that are not currently being processed
                if session_processing_completed(session, current_state,
                                                stored_sessions):
                    item = session_map_item(site, session, stored_sessions,
                                            next_state)
                    ytarchive().sessionsUpdate(item)
                    results['updated'].append(session)
                    store_files(session)
        log(session, message, next_state)
    return results
Beispiel #13
0
def update_session_file_validation(session_file, validated):
    """shortcut to update stored validated status for items"""
    ytarchive().filesUpdate({'id': session_file.id, 'validated': validated})
Beispiel #14
0
def get_stored_files(session):
    """Get previously stored session files to compare against live"""
    stored_files = ytarchive().filesGet(id=None,
                                        params={'session_id': session.id})
    return stored_files
Beispiel #15
0
#!/usr/bin/env python3

from sqlalchemy_declarative import Session, File, Log
from db import ytarchive

session = ytarchive().db

# sample session
new_session = Session(
    id=179,
    site_id=1,
    group="Admin",
    archive_collection_id="coloradochannel",
    archive_id="om-1-179",
    title="Creating a Foundation for Change new title",
    category="City Council",
    state="synced",
    created=1489783849,
    last_updated=1531858777,
    validated=1)
session.add(new_session)
session.commit()

# sample session
new_session = Session(
    id=180,
    site_id=1,
    group="Admin",
    archive_collection_id="coloradochannel",
    archive_id="om-1-180",
    title="Another session title",
Beispiel #16
0
def update_file_status(file, status):
    """Update stored session file status"""
    ytarchive().filesUpdate({'id': file.id, 'state': status})
Beispiel #17
0
def update_session_file_status(session_file, status):
    """shortcut to update stored file state"""
    ytarchive().filesUpdate({'id': session_file.id, 'state': status})