def get_available_checksums_from_disk(channel_id, drive_id): try: basepath = get_mounted_drive_by_id(drive_id).datafolder except KeyError: raise LocationError("Drive with id {} does not exist".format(drive_id)) PER_DISK_CACHE_KEY = "DISK_AVAILABLE_CHECKSUMS_{basepath}".format( basepath=basepath) PER_DISK_PER_CHANNEL_CACHE_KEY = "DISK_AVAILABLE_CHECKSUMS_{basepath}_{channel_id}".format( basepath=basepath, channel_id=channel_id) if PER_DISK_PER_CHANNEL_CACHE_KEY not in cache: if PER_DISK_CACHE_KEY not in cache: content_dir = get_content_storage_dir_path(datafolder=basepath) disk_checksums = [] for _, _, files in os.walk(content_dir): for name in files: checksum = os.path.splitext(name)[0] # Only add valid checksums formatted according to our standard filename if checksum_regex.match(checksum): disk_checksums.append(checksum) # Cache is per device, so a relatively long lived one should # be fine. cache.set(PER_DISK_CACHE_KEY, disk_checksums, 3600) else: disk_checksums = cache.get(PER_DISK_CACHE_KEY) checksums = set( LocalFile.objects.filter( files__contentnode__channel_id=channel_id).values_list( "id", flat=True)).intersection(set(disk_checksums)) cache.set(PER_DISK_PER_CHANNEL_CACHE_KEY, checksums, 3600) else: checksums = cache.get(PER_DISK_PER_CHANNEL_CACHE_KEY) return checksums
def channeldiffstats(self, request): job_metadata = {} channel_id = request.data.get("channel_id") method = request.data.get("method") drive_id = request.data.get("drive_id") baseurl = request.data.get("baseurl") # request validation and job metadata info if not channel_id: raise serializers.ValidationError("The channel_id field is required.") if not method: raise serializers.ValidationError("The method field is required.") if method == "network": baseurl = baseurl or conf.OPTIONS["Urls"]["CENTRAL_CONTENT_BASE_URL"] job_metadata["baseurl"] = baseurl # get channel version metadata url = get_channel_lookup_url(baseurl=baseurl, identifier=channel_id) resp = requests.get(url) channel_metadata = resp.json() job_metadata["new_channel_version"] = channel_metadata[0]["version"] elif method == "disk": if not drive_id: raise serializers.ValidationError( "The drive_id field is required when using 'disk' method." ) job_metadata = _add_drive_info(job_metadata, request.data) # get channel version metadata drive = get_mounted_drive_by_id(drive_id) channel_metadata = read_channel_metadata_from_db_file( get_content_database_file_path(channel_id, drive.datafolder) ) job_metadata["new_channel_version"] = channel_metadata.version else: raise serializers.ValidationError( "'method' field should either be 'network' or 'disk'." ) job_metadata.update( { "type": "CHANNELDIFFSTATS", "started_by": request.user.pk, "channel_id": channel_id, } ) job_id = priority_queue.enqueue( diff_stats, channel_id, method, drive_id=drive_id, baseurl=baseurl, extra_metadata=job_metadata, track_progress=False, cancellable=True, ) resp = _job_to_response(priority_queue.fetch_job(job_id)) return Response(resp)
def checksums_from_drive_id(self, drive_id, instance): try: datafolder = get_mounted_drive_by_id(drive_id).datafolder except KeyError: raise serializers.ValidationError( "The external drive with given drive id {} does not exist.".format( drive_id ) ) return get_available_checksums_from_disk(instance.channel_id, datafolder)
def startdiskcontentimport(self, request): try: channel_id = request.data["channel_id"] except KeyError: raise serializers.ValidationError( "The channel_id field is required.") try: drive_id = request.data["drive_id"] except KeyError: raise serializers.ValidationError( "The drive_id field is required.") try: drive = get_mounted_drive_by_id(drive_id) except KeyError: raise serializers.ValidationError( "That drive_id was not found in the list of drives.") # optional arguments node_ids = request.data.get("node_ids", None) exclude_node_ids = request.data.get("exclude_node_ids", None) if node_ids and not isinstance(node_ids, list): raise serializers.ValidationError("node_ids must be a list.") if exclude_node_ids and not isinstance(exclude_node_ids, list): raise serializers.ValidationError( "exclude_node_ids must be a list.") job_metadata = { "type": "DISKCONTENTIMPORT", "started_by": request.user.pk } job_id = get_queue().enqueue( call_command, "importcontent", "disk", channel_id, drive.datafolder, node_ids=node_ids, exclude_node_ids=exclude_node_ids, extra_metadata=job_metadata, track_progress=True, cancellable=True, ) resp = _job_to_response(get_queue().fetch_job(job_id)) return Response(resp)
def _add_drive_info(import_task, task_description): try: drive_id = task_description["drive_id"] except KeyError: raise serializers.ValidationError("The drive_id field is required.") try: drive = get_mounted_drive_by_id(drive_id) except KeyError: raise serializers.ValidationError( "That drive_id was not found in the list of drives.") import_task.update({"drive_id": drive_id, "datafolder": drive.datafolder}) return import_task
def startdiskchannelimport(self, request): # Load the required parameters try: channel_id = request.data["channel_id"] except KeyError: raise serializers.ValidationError( "The channel_id field is required.") try: drive_id = request.data["drive_id"] except KeyError: raise serializers.ValidationError( "The drive_id field is required.") try: drive = get_mounted_drive_by_id(drive_id) except KeyError: raise serializers.ValidationError( "That drive_id was not found in the list of drives.") job_metadata = { "type": "DISKCHANNELIMPORT", "started_by": request.user.pk } job_id = get_queue().enqueue( call_command, "importchannel", "disk", channel_id, drive.datafolder, extra_metadata=job_metadata, cancellable=True, ) resp = _job_to_response(get_queue().fetch_job(job_id)) return Response(resp)
def _localexport( channel_id, drive_id, update_progress=None, check_for_cancel=None, node_ids=None, exclude_node_ids=None, extra_metadata=None, ): drive = get_mounted_drive_by_id(drive_id) call_command( "exportchannel", channel_id, drive.datafolder, update_progress=update_progress, check_for_cancel=check_for_cancel, ) try: call_command( "exportcontent", channel_id, drive.datafolder, node_ids=node_ids, exclude_node_ids=exclude_node_ids, update_progress=update_progress, check_for_cancel=check_for_cancel, ) except UserCancelledError: try: os.remove( get_content_database_file_path(channel_id, datafolder=drive.datafolder)) except OSError: pass raise
def diff_stats(channel_id, method, drive_id=None, baseurl=None): """ Download the channel database to an upgraded path. Annotate the local file availability of the upgraded channel db. Calculate diff stats comparing default db and annotated channel db. """ # upgraded content database path source_path = paths.get_upgrade_content_database_file_path(channel_id) # annotated db to be used for calculating diff stats destination_path = paths.get_annotated_content_database_file_path(channel_id) try: if method == "network": call_command( "importchannel", "network", channel_id, baseurl=baseurl, no_upgrade=True ) elif method == "disk": drive = get_mounted_drive_by_id(drive_id) call_command( "importchannel", "disk", channel_id, drive.datafolder, no_upgrade=True ) # create all fields/tables at the annotated destination db, based on the current schema version bridge = Bridge( sqlite_file_path=destination_path, schema_version=CURRENT_SCHEMA_VERSION ) bridge.Base.metadata.create_all(bridge.engine) # initialize import manager based on annotated destination path, pulling from source db path import_manager = channel_import.initialize_import_manager( channel_id, cancel_check=False, source=source_path, destination=destination_path, ) # import channel data from source db path import_manager.import_channel_data() import_manager.end() # annotate file availability on destination db annotation.set_local_file_availability_from_disk(destination=destination_path) # get the diff count between whats on the default db and the annotated db new_resources_count = count_new_resources_available_for_import( destination_path, channel_id ) # get the count for leaf nodes which are in the default db, but not in the annotated db resources_to_be_deleted_count = count_removed_resources( destination_path, channel_id ) # get the ids of leaf nodes which are now incomplete due to missing local files updated_resources_ids = automatically_updated_resource_ids( destination_path, channel_id ) # remove the annotated database try: os.remove(destination_path) except OSError as e: logger.info( "Tried to remove {}, but exception {} occurred.".format( destination_path, e ) ) # annotate job metadata with diff stats job = get_current_job() if job: job.extra_metadata["new_resources_count"] = new_resources_count job.extra_metadata[ "deleted_resources_count" ] = resources_to_be_deleted_count job.extra_metadata["updated_node_ids"] = updated_resources_ids job.save_meta() except UserCancelledError: # remove the annotated database try: os.remove(destination_path) except OSError: pass raise