def update_job_metadata(self, total_bytes_to_transfer, total_resource_count): job = get_current_job() if job: job.extra_metadata["file_size"] = total_bytes_to_transfer job.extra_metadata["total_resources"] = total_resource_count job.save_meta()
def _ping(started, server, checkrate): try: ping_once(started, server=server) connection.close() return except ConnectionError: logger.warn( "Ping failed (could not connect). Trying again in {} minutes.".format( checkrate ) ) except Timeout: logger.warn( "Ping failed (connection timed out). Trying again in {} minutes.".format( checkrate ) ) except RequestException as e: logger.warn( "Ping failed ({})! Trying again in {} minutes.".format(e, checkrate) ) connection.close() job = get_current_job() if job and job in scheduler: scheduler.change_execution_time( job, datetime.datetime.now() + datetime.timedelta(seconds=checkrate * 60) )
def handle_async(self, *args, **options): channel_id = options["channel_id"] node_ids = options["node_ids"] exclude_node_ids = options["exclude_node_ids"] force_delete = options["force_delete"] try: channel = ChannelMetadata.objects.get(pk=channel_id) except ChannelMetadata.DoesNotExist: raise CommandError( "Channel matching id {id} does not exist".format(id=channel_id) ) delete_all_metadata = delete_metadata( channel, node_ids, exclude_node_ids, force_delete ) unused_files = LocalFile.objects.get_unused_files() # Get orphan files that are being deleted total_file_deletion_operations = unused_files.count() job = get_current_job() if job: total_file_deletion_size = unused_files.aggregate(Sum("file_size")).get( "file_size__sum", 0 ) job.extra_metadata["file_size"] = total_file_deletion_size job.extra_metadata["total_resources"] = total_file_deletion_operations job.save_meta() progress_extra_data = {"channel_id": channel_id} additional_progress = sum((1, bool(delete_all_metadata))) with self.start_progress( total=total_file_deletion_operations + additional_progress ) as progress_update: for file in LocalFile.objects.delete_unused_files(): progress_update(1, progress_extra_data) with db_task_write_lock: LocalFile.objects.delete_orphan_file_objects() progress_update(1, progress_extra_data) if delete_all_metadata: try: os.remove(get_content_database_file_path(channel_id)) except OSError: pass progress_update(1, progress_extra_data)
def handle_async(self, *args, **options): # set language for the translation of the messages locale = settings.LANGUAGE_CODE if not options["locale"] else options["locale"] translation.activate(locale) self.overall_error = "" job = get_current_job() facility = self.get_facility(options) if not facility: self.overall_error = str(MESSAGES[NO_FACILITY]) else: log_type = options["log_type"] log_info = classes_info[log_type] if options["output_file"] is None: filename = log_info["filename"].format(facility.name, facility.id[:4]) else: filename = options["output_file"] filepath = os.path.join(os.getcwd(), filename) queryset = log_info["queryset"] total_rows = queryset.count() with self.start_progress(total=total_rows) as progress_update: try: for row in csv_file_generator( facility, log_type, filepath, overwrite=options["overwrite"] ): progress_update(1) except (ValueError, IOError) as e: self.overall_error = str(MESSAGES[FILE_WRITE_ERROR].format(e)) if job: job.extra_metadata["overall_error"] = self.overall_error self.job.extra_metadata["filename"] = ntpath.basename(filepath) job.save_meta() else: if self.overall_error: raise CommandError(self.overall_error) else: logger.info( "Created csv file {} with {} lines".format(filepath, total_rows) ) translation.deactivate()
def _diskimport( channel_id, directory, drive_id=None, update_progress=None, check_for_cancel=None, node_ids=None, is_updating=False, exclude_node_ids=None, extra_metadata=None, ): call_command( "importchannel", "disk", channel_id, directory, update_progress=update_progress, check_for_cancel=check_for_cancel, ) # Make some real-time updates to the metadata job = get_current_job() # Signal to UI that the DB-downloading step is done so it knows to display # progress correctly job.update_progress(0, 1.0) job.extra_metadata["database_ready"] = True # Add the channel name if it wasn't added initially if job and job.extra_metadata.get("channel_name", "") == "": job.extra_metadata["channel_name"] = get_channel_name(channel_id) job.save_meta() # Skip importcontent step if updating and no nodes have changed if is_updating and (node_ids is not None) and len(node_ids) == 0: pass else: call_command( "importcontent", "disk", channel_id, directory, drive_id=drive_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, update_progress=update_progress, check_for_cancel=check_for_cancel, )
def cancelable_job(): """ Test function for checking if a job is cancelable. Meant to be used in a job cancel test case. It then calls the check_for_cancel, followed by a time.sleep function, 3 times. :param check_for_cancel: A function that the BBQ framework passes in when a job is set to be cancellable. Calling this function makes the thread check if a cancellation has been requested, and then exits early if true. :return: None """ job = get_current_job() for _ in range(10): time.sleep(0.5) if job.check_for_cancel(): return
def _remoteimport( channel_id, baseurl, peer_id=None, update_progress=None, check_for_cancel=None, node_ids=None, is_updating=False, exclude_node_ids=None, extra_metadata=None, ): call_command( "importchannel", "network", channel_id, baseurl=baseurl, update_progress=update_progress, check_for_cancel=check_for_cancel, ) # Make some real-time updates to the metadata job = get_current_job() # Signal to UI that the DB-downloading step is done so it knows to display # progress correctly job.update_progress(0, 1.0) job.extra_metadata["database_ready"] = True # Add the channel name if it wasn't added initially if job and job.extra_metadata.get("channel_name", "") == "": job.extra_metadata["channel_name"] = get_channel_name(channel_id) job.save_meta() call_command( "importcontent", "network", channel_id, baseurl=baseurl, peer_id=peer_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, import_updates=is_updating, update_progress=update_progress, check_for_cancel=check_for_cancel, )
def update_progress_cancelable_job(): """ Test function for checking if a job is cancelable when it updates progress. Meant to be used in a job cancel with progress update test case. It then calls the check_for_cancel, followed by a time.sleep function, 10 times. :param update_progress: A function that is called to update progress :param check_for_cancel: A function that the iceqube framework passes in when a job is set to be cancellable. Calling this function makes the thread check if a cancellation has been requested, and then exits early if true. :return: None """ job = get_current_job() for i in range(10): time.sleep(0.5) job.update_progress(i, 9) if job.check_for_cancel(): return
def _diskimport( channel_id, directory, drive_id=None, update_progress=None, check_for_cancel=None, node_ids=None, is_updating=False, exclude_node_ids=None, extra_metadata=None, ): call_command( "importchannel", "disk", channel_id, directory, update_progress=update_progress, check_for_cancel=check_for_cancel, ) # Add the channel name if it wasn't added initially job = get_current_job() if job and job.extra_metadata.get("channel_name", "") == "": job.extra_metadata["channel_name"] = get_channel_name(channel_id) job.save_meta() # Skip importcontent step if updating and no nodes have changed if is_updating and (node_ids is not None) and len(node_ids) == 0: pass else: call_command( "importcontent", "disk", channel_id, directory, drive_id=drive_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, update_progress=update_progress, check_for_cancel=check_for_cancel, )
def handle_async(self, *args, **options): # set language for the translation of the messages locale = settings.LANGUAGE_CODE if not options["locale"] else options[ "locale"] translation.activate(locale) translate_labels() self.overall_error = [] filepath = self.get_filepath(options) facility = self.get_facility(options) job = get_current_job() total_rows = FacilityUser.objects.filter(facility=facility).count() with self.start_progress(total=total_rows) as progress_update: try: for row in csv_file_generator( facility, filepath, overwrite=options["overwrite"], ): progress_update(1) except (ValueError, IOError) as e: self.overall_error.append(MESSAGES[FILE_WRITE_ERROR].format(e)) raise CommandError(self.overall_error[-1]) # freeze error messages translations: self.overall_error = [str(msg) for msg in self.overall_error] if job: job.extra_metadata["overall_error"] = self.overall_error job.extra_metadata["users"] = total_rows job.extra_metadata["filename"] = ntpath.basename(filepath) job.save_meta() else: logger.info("Created csv file {} with {} lines".format( filepath, total_rows)) translation.deactivate()
def handle_async(self, *args, **options): # initialize stats data structures: self.overall_error = [] db_new_classes = [] db_update_classes = [] classes_to_clear = [] db_new_users = [] db_update_users = [] users_to_delete = [] per_line_errors = [] # set language for the translation of the messages locale = settings.LANGUAGE_CODE if not options["locale"] else options[ "locale"] translation.activate(locale) self.job = get_current_job() filepath = options["filepath"] self.default_facility = self.get_facility(options) self.number_lines = self.get_number_lines(filepath) self.exit_if_error() with self.start_progress(total=100) as self.progress_update: # validate csv headers: has_header = self.csv_headers_validation(filepath) if not has_header: self.overall_error.append(MESSAGES[INVALID_HEADER]) self.exit_if_error() self.progress_update(1) # state=csv_headers try: with open(filepath) as f: reader = csv.DictReader(f, strict=True) per_line_errors, classes, users, roles = self.csv_values_validation( reader, self.header_translation) except (ValueError, FileNotFoundError, csv.Error) as e: self.overall_error.append(MESSAGES[FILE_READ_ERROR].format(e)) self.exit_if_error() ( db_new_users, db_update_users, keeping_users, more_line_errors, ) = self.build_users_objects(users) per_line_errors += more_line_errors ( db_new_classes, db_update_classes, fixed_classes, ) = self.build_classes_objects(classes) classes = fixed_classes users_to_delete, classes_to_clear = self.get_delete( options, keeping_users, db_update_classes) per_line_errors += self.db_validate_list(db_new_users, users=True) per_line_errors += self.db_validate_list(db_update_users, users=True) # progress = 91% per_line_errors += self.db_validate_list(db_new_classes) per_line_errors += self.db_validate_list(db_update_classes) if not options["dryrun"]: self.delete_users(users_to_delete) # clear users from classes not included in the csv: Membership.objects.filter( collection__in=classes_to_clear).delete() # bulk_create and bulk_update are not possible with current Morango: db_users = db_new_users + db_update_users for user in db_users: user.save() # assign roles to users: users_data = {u.username: u for u in db_users} self.add_roles(users_data, roles) db_created_classes = [] for classroom in db_new_classes: created_class = Classroom.objects.create( name=classroom.name, parent=classroom.parent) db_created_classes.append(created_class) # hack to get ids created by Morango: db_new_classes = db_created_classes self.add_classes_memberships( classes, users_data, db_new_classes + db_update_classes) self.remove_memberships(keeping_users, classes[0], classes[1]) classes_report = { "created": len(db_new_classes), "updated": len(db_update_classes), "cleared": len(classes_to_clear), } users_report = { "created": len(db_new_users), "updated": len(db_update_users), "deleted": len(users_to_delete), } self.output_messages( per_line_errors, classes_report, users_report, filepath, options["errorlines"], ) translation.deactivate()
def handle(self, *args, **options): self.job = get_current_job() return self.handle_async(*args, **options)
def _transfer( # noqa: max-complexity=16 self, method, channel_id, path=None, drive_id=None, node_ids=None, exclude_node_ids=None, baseurl=None, peer_id=None, renderable_only=True, import_updates=False, ): try: if not import_updates: nodes_for_transfer = get_nodes_to_transfer( channel_id, node_ids, exclude_node_ids, False, renderable_only=renderable_only, drive_id=drive_id, peer_id=peer_id, ) total_resource_count = (nodes_for_transfer.exclude( kind=content_kinds.TOPIC).values( "content_id").distinct().count()) ( files_to_download, total_bytes_to_transfer, ) = calculate_files_to_transfer(nodes_for_transfer, False) else: ( total_resource_count, files_to_download, total_bytes_to_transfer, ) = get_import_data_for_update( channel_id, renderable_only=renderable_only, drive_id=drive_id, peer_id=peer_id, ) except LocationError: if drive_id: raise CommandError( "The external drive with given drive id {} does not exist." .format(drive_id)) if peer_id: raise CommandError( "The network location with the id {} does not exist". format(peer_id)) except ValueError: if import_updates: raise CommandError( "Tried to perform an channel update import when update data was not available" ) raise job = get_current_job() if job: job.extra_metadata["file_size"] = total_bytes_to_transfer job.extra_metadata["total_resources"] = total_resource_count job.save_meta() number_of_skipped_files = 0 transferred_file_size = 0 file_checksums_to_annotate = [] public = None # If we're downloading, check listing status if method == DOWNLOAD_METHOD: public = lookup_channel_listing_status(channel_id=channel_id, baseurl=baseurl) resources_before_transfer = (ContentNode.objects.filter( channel_id=channel_id, available=True).exclude(kind=content_kinds.TOPIC).values( "content_id").distinct().count()) dummy_bytes_for_annotation = annotation.calculate_dummy_progress_for_annotation( node_ids, exclude_node_ids, total_bytes_to_transfer) with self.start_progress( total=total_bytes_to_transfer + dummy_bytes_for_annotation) as overall_progress_update: if method == DOWNLOAD_METHOD: session = requests.Session() file_transfers = [] for f in files_to_download: if self.is_cancelled(): break filename = f.get_filename() try: dest = paths.get_content_storage_file_path(filename) except InvalidStorageFilenameError: # If the destination file name is malformed, just stop now. overall_progress_update(f.file_size) continue # if the file already exists, add its size to our overall progress, and skip if os.path.isfile(dest) and os.path.getsize( dest) == f.file_size: overall_progress_update(f.file_size) file_checksums_to_annotate.append(f.id) transferred_file_size += f.file_size continue # determine where we're downloading/copying from, and create appropriate transfer object if method == DOWNLOAD_METHOD: url = paths.get_content_storage_remote_url(filename, baseurl=baseurl) filetransfer = transfer.FileDownload( url, dest, session=session, cancel_check=self.is_cancelled) file_transfers.append((f, filetransfer)) elif method == COPY_METHOD: try: srcpath = paths.get_content_storage_file_path( filename, datafolder=path) except InvalidStorageFilenameError: # If the source file name is malformed, just stop now. overall_progress_update(f.file_size) continue filetransfer = transfer.FileCopy( srcpath, dest, cancel_check=self.is_cancelled) file_transfers.append((f, filetransfer)) with concurrent.futures.ThreadPoolExecutor( max_workers=10) as executor: batch_size = 100 # ThreadPoolExecutor allows us to download files concurrently, # greatly reducing download time in most cases. However, loading # all the downloads into the pool requires considerable memory, # so we divide the downloads into batches to keep memory usage down. # In batches of 100, total RAM usage doesn't exceed 250MB in testing. while len(file_transfers) > 0: future_file_transfers = {} for i in range(batch_size): if len(file_transfers) > 0: f, filetransfer = file_transfers.pop() future = executor.submit( self._start_file_transfer, f, filetransfer, overall_progress_update, ) future_file_transfers[future] = (f, filetransfer) for future in concurrent.futures.as_completed( future_file_transfers): f, filetransfer = future_file_transfers[future] try: status = future.result() if self.is_cancelled(): break if status == FILE_SKIPPED: number_of_skipped_files += 1 else: file_checksums_to_annotate.append(f.id) transferred_file_size += f.file_size except transfer.TransferCanceled: break except Exception as e: logger.error( "An error occurred during content import: {}". format(e)) if (isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 404) or (isinstance(e, OSError) and e.errno == 2): # Continue file import when the current file is not found from the source and is skipped. overall_progress_update(f.file_size) number_of_skipped_files += 1 continue else: self.exception = e break with db_task_write_lock: annotation.set_content_visibility( channel_id, file_checksums_to_annotate, node_ids=node_ids, exclude_node_ids=exclude_node_ids, public=public, ) resources_after_transfer = (ContentNode.objects.filter( channel_id=channel_id, available=True).exclude(kind=content_kinds.TOPIC).values( "content_id").distinct().count()) if job: job.extra_metadata[ "transferred_file_size"] = transferred_file_size job.extra_metadata["transferred_resources"] = ( resources_after_transfer - resources_before_transfer) job.save_meta() if number_of_skipped_files > 0: logger.warning( "{} files are skipped, because errors occurred during the import." .format(number_of_skipped_files)) overall_progress_update(dummy_bytes_for_annotation) if self.exception: raise self.exception if self.is_cancelled(): self.cancel()
def make_job_updates(flag): job = get_current_job() for i in range(3): job.update_progress(i, 2) set_flag(flag)
def _transfer( # noqa: max-complexity=16 self, method, channel_id, path=None, drive_id=None, node_ids=None, exclude_node_ids=None, baseurl=None, peer_id=None, renderable_only=True, ): try: nodes_for_transfer = get_nodes_to_transfer( channel_id, node_ids, exclude_node_ids, False, renderable_only=renderable_only, drive_id=drive_id, peer_id=peer_id, ) except LocationError: if drive_id: raise CommandError( "The external drive with given drive id {} does not exist." .format(drive_id)) if peer_id: raise CommandError( "The network location with the id {} does not exist". format(peer_id)) total_resource_count = (nodes_for_transfer.exclude( kind=content_kinds.TOPIC).values("content_id").distinct().count()) (files_to_download, total_bytes_to_transfer) = calculate_files_to_transfer( nodes_for_transfer, False) job = get_current_job() if job: job.extra_metadata["file_size"] = total_bytes_to_transfer job.extra_metadata["total_resources"] = total_resource_count job.save_meta() number_of_skipped_files = 0 transferred_file_size = 0 file_checksums_to_annotate = [] resources_before_transfer = (ContentNode.objects.filter( channel_id=channel_id, available=True).exclude(kind=content_kinds.TOPIC).values( "content_id").distinct().count()) with self.start_progress( total=total_bytes_to_transfer) as overall_progress_update: exception = None # Exception that is not caught by the retry logic if method == DOWNLOAD_METHOD: session = requests.Session() for f in files_to_download: if self.is_cancelled(): break filename = f.get_filename() try: dest = paths.get_content_storage_file_path(filename) except InvalidStorageFilenameError: # If the destination file name is malformed, just stop now. overall_progress_update(f.file_size) continue # if the file already exists, add its size to our overall progress, and skip if os.path.isfile(dest) and os.path.getsize( dest) == f.file_size: overall_progress_update(f.file_size) file_checksums_to_annotate.append(f.id) transferred_file_size += f.file_size continue # determine where we're downloading/copying from, and create appropriate transfer object if method == DOWNLOAD_METHOD: url = paths.get_content_storage_remote_url(filename, baseurl=baseurl) filetransfer = transfer.FileDownload(url, dest, session=session) elif method == COPY_METHOD: try: srcpath = paths.get_content_storage_file_path( filename, datafolder=path) except InvalidStorageFilenameError: # If the source file name is malformed, just stop now. overall_progress_update(f.file_size) continue filetransfer = transfer.FileCopy(srcpath, dest) finished = False try: while not finished: finished, status = self._start_file_transfer( f, filetransfer, overall_progress_update) if self.is_cancelled(): break if status == FILE_TRANSFERRED: file_checksums_to_annotate.append(f.id) transferred_file_size += f.file_size elif status == FILE_SKIPPED: number_of_skipped_files += 1 except Exception as e: exception = e break with db_task_write_lock: annotation.set_content_visibility( channel_id, file_checksums_to_annotate, node_ids=node_ids, exclude_node_ids=exclude_node_ids, ) resources_after_transfer = (ContentNode.objects.filter( channel_id=channel_id, available=True).exclude(kind=content_kinds.TOPIC).values( "content_id").distinct().count()) if job: job.extra_metadata[ "transferred_file_size"] = transferred_file_size job.extra_metadata["transferred_resources"] = ( resources_after_transfer - resources_before_transfer) job.save_meta() if number_of_skipped_files > 0: logger.warning( "{} files are skipped, because errors occurred during the import." .format(number_of_skipped_files)) if exception: raise exception if self.is_cancelled(): self.cancel()
def diff_stats(channel_id, method, drive_id=None, baseurl=None): """ Download the channel database to an upgraded path. Annotate the local file availability of the upgraded channel db. Calculate diff stats comparing default db and annotated channel db. """ # upgraded content database path source_path = paths.get_upgrade_content_database_file_path(channel_id) # annotated db to be used for calculating diff stats destination_path = paths.get_annotated_content_database_file_path(channel_id) try: if method == "network": call_command( "importchannel", "network", channel_id, baseurl=baseurl, no_upgrade=True ) elif method == "disk": drive = get_mounted_drive_by_id(drive_id) call_command( "importchannel", "disk", channel_id, drive.datafolder, no_upgrade=True ) # create all fields/tables at the annotated destination db, based on the current schema version bridge = Bridge( sqlite_file_path=destination_path, schema_version=CURRENT_SCHEMA_VERSION ) bridge.Base.metadata.create_all(bridge.engine) # initialize import manager based on annotated destination path, pulling from source db path import_manager = channel_import.initialize_import_manager( channel_id, cancel_check=False, source=source_path, destination=destination_path, ) # import channel data from source db path import_manager.import_channel_data() import_manager.end() # annotate file availability on destination db annotation.set_local_file_availability_from_disk(destination=destination_path) # get the diff count between whats on the default db and the annotated db new_resources_count = count_new_resources_available_for_import( destination_path, channel_id ) # get the count for leaf nodes which are in the default db, but not in the annotated db resources_to_be_deleted_count = count_removed_resources( destination_path, channel_id ) # get the ids of leaf nodes which are now incomplete due to missing local files updated_resources_ids = automatically_updated_resource_ids( destination_path, channel_id ) # remove the annotated database try: os.remove(destination_path) except OSError as e: logger.info( "Tried to remove {}, but exception {} occurred.".format( destination_path, e ) ) # annotate job metadata with diff stats job = get_current_job() if job: job.extra_metadata["new_resources_count"] = new_resources_count job.extra_metadata[ "deleted_resources_count" ] = resources_to_be_deleted_count job.extra_metadata["updated_node_ids"] = updated_resources_ids job.save_meta() except UserCancelledError: # remove the annotated database try: os.remove(destination_path) except OSError: pass raise