def test_row_uniqueness(self, upload_job, csv_raw_1, csv_raw_2, csv_parsed_1, csv_parsed_2): registration_row_1 = RegistrationBulkUploadRow.create( upload_job, csv_raw_1, csv_parsed_1) registration_row_1.save() registration_row_2 = RegistrationBulkUploadRow.create( upload_job, csv_raw_1, csv_parsed_2) with pytest.raises(IntegrityError): registration_row_2.save()
def test_row_object_eq(self, upload_job, csv_raw_1, csv_raw_2, csv_parsed_1, csv_parsed_2): registration_row_1 = RegistrationBulkUploadRow.create( upload_job, csv_raw_1, csv_parsed_1) registration_row_2 = RegistrationBulkUploadRow.create( upload_job, csv_raw_2, csv_parsed_2) registration_row_3 = RegistrationBulkUploadRow.create( upload_job, csv_raw_2, csv_parsed_2) assert registration_row_1 != registration_row_2 assert registration_row_2 == registration_row_3
def registration_row_invalid_affiliation(self, upload_job_done_error, csv_parsed_invalid_affiliation): row = RegistrationBulkUploadRow.create(upload_job_done_error, str(uuid.uuid4()), csv_parsed_invalid_affiliation) row.save() return row
def registration_row_invalid_extra_bib_1(self, upload_job_done_partial, csv_parsed_extra_bib): row = RegistrationBulkUploadRow.create(upload_job_done_partial, str(uuid.uuid4()), csv_parsed_extra_bib) row.save() return row
def test_row_object_hash(self, upload_job, csv_raw_2, csv_parsed_2, row_hash_2): registration_row = RegistrationBulkUploadRow.create( upload_job, csv_raw_2, csv_parsed_2) assert registration_row.__hash__() == hash(row_hash_2) registration_row.save() registration_row.reload() assert registration_row.__hash__() == hash(registration_row.pk)
def test_row_creation(self, upload_job, csv_raw_1, csv_parsed_1, row_hash_1): registration_row = RegistrationBulkUploadRow.create( upload_job, csv_raw_1, csv_parsed_1) registration_row.save() registration_row.reload() assert registration_row.draft_registration is None assert registration_row.is_completed is False assert registration_row.is_picked_up is False assert registration_row.row_hash == row_hash_1 assert registration_row.upload == upload_job assert registration_row.csv_raw == csv_raw_1 assert registration_row.csv_parsed == csv_parsed_1
def registration_row_3(self, upload_job_done_partial, csv_parsed_1): row = RegistrationBulkUploadRow.create(upload_job_done_partial, str(uuid.uuid4()), csv_parsed_1) row.save() return row
def prepare_for_registration_bulk_creation(payload_hash, initiator_id, provider_id, parsing_output, dry_run=False): # Check initiator initiator = OSFUser.load(initiator_id) if not initiator: message = f'Bulk upload preparation failure: initiator [id={initiator_id}] not found' return handle_internal_error(initiator=None, provider=None, message=message, dry_run=dry_run) # Check provider try: provider = RegistrationProvider.objects.get(_id=provider_id) except RegistrationProvider.DoesNotExist: message = f'Bulk upload preparation failure: registration provider [_id={provider_id}] not found' return handle_internal_error(initiator=initiator, provider=None, message=message, dry_run=dry_run) except RegistrationProvider.MultipleObjectsReturned: message = f'Bulk upload preparation failure: multiple registration providers returned for [_id={provider_id}]' return handle_internal_error(initiator=initiator, provider=None, message=message, dry_run=dry_run) # Check parsing output if not parsing_output: message = 'Bulk upload preparation failure: missing parser output as task input' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) # Check schema schema_id = parsing_output.get('schema_id', None) try: schema = RegistrationSchema.objects.get(_id=schema_id) except RegistrationSchema.DoesNotExist: message = f'Bulk upload preparation failure: registration schema [_id={schema_id}] not found' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) except RegistrationSchema.MultipleObjectsReturned: message = f'Bulk upload preparation failure: multiple registration schemas [_id={schema_id}] returned' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) # Create the bulk upload job upload = RegistrationBulkUploadJob.create(payload_hash, initiator, provider, schema) logger.info( f'Creating a registration bulk upload job with [hash={upload.payload_hash}] ...' ) if not dry_run: try: upload.save() except ValidationError: sentry.log_exception() message = 'Bulk upload preparation failure: failed to create the job' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) upload.reload() logger.info( f'Bulk upload job created: [pk={upload.id}, hash={upload.payload_hash}]' ) else: logger.info('Dry run: insertion did not happen') # Create registration rows for the bulk upload job registration_rows = parsing_output.get('registrations', []) if not registration_rows: message = 'Bulk upload preparation failure: missing registration rows' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) initial_row_count = len(registration_rows) logger.info( f'Preparing [{initial_row_count}] registration rows for bulk creation ...' ) bulk_upload_rows = set() draft_error_list = [] try: for registration_row in registration_rows: bulk_upload_row = RegistrationBulkUploadRow.create( upload, registration_row.get('csv_raw', ''), registration_row.get('csv_parsed'), ) metadata = bulk_upload_row.csv_parsed.get('metadata', {}) or {} row_external_id = metadata.get('External ID', 'N/A') row_title = metadata.get('Title', 'N/A') # Check duplicates with the database if RegistrationBulkUploadRow.objects.filter( row_hash=bulk_upload_row.row_hash).exists(): error = 'Duplicate rows - existing row found in the system' exception = RegistrationBulkCreationRowError(upload.id, 'N/A', row_title, row_external_id, error=error) logger.error(exception.long_message) sentry.log_message(exception.long_message) draft_error_list.append(exception.short_message) else: # Don't `return` or `continue` so that duplicates within the rows can be detected pass # Check duplicates within the CSV if bulk_upload_row in bulk_upload_rows: error = 'Duplicate rows - CSV contains duplicate rows' exception = RegistrationBulkCreationRowError(upload.id, 'N/A', row_title, row_external_id, error=error) logger.error(exception.long_message) sentry.log_message(exception.long_message) draft_error_list.append(exception.short_message) else: bulk_upload_rows.add(bulk_upload_row) except Exception as e: upload.delete() return handle_internal_error(initiator=initiator, provider=provider, message=repr(e), dry_run=dry_run) # Cancel the preparation task if duplicates are found in the CSV and/or in DB if draft_error_list: upload.delete() mails.send_mail( to_addr=initiator.username, mail=mails.REGISTRATION_BULK_UPLOAD_FAILURE_DUPLICATES, fullname=initiator.fullname, count=initial_row_count, draft_errors=draft_error_list, osf_support_email=settings.OSF_SUPPORT_EMAIL, ) return if dry_run: logger.info( 'Dry run: complete. Bulk creation did not run and emails are not sent.' ) return try: logger.info( f'Bulk creating [{len(bulk_upload_rows)}] registration rows ...') created_objects = RegistrationBulkUploadRow.objects.bulk_create( bulk_upload_rows) except (ValueError, IntegrityError): upload.delete() sentry.log_exception() message = 'Bulk upload preparation failure: failed to create the rows.' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) logger.info(f'[{len(created_objects)}] rows successfully prepared.') upload.state = JobState.INITIALIZED try: upload.save() except ValidationError: upload.delete() sentry.log_exception() message = 'Bulk upload preparation failure: job state update failed' return handle_internal_error(initiator=initiator, provider=provider, message=message, dry_run=dry_run) logger.info( f'Bulk upload preparation finished: [upload={upload.id}, state={upload.state.name}, ' f'provider={upload.provider._id}, schema={upload.schema._id}, initiator={upload.initiator._id}]', )