def test_row_uniqueness(self, upload_job, csv_raw_1, csv_raw_2,
                         csv_parsed_1, csv_parsed_2):
     registration_row_1 = RegistrationBulkUploadRow.create(
         upload_job, csv_raw_1, csv_parsed_1)
     registration_row_1.save()
     registration_row_2 = RegistrationBulkUploadRow.create(
         upload_job, csv_raw_1, csv_parsed_2)
     with pytest.raises(IntegrityError):
         registration_row_2.save()
 def test_row_object_eq(self, upload_job, csv_raw_1, csv_raw_2,
                        csv_parsed_1, csv_parsed_2):
     registration_row_1 = RegistrationBulkUploadRow.create(
         upload_job, csv_raw_1, csv_parsed_1)
     registration_row_2 = RegistrationBulkUploadRow.create(
         upload_job, csv_raw_2, csv_parsed_2)
     registration_row_3 = RegistrationBulkUploadRow.create(
         upload_job, csv_raw_2, csv_parsed_2)
     assert registration_row_1 != registration_row_2
     assert registration_row_2 == registration_row_3
Beispiel #3
0
 def registration_row_invalid_affiliation(self, upload_job_done_error,
                                          csv_parsed_invalid_affiliation):
     row = RegistrationBulkUploadRow.create(upload_job_done_error,
                                            str(uuid.uuid4()),
                                            csv_parsed_invalid_affiliation)
     row.save()
     return row
Beispiel #4
0
 def registration_row_invalid_extra_bib_1(self, upload_job_done_partial,
                                          csv_parsed_extra_bib):
     row = RegistrationBulkUploadRow.create(upload_job_done_partial,
                                            str(uuid.uuid4()),
                                            csv_parsed_extra_bib)
     row.save()
     return row
    def test_row_object_hash(self, upload_job, csv_raw_2, csv_parsed_2,
                             row_hash_2):

        registration_row = RegistrationBulkUploadRow.create(
            upload_job, csv_raw_2, csv_parsed_2)
        assert registration_row.__hash__() == hash(row_hash_2)
        registration_row.save()
        registration_row.reload()
        assert registration_row.__hash__() == hash(registration_row.pk)
    def test_row_creation(self, upload_job, csv_raw_1, csv_parsed_1,
                          row_hash_1):

        registration_row = RegistrationBulkUploadRow.create(
            upload_job, csv_raw_1, csv_parsed_1)
        registration_row.save()
        registration_row.reload()
        assert registration_row.draft_registration is None
        assert registration_row.is_completed is False
        assert registration_row.is_picked_up is False
        assert registration_row.row_hash == row_hash_1
        assert registration_row.upload == upload_job
        assert registration_row.csv_raw == csv_raw_1
        assert registration_row.csv_parsed == csv_parsed_1
Beispiel #7
0
 def registration_row_3(self, upload_job_done_partial, csv_parsed_1):
     row = RegistrationBulkUploadRow.create(upload_job_done_partial,
                                            str(uuid.uuid4()), csv_parsed_1)
     row.save()
     return row
Beispiel #8
0
def prepare_for_registration_bulk_creation(payload_hash,
                                           initiator_id,
                                           provider_id,
                                           parsing_output,
                                           dry_run=False):

    # Check initiator
    initiator = OSFUser.load(initiator_id)
    if not initiator:
        message = f'Bulk upload preparation failure: initiator [id={initiator_id}] not found'
        return handle_internal_error(initiator=None,
                                     provider=None,
                                     message=message,
                                     dry_run=dry_run)

    # Check provider
    try:
        provider = RegistrationProvider.objects.get(_id=provider_id)
    except RegistrationProvider.DoesNotExist:
        message = f'Bulk upload preparation failure: registration provider [_id={provider_id}] not found'
        return handle_internal_error(initiator=initiator,
                                     provider=None,
                                     message=message,
                                     dry_run=dry_run)
    except RegistrationProvider.MultipleObjectsReturned:
        message = f'Bulk upload preparation failure: multiple registration providers returned for [_id={provider_id}]'
        return handle_internal_error(initiator=initiator,
                                     provider=None,
                                     message=message,
                                     dry_run=dry_run)

    # Check parsing output
    if not parsing_output:
        message = 'Bulk upload preparation failure: missing parser output as task input'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)

    # Check schema
    schema_id = parsing_output.get('schema_id', None)
    try:
        schema = RegistrationSchema.objects.get(_id=schema_id)
    except RegistrationSchema.DoesNotExist:
        message = f'Bulk upload preparation failure: registration schema [_id={schema_id}] not found'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)
    except RegistrationSchema.MultipleObjectsReturned:
        message = f'Bulk upload preparation failure: multiple registration schemas [_id={schema_id}] returned'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)

    # Create the bulk upload job
    upload = RegistrationBulkUploadJob.create(payload_hash, initiator,
                                              provider, schema)
    logger.info(
        f'Creating a registration bulk upload job with [hash={upload.payload_hash}] ...'
    )
    if not dry_run:
        try:
            upload.save()
        except ValidationError:
            sentry.log_exception()
            message = 'Bulk upload preparation failure: failed to create the job'
            return handle_internal_error(initiator=initiator,
                                         provider=provider,
                                         message=message,
                                         dry_run=dry_run)
        upload.reload()
        logger.info(
            f'Bulk upload job created: [pk={upload.id}, hash={upload.payload_hash}]'
        )
    else:
        logger.info('Dry run: insertion did not happen')

    # Create registration rows for the bulk upload job
    registration_rows = parsing_output.get('registrations', [])
    if not registration_rows:
        message = 'Bulk upload preparation failure: missing registration rows'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)
    initial_row_count = len(registration_rows)
    logger.info(
        f'Preparing [{initial_row_count}] registration rows for bulk creation ...'
    )

    bulk_upload_rows = set()
    draft_error_list = []
    try:
        for registration_row in registration_rows:
            bulk_upload_row = RegistrationBulkUploadRow.create(
                upload,
                registration_row.get('csv_raw', ''),
                registration_row.get('csv_parsed'),
            )
            metadata = bulk_upload_row.csv_parsed.get('metadata', {}) or {}
            row_external_id = metadata.get('External ID', 'N/A')
            row_title = metadata.get('Title', 'N/A')
            # Check duplicates with the database
            if RegistrationBulkUploadRow.objects.filter(
                    row_hash=bulk_upload_row.row_hash).exists():
                error = 'Duplicate rows - existing row found in the system'
                exception = RegistrationBulkCreationRowError(upload.id,
                                                             'N/A',
                                                             row_title,
                                                             row_external_id,
                                                             error=error)
                logger.error(exception.long_message)
                sentry.log_message(exception.long_message)
                draft_error_list.append(exception.short_message)
            else:
                # Don't `return` or `continue` so that duplicates within the rows can be detected
                pass
            # Check duplicates within the CSV
            if bulk_upload_row in bulk_upload_rows:
                error = 'Duplicate rows - CSV contains duplicate rows'
                exception = RegistrationBulkCreationRowError(upload.id,
                                                             'N/A',
                                                             row_title,
                                                             row_external_id,
                                                             error=error)
                logger.error(exception.long_message)
                sentry.log_message(exception.long_message)
                draft_error_list.append(exception.short_message)
            else:
                bulk_upload_rows.add(bulk_upload_row)
    except Exception as e:
        upload.delete()
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=repr(e),
                                     dry_run=dry_run)

    # Cancel the preparation task if duplicates are found in the CSV and/or in DB
    if draft_error_list:
        upload.delete()
        mails.send_mail(
            to_addr=initiator.username,
            mail=mails.REGISTRATION_BULK_UPLOAD_FAILURE_DUPLICATES,
            fullname=initiator.fullname,
            count=initial_row_count,
            draft_errors=draft_error_list,
            osf_support_email=settings.OSF_SUPPORT_EMAIL,
        )
        return

    if dry_run:
        logger.info(
            'Dry run: complete. Bulk creation did not run and emails are not sent.'
        )
        return

    try:
        logger.info(
            f'Bulk creating [{len(bulk_upload_rows)}] registration rows ...')
        created_objects = RegistrationBulkUploadRow.objects.bulk_create(
            bulk_upload_rows)
    except (ValueError, IntegrityError):
        upload.delete()
        sentry.log_exception()
        message = 'Bulk upload preparation failure: failed to create the rows.'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)
    logger.info(f'[{len(created_objects)}] rows successfully prepared.')

    upload.state = JobState.INITIALIZED
    try:
        upload.save()
    except ValidationError:
        upload.delete()
        sentry.log_exception()
        message = 'Bulk upload preparation failure: job state update failed'
        return handle_internal_error(initiator=initiator,
                                     provider=provider,
                                     message=message,
                                     dry_run=dry_run)
    logger.info(
        f'Bulk upload preparation finished: [upload={upload.id}, state={upload.state.name}, '
        f'provider={upload.provider._id}, schema={upload.schema._id}, initiator={upload.initiator._id}]',
    )