def test_api_1(self):
        file_name = op.join(DATA_DIR, "tst_1_subreads.bam")
        _validators = bam.get_validators(validate_index=True)

        def _run_validators(f, expected_failures, validators=_validators):
            errors = []
            for v in validators:
                if isinstance(v, ValidateFileObject):
                    if not v.validate(f):
                        errors.extend(v.to_errors(f))
            for rg in f.peer.header["RG"]:
                for v in validators:
                    if isinstance(v, ValidateReadGroup):
                        if not v.validate(rg):
                            errors.extend(v.to_errors(rg))
            for aln in f:
                for v in validators:
                    if isinstance(v, ValidateRecord):
                        if not v.validate(aln):
                            errors.extend(v.to_errors(aln))
            found = sorted(list(set([type(e).__name__ for e in errors])))
            expected = sorted(list(set(expected_failures)))
            self.assertEqual(found, expected)
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[])
        # now a bad one
        file_name = op.join(DATA_DIR, "tst_2_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[
            'AlignmentCigarError',
            'AlignmentCigarMatchError', 'AlignmentNotUniqueError',
            'AlignmentUnmappedError', 'BasecallerVersionError',
            'MissingCodecError', 'MissingIndexError', 'MissingPlatformError',
            'QnameFormatError', 'QnameRangeError',
            'ReadGroupChemistryError',
            'ReadGroupIdMismatchError', "ReadLengthError", 'TagValueError',
            'UninitializedSNRError', 'UnsortedError'])
        # a good unaligned file
        file_name = op.join(DATA_DIR, "tst_3_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[])
        # a bad unaligned file
        file_name = op.join(DATA_DIR, "tst_4_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[
            'BasecallerVersionError', 'MissingCodecError',
            'PulseFeatureError', 'QnameHoleNumberError', 'QnameMovieError',
            'ReadGroupChemistryError', 'UninitializedSNRError',
            'UnmappedPropertiesError', 'UnsortedError',
            'WrongPlatformError'])
        _validators = bam.get_validators(aligned=True)
        _run_validators(f=bam_file, expected_failures=[
            'BasecallerVersionError',
            'FileNotAlignedError', 'MissingCodecError', 'PulseFeatureError',
            'QnameHoleNumberError', 'QnameMovieError',
            'ReadGroupChemistryError',
            'UninitializedSNRError', 'UnsortedError',
            'WrongPlatformError'],
            validators=_validators)
Beispiel #2
0
    def test_api_1(self):
        file_name = op.join(DATA_DIR, "tst_1_subreads.bam")
        _validators = bam.get_validators(validate_index=True)

        def _run_validators(f, expected_failures, validators=_validators):
            errors = []
            for v in validators:
                if isinstance(v, ValidateFileObject):
                    if not v.validate(f):
                        errors.extend(v.to_errors(f))
            for rg in f.peer.header["RG"]:
                for v in validators:
                    if isinstance(v, ValidateReadGroup):
                        if not v.validate(rg):
                            errors.extend(v.to_errors(rg))
            for aln in f:
                for v in validators:
                    if isinstance(v, ValidateRecord):
                        if not v.validate(aln):
                            errors.extend(v.to_errors(aln))
            found = sorted(list(set([type(e).__name__ for e in errors])))
            expected = sorted(list(set(expected_failures)))
            assert found == expected

        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[])
        # now a bad one
        file_name = op.join(DATA_DIR, "tst_2_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(
            f=bam_file,
            expected_failures=[
                'AlignmentCigarMatchError', 'AlignmentNotUniqueError',
                'AlignmentUnmappedError', 'BasecallerVersionError',
                'MissingCodecError', 'MissingIndexError',
                'MissingPlatformError', 'QnameFormatError', 'QnameRangeError',
                'ReadGroupChemistryError', 'ReadGroupIdMismatchError',
                "ReadLengthError", 'TagValueError', 'UninitializedSNRError',
                'UnsortedError'
            ])
        # a good unaligned file
        file_name = op.join(DATA_DIR, "tst_3_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file, expected_failures=[])
        # a bad unaligned file
        file_name = op.join(DATA_DIR, "tst_4_subreads.bam")
        bam_file = pbcore.io.BamReader(file_name)
        _run_validators(f=bam_file,
                        expected_failures=[
                            'BasecallerVersionError', 'MissingCodecError',
                            'QnameHoleNumberError', 'QnameMovieError',
                            'ReadGroupChemistryError', 'UninitializedSNRError',
                            'UnmappedPropertiesError', 'UnsortedError',
                            'WrongPlatformError'
                        ])
        _validators = bam.get_validators(aligned=True)
        _run_validators(f=bam_file,
                        expected_failures=[
                            'BasecallerVersionError', 'FileNotAlignedError',
                            'MissingCodecError', 'QnameHoleNumberError',
                            'QnameMovieError', 'ReadGroupChemistryError',
                            'UninitializedSNRError', 'UnsortedError',
                            'WrongPlatformError'
                        ],
                        validators=_validators)
Beispiel #3
0
def validate_dataset(
        file_name,
        dataset_type=None,
        reference=None,
        quick=False,
        max_errors=None,
        max_records=None,
        contents=None,
        aligned=None,
        validate_index=False,
        strict=False,
        permissive_headers=False):
    assert os.path.isfile(os.path.realpath(file_name))
    ds = None
    ReaderClass = getattr(pbcore.io, str(dataset_type), pbcore.io.openDataSet)
    log.debug("ReaderClass: %s" % ReaderClass.__name__)
    try:
        # XXX suppressing logging errors temporarily
        #logging.disable(logging.CRITICAL)
        try:
            ds = ReaderClass(file_name, strict=True)
        finally:
            pass #logging.disable(logging.NOTSET)
    except Exception as e:
        # XXX in strict mode the reader will cough up an IOError if the
        # requested dataset type does not agree with the XML.  if this happens
        # there's no point doing any additional validation.
        if False: #True:
            # XXX actually, it can cough up other errors too if there is
            # something wrong with the underlying files and it tries to read
            # them immediately.  Still treating this as a validation error, but
            # it may indicate bugs.
            _, _, ex_traceback = sys.exc_info()
            tb_lines = traceback.format_exception(e.__class__, e, ex_traceback)
            log.error("\n".join(tb_lines))
        errors = [ReaderError.from_args(file_name, str(e))]
        return errors, {}
    log.debug("Dataset type: %s" % ds.__class__.__name__)
    actual_dataset_type = _dataset_type(ds)
    log.debug("Actual type:  %s" % actual_dataset_type)
    if isinstance(ds, pbcore.io.SubreadSet) and contents is None:
        contents = "SUBREAD"
    elif isinstance(ds, pbcore.io.ConsensusReadSet) and contents is None:
        contents = "CCS"
    elif isinstance(ds, pbcore.io.AlignmentSet):
        pass
    validators = [
        ValidateEncoding(),
        ValidateRootTag(),
        ValidateResources(),
        ValidateDatasetType(dataset_type),
        ValidateMetadata(),
        ValidateNamespace(),
        ValidateRandomAccess(),
    ]
    if not actual_dataset_type in DatasetTypes.HDF5_DATASET:
        validators.extend([
            ValidateResourcesOpen(),
            ValidateNumRecords(),
        ])
        if validate_index:
            validators.append(ValidateIndex())
    if strict:
        validators.extend([
            ValidateXML(),
            ValidateFileName(file_name),
        ])
    additional_validation_function = None
    opened_class_name = ds.__class__.__name__
    # XXX not sure this is ideal - what if it opens as a ReferenceSet but we
    # asked for an AlignmentSet?  This is caught by ValidateDatasetType, but
    # we'd still check for Fasta file errors.
    if opened_class_name in DatasetTypes.FASTA_DATASET:
        validators_ = fasta.get_validators(validate_raw_format=False)
        validators_.insert(0, ValidateFastaRaw())
        validators.extend(validators_)
    elif opened_class_name in DatasetTypes.BAM_DATASET:
        validators_ = bam.get_validators(aligned=aligned,
                                         contents=contents,
                                         include_file_validators=False,
                                         permissive_headers=permissive_headers)
        validators_.insert(0, ValidateSorting())
        validators_.insert(0, ValidateContents(aligned=aligned,
                                               content_type=contents))
        validators.extend(validators_)
        additional_validation_function = _validate_read_groups

    def ReaderClass_wrapper(*args, **kwds):
        logging.disable(logging.CRITICAL)
        try:
            return DatasetReader(ReaderClass, *args, **kwds)
        finally:
            logging.disable(logging.NOTSET)
    context_class = get_context_class(
        quick=quick,
        max_errors=max_errors,
        max_records=max_records)
    errors, metrics = run_validators(
        context_class=context_class,
        path=file_name,
        reader_class=ReaderClass_wrapper,
        validators=validators,
        additional_validation_function=additional_validation_function)
    return errors, metrics