Exemple #1
0
 def test_2(self):
     """This file has many errors"""
     validators = [
         ValidateTxtCatRecord(), ValidateTxtDogRecord(), ValidateTxtFile()]
     file_path = "file.doc"
     contents = ["cat dog ", "cat dog bird", "cat dog tree"]
     # records with errors
     contents.extend(["fish"] * 5)
     _write_example_file(contents, file_path)
     errors, metrics = run_validators(_to_max_errors(2), file_path,
                                      TextFileReader, validators)
     os.remove(file_path)
     self.assertEqual(len(errors), 2)
     errors, metrics = run_validators(_to_max_records(1), file_path,
                                      TextFileReader, validators)
     self.assertEqual(len(errors), 1)
Exemple #2
0
 def test_1(self):
     """This is a valid file format"""
     validators = [ValidateTxtCatRecord(), ValidateTxtDogRecord(),
                   ValidateTxtFile()]
     file_path = "file.txt"
     contents = ["cat dog ", "cat dog bird", "cat dog tree"]
     _write_example_file(contents, file_path)
     errors, metrics = run_validators(ValidatorContextFirstError, file_path,
                                      TextFileReader, validators)
     os.remove(file_path)
     self.assertEqual(len(errors), 0)
Exemple #3
0
 def test_3(self):
     """Test for consistent behavior when a validator is broken"""
     validators = [ValidateTxtCatRecord(), ValidateTxtDogRecord(),
                   ValidateTxtFile(), ValidateBad()]
     file_path = "file.txt"
     contents = ["cat dog ", "cat dog bird", "cat dog tree"]
     contents.extend(["fish"] * 1)
     _write_example_file(contents, file_path)
     errors, metrics = run_validators(ValidatorErrorContext, file_path,
                                      TextFileReader, validators)
     self.assertEqual(len(errors), 6)
     os.remove(file_path)
Exemple #4
0
def validate_bam(file_name,
                 reference=None,
                 aligned=None,
                 contents=None,
                 quick=False,
                 max_errors=None,
                 max_records=None,
                 validate_index=False):
    """
    Main API entry point for running BAM validation.

    Example:

    .. doctest::
        >>> from pbcoretools.pbvalidate.bam import validate_bam
        >>> from pbcore import data
        >>> bam_file = data.getBamAndCmpH5()[0]
        >>> errors, metrics = validate_bam(file_name=bam_file)
        >>> len(errors)
        231
        >>> print(errors[0])
        Mismatch between specified and expected read group ID: a9a22406c5 in file, but computed as b89a4406
        >>> unmapped_file = data.getUnalignedBam()
        >>> errors, metrics = validate_bam(file_name=unmapped_file)
        >>> len(errors)
        118
        >>> print(errors[0])
        This file has not been sorted by QNAME, or the header has not been updated.
        >>> errors, metrics = validate_bam(file_name=unmapped_file,
        ...     aligned=True, contents="CCS")
        >>> len(errors)
        120
    """
    validators = get_validators(aligned=aligned, contents=contents,
                                validate_index=validate_index)
    e, m = run_validators(
        context_class=get_context_class(
            quick=quick,
            max_errors=max_errors,
            max_records=max_records),
        path=file_name,
        reader_class=_get_reader(file_name, reference),
        validators=validators,
        additional_validation_function=validate_read_groups)
    return e, m
Exemple #5
0
def validate_dataset(
        file_name,
        dataset_type=None,
        reference=None,
        quick=False,
        max_errors=None,
        max_records=None,
        contents=None,
        aligned=None,
        validate_index=False,
        strict=False,
        permissive_headers=False):
    assert os.path.isfile(os.path.realpath(file_name))
    ds = None
    ReaderClass = getattr(pbcore.io, str(dataset_type), pbcore.io.openDataSet)
    log.debug("ReaderClass: %s" % ReaderClass.__name__)
    try:
        # XXX suppressing logging errors temporarily
        #logging.disable(logging.CRITICAL)
        try:
            ds = ReaderClass(file_name, strict=True)
        finally:
            pass #logging.disable(logging.NOTSET)
    except Exception as e:
        # XXX in strict mode the reader will cough up an IOError if the
        # requested dataset type does not agree with the XML.  if this happens
        # there's no point doing any additional validation.
        if False: #True:
            # XXX actually, it can cough up other errors too if there is
            # something wrong with the underlying files and it tries to read
            # them immediately.  Still treating this as a validation error, but
            # it may indicate bugs.
            _, _, ex_traceback = sys.exc_info()
            tb_lines = traceback.format_exception(e.__class__, e, ex_traceback)
            log.error("\n".join(tb_lines))
        errors = [ReaderError.from_args(file_name, str(e))]
        return errors, {}
    log.debug("Dataset type: %s" % ds.__class__.__name__)
    actual_dataset_type = _dataset_type(ds)
    log.debug("Actual type:  %s" % actual_dataset_type)
    if isinstance(ds, pbcore.io.SubreadSet) and contents is None:
        contents = "SUBREAD"
    elif isinstance(ds, pbcore.io.ConsensusReadSet) and contents is None:
        contents = "CCS"
    elif isinstance(ds, pbcore.io.AlignmentSet):
        pass
    validators = [
        ValidateEncoding(),
        ValidateRootTag(),
        ValidateResources(),
        ValidateDatasetType(dataset_type),
        ValidateMetadata(),
        ValidateNamespace(),
        ValidateRandomAccess(),
    ]
    if not actual_dataset_type in DatasetTypes.HDF5_DATASET:
        validators.extend([
            ValidateResourcesOpen(),
            ValidateNumRecords(),
        ])
        if validate_index:
            validators.append(ValidateIndex())
    if strict:
        validators.extend([
            ValidateXML(),
            ValidateFileName(file_name),
        ])
    additional_validation_function = None
    opened_class_name = ds.__class__.__name__
    # XXX not sure this is ideal - what if it opens as a ReferenceSet but we
    # asked for an AlignmentSet?  This is caught by ValidateDatasetType, but
    # we'd still check for Fasta file errors.
    if opened_class_name in DatasetTypes.FASTA_DATASET:
        validators_ = fasta.get_validators(validate_raw_format=False)
        validators_.insert(0, ValidateFastaRaw())
        validators.extend(validators_)
    elif opened_class_name in DatasetTypes.BAM_DATASET:
        validators_ = bam.get_validators(aligned=aligned,
                                         contents=contents,
                                         include_file_validators=False,
                                         permissive_headers=permissive_headers)
        validators_.insert(0, ValidateSorting())
        validators_.insert(0, ValidateContents(aligned=aligned,
                                               content_type=contents))
        validators.extend(validators_)
        additional_validation_function = _validate_read_groups

    def ReaderClass_wrapper(*args, **kwds):
        logging.disable(logging.CRITICAL)
        try:
            return DatasetReader(ReaderClass, *args, **kwds)
        finally:
            logging.disable(logging.NOTSET)
    context_class = get_context_class(
        quick=quick,
        max_errors=max_errors,
        max_records=max_records)
    errors, metrics = run_validators(
        context_class=context_class,
        path=file_name,
        reader_class=ReaderClass_wrapper,
        validators=validators,
        additional_validation_function=additional_validation_function)
    return errors, metrics