Example #1
0
def validate_file(file_name, strict=False, validate_index=False):
    validators = fasta.get_validators(strict=strict,
                                      validate_index=validate_index)
    return fasta.run_validators(context_class=ValidatorErrorContext,
                                path=file_name,
                                reader_class=pbcore.io.FastaReader,
                                validators=validators)
def validate_file(file_name, strict=False, validate_index=False):
    validators = fasta.get_validators(
        strict=strict,
        validate_index=validate_index)
    return fasta.run_validators(
        context_class=ValidatorErrorContext,
        path=file_name,
        reader_class=pbcore.io.FastaReader,
        validators=validators)
Example #3
0
def validate_dataset(
        file_name,
        dataset_type=None,
        reference=None,
        quick=False,
        max_errors=None,
        max_records=None,
        contents=None,
        aligned=None,
        validate_index=False,
        strict=False,
        permissive_headers=False):
    assert os.path.isfile(os.path.realpath(file_name))
    ds = None
    ReaderClass = getattr(pbcore.io, str(dataset_type), pbcore.io.openDataSet)
    log.debug("ReaderClass: %s" % ReaderClass.__name__)
    try:
        # XXX suppressing logging errors temporarily
        #logging.disable(logging.CRITICAL)
        try:
            ds = ReaderClass(file_name, strict=True)
        finally:
            pass #logging.disable(logging.NOTSET)
    except Exception as e:
        # XXX in strict mode the reader will cough up an IOError if the
        # requested dataset type does not agree with the XML.  if this happens
        # there's no point doing any additional validation.
        if False: #True:
            # XXX actually, it can cough up other errors too if there is
            # something wrong with the underlying files and it tries to read
            # them immediately.  Still treating this as a validation error, but
            # it may indicate bugs.
            _, _, ex_traceback = sys.exc_info()
            tb_lines = traceback.format_exception(e.__class__, e, ex_traceback)
            log.error("\n".join(tb_lines))
        errors = [ReaderError.from_args(file_name, str(e))]
        return errors, {}
    log.debug("Dataset type: %s" % ds.__class__.__name__)
    actual_dataset_type = _dataset_type(ds)
    log.debug("Actual type:  %s" % actual_dataset_type)
    if isinstance(ds, pbcore.io.SubreadSet) and contents is None:
        contents = "SUBREAD"
    elif isinstance(ds, pbcore.io.ConsensusReadSet) and contents is None:
        contents = "CCS"
    elif isinstance(ds, pbcore.io.AlignmentSet):
        pass
    validators = [
        ValidateEncoding(),
        ValidateRootTag(),
        ValidateResources(),
        ValidateDatasetType(dataset_type),
        ValidateMetadata(),
        ValidateNamespace(),
        ValidateRandomAccess(),
    ]
    if not actual_dataset_type in DatasetTypes.HDF5_DATASET:
        validators.extend([
            ValidateResourcesOpen(),
            ValidateNumRecords(),
        ])
        if validate_index:
            validators.append(ValidateIndex())
    if strict:
        validators.extend([
            ValidateXML(),
            ValidateFileName(file_name),
        ])
    additional_validation_function = None
    opened_class_name = ds.__class__.__name__
    # XXX not sure this is ideal - what if it opens as a ReferenceSet but we
    # asked for an AlignmentSet?  This is caught by ValidateDatasetType, but
    # we'd still check for Fasta file errors.
    if opened_class_name in DatasetTypes.FASTA_DATASET:
        validators_ = fasta.get_validators(validate_raw_format=False)
        validators_.insert(0, ValidateFastaRaw())
        validators.extend(validators_)
    elif opened_class_name in DatasetTypes.BAM_DATASET:
        validators_ = bam.get_validators(aligned=aligned,
                                         contents=contents,
                                         include_file_validators=False,
                                         permissive_headers=permissive_headers)
        validators_.insert(0, ValidateSorting())
        validators_.insert(0, ValidateContents(aligned=aligned,
                                               content_type=contents))
        validators.extend(validators_)
        additional_validation_function = _validate_read_groups

    def ReaderClass_wrapper(*args, **kwds):
        logging.disable(logging.CRITICAL)
        try:
            return DatasetReader(ReaderClass, *args, **kwds)
        finally:
            logging.disable(logging.NOTSET)
    context_class = get_context_class(
        quick=quick,
        max_errors=max_errors,
        max_records=max_records)
    errors, metrics = run_validators(
        context_class=context_class,
        path=file_name,
        reader_class=ReaderClass_wrapper,
        validators=validators,
        additional_validation_function=additional_validation_function)
    return errors, metrics