Ejemplo n.º 1
0
def test_index_and_collect_contigs__fai_files(tmp_path):
    fasta_file = tmp_path / "test.fasta"
    with fasta_file.open("wt") as handle:
        _TEST_FASTA_1_A.write(handle)

    fai_file = tmp_path / "test.fasta.fai"

    # Fai file should be created once, and then not modified
    FASTA.index_and_collect_contigs(fasta_file)
    stats_1 = fai_file.stat()
    FASTA.index_and_collect_contigs(fasta_file)
    stats_2 = fai_file.stat()

    assert stats_1 == stats_2
Ejemplo n.º 2
0
def _collect_fasta_contigs(filename, cache={}):
    if filename in cache:
        return cache[filename]

    if not os.path.exists(filename + ".fai"):
        print_info("      - Index does not exist for %r; this may "
                   "take a while ..." % (filename,))

    cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename))
    return contigs
Ejemplo n.º 3
0
def _collect_fasta_contigs(filename, cache={}):
    if filename in cache:
        return cache[filename]

    if not os.path.exists(filename + ".fai"):
        print_info("      - Index does not exist for %r; this may "
                   "take a while ..." % (filename,))

    cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename))
    return contigs
Ejemplo n.º 4
0
def _collect_fasta_contigs(filename, cache={}):
    if filename in cache:
        return cache[filename]

    if not os.path.exists(filename + ".fai"):
        log = logging.getLogger(__name__)
        log.info("Indexing %r; this may take a while", filename)

    cache[filename] = contigs = FASTA.index_and_collect_contigs(filename)
    return contigs
Ejemplo n.º 5
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files." %
                           (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Ejemplo n.º 6
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path,))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path,))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error,))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s"
                                        % (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files."
                           % (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Ejemplo n.º 7
0
def test_index_and_collect_contigs__duplicate_names(tmp_path):
    fasta_file = tmp_path / "test.fasta"
    with fasta_file.open("wt") as handle:
        _TEST_FASTA_1_A.write(handle)
        _TEST_FASTA_2.write(handle)
        _TEST_FASTA_1_B.write(handle)

    assert FASTA.index_and_collect_contigs(fasta_file) == {
        "seq1": 25,
        "seq2": 7,
    }
Ejemplo n.º 8
0
    def _stat_prefixes(cls, prefixes):
        """Returns (size, number of contigs) for a set of BWA prefix."""
        genomes = {}
        for prefix in prefixes:
            contigs = FASTA.index_and_collect_contigs(prefixes[prefix]["Reference"])

            genomes[prefix] = {
                "Size": sum(contigs.values()),
                "NContigs": len(contigs),
            }

        return genomes
Ejemplo n.º 9
0
def _validate_prefixes(makefiles):
    logger = logging.getLogger(__name__)
    already_validated = {}
    logger.info("Validating FASTA files")
    for makefile in makefiles:
        for prefix in makefile["Prefixes"].values():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                logger.warn("Reference FASTA file does not exist: %r", path)
                continue
            elif not os.path.exists(path + ".fai"):
                logger.info("Indexing FASTA at %r", path)

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError as error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.items():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError) as error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.values()) > _BAM_MAX_SEQUENCE_LENGTH:
                logger.warn(
                    "FASTA file %r contains sequences longer "
                    "than %i! CSI index files will be used instead "
                    "of BAI index files.",
                    path,
                    _BAM_MAX_SEQUENCE_LENGTH,
                )
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix