Example #1
0
def _collect_sequence_names(bed_file, fasta_file, min_columns=6):
    contigs = _collect_fasta_contigs(fasta_file)
    sequences = {}

    for record in read_bed_file(bed_file, min_columns=6, contigs=contigs):
        current = (record.contig, record.strand)
        reference = sequences.setdefault(record.name, current)

        if current[0] != reference[0]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different contigs (%r and "
                                "%r); note that PALEOMIX assumes that "
                                "regions with the same name constitute "
                                "parts of a single consecutive sequence, "
                                "which must therefore be located on one "
                                "strand of a single sequence. Please "
                                "rename one or more of these regions to"
                                "continue." % (bed_file, record.name,
                                               current[0], reference[0]))
        elif current[1] != reference[1]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different strands; note "
                                "that PALEOMIX assumes that regions with "
                                "the same name constitute parts of a "
                                "single consecutive sequence, and that "
                                "these must therefore be located on the "
                                "same strand." % (bed_file, record.name,))

    return frozenset(sequences)
Example #2
0
def _collect_sequence_names(bed_file, fasta_file, min_columns=6):
    contigs = _collect_fasta_contigs(fasta_file)
    sequences = {}

    for record in read_bed_file(bed_file, min_columns=6, contigs=contigs):
        current = (record.contig, record.strand)
        reference = sequences.setdefault(record.name, current)

        if current[0] != reference[0]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different contigs (%r and "
                                "%r); note that PALEOMIX assumes that "
                                "regions with the same name constitute "
                                "parts of a single consecutive sequence, "
                                "which must therefore be located on one "
                                "strand of a single sequence. Please "
                                "rename one or more of these regions to"
                                "continue." % (bed_file, record.name,
                                               current[0], reference[0]))
        elif current[1] != reference[1]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different strands; note "
                                "that PALEOMIX assumes that regions with "
                                "the same name constitute parts of a "
                                "single consecutive sequence, and that "
                                "these must therefore be located on the "
                                "same strand." % (bed_file, record.name,))

    return frozenset(sequences)
Example #3
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files." %
                           (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Example #4
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path,))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path,))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error,))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s"
                                        % (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files."
                           % (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Example #5
0
def collect_bed_regions(filename):
    regions = []
    name_cache = {}
    for record in read_bed_file(filename):
        if len(record) < 4:
            record.name = "%s*" % (record.contig,)

        record.contig = name_cache.get(record.contig, record.contig)
        record.name = name_cache.get(record.name, record.name)

        regions.append(record.freeze())

    return regions
Example #6
0
def collect_bed_regions(filename):
    regions = []
    name_cache = {}
    for record in read_bed_file(filename):
        if len(record) < 4:
            record.name = "%s*" % (record.contig, )

        record.contig = name_cache.get(record.contig, record.contig)
        record.name = name_cache.get(record.name, record.name)

        regions.append(record.freeze())

    return regions
Example #7
0
def collect_regions(bedfile, bam_input_handle):
    """Returns the regions to be genotyped / pileup'd, as a list of bed-regions
    in the form (contig, start, end), where start is zero-based, and end is
    open based.
    """
    if bedfile is not None:
        regions = list(read_bed_file(bedfile))
        sort_bed_by_bamfile(bam_input_handle, regions)
        regions = merge_bed_regions(regions)
    else:
        regions = []
        for (name, length) in zip(bam_input_handle.references,
                                  bam_input_handle.lengths):
            regions.append((name, 0, length))
    return regions
Example #8
0
def collect_regions(bedfile, bam_input_handle):
    """Returns the regions to be genotyped / pileup'd, as a list of bed-regions
    in the form (contig, start, end), where start is zero-based, and end is
    open based.
    """
    if bedfile is not None:
        regions = list(read_bed_file(bedfile))
        sort_bed_by_bamfile(bam_input_handle, regions)
        regions = merge_bed_regions(regions)
    else:
        regions = []
        for (name, length) in zip(bam_input_handle.references,
                                  bam_input_handle.lengths):
            regions.append((name, 0, length))
    return regions
Example #9
0
def _validate_prefixes(makefiles):
    logger = logging.getLogger(__name__)
    already_validated = {}
    logger.info("Validating FASTA files")
    for makefile in makefiles:
        for prefix in makefile["Prefixes"].values():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                logger.warn("Reference FASTA file does not exist: %r", path)
                continue
            elif not os.path.exists(path + ".fai"):
                logger.info("Indexing FASTA at %r", path)

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError as error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.items():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError) as error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.values()) > _BAM_MAX_SEQUENCE_LENGTH:
                logger.warn(
                    "FASTA file %r contains sequences longer "
                    "than %i! CSI index files will be used instead "
                    "of BAI index files.",
                    path,
                    _BAM_MAX_SEQUENCE_LENGTH,
                )
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Example #10
0
    def _run(self, config, temp):
        contigs = {}
        with open(self._fai_file) as handle:
            for line in handle:
                name, length, _ = line.split('\t', 2)
                if name in contigs:
                    raise NodeError('Reference genome contains multiple '
                                    'identically named contigs (%r)!' %
                                    (name, ))

                contigs[name] = int(length)

        with open(reroot_path(temp, self._outfile), 'w') as handle:
            for record in read_bed_file(self._infile, contigs=contigs):
                max_length = contigs[record.contig]
                record.start = max(0, record.start - self._amount)
                record.end = min(record.end + self._amount, max_length)

                handle.write('%s\n' % (record, ))
Example #11
0
    def _run(self, config, temp):
        contigs = {}
        with open(self._fai_file) as handle:
            for line in handle:
                name, length, _ = line.split('\t', 2)
                if name in contigs:
                    raise NodeError('Reference genome contains multiple '
                                    'identically named contigs (%r)!'
                                    % (name,))

                contigs[name] = int(length)

        with open(reroot_path(temp, self._outfile), 'w') as handle:
            for record in read_bed_file(self._infile, contigs=contigs):
                max_length = contigs[record.contig]
                record.start = max(0, record.start - self._amount)
                record.end = min(record.end + self._amount, max_length)

                handle.write('%s\n' % (record,))
Example #12
0
    def _run(self, config, temp):
        contigs = {}
        with open(self._fai_file) as handle:
            for line in handle:
                name, length, _ = line.split("\t", 2)
                if name in contigs:
                    raise NodeError("Reference genome contains multiple "
                                    "identically named contigs (%r)!" %
                                    (name, ))

                contigs[name] = int(length)

        with open(reroot_path(temp, self._outfile), "w") as handle:
            records = list(read_bed_file(self._infile, contigs=contigs))
            pad_bed_records(records=records,
                            padding=self._amount,
                            max_sizes=contigs)

            for record in merge_bed_records(records):
                handle.write("%s\n" % (record, ))