Esempio n. 1
0
def _collect_sequence_names(bed_file, fasta_file, min_columns=6):
    contigs = _collect_fasta_contigs(fasta_file)
    sequences = {}

    for record in read_bed_file(bed_file, min_columns=6, contigs=contigs):
        current = (record.contig, record.strand)
        reference = sequences.setdefault(record.name, current)

        if current[0] != reference[0]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different contigs (%r and "
                                "%r); note that PALEOMIX assumes that "
                                "regions with the same name constitute "
                                "parts of a single consecutive sequence, "
                                "which must therefore be located on one "
                                "strand of a single sequence. Please "
                                "rename one or more of these regions to"
                                "continue." % (bed_file, record.name,
                                               current[0], reference[0]))
        elif current[1] != reference[1]:
            raise MakefileError("Regions in %r with the same name (%r) "
                                "are located on different strands; note "
                                "that PALEOMIX assumes that regions with "
                                "the same name constitute parts of a "
                                "single consecutive sequence, and that "
                                "these must therefore be located on the "
                                "same strand." % (bed_file, record.name,))

    return frozenset(sequences)
Esempio n. 2
0
def _mangle_prefixes(makefile):
    records = []
    for (name, values) in makefile.get("Prefixes", {}).items():
        if "*" in name[:-1]:
            raise MakefileError("The character '*' is not allowed in Prefix "
                                "names; if you wish to select multiple .fasta "
                                "files using a search-string, then use the "
                                "prefix name '%s*' instead and specify the "
                                "wildcards in the 'Path'." %
                                (name.replace("*", "")))
        elif name.endswith("*"):
            records.extend(_glob_prefixes(values, values["Path"]))

        else:
            records.append((name, values))

    prefixes = {}
    for (name, record) in records:
        if name in prefixes:
            raise MakefileError("Multiple prefixes with the same name: %s" %
                                name)

        if not record["Path"].endswith(".fasta"):
            raise MakefileError("Path for prefix %r does not end with "
                                ".fasta:\n   %r" % (name, record["Path"]))

        record["Name"] = name
        record["Reference"] = record["Path"]
        prefixes[name] = record

    if not prefixes:
        raise MakefileError("At least one prefix must be specified")

    makefile["Prefixes"] = prefixes
Esempio n. 3
0
    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
Esempio n. 4
0
def _determine_lane_type(prefixes, data, path):
    if isinstance(data, types.StringTypes):
        return "Raw"
    elif isinstance(data, types.DictType):
        if all((key in _READ_TYPES) for key in data):
            for (key, files) in data.iteritems():
                is_paired = paths.is_paired_end(files)

                if is_paired and (key != "Paired"):
                    raise MakefileError("Error at Barcode level; Path "
                                        "includes {Pair} key, but read-type "
                                        "is not Paired:\n    %r" %
                                        (" :: ".join(path + (key, )), ))
                elif not is_paired and (key == "Paired"):
                    raise MakefileError(
                        "Error at Barcode level; Paired pre-"
                        "trimmed reads specified, but path "
                        "does not contain {Pair} key:\n    %r" %
                        (" :: ".join(path + (key, )), ))

            return "Trimmed"

    raise MakefileError("Error at Barcode level; keys must either be "
                        "prefix-names, OR 'Paired', 'Single', 'Collapsed', "
                        "'CollapsedTruncated', or 'Singleton'. "
                        "Found: %s" % (", ".join(data), ))
Esempio n. 5
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups  = mkfile["Project"]["Groups"]

    filtering = {}
    for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per "
                                "sample, not by groups: %r" % (target,))
        elif target not in samples:
            raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,))
        elif target in filter_by:
            raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,))

        path = "Project:FilterSingletons:%s" % (target,)
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target,))
            print_warn("Warning: Sample %r is singleton-filtered using a "
                       "group it is also a member of; this may be by mistake."
                       % (target,))

        if not filtering[target]:
            raise MakefileError("No samples specified by which to "
                                "singleton-filter by for %r" % (target,))

    mkfile["Project"]["FilterSingletons"] = filtering
Esempio n. 6
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files." %
                           (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Esempio n. 7
0
def _select_samples(select, groups, samples, path):
    selection = set()
    for group in select:
        if group.startswith("<") and group.endswith(">"):
            key = tuple(group[1:-1].split("/"))
            if key not in groups:
                raise MakefileError("Unknown group specifed for filtering %r: %r" % (path, key))
            selection.update(groups[key])
        elif group in samples:
            selection.add(group)
        else:
            raise MakefileError("Unknown/Invalid group specifed for filtering %r: %r" % (path, group))
    return selection
Esempio n. 8
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            for realpath in map(os.path.realpath, record["Data"].values()):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.items():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    logger = logging.getLogger(__name__)
    by_records = sorted(
        zip(list(has_overlap.values()), list(has_overlap.keys())))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            logger.warn("WARNING: Path included in multiple targets:\n%s",
                        description)
Esempio n. 9
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, sep="")
Esempio n. 10
0
def _update_and_check_max_read_depth(options, mkfile):
    if any(subdd["VCF_Filter"]["MaxReadDepth"] == "auto"
           for subdd in mkfile["Genotyping"].itervalues()):
        print_info("    - Determinining max-depth from depth-histograms ...")

    for (key, settings) in mkfile["Genotyping"].iteritems():
        required_keys = set()
        for sample in mkfile["Project"]["Samples"].itervalues():
            if sample["GenotypingMethod"].lower() == "samtools":
                required_keys.add(sample["Name"])

        max_depths = settings["VCF_Filter"]["MaxReadDepth"]
        if isinstance(max_depths, types.DictType):
            # Extra keys are allowed, to make it easier
            # to temporarily disable a sample
            missing_keys = required_keys - set(max_depths)
            if missing_keys:
                missing_keys = "\n    - ".join(sorted(missing_keys))
                message = "MaxReadDepth not specified for the following " \
                          "samples for %r:\n    - %s" % (key, missing_keys)
                raise MakefileError(message)

        elif isinstance(max_depths, types.StringTypes):
            assert max_depths.lower() == "auto", max_depths
            prefix = mkfile["Project"]["Regions"][key]["Prefix"]

            settings["VCF_Filter"]["MaxReadDepth"] \
                = _read_max_depths(options, prefix, required_keys)
        else:
            max_depths = dict.fromkeys(required_keys, max_depths)
            settings["VCF_Filter"]["MaxReadDepth"] = max_depths
Esempio n. 11
0
def _raise_missing_files(description, path, template):
    raise MakefileError(
        "No files found for %s reads using path %r; "
        "specified in makefile at %r. Please verify that the "
        "path is correct, and update the makefile!"
        % (description, template, " :: ".join(path))
    )
Esempio n. 12
0
def _mangle_lanes(makefile):
    formatter = string.Formatter()
    prefixes = makefile["Prefixes"]
    for (target_name, samples) in makefile["Targets"].iteritems():
        for (sample_name, libraries) in samples.iteritems():
            for (library_name, lanes) in libraries.iteritems():
                options = lanes.pop("Options")

                for (lane, data) in lanes.iteritems():
                    path = (target_name, sample_name, library_name, lane)

                    _validate_lane_paths(data, path, formatter)

                    lane_type = _determine_lane_type(prefixes, data, path)

                    if lane_type == "Trimmed" and \
                            options["QualityOffset"] == "Solexa":
                        path = " :: ".join(
                            (target_name, sample_name, library_name, lane))

                        raise MakefileError("Pre-trimmed Solexa data is not "
                                            "supported; please convert the "
                                            "quality scores to Phred (offset "
                                            "33 or 64) to continue:\n"
                                            "    Path = %s" % (path, ))

                    lanes[lane] = {
                        "Type": lane_type,
                        "Data": data,
                        "Options": options
                    }
Esempio n. 13
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:\n%s\n" %
                       (description, ))
Esempio n. 14
0
def collect_files(path, template):
    """

    """
    if is_paired_end(template):
        if _has_glob_magic(template):
            result = {
                "PE_1": _sorted_glob(template.format(Pair=1)),
                "PE_2": _sorted_glob(template.format(Pair=2))
            }

            if not (result["PE_1"] or result["PE_2"]):
                _raise_missing_files("paired-end", path, template)
            elif len(result["PE_1"]) != len(result["PE_2"]):
                raise MakefileError(
                    "Unequal number of mate 1 and mate 2 "
                    "files found at path %r; found %i mate 1 "
                    "files, and %i mate 2 files; specified in "
                    "makefile at %r. Please verify that the "
                    "path is correct, and update the makefile!" %
                    (template, len(result["PE_1"]), len(
                        result["PE_2"]), " :: ".join(path)))
        else:
            result = {
                "PE_1": [template.format(Pair=1)],
                "PE_2": [template.format(Pair=2)]
            }
    elif _has_glob_magic(template):
        result = {"SE": _sorted_glob(template)}
        if not result["SE"]:
            _raise_missing_files("single-end", path, template)
    else:
        result = {"SE": [template]}

    return result
Esempio n. 15
0
def _update_regions(options, mkfile):
    log = logging.getLogger(__name__)
    log.info("Validating regions of interest")
    mkfile["Project"]["Regions"] = mkfile["Project"].pop("RegionsOfInterest")

    if not mkfile["Project"]["Regions"]:
        raise MakefileError("No regions of interest have been specified; "
                            "no analyses will be performed.")

    for (name, subdd) in mkfile["Project"]["Regions"].items():
        if "Prefix" not in subdd:
            raise MakefileError("No genome specified for regions %r" %
                                (name, ))

        subdd["Name"] = name
        subdd["Desc"] = "{Prefix}.{Name}".format(**subdd)
        subdd["BED"] = os.path.join(options.regions_root,
                                    subdd["Desc"] + ".bed")
        subdd["FASTA"] = os.path.join(options.prefix_root,
                                      subdd["Prefix"] + ".fasta")

        required_files = (
            ("Regions file", subdd["BED"]),
            ("Reference sequence", subdd["FASTA"]),
        )

        for (desc, path) in required_files:
            if not os.path.isfile(path):
                raise MakefileError("%s does not exist for %r:\n  Path = %r" %
                                    (desc, name, path))

        # Collects seq. names / validate regions
        try:
            sequences = _collect_sequence_names(bed_file=subdd["BED"],
                                                fasta_file=subdd["FASTA"])
        except (IOError, BEDError) as error:
            raise MakefileError("Error reading regions-of-interest %r:\n%s" %
                                (name, error))

        subdd["Sequences"] = {None: sequences}
        subdd["SubsetFiles"] = {None: ()}
        sampledd = subdd["Genotypes"] = {}
        for sample_name in mkfile["Project"]["Samples"]:
            fasta_file = ".".join((sample_name, subdd["Desc"], "fasta"))
            sampledd[sample_name] = os.path.join(options.destination,
                                                 mkfile["Project"]["Title"],
                                                 "genotypes", fasta_file)
Esempio n. 16
0
def _validate_prefixes(makefiles):
    logger = logging.getLogger(__name__)
    already_validated = {}
    logger.info("Validating FASTA files")
    for makefile in makefiles:
        for prefix in makefile["Prefixes"].values():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                logger.warn("Reference FASTA file does not exist: %r", path)
                continue
            elif not os.path.exists(path + ".fai"):
                logger.info("Indexing FASTA at %r", path)

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError as error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.items():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError) as error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.values()) > _BAM_MAX_SEQUENCE_LENGTH:
                logger.warn(
                    "FASTA file %r contains sequences longer "
                    "than %i! CSI index files will be used instead "
                    "of BAI index files.",
                    path,
                    _BAM_MAX_SEQUENCE_LENGTH,
                )
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Esempio n. 17
0
def _check_sexes(mkfile):
    all_contigs = set()
    contigs_sexes = set()
    regions_sexes = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions["FASTA"]))

        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_sexes.update(contigs)

        current_sexes = set(regions["HomozygousContigs"])
        if not regions_sexes:
            regions_sexes = current_sexes
        elif regions_sexes != current_sexes:
            raise MakefileError("List of sexes for regions %r does not "
                                "match other regions" % (regions["Name"],))

    if not regions_sexes:
        raise MakefileError("No sexes have been specified in makefile; "
                            "please list all sample sexes and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample.get("Sex") is None:
            if sample.get("Gender") is None:
                raise MakefileError("Please specify a sex for sample %r, or "
                                    "'NA' if not applicable."
                                    % (sample["Name"]))

            sample["Sex"] = sample.pop("Gender")
        elif sample.get("Gender") is not None:
            raise MakefileError("Both a Sex and a Gender has been specified "
                                "sample %r; the Gender field is deprecated, "
                                "please only use the Sex field."
                                % (sample["Name"]))

        if sample["Sex"] not in regions_sexes:
            sexes = ", ".join(map(repr, regions_sexes))
            message = "Sample %r has unknown sex %r; known sexes are %s" \
                % (sample["Name"], sample["Sex"], sexes)
            raise MakefileError(message)

    unknown_contigs = contigs_sexes - all_contigs
    if unknown_contigs:
        print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n"
                   "    - " + "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Esempio n. 18
0
def _check_indels_and_msa(mkfile):
    msa     = mkfile["MultipleSequenceAlignment"]
    regions = mkfile["Project"]["Regions"]
    for (name, subdd) in regions.iteritems():
        msa_enabled = msa[name]["Enabled"]

        if subdd["IncludeIndels"] and not msa_enabled:
            raise MakefileError("Regions %r includes indels, but MSA is disabled!" % (name,))
Esempio n. 19
0
def _update_subsets(mkfile, steps):
    subsets_by_regions = mkfile["Project"]["Regions"]

    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)

    if "phylogeny:examl" in steps:
        for (key, subdd) in mkfile["PhylogeneticInference"].iteritems():
            for (subkey, roidd) in subdd["RegionsOfInterest"].iteritems():
                if subkey not in subsets_by_regions:
                    message = \
                        "Unknown regions name in phylogenetic inference:\n" \
                        "\tPath = PhylogeneticInference:%s:RegionsOfInterest" \
                        "\n\tName = %s"
                    raise MakefileError(message % (key, subkey))

                roidd["Name"] = subkey

                if roidd.get("SubsetRegions") is not None:
                    path = "PhylogeneticInference:%s:RegionsOfInterest:%s" % (key, subkey)
                    _collect_subsets(subkey, roidd["SubsetRegions"], path)

    if "paml:codeml" in steps:
        for (roi, subset) in mkfile["PAML"]["codeml"]["SubsetRegions"].iteritems():
            _collect_subsets(roi, subset, "PAML:codeml:SubsetRegions")
Esempio n. 20
0
def _validate_makefiles_duplicate_targets(makefiles):
    targets = set()
    for makefile in makefiles:
        for target in makefile["Targets"]:
            if target in targets:
                raise MakefileError("Target name '%s' used multiple times; "
                                    "output files would be clobbered!" %
                                    target)
            targets.add(target)
Esempio n. 21
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(options.samples_root, "%s.%s.bam"
                                    % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions["FASTA"])

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = ("Reference sequence missing from BAM file; "
                               "BAM file aligned against different prefix?\n"
                               "    BAM file = %s\n    Sequence name = %s") \
                               % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = ("Length of reference sequence in FASTA differs "
                               "from length of sequence in BAM file; BAM file "
                               "aligned against different prefix?\n"
                               "    BAM file = %s\n"
                               "    Length in FASTA = %s\n"
                               "    Length in BAM = %s") \
                               % (filename, length, bam_length)
                    raise MakefileError(message)
Esempio n. 22
0
def _check_sexes(mkfile):
    all_contigs = set()
    contigs_sexes = set()
    regions_sexes = set()
    for regions in mkfile["Project"]["Regions"].values():
        all_contigs.update(_collect_fasta_contigs(regions["FASTA"]))

        for contigs in regions["HomozygousContigs"].values():
            contigs_sexes.update(contigs)

        current_sexes = set(regions["HomozygousContigs"])
        if not regions_sexes:
            regions_sexes = current_sexes
        elif regions_sexes != current_sexes:
            raise MakefileError("List of sexes for regions %r does not "
                                "match other regions" % (regions["Name"], ))

    if not regions_sexes:
        raise MakefileError("No sexes have been specified in makefile; "
                            "please list all sample sexes and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].values():
        if sample.get("Sex") is None:
            raise MakefileError("Please specify a sex for sample %r, or "
                                "'NA' if not applicable." % (sample["Name"]))
        elif sample["Sex"] not in regions_sexes:
            sexes = ", ".join(map(repr, regions_sexes))
            message = "Sample %r has unknown sex %r; known sexes are %s" % (
                sample["Name"],
                sample["Sex"],
                sexes,
            )
            raise MakefileError(message)

    unknown_contigs = contigs_sexes - all_contigs
    if unknown_contigs:
        log = logging.getLogger(__name__)
        log.warning("Unknown contig(s) in 'HomozygousContigs':")
        for name in sorted(unknown_contigs):
            log.warning("  - %r", name)
        log.warning("Please verify that the list(s) of contigs is correct!")
Esempio n. 23
0
def _validate_makefile_libraries(makefile):
    libraries = collections.defaultdict(set)
    iterator = _iterate_over_records(makefile)
    for (target, sample, library, _, _) in iterator:
        libraries[(target, library)].add(sample)

    for ((target, library), samples) in libraries.iteritems():
        if len(samples) > 1:
            raise MakefileError("Library '%s' in target '%s' spans multiple "
                                " samples: %s" %
                                (library, target, ", ".join(samples)))
Esempio n. 24
0
def _update_genotyping(mkfile):
    genotyping = mkfile["Genotyping"]
    defaults = genotyping.pop("Defaults")
    defaults.setdefault("Padding", 5)
    defaults["VCF_Filter"].setdefault("MaxReadDepth", 0)

    for (key, subdd) in genotyping.items():
        if subdd.get("GenotypeEntirePrefix"):
            message = ("GenotypeEntirePrefix is only allowed for prefixes "
                       "using default parameters, but is set for %r" % (key, ))
            raise MakefileError(message)

    for key in mkfile["Project"]["Regions"]:
        genotyping[key] = fill_dict(genotyping.get(key, {}), defaults)

    regions = set(genotyping)
    unknown_regions = regions - set(mkfile["Project"]["Regions"])
    if unknown_regions:
        raise MakefileError("Unknown Regions of Interest in Genotyping: %s" %
                            (", ".join(unknown_regions), ))
Esempio n. 25
0
    def _build_bwa_algorithm(self, config, prefix, record, parameters):
        if self.options["QualityOffset"] != 33:
            raise MakefileError(
                "Mapping with BWA using the %r algorithm currently does not support "
                "QualityOffsets other than 33; please convert your FASTQ if you wish "
                "to proceed.")

        parameters = self._set_pe_input_files(parameters)
        parameters["mapping_options"] = self.options["Aligners"]["BWA"]
        parameters["cleanup_options"] = self._cleanup_options("BWA")

        return BWAAlgorithmNode(**parameters)
Esempio n. 26
0
def _glob_prefixes(template, pattern):
    filename = None
    for filename in glob.iglob(pattern):
        name = os.path.basename(filename).split(".")[0]
        _VALID_PREFIX_NAME(("Prefixes", name), name)
        new_prefix = copy.copy(template)
        new_prefix["Path"] = filename

        yield (name, new_prefix)

    if filename is None:
        raise MakefileError("Did not find any matches for search string %r" %
                            (pattern, ))
Esempio n. 27
0
def _update_msa(mkfile):
    msa      = mkfile["MultipleSequenceAlignment"]
    defaults = msa.pop("Defaults")
    defaults.setdefault("Program", "MAFFT")
    defaults["MAFFT"].setdefault("Algorithm", "MAFFT")

    for key in mkfile["Project"]["Regions"]:
        msa[key] = fill_dict(msa.get(key, {}), defaults)

    unknown_regions = set(msa) - set(mkfile["Project"]["Regions"])
    if unknown_regions:
        raise MakefileError("Unknown Regions of Interest in Genotyping: %s" \
                            % (", ".join(unknown_regions),))
Esempio n. 28
0
def _validate_makefiles_features(makefiles):
    for makefile in makefiles:
        features = makefile["Options"]["Features"]
        roi_enabled = False

        for prefix in makefile["Prefixes"].itervalues():
            roi_enabled |= bool(prefix.get("RegionsOfInterest"))

        if features["Depths"] and roi_enabled:
            if not (features["RawBAM"] or features["RealignedBAM"]):
                raise MakefileError("The feature 'Depths' (depth histograms) "
                                    "with RegionsOfInterest enabled, requires "
                                    "that either the feature 'RawBAM' or the "
                                    "feature 'RalignedBAM' is enabled.")
Esempio n. 29
0
    def _build_bwa_algorithm(self, config, prefix, record, parameters):
        if self.options["QualityOffset"] != 33:
            raise MakefileError("Mapping with BWA using the %r algorithm "
                                "currently does not support QualityOffsets "
                                "other than 33; please convert your FASTQ "
                                "if you wish to proceed.")

        self._set_pe_input_files(parameters)
        node = BWAAlgorithmNode.customize(**parameters)

        apply_options(node.commands["aln"],
                      self.options["Aligners"]["BWA"])

        return self._finalize_nodes(config, prefix, parameters, node)
Esempio n. 30
0
def _validate_makefiles_duplicate_targets(config, makefiles):
    targets = set()
    for makefile in makefiles:
        destination = config.destination
        if destination is None:
            filename = makefile["Statistics"]["Filename"]
            destination = os.path.dirname(filename)

        for target in makefile["Targets"]:
            key = (destination, target)
            if key in targets:
                raise MakefileError("Target name '%s' used multiple times; "
                                    "output files would be clobbered!" %
                                    target)
            targets.add(key)