Ejemplo n.º 1
0
def _check_genders(mkfile):
    all_contigs = set()
    contigs_genders = set()
    regions_genders = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions))

        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_genders.update(contigs)

        current_genders = set(regions["HomozygousContigs"])
        if not regions_genders:
            regions_genders = current_genders
        elif regions_genders != current_genders:
            raise MakefileError("List of genders for regions %r does not "
                                "match other regions" % (regions["Name"], ))

    if not regions_genders:
        raise MakefileError("No genders have been specified in makefile; "
                            "please list all sample genders and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample["Gender"] not in regions_genders:
            genders = ", ".join(map(repr, regions_genders))
            message = "Sample %r has unknown gender %r; known genders are %s" \
                % (sample["Name"], sample["Gender"], genders)
            raise MakefileError(message)

    unknown_contigs = contigs_genders - all_contigs
    if unknown_contigs:
        print_warn(
            "WARNING: Unknown contig(s) in 'HomozygousContigs':\n    - " +
            "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Ejemplo n.º 2
0
def _update_prefixes(makefile):
    prefixes = {}
    for (name, values) in makefile.get("Prefixes", {}).iteritems():
        filename = values["Path"]
        if name.endswith("*"):
            records = []
            for fname in glob.glob(filename):
                name = os.path.basename(fname).split(".")[0]
                _VALID_PREFIX_NAME(("Prefixes", name), name)
                new_prefix = copy.copy(values)
                new_prefix["Path"] = fname

                records.append((name, new_prefix))
            if not records:
                raise MakefileError("Did not find any matches for glob %s" %
                                    repr(filename))
        else:
            records = [(name, values)]

        for (name, record) in records:
            if name in prefixes:
                raise MakefileError(
                    "Multiple prefixes with the same name: %s" % name)

            if not record["Path"].endswith(".fasta"):
                raise MakefileError("Path for prefix %r does not end with "
                                    ".fasta:\n   %r" % (name, record["Path"]))

            record["Name"] = name
            record["Reference"] = record["Path"]
            prefixes[name] = record

    if not prefixes:
        raise MakefileError("At least one prefix must be specified")
    makefile["Prefixes"] = prefixes
Ejemplo n.º 3
0
def _determine_lane_type(prefixes, data, path):
    if isinstance(data, types.StringTypes):
        return "Raw"
    elif isinstance(data, types.DictType):
        if all((key in _READ_TYPES) for key in data):
            for (key, files) in data.iteritems():
                is_paired = paths.is_paired_end(files)

                if is_paired and (key != "Paired"):
                    raise MakefileError("Error at Barcode level; Path "
                                        "includes {Pair} key, but read-type "
                                        "is not Paired:\n    "
                                        "%s:%s" % (":".join(path), key))
                elif not is_paired and (key == "Paired"):
                    raise MakefileError("Error at Barcode level; Paired pre-"
                                        "trimmed reads specified, but path "
                                        "does not contain {Pair} key:\n    "
                                        "%s:%s" % (":".join(path), key))

            return "Trimmed"
        elif all((key in prefixes) for key in data):
            return "BAMs"

    raise MakefileError("Error at Barcode level; keys must either be "
                        "prefix-names, OR 'Paired', 'Single' or 'Collapsed'. "
                        "Found: %s" % (", ".join(data), ))
Ejemplo n.º 4
0
    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError(
                "Subset of unknown region (%r) requested at %r" % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError(
                ("Subset file does not exist for Regions Of Interest:\n"
                 "  Region = %r\n  Subset = %r\n  Path   = %r") %
                (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname, )
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
Ejemplo n.º 5
0
def _collect_and_validate_regions(regions):
    contigs = _collect_fasta_contigs(regions)
    parser = pysam.asBed()
    sequences = set()
    with open(regions["BED"]) as bedhandle:
        for (line_num, line) in enumerate(bedhandle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = parser(line, len(line))
                # Force evaluation of (lazily parsed) properties
                bed_start = bed.start
                bed_end = bed.end
            except ValueError, error:
                raise MakefileError(
                    ("Error parsing line %i in regions file:\n"
                     "  Path = %r\n  Line = %r\n\n%s") %
                    (line_num + 1, regions["BED"], line, error))

            if len(bed) < 6:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise MakefileError(("Region at line #%i (%s) does not "
                                     "contain the expected number of fields; "
                                     "the first 6 fields are required. C.f. "
                                     "defination at\n   %s\n\nPath = %r") %
                                    (line_num, name, url, regions["BED"]))

            contig_len = contigs.get(bed.contig)
            if contig_len is None:
                raise MakefileError(("Regions file contains contig not found "
                                     "in reference:\n  Path = %r\n  Contig = "
                                     "%r\n\nPlease ensure that all contig "
                                     "names match the reference names!") %
                                    (regions["BED"], bed.contig))
            elif not (0 <= int(bed_start) < int(bed_end) <= contig_len):
                raise MakefileError(("Regions file contains invalid region:\n"
                                     "  Path   = %r\n  Contig = %r\n"
                                     "  Start  = %s\n  End    = %s\n\n"
                                     "Expected 0 <= Start < End <= %i!") %
                                    (regions["BED"], bed.contig, bed.start,
                                     bed.end, contig_len))
            elif bed.strand not in "+-":
                raise MakefileError(
                    ("Regions file contains invalid region: "
                     "  Path   = %r\n  Line = %i\n  Name = %r"
                     "\nStrand is %r, expected '+' or '-'.") %
                    (regions["BED"], line_num, bed.name, bed.strand))

            sequences.add(bed.name)
Ejemplo n.º 6
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, file=sys.stderr, sep="")
Ejemplo n.º 7
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:",
                       file=sys.stderr)
            print_warn(description, file=sys.stderr)
            print_warn(file=sys.stderr)
Ejemplo n.º 8
0
def _update_subsets(mkfile, steps):
    subsets_by_regions = mkfile["Project"]["Regions"]

    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError(
                "Subset of unknown region (%r) requested at %r" % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError(
                ("Subset file does not exist for Regions Of Interest:\n"
                 "  Region = %r\n  Subset = %r\n  Path   = %r") %
                (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname, )
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)

    if "phylogeny:examl" in steps:
        for (key, subdd) in mkfile["PhylogeneticInference"].iteritems():
            for (subkey, roidd) in subdd["RegionsOfInterest"].iteritems():
                if subkey not in subsets_by_regions:
                    message = \
                        "Unknown regions name in phylogenetic inference:\n" \
                        "\tPath = PhylogeneticInference:%s:RegionsOfInterest" \
                        "\n\tName = %s"
                    raise MakefileError(message % (key, subkey))

                roidd["Name"] = subkey

                if roidd.get("SubsetRegions") is not None:
                    path = "PhylogeneticInference:%s:RegionsOfInterest:%s" % (
                        key, subkey)
                    _collect_subsets(subkey, roidd["SubsetRegions"], path)

    if "paml:codeml" in steps:
        for (roi,
             subset) in mkfile["PAML"]["codeml"]["SubsetRegions"].iteritems():
            _collect_subsets(roi, subset, "PAML:codeml:SubsetRegions")
Ejemplo n.º 9
0
def _select_samples(select, groups, samples, path):
    selection = set()
    for group in select:
        if group.startswith("<") and group.endswith(">"):
            key = tuple(group[1:-1].split("/"))
            if key not in groups:
                raise MakefileError(
                    "Unknown group specifed for filtering %r: %r" %
                    (path, key))
            selection.update(groups[key])
        elif group in samples:
            selection.add(group)
        else:
            raise MakefileError(
                "Unknown/Invalid group specifed for filtering %r: %r" %
                (path, group))
    return selection
Ejemplo n.º 10
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(
                options.samples_root,
                "%s.%s.bam" % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions)

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = ("Reference sequence missing from BAM file; "
                               "BAM file aligned against different prefix?\n"
                               "    BAM file = %s\n    Sequence name = %s") \
                               % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = ("Length of reference sequence in FASTA differs "
                               "from length of sequence in BAM file; BAM file "
                               "aligned against different prefix?\n"
                               "    BAM file = %s\n"
                               "    Length in FASTA = %s\n"
                               "    Length in BAM = %s") \
                               % (filename, length, bam_length)
                    raise MakefileError(message)
Ejemplo n.º 11
0
def _check_indels_and_msa(mkfile):
    msa = mkfile["MultipleSequenceAlignment"]
    regions = mkfile["Project"]["Regions"]
    for (name, subdd) in regions.iteritems():
        msa_enabled = msa[name]["Enabled"]

        if subdd["IncludeIndels"] and not msa_enabled:
            raise MakefileError(
                "Regions %r includes indels, but MSA is disabled!" % (name, ))
Ejemplo n.º 12
0
def _update_prefixes(makefile):
    prefixes = {}
    for (name, values) in makefile.get("Prefixes", {}).iteritems():
        filename = values["Path"]
        if "*" in name[:-1]:
            raise MakefileError("The character '*' is not allowed in Prefix "
                                "names; if you use to select .fasta files "
                                "using a search-string, then use the prefix "
                                "name '%s*' instead and specify the wildcards "
                                "in the 'Path' instead." % (name.replace(
                                    "*",
                                    "",
                                )))
        elif name.endswith("*"):
            records = []
            for fname in glob.glob(filename):
                name = os.path.basename(fname).split(".")[0]
                _VALID_PREFIX_NAME(("Prefixes", name), name)
                new_prefix = copy.copy(values)
                new_prefix["Path"] = fname

                records.append((name, new_prefix))
            if not records:
                raise MakefileError("Did not find any matches for glob %s" %
                                    repr(filename))
        else:
            records = [(name, values)]

        for (name, record) in records:
            if name in prefixes:
                raise MakefileError(
                    "Multiple prefixes with the same name: %s" % name)

            if not record["Path"].endswith(".fasta"):
                raise MakefileError("Path for prefix %r does not end with "
                                    ".fasta:\n   %r" % (name, record["Path"]))

            record["Name"] = name
            record["Reference"] = record["Path"]
            prefixes[name] = record

    if not prefixes:
        raise MakefileError("At least one prefix must be specified")
    makefile["Prefixes"] = prefixes
Ejemplo n.º 13
0
def _validate_makefile_libraries(makefile):
    libraries = collections.defaultdict(set)
    iterator = _iterate_over_records(makefile)
    for (target, sample, library, _, _) in iterator:
        libraries[(target, library)].add(sample)

    for ((target, library), samples) in libraries.iteritems():
        if len(samples) > 1:
            raise MakefileError("Library '%s' in target '%s' spans multiple "
                                " samples: %s" %
                                (library, target, ", ".join(samples)))
Ejemplo n.º 14
0
def _read_max_depths(filename, prefix, sample):
    if filename in _DEPTHS_CACHE:
        return _DEPTHS_CACHE[filename]

    max_depth = None
    try:
        with open(filename) as handle:
            for row in parse_padded_table(handle):
                if row["Name"] == sample and \
                        row["Sample"] == "*" and \
                        row["Library"] == "*" and \
                        row["Contig"] == "*":
                    max_depth = row["MaxDepth"]
                    break
            else:
                raise MakefileError("Could not find MaxDepth in "
                                    "depth-histogram: %r" % (filename, ))

    except (OSError, IOError), error:
        raise MakefileError("Error reading depth-histogram (%s): %s" %
                            (filename, error))
Ejemplo n.º 15
0
def _update_regions(options, mkfile):
    print_info("    - Validating regions of interest ...")
    mkfile["Project"]["Regions"] = mkfile["Project"].pop("RegionsOfInterest")

    for (name, subdd) in mkfile["Project"]["Regions"].iteritems():
        if "Prefix" not in subdd:
            raise MakefileError("No genome specified for regions %r" %
                                (name, ))

        subdd["Name"] = name
        subdd["Desc"] = "{Prefix}.{Name}".format(**subdd)
        subdd["BED"] = os.path.join(options.regions_root,
                                    subdd["Desc"] + ".bed")
        subdd["FASTA"] = os.path.join(options.prefix_root,
                                      subdd["Prefix"] + ".fasta")

        required_files = (("Regions file", subdd["BED"],
                           None), ("Reference sequence", subdd["FASTA"], None),
                          ("Reference sequence index", subdd["FASTA"] + ".fai",
                           "Please index using 'samtools faidx %s'" %
                           (subdd["FASTA"], )))

        for (desc, path, instructions) in required_files:
            if not os.path.isfile(path):
                message = "%s does not exist for %r:\n  Path = %r" \
                                % (desc, name, path)
                if instructions:
                    message = "%s\n%s" % (message, instructions)
                raise MakefileError(message)

        # Collects seq. names / validate regions
        subdd["Sequences"] = {None: _collect_and_validate_regions(subdd)}
        subdd["SubsetFiles"] = {None: ()}

        sampledd = subdd["Genotypes"] = {}
        for sample_name in mkfile["Project"]["Samples"]:
            fasta_file = ".".join((sample_name, subdd["Desc"], "fasta"))
            sampledd[sample_name] = os.path.join(options.destination,
                                                 mkfile["Project"]["Title"],
                                                 "genotypes", fasta_file)
Ejemplo n.º 16
0
def _update_msa(mkfile):
    msa = mkfile["MultipleSequenceAlignment"]
    defaults = msa.pop("Defaults")
    defaults.setdefault("Program", "MAFFT")
    defaults["MAFFT"].setdefault("Algorithm", "MAFFT")

    for key in mkfile["Project"]["Regions"]:
        msa[key] = fill_dict(msa.get(key, {}), defaults)

    unknown_regions = set(msa) - set(mkfile["Project"]["Regions"])
    if unknown_regions:
        raise MakefileError("Unknown Regions of Interest in Genotyping: %s" \
                            % (", ".join(unknown_regions),))
Ejemplo n.º 17
0
def _update_genotyping(mkfile):
    genotyping = mkfile["Genotyping"]
    defaults = genotyping.pop("Defaults")
    defaults.setdefault("Padding", 5)
    defaults["VCF_Filter"].setdefault("MaxReadDepth", 0)

    for (key, subdd) in genotyping.iteritems():
        if subdd.get("GenotypeEntirePrefix"):
            message = "GenotypeEntirePrefix is only allowed for prefixes " \
                      "using default parameters, but is set for %r" % (key,)
            raise MakefileError(message)

    for key in mkfile["Project"]["Regions"]:
        subdd = fill_dict(genotyping.get(key, {}), defaults)
        subdd["Random"]["--padding"] = subdd["Padding"]
        genotyping[key] = subdd

    regions = set(genotyping)
    unknown_regions = regions - set(mkfile["Project"]["Regions"])
    if unknown_regions:
        raise MakefileError("Unknown Regions of Interest in Genotyping: %s" \
                            % (", ".join(unknown_regions),))
Ejemplo n.º 18
0
def _validate_makefiles_features(makefiles):
    for makefile in makefiles:
        features = makefile["Options"]["Features"]
        roi_enabled = False

        for prefix in makefile["Prefixes"].itervalues():
            roi_enabled |= bool(prefix.get("RegionsOfInterest"))

        if "Depths" in features and roi_enabled:
            if not (("Raw BAM" in features) or ("Realigned BAM") in features):
                raise MakefileError("The feature 'Depths' (depth histograms) "
                                    "with RegionsOfInterest enabled, requires "
                                    "that either the feature 'Raw BAM' or the "
                                    "feature 'Raligned BAM' is enabled.")
Ejemplo n.º 19
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups = mkfile["Project"]["Groups"]

    filtering = {}
    for (target,
         filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per "
                                "sample, not by groups: %r" % (target, ))
        elif target not in samples:
            raise MakefileError(
                "Unknown/Invalid sample specifed for singleton filtering: %r" %
                (target, ))
        elif target in filter_by:
            raise MakefileError(
                "Attempting to filter singleton in sample using itself as comparison: %r"
                % (target, ))

        path = "Project:FilterSingletons:%s" % (target, )
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target, ))
            print_warn(
                "Warning: Sample %r is singleton-filtered using a "
                "group it is also a member of; this may be by mistake." %
                (target, ))

        if not filtering[target]:
            raise MakefileError("No samples specified by which to "
                                "singleton-filter by for %r" % (target, ))

    mkfile["Project"]["FilterSingletons"] = filtering
Ejemplo n.º 20
0
def _validate_makefiles_duplicate_targets(config, makefiles):
    targets = set()
    for makefile in makefiles:
        destination = config.destination
        if destination is None:
            filename = makefile["Statistics"]["Filename"]
            destination = os.path.dirname(filename)

        for target in makefile["Targets"]:
            key = (destination, target)
            if key in targets:
                raise MakefileError("Target name '%s' used multiple times; "
                                    "output files would be clobbered!" %
                                    target)
            targets.add(key)
Ejemplo n.º 21
0
    def _collect_samples(samples_dict, path=()):
        current_samples = {}
        for (key, subdd) in samples_dict.iteritems():
            if key.startswith("<") and key.endswith(">"):
                key = key.lstrip("<").rstrip(">")
                current_samples.update(_collect_samples(subdd, path + (key, )))
            elif key not in samples:
                samples.add(key)
                subdd["Name"] = key
                current_samples[key] = subdd
            else:
                raise MakefileError("Duplicate sample-name: %r" % (key, ))

        groups[path] = current_samples
        return current_samples
Ejemplo n.º 22
0
def _mangle_makefile(mkfile):
    _collapse_taxa(mkfile)
    _update_intervals(mkfile)
    _update_filtering(mkfile)
    mkfile["Nodes"] = ()

    padding = mkfile["Genotyping"]["Padding"]
    mkfile["Genotyping"]["Random"]["--padding"] = padding

    excluded_groups = set(mkfile["Phylogenetic Inference"]["ExcludeGroups"])
    unknown_groups  = excluded_groups - set(mkfile["Project"]["Taxa"])
    if unknown_groups:
        raise MakefileError("Unknown taxa in 'Phylogenetic Inference:ExcludeGroups': '%s'" \
                            % ("', '".join(unknown_groups)))

    return mkfile
Ejemplo n.º 23
0
    def _do_update_options(options, data, path):
        options = copy.deepcopy(options)
        if "Options" in data:
            if "Features" in data["Options"]:
                raise MakefileError("Features may only be specified at root "
                                    "level, not at %r" % (":".join(path), ))

            # Fill out missing values using those of prior levels
            options = fill_dict(destination=data.pop("Options"),
                                source=options)
            _update_possibly_empty_lists(options)

        if len(path) < 2:
            for key in data:
                if key != "Options":
                    _do_update_options(options, data[key], path + (key, ))
        else:
            data["Options"] = options
Ejemplo n.º 24
0
def _validate_hg_prefixes(makefiles):
    """Implementation of the checks included in GATK, which require that the
    FASTA for the human genome is ordered 1 .. 23, .
    """
    already_validated = set()
    print_info("  - Validating prefixes ...", file=sys.stderr)
    for makefile in makefiles:
        uses_gatk = "Realigned BAM" in makefile["Options"]["Features"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                continue

            if not os.path.exists(path):
                print_info("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ),
                           file=sys.stderr)
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ),
                           file=sys.stderr)

                if not os.access(os.path.dirname(path), os.W_OK):
                    message = \
                        "FASTA index for prefix is missing, but folder is\n" \
                        "not writable, so it cannot be created:\n" \
                        "  Prefix = %s\n\n" \
                        "Either change permissions on the folder, or move\n" \
                        "the prefix to different location." % (path,)
                    raise MakefileError(message)

                # Use pysam to index the file
                pysam.Fastafile(path).close()

            contigs = []
            with open(path + ".fai") as handle:
                for line in handle:
                    name, size, _ = line.split('\t', 2)
                    contigs.append((name, int(size)))

            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)
            already_validated.add(path)
Ejemplo n.º 25
0
def _collect_fasta_contigs(regions):
    filename = regions["FASTA"] + ".fai"
    if filename in _CONTIGS_CACHE:
        return _CONTIGS_CACHE[filename]

    contigs = {}
    with open(filename) as faihandle:
        for line in faihandle:
            name, length, _ = line.split(None, 2)
            if name in contigs:
                message = ("Reference contains multiple identically named "
                           "sequences:\n  Path = %r\n  Name = %r\n"
                           "Please ensure that sequences have unique names") \
                           % (regions["FASTA"], name)
                raise MakefileError(message)

            contigs[name] = int(length)

    _CONTIGS_CACHE[filename] = contigs
    return contigs
Ejemplo n.º 26
0
def _split_lanes_by_filenames(makefile):
    iterator = _iterate_over_records(makefile)
    for (target, sample, library, barcode, record) in iterator:
        if record["Type"] == "Raw":
            template = record["Data"]
            record["Data"] = files = paths.collect_files(template)
            split = record["Options"]["SplitLanesByFilenames"]

            if (split == True) or (isinstance(split, list) and
                                   (barcode in split)):
                if any(
                        missing_files(file_set)
                        for file_set in files.itervalues()):
                    raise MakefileError("Unable to split by filename for "
                                        "search-string '%s', did not find any "
                                        "files; please verify that the path"
                                        "is correct and update the makefile." %
                                        template)
                elif any(len(v) > 1 for v in files.itervalues()):
                    template = makefile["Targets"][target][sample][
                        library].pop(barcode)
                    keys = ("SE", ) if ("SE" in files) else ("PE_1", "PE_2")

                    input_files = [files[key] for key in keys]
                    input_files_iter = itertools.izip_longest(*input_files)
                    for (index, filenames) in enumerate(input_files_iter,
                                                        start=1):
                        assert len(filenames) == len(keys)
                        assert len(filenames[0]) == len(filenames[-1])
                        new_barcode = "%s_%03i" % (barcode, index)

                        current = copy.deepcopy(template)
                        current["Data"] = dict(
                            (key, [filename])
                            for (key, filename) in zip(keys, filenames))
                        current["Tags"]["PU_cur"] = new_barcode

                        makefile["Targets"][target][sample][library][
                            new_barcode] = current
Ejemplo n.º 27
0
def _update_and_check_max_read_depth(options, mkfile):
    if any(subdd["VCF_Filter"]["MaxReadDepth"] == "auto"
           for subdd in mkfile["Genotyping"].itervalues()):
        print_info("    - Determinining max-depth from depth-histograms ...")

    for (key, settings) in mkfile["Genotyping"].iteritems():
        required_keys = set()
        for sample in mkfile["Project"]["Samples"].itervalues():
            if sample["GenotypingMethod"].lower() == "samtools":
                required_keys.add(sample["Name"])

        max_depths = settings["VCF_Filter"]["MaxReadDepth"]
        if isinstance(max_depths, types.DictType):
            # Extra keys are allowed, to make it easier
            # to temporarily disable a sample
            missing_keys = required_keys - set(max_depths)
            if missing_keys:
                missing_keys = "\n    - ".join(sorted(missing_keys))
                message = "MaxReadDepth not specified for the following " \
                          "samples for %r:\n    - %s" % (key, missing_keys)
                raise MakefileError(message)

        elif isinstance(max_depths, types.StringTypes):
            assert max_depths.lower() == "auto", max_depths
            prefix = mkfile["Project"]["Regions"][key]["Prefix"]
            max_depths = {}

            for sample in required_keys:
                fname = "%s.%s.depths" % (sample, prefix)
                fpath = os.path.join(options.samples_root, fname)
                max_depths[sample] = _read_max_depths(fpath, prefix, sample)

            settings["VCF_Filter"]["MaxReadDepth"] = max_depths
        else:
            max_depths = dict.fromkeys(required_keys, max_depths)
            settings["VCF_Filter"]["MaxReadDepth"] = max_depths
Ejemplo n.º 28
0
                        row["Sample"] == "*" and \
                        row["Library"] == "*" and \
                        row["Contig"] == "*":
                    max_depth = row["MaxDepth"]
                    break
            else:
                raise MakefileError("Could not find MaxDepth in "
                                    "depth-histogram: %r" % (filename, ))

    except (OSError, IOError), error:
        raise MakefileError("Error reading depth-histogram (%s): %s" %
                            (filename, error))

    if max_depth == "NA":
        raise MakefileError("MaxDepth is not calculated for sample (%s);\n"
                            "cannot determine MaxDepth values automatically." %
                            (filename, ))
    max_depth = int(max_depth)

    print_info("        - %s.%s = %i" % (sample, prefix, max_depth))
    _DEPTHS_CACHE[filename] = max_depth
    return max_depth


_DEPTHS_CACHE = {}


def _check_indels_and_msa(mkfile):
    msa = mkfile["MultipleSequenceAlignment"]
    regions = mkfile["Project"]["Regions"]
    for (name, subdd) in regions.iteritems():