Beispiel #1
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, sep="")
Beispiel #2
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:\n%s\n" %
                       (description, ))
Beispiel #3
0
def build_mito_nodes(config, root, bamfile, dependencies=()):
    if config.database.mitochondria is None:
        print_warn("WARNING: Zonkey database %r does not contain "
                   "mitochondrial  sequences; cannot analyze MT BAM %r!\n"
                   % (config.tablefile, bamfile))
        return ()

    samples = os.path.join(root, "figures", "samples.txt")

    mt_prefix = os.path.join(root, "results", "mitochondria", "sequences")
    alignment = mitochondria.MitoConsensusNode(database=config.tablefile,
                                               bamfile=bamfile,
                                               output_prefix=mt_prefix,
                                               dependencies=dependencies)

    raxml_template = os.path.join(root, "results", "mitochondria", "raxml_%s")
    phylo = RAxMLRapidBSNode.customize(input_alignment=mt_prefix + ".phy",
                                       output_template=raxml_template,
                                       dependencies=(alignment,))

    phylo.command.set_option("-N", 100)
    phylo.command.set_option("-m", "GTRGAMMA")
    phylo = phylo.build_node()

    output_prefix = os.path.join(root, "figures", "mitochondria", "mito_phylo")
    trees = mitochondria.DrawPhylogenyNode(samples=samples,
                                           treefile=raxml_template % ("bestTree",),
                                           bootstraps=raxml_template % ("bootstrap",),
                                           output_prefix=output_prefix,
                                           dependencies=(phylo,))

    return (trees,)
Beispiel #4
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:\n%s\n"
                       % (description,))
Beispiel #5
0
def build_mito_nodes(config, root, bamfile, dependencies=()):
    if config.database.mitochondria is None:
        print_warn("WARNING: Zonkey database %r does not contain "
                   "mitochondrial  sequences; cannot analyze MT BAM %r!\n"
                   % (config.tablefile, bamfile))
        return ()

    samples = os.path.join(root, "figures", "samples.txt")

    mt_prefix = os.path.join(root, "results", "mitochondria", "sequences")
    alignment = mitochondria.MitoConsensusNode(database=config.tablefile,
                                               bamfile=bamfile,
                                               output_prefix=mt_prefix,
                                               dependencies=dependencies)

    raxml_template = os.path.join(root, "results", "mitochondria", "raxml_%s")
    phylo = RAxMLRapidBSNode.customize(input_alignment=mt_prefix + ".phy",
                                       output_template=raxml_template,
                                       dependencies=(alignment,))

    phylo.command.set_option("-N", 100)
    phylo.command.set_option("-m", "GTRGAMMA")
    phylo = phylo.build_node()

    output_prefix = os.path.join(root, "figures", "mitochondria", "mito_phylo")
    trees = mitochondria.DrawPhylogenyNode(samples=samples,
                                           treefile=raxml_template % ("bestTree",),
                                           bootstraps=raxml_template % ("bootstrap",),
                                           output_prefix=output_prefix,
                                           dependencies=(phylo,))

    return (trees,)
Beispiel #6
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups  = mkfile["Project"]["Groups"]

    filtering = {}
    for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per "
                                "sample, not by groups: %r" % (target,))
        elif target not in samples:
            raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,))
        elif target in filter_by:
            raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,))

        path = "Project:FilterSingletons:%s" % (target,)
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target,))
            print_warn("Warning: Sample %r is singleton-filtered using a "
                       "group it is also a member of; this may be by mistake."
                       % (target,))

        if not filtering[target]:
            raise MakefileError("No samples specified by which to "
                                "singleton-filter by for %r" % (target,))

    mkfile["Project"]["FilterSingletons"] = filtering
Beispiel #7
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, sep="")
Beispiel #8
0
def _check_genders(mkfile):
    all_contigs = set()
    contigs_genders = set()
    regions_genders = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions["FASTA"]))

        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_genders.update(contigs)

        current_genders = set(regions["HomozygousContigs"])
        if not regions_genders:
            regions_genders = current_genders
        elif regions_genders != current_genders:
            raise MakefileError("List of genders for regions %r does not "
                                "match other regions" % (regions["Name"],))

    if not regions_genders:
        raise MakefileError("No genders have been specified in makefile; "
                            "please list all sample genders and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample["Gender"] not in regions_genders:
            genders = ", ".join(map(repr, regions_genders))
            message = "Sample %r has unknown gender %r; known genders are %s" \
                % (sample["Name"], sample["Gender"], genders)
            raise MakefileError(message)

    unknown_contigs = contigs_genders - all_contigs
    if unknown_contigs:
        print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n    - "
                   + "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Beispiel #9
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups  = mkfile["Project"]["Groups"]

    filtering = {}
    for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per "
                                "sample, not by groups: %r" % (target,))
        elif target not in samples:
            raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,))
        elif target in filter_by:
            raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,))

        path = "Project:FilterSingletons:%s" % (target,)
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target,))
            print_warn("Warning: Sample %r is singleton-filtered using a "
                       "group it is also a member of; this may be by mistake."
                       % (target,))

        if not filtering[target]:
            raise MakefileError("No samples specified by which to "
                                "singleton-filter by for %r" % (target,))

    mkfile["Project"]["FilterSingletons"] = filtering
Beispiel #10
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files." %
                           (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Beispiel #11
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path,))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path,))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error,))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s"
                                        % (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files."
                           % (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
Beispiel #12
0
def _check_sexes(mkfile):
    all_contigs = set()
    contigs_sexes = set()
    regions_sexes = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions["FASTA"]))

        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_sexes.update(contigs)

        current_sexes = set(regions["HomozygousContigs"])
        if not regions_sexes:
            regions_sexes = current_sexes
        elif regions_sexes != current_sexes:
            raise MakefileError("List of sexes for regions %r does not "
                                "match other regions" % (regions["Name"],))

    if not regions_sexes:
        raise MakefileError("No sexes have been specified in makefile; "
                            "please list all sample sexes and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample.get("Sex") is None:
            if sample.get("Gender") is None:
                raise MakefileError("Please specify a sex for sample %r, or "
                                    "'NA' if not applicable."
                                    % (sample["Name"]))

            sample["Sex"] = sample.pop("Gender")
        elif sample.get("Gender") is not None:
            raise MakefileError("Both a Sex and a Gender has been specified "
                                "sample %r; the Gender field is deprecated, "
                                "please only use the Sex field."
                                % (sample["Name"]))

        if sample["Sex"] not in regions_sexes:
            sexes = ", ".join(map(repr, regions_sexes))
            message = "Sample %r has unknown sex %r; known sexes are %s" \
                % (sample["Name"], sample["Sex"], sexes)
            raise MakefileError(message)

    unknown_contigs = contigs_sexes - all_contigs
    if unknown_contigs:
        print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n"
                   "    - " + "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Beispiel #13
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn(
            "WARNING: An adapter specified for AdapterRemoval "
            "corresponds to the default sequence, but is reverse "
            "complemented. Please make sure that this is intended! ",
            end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
Beispiel #14
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn("WARNING: An adapter specified for AdapterRemoval "
                   "corresponds to the default sequence, but is reverse "
                   "complemented. Please make sure that this is intended! ",
                   end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
Beispiel #15
0
            name = cand_sample.split('.', 1)[0]
            name_mapping[name] = cand_sample
            name_counts[name] = name_counts.get(name, 0) + 1

        if name_mapping.get(sample) == 1:
            # Sample name (with some extensions) found
            # This is typical if 'paleomix depths' has been run manually.
            max_depth = max_depths[name_mapping[sample]]
        elif len(max_depths) == 1:
            # Just one sampel in the depth histogram; even though it does not
            # match, we assuem that this is the correct table. This is because
            # manually generating files / renaming files would otherwise cause
            # failure when using 'MaxDepth: auto'.
            (cand_sample, max_depth), = max_depths.items()
            print_warn("        - Name in depths file not as expected; "
                       "found %r, not %r:"
                       % (cand_sample, sample))

    if max_depth is None:
        raise MakefileError("MaxDepth for %r not found in depth-histogram: %r"
                            % (sample, filename))
    elif max_depth == "NA":
        raise MakefileError("MaxDepth is not calculated for sample %r; "
                            "cannot determine MaxDepth values automatically."
                            % (filename,))
    elif not max_depth.isdigit():
        raise MakefileError("MaxDepth is not a valid for sample %r in %r; "
                            "expected integer, found %r."
                            % (sample, filename, max_depth))

    max_depth = int(max_depth)
Beispiel #16
0
            name = cand_sample.split('.', 1)[0]
            name_mapping[name] = cand_sample
            name_counts[name] = name_counts.get(name, 0) + 1

        if name_mapping.get(sample) == 1:
            # Sample name (with some extensions) found
            # This is typical if 'paleomix depths' has been run manually.
            max_depth = max_depths[name_mapping[sample]]
        elif len(max_depths) == 1:
            # Just one sampel in the depth histogram; even though it does not
            # match, we assuem that this is the correct table. This is because
            # manually generating files / renaming files would otherwise cause
            # failure when using 'MaxDepth: auto'.
            (cand_sample, max_depth), = max_depths.items()
            print_warn("        - Name in depths file not as expected; "
                       "found %r, not %r:"
                       % (cand_sample, sample))

    if max_depth is None:
        raise MakefileError("MaxDepth for %r not found in depth-histogram: %r"
                            % (sample, filename))
    elif max_depth == "NA":
        raise MakefileError("MaxDepth is not calculated for sample %r; "
                            "cannot determine MaxDepth values automatically."
                            % (filename,))
    elif not max_depth.isdigit():
        raise MakefileError("MaxDepth is not a valid for sample %r in %r; "
                            "expected integer, found %r."
                            % (sample, filename, max_depth))

    max_depth = int(max_depth)