Beispiel #1
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, file=sys.stderr, sep="")
Beispiel #2
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups = mkfile["Project"]["Groups"]

    filtering = {}
    for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per " "sample, not by groups: %r" % (target,))
        elif target not in samples:
            raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,))
        elif target in filter_by:
            raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,))

        path = "Project:FilterSingletons:%s" % (target,)
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target,))
            print_warn(
                "Warning: Sample %r is singleton-filtered using a "
                "group it is also a member of; this may be by mistake." % (target,)
            )

        if not filtering[target]:
            raise MakefileError("No samples specified by which to " "singleton-filter by for %r" % (target,))

    mkfile["Project"]["FilterSingletons"] = filtering
Beispiel #3
0
def _check_genders(mkfile):
    all_contigs = set()
    contigs_genders = set()
    regions_genders = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions))
        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_genders.update(contigs)

        current_genders = set(regions["HomozygousContigs"])
        if not regions_genders:
            regions_genders = current_genders
        elif regions_genders != current_genders:
            raise MakefileError("List of genders for regions %r does not "
                                "match other regions" % (regions["Name"],))

    if not regions_genders:
        raise MakefileError("No genders have been specified in makefile; "
                            "please list all sample genders and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample["Gender"] not in regions_genders:
            genders = ", ".join(map(repr, regions_genders))
            message = "Sample %r has unknown gender %r; known genders are %s" \
                % (sample["Name"], sample["Gender"], genders)
            raise MakefileError(message)

    unknown_contigs = contigs_genders - all_contigs
    if unknown_contigs:
        print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n    - "
                   + "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Beispiel #4
0
def _check_genders(mkfile):
    all_contigs = set()
    contigs_genders = set()
    regions_genders = set()
    for regions in mkfile["Project"]["Regions"].itervalues():
        all_contigs.update(_collect_fasta_contigs(regions))

        for contigs in regions["HomozygousContigs"].itervalues():
            contigs_genders.update(contigs)

        current_genders = set(regions["HomozygousContigs"])
        if not regions_genders:
            regions_genders = current_genders
        elif regions_genders != current_genders:
            raise MakefileError("List of genders for regions %r does not "
                                "match other regions" % (regions["Name"], ))

    if not regions_genders:
        raise MakefileError("No genders have been specified in makefile; "
                            "please list all sample genders and assosiated "
                            "homozygous contigs (if any).")

    for sample in mkfile["Project"]["Samples"].itervalues():
        if sample["Gender"] not in regions_genders:
            genders = ", ".join(map(repr, regions_genders))
            message = "Sample %r has unknown gender %r; known genders are %s" \
                % (sample["Name"], sample["Gender"], genders)
            raise MakefileError(message)

    unknown_contigs = contigs_genders - all_contigs
    if unknown_contigs:
        print_warn(
            "WARNING: Unknown contig(s) in 'HomozygousContigs':\n    - " +
            "\n    - ".join(unknown_contigs))
        print_warn("Please verify that the list(s) of contigs is correct!")
Beispiel #5
0
def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
    if not _is_invalid_hg_prefix(contigs):
        return

    message = \
        "Prefix appears to be a human genome, but chromosomes are ordered\n" \
        "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
        "(chr1, chr2, chr3, ...):\n\n" \
        "  Makefile = %s\n" \
        "  Prefix   = %s\n\n" \
        "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
        "See the documentation at the GATK website for more information:\n  " \
        "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"

    prefix_path = prefix["Path"]
    mkfile_path = makefile["Statistics"]["Filename"]
    if fatal:
        details = "Either disable GATK in the makefile, or fix the prefix."
        message %= (mkfile_path, prefix_path, details)

        raise MakefileError(message)
    else:
        details = \
            "You will not be able to use the resulting BAM file with GATK."
        message %= (mkfile_path, prefix_path, details)
        print_warn("\nWARNING:\n", message, file=sys.stderr, sep="")
Beispiel #6
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:",
                       file=sys.stderr)
            print_warn(description, file=sys.stderr)
            print_warn(file=sys.stderr)
Beispiel #7
0
def _validate_makefiles_duplicate_files(makefiles):
    filenames = collections.defaultdict(list)
    for makefile in makefiles:
        iterator = _iterate_over_records(makefile)
        for (target, sample, library, barcode, record) in iterator:
            current_filenames = []
            if record["Type"] == "Raw":
                for raw_filenames in record["Data"].itervalues():
                    current_filenames.extend(raw_filenames)
            else:
                current_filenames.extend(record["Data"].values())

            for realpath in map(os.path.realpath, current_filenames):
                filenames[realpath].append((target, sample, library, barcode))

    has_overlap = {}
    for (filename, records) in filenames.iteritems():
        if len(records) > 1:
            has_overlap[filename] = list(set(records))

    by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
    for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
        pairs = list(pairs)
        description = _describe_files_in_multiple_records(records, pairs)

        if len(set(record[0] for record in records)) != len(records):
            message = "Path included multiple times in target:\n"
            raise MakefileError(message + description)
        else:
            print_warn("WARNING: Path included in multiple targets:",
                       file=sys.stderr)
            print_warn(description, file=sys.stderr)
            print_warn(file=sys.stderr)
Beispiel #8
0
def _update_filtering(mkfile):
    samples = mkfile["Project"]["Samples"]
    groups = mkfile["Project"]["Groups"]

    filtering = {}
    for (target,
         filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
        if target.startswith("<") and target.endswith(">"):
            raise MakefileError("Singleton-filtering must be specified per "
                                "sample, not by groups: %r" % (target, ))
        elif target not in samples:
            raise MakefileError(
                "Unknown/Invalid sample specifed for singleton filtering: %r" %
                (target, ))
        elif target in filter_by:
            raise MakefileError(
                "Attempting to filter singleton in sample using itself as comparison: %r"
                % (target, ))

        path = "Project:FilterSingletons:%s" % (target, )
        filtering[target] = _select_samples(filter_by, groups, samples, path)

        # Implicit inclusion is allowed, since that is useful in some cases,
        # where we want to filter a sample based on the group it is a member of
        if target in filtering[target]:
            # The target itself must be excluded, as including it is invalid
            filtering[target] = filtering[target] - set((target, ))
            print_warn(
                "Warning: Sample %r is singleton-filtered using a "
                "group it is also a member of; this may be by mistake." %
                (target, ))

        if not filtering[target]:
            raise MakefileError("No samples specified by which to "
                                "singleton-filter by for %r" % (target, ))

    mkfile["Project"]["FilterSingletons"] = filtering
Beispiel #9
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn(
            "WARNING: An adapter specified for AdapterRemoval "
            "corresponds to the default sequence, but is reverse "
            "complemented. Please make sure that this is intended! ",
            end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")
Beispiel #10
0
def _validate_makefile_adapters(makefile):
    """Checks for the default adapter sequences specified in the wrong
    orientation for AdapterRemoval, which is a typical mistake when using
    the --pcr2 option.
    """
    # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
    adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

    tests = {
        # --pcr2 expects the reverse complement of the mate 2 adapter seq.
        "--pcr2": adapter_2,
        # --adapter2 (AdapterRemoval v2) expects the regular sequence
        "--adapter2": sequences.reverse_complement(adapter_2)
    }

    def check_options(options, results):
        for key, value in tests.iteritems():
            if options.get(key) == value:
                results[key] = True

    results = dict.fromkeys(tests, False)
    for (_, _, _, _, record) in _iterate_over_records(makefile):
        adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
        check_options(adapterrm_opt, results)

    adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
    check_options(adapterrm_opt, results)

    if any(results.itervalues()):
        print_warn("WARNING: An adapter specified for AdapterRemoval "
                   "corresponds to the default sequence, but is reverse "
                   "complemented. Please make sure that this is intended! ",
                   end="")

        if results["--pcr2"]:
            print_warn("For --pcr2, the sequence given should be the "
                       "reverse complement of the sequence observed in the "
                       "mate 2 FASTQ file.\n")

        if results["--adapter2"]:
            print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
                       "should be exactly as observed in the FASTQ reads.\n")