Exemple #1
0
def add_bootstrap_support(destination, replicate, bootstrap):
    if not (replicate and bootstrap):
        return filter(None, (replicate, bootstrap))

    replicate_file = os.path.join(destination, "replicates.newick")
    bootstrap_file = os.path.join(destination, "bootstraps.newick")
    output_file = add_postfix(replicate_file, ".support")

    return NewickSupportNode(main_tree_files=replicate_file,
                             support_tree_files=bootstrap_file,
                             output_file=output_file,
                             dependencies=(bootstrap, replicate)),
Exemple #2
0
def add_bootstrap_support(destination, replicate, bootstrap):
    if not (replicate and bootstrap):
        return filter(None, (replicate, bootstrap))

    replicate_file = os.path.join(destination, "replicates.newick")
    bootstrap_file = os.path.join(destination, "bootstraps.newick")
    output_file    = add_postfix(replicate_file, ".support")

    return NewickSupportNode(main_tree_files    = replicate_file,
                             support_tree_files = bootstrap_file,
                             output_file        = output_file,
                             dependencies       = (bootstrap, replicate)),
Exemple #3
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(
                options.samples_root,
                "%s.%s.bam" % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions)

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = ("Reference sequence missing from BAM file; "
                               "BAM file aligned against different prefix?\n"
                               "    BAM file = %s\n    Sequence name = %s") \
                               % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = ("Length of reference sequence in FASTA differs "
                               "from length of sequence in BAM file; BAM file "
                               "aligned against different prefix?\n"
                               "    BAM file = %s\n"
                               "    Length in FASTA = %s\n"
                               "    Length in BAM = %s") \
                               % (filename, length, bam_length)
                    raise MakefileError(message)
Exemple #4
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(options.samples_root, "%s.%s.bam" % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions)

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = (
                        "Reference sequence missing from BAM file; "
                        "BAM file aligned against different prefix?\n"
                        "    BAM file = %s\n    Sequence name = %s"
                    ) % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = (
                        "Length of reference sequence in FASTA differs "
                        "from length of sequence in BAM file; BAM file "
                        "aligned against different prefix?\n"
                        "    BAM file = %s\n"
                        "    Length in FASTA = %s\n"
                        "    Length in BAM = %s"
                    ) % (filename, length, bam_length)
                    raise MakefileError(message)
Exemple #5
0
def build_regions_nodes(regions, padding, dependencies=()):
    destination = add_postfix(regions["BED"], ".padded_%ibp" % (padding, ))

    if not padding:
        return regions["BED"], dependencies

    if destination not in _BED_CACHE:
        dependencies = list(dependencies)
        dependencies.append(build_fasta_index_node(regions["FASTA"]))
        _BED_CACHE[destination] \
            = SlopBedNode(genome=regions["FASTA"] + ".fai",
                          infile=regions["BED"],
                          outfile=destination,
                          from_start=padding,
                          from_end=padding,
                          dependencies=dependencies)

    return destination, (_BED_CACHE[destination], )
Exemple #6
0
def build_regions_nodes(regions, padding, dependencies=()):
    destination = add_postfix(regions["BED"], ".padded_%ibp" % (padding,))

    if not padding:
        return regions["BED"], dependencies

    if destination not in _BED_CACHE:
        dependencies = list(dependencies)
        dependencies.append(build_fasta_index_node(regions["FASTA"]))
        _BED_CACHE[destination] \
            = SlopBedNode(genome=regions["FASTA"] + ".fai",
                          infile=regions["BED"],
                          outfile=destination,
                          from_start=padding,
                          from_end=padding,
                          dependencies=dependencies)

    return destination, (_BED_CACHE[destination],)
Exemple #7
0
def build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
                                   dependencies):
    bamfile = "%s.%s.bam" % (sample, regions["Prefix"])
    bamfile = os.path.join(options.samples_root, bamfile)
    if regions["Realigned"]:
        bamfile = add_postfix(bamfile, ".realigned")

    prefix = regions["Genotypes"][sample]
    padding, bedfile, node = genotyping["Padding"], None, dependencies
    if not genotyping["GenotypeEntirePrefix"]:
        bedfile, nodes = \
            build_regions_nodes(regions, padding, dependencies)
        bai_node = build_bam_index_node(bamfile)
        dependencies = nodes + (bai_node, )
    else:
        prefix = os.path.join(os.path.dirname(prefix),
                              "%s.%s.TEMP" % (sample, regions["Prefix"]))

        dependencies += (build_bam_index_node(bamfile), )

    return prefix, bamfile, bedfile, dependencies
Exemple #8
0
def build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
                                   dependencies):
    bamfile = "%s.%s.bam" % (sample, regions["Prefix"])
    bamfile = os.path.join(options.samples_root, bamfile)
    if regions["Realigned"]:
        bamfile = add_postfix(bamfile, ".realigned")

    prefix = regions["Genotypes"][sample]
    padding, bedfile, node = genotyping["Padding"], None, dependencies
    if not genotyping["GenotypeEntirePrefix"]:
        bedfile, nodes = \
            build_regions_nodes(regions, padding, dependencies)
        bai_node = build_bam_index_node(bamfile)
        dependencies = nodes + (bai_node,)
    else:
        prefix = os.path.join(os.path.dirname(prefix),
                              "%s.%s.TEMP" % (sample, regions["Prefix"]))

        dependencies += (build_bam_index_node(bamfile),)

    return prefix, bamfile, bedfile, dependencies
Exemple #9
0
def build_sampling_nodes(options, genotyping, sample, regions, dependencies):
    fasta_file = regions["Genotypes"][sample]
    pileup_file = swap_ext(fasta_file, ".pileup.bgz")

    padding = genotyping["Padding"]
    slop, node = build_regions_nodes(regions, padding, dependencies)

    bam_file = "%s.%s.bam" % (sample, regions["Prefix"])
    bam_file = os.path.join(options.samples_root, bam_file)
    if regions["Realigned"]:
        bam_file = add_postfix(bam_file, ".realigned")
    bai_node = build_bam_index_node(bam_file)

    genotype = GenotypeRegionsNode.customize(pileup_only=True,
                                             reference=regions["FASTA"],
                                             bedfile=slop,
                                             infile=bam_file,
                                             outfile=pileup_file,
                                             nbatches=options.samtools_max_threads,
                                             dependencies=node + (bai_node,))
    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    genotype = genotype.build_node()

    tabix = TabixIndexNode(infile=pileup_file,
                           preset="pileup",
                           dependencies=genotype)

    builder = SampleRegionsNode(infile=pileup_file,
                                bedfile=regions["BED"],
                                outfile=fasta_file,
                                dependencies=tabix)

    faidx = FastaIndexNode(infile=fasta_file,
                           dependencies=builder)

    return (faidx,)
Exemple #10
0
def build_sampling_nodes(options, genotyping, sample, regions, dependencies):
    fasta_file = regions["Genotypes"][sample]
    pileup_file = swap_ext(fasta_file, ".pileup.bgz")

    padding = genotyping["Padding"]
    slop, node = build_regions_nodes(regions, padding, dependencies)

    bam_file = "%s.%s.bam" % (sample, regions["Prefix"])
    bam_file = os.path.join(options.samples_root, bam_file)
    if regions["Realigned"]:
        bam_file = add_postfix(bam_file, ".realigned")
    bai_node = build_bam_index_node(bam_file)

    genotype = GenotypeRegionsNode.customize(
        pileup_only=True,
        reference=regions["FASTA"],
        bedfile=slop,
        infile=bam_file,
        outfile=pileup_file,
        nbatches=options.samtools_max_threads,
        dependencies=node + (bai_node, ))
    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    genotype = genotype.build_node()

    tabix = TabixIndexNode(infile=pileup_file,
                           preset="pileup",
                           dependencies=genotype)

    builder = SampleRegionsNode(infile=pileup_file,
                                bedfile=regions["BED"],
                                outfile=fasta_file,
                                dependencies=tabix)

    faidx = FastaIndexNode(infile=fasta_file, dependencies=builder)

    return (faidx, )
Exemple #11
0
def main(argv):
    config, args = parse_options(argv)
    if config is None:
        return 1

    # Get default options for bam_pipeline
    bam_config, _ = bam_cfg.parse_config(args)
    makefiles = bam_pipeline.read_makefiles(bam_config, args)
    # Build .fai files for reference .fasta files
    bam_pipeline.index_references(bam_config, makefiles)

    for makefile in makefiles:
        mkfile_fname = makefile["Statistics"]["Filename"]
        bam_config.destination = os.path.dirname(mkfile_fname)
        tasks = bam_pipeline.build_pipeline_full(bam_config, makefile,
                                                 return_nodes=False)

        make_dirs(config.destination)
        makefile_name = add_postfix(makefile["Statistics"]["Filename"],
                                    config.postfix)
        makefile_path = reroot_path(config.destination, makefile_name)
        if samefile(makefile["Statistics"]["Filename"], makefile_path):
            sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path,))
            sys.stderr.write("       Please set --destination and/or --output-name-postfix\n")
            sys.stderr.write("       before continuing.\n")
            return 1

        print("Writing makefile", makefile_path)

        found_prefix = False
        for prefix in makefile["Prefixes"]:
            if prefix != config.prefix:
                print("%sSkipping %s" % (_INDENTATION, prefix))
            else:
                found_prefix = True

        if not found_prefix:
            sys.stderr.write("\nERROR:\n")
            sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n"
                             % (config.prefix, mkfile_fname))
            return 1

        with open(makefile_path, "w") as makefile_handle:
            bam_mkfile.print_header(dst=makefile_handle)
            makefile_handle.write("\n" * 3)

            for target in tasks:
                target_name = add_postfix(target.name, config.postfix)
                print("%sTarget: %s -> %s" % (_INDENTATION,
                                              target.name,
                                              target_name))

                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0,
                                                     target_name))
                for prefix in target.prefixes:
                    if prefix.name != config.prefix:
                        continue

                    for sample in prefix.samples:
                        print("%sSample: %s" % (_INDENTATION * 2, sample.name))

                        makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1,
                                                             sample.name))

                        for library in sample.libraries:
                            print("%sLibrary: %s" % (_INDENTATION * 3,
                                                     library.name))
                            makefile_handle.write('%s"%s":\n'
                                                  % (_INDENTATION * 2,
                                                     library.name))

                            sink_cache = {}
                            destination = os.path.join(target_name,
                                                       "reads",
                                                       sample.name,
                                                       library.name)

                            for lane in library.lanes:
                                convert_reads(config, destination, lane, sink_cache)
                            ReadSink.close_all_sinks()

                            for lane_name in sorted(sink_cache):
                                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 3, lane_name))
                                for (reads_type, sink) in sorted(sink_cache[lane_name].items()):
                                    makefile_handle.write('%s%s "%s"\n'
                                                          % (_INDENTATION * 4,
                                                             ("%s:" % (reads_type,)).ljust(20),
                                                             sink.filename))
                                makefile_handle.write("\n")
        print("\tDone ...")
        print()

    return 0
Exemple #12
0
def main(argv):
    config, args = parse_options(argv)
    if config is None:
        return 1

    # Get default options for bam_pipeline
    bam_config, _ = bam_cfg.parse_config(args)
    makefiles = bam_pipeline.read_makefiles(bam_config, args)
    # Build .fai files for reference .fasta files
    bam_pipeline.index_references(bam_config, makefiles)

    for makefile in makefiles:
        mkfile_fname = makefile["Statistics"]["Filename"]
        bam_config.destination = os.path.dirname(mkfile_fname)
        tasks = bam_pipeline.build_pipeline_full(bam_config,
                                                 makefile,
                                                 return_nodes=False)

        make_dirs(config.destination)
        makefile_name = add_postfix(makefile["Statistics"]["Filename"],
                                    config.postfix)
        makefile_path = reroot_path(config.destination, makefile_name)
        if samefile(makefile["Statistics"]["Filename"], makefile_path):
            sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" %
                             (makefile_path, ))
            sys.stderr.write(
                "       Please set --destination and/or --output-name-postfix\n"
            )
            sys.stderr.write("       before continuing.\n")
            return 1

        print("Writing makefile", makefile_path)

        found_prefix = False
        for prefix in makefile["Prefixes"]:
            if prefix != config.prefix:
                print("%sSkipping %s" % (_INDENTATION, prefix))
            else:
                found_prefix = True

        if not found_prefix:
            sys.stderr.write("\nERROR:\n")
            sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n" %
                             (config.prefix, mkfile_fname))
            return 1

        with open(makefile_path, "w") as makefile_handle:
            bam_mkfile.print_header(dst=makefile_handle)
            makefile_handle.write("\n" * 3)

            for target in tasks:
                target_name = add_postfix(target.name, config.postfix)
                print("%sTarget: %s -> %s" %
                      (_INDENTATION, target.name, target_name))

                makefile_handle.write('%s"%s":\n' %
                                      (_INDENTATION * 0, target_name))
                for prefix in target.prefixes:
                    if prefix.name != config.prefix:
                        continue

                    for sample in prefix.samples:
                        print("%sSample: %s" % (_INDENTATION * 2, sample.name))

                        makefile_handle.write('%s"%s":\n' %
                                              (_INDENTATION * 1, sample.name))

                        for library in sample.libraries:
                            print("%sLibrary: %s" %
                                  (_INDENTATION * 3, library.name))
                            makefile_handle.write(
                                '%s"%s":\n' % (_INDENTATION * 2, library.name))

                            sink_cache = {}
                            destination = os.path.join(target_name, "reads",
                                                       sample.name,
                                                       library.name)

                            for lane in library.lanes:
                                convert_reads(config, destination, lane,
                                              sink_cache)
                            ReadSink.close_all_sinks()

                            for lane_name in sorted(sink_cache):
                                makefile_handle.write(
                                    '%s"%s":\n' %
                                    (_INDENTATION * 3, lane_name))
                                for (reads_type, sink) in sorted(
                                        sink_cache[lane_name].items()):
                                    makefile_handle.write(
                                        '%s%s "%s"\n' %
                                        (_INDENTATION * 4,
                                         ("%s:" % (reads_type, )).ljust(20),
                                         sink.filename))
                                makefile_handle.write("\n")
        print("\tDone ...")
        print()

    return 0
Exemple #13
0
def test_add_postfix__dot_postfix():
    assert_equal(add_postfix("name.foo", ".pf"), "name.pf.foo")
Exemple #14
0
def test_add_postfix__no_ext__underscore_postfix():
    assert_equal(add_postfix("name", "_pf"), "name_pf")
Exemple #15
0
def test_add_postfix__no_ext__dot_postfix():
    assert_equal(add_postfix("name", ".pf"), "name.pf")
Exemple #16
0
def test_add_postfix__no_ext__no_postfix():
    assert_equal(add_postfix("name", ""), "name")
Exemple #17
0
def test_add_postfix__underscore_postfix():
    assert_equal(add_postfix("name.foo", "_pf"), "name_pf.foo")
Exemple #18
0
def test_add_postfix__dot_postfix():
    assert_equal(add_postfix("name.foo", ".pf"), "name.pf.foo")
Exemple #19
0
def test_add_postfix__no_postfix():
    assert_equal(add_postfix("name.foo", ""), "name.foo")
Exemple #20
0
def test_add_postfix__underscore_postfix():
    assert_equal(add_postfix("name.foo", "_pf"), "name_pf.foo")
Exemple #21
0
def test_add_postfix__no_postfix():
    assert_equal(add_postfix("name.foo", ""), "name.foo")
Exemple #22
0
def test_add_postfix__no_ext__no_postfix():
    assert_equal(add_postfix("name", ""), "name")
Exemple #23
0
def test_add_postfix__no_ext__dot_postfix():
    assert_equal(add_postfix("name", ".pf"), "name.pf")
Exemple #24
0
def test_add_postfix__no_ext__underscore_postfix():
    assert_equal(add_postfix("name", "_pf"), "name_pf")