Exemple #1
0
def _build_table_rows(args):
    rows = []
    for filename in args.makefile:
        for (target, sample, library, lane, path) in _parse_makefile(filename):
            if isinstance(path, dict):
                ui.print_err("WARNING: Found pre-processed data "
                             "at %s:%s:%s:%s; cannot collect raw "
                             "FASTQ data." % (target, sample, library, lane))
                continue

            row = {
                "sample_alias": "*",
                "instrument_model": "*",
                "library_source": "GENOMIC",
                "library_selection": "RANDOM",
                "library_strategy": "WGS",
                "design_description": "",
                "library_construction_protocol": "",
                "insert_size": "0",
                "MAKEFILE_TARGET": target,
                "MAKEFILE_SAMPLE": sample,
                "MAKEFILE_LIBRARY": library,
                "MAKEFILE_LANE": lane,
                "MAKEFILE_PATH": path
            }

            rows.append(row)

    return rows
Exemple #2
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, line.rstrip('\r\n').split('\t'))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i; "
                          "expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            if name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemple #3
0
def _build_table_rows(args):
    rows = []
    for filename in args.makefile:
        for (target, sample, library, lane, path) in _parse_makefile(filename):
            if isinstance(path, dict):
                ui.print_err("WARNING: Found pre-processed data "
                             "at %s:%s:%s:%s; cannot collect raw "
                             "FASTQ data."
                             % (target, sample, library, lane))
                continue

            row = {"sample_alias": "*",
                   "instrument_model": "*",
                   "library_source": "GENOMIC",
                   "library_selection": "RANDOM",
                   "library_strategy": "WGS",
                   "design_description": "",
                   "library_construction_protocol": "",
                   "insert_size": "0",

                   "MAKEFILE_TARGET": target,
                   "MAKEFILE_SAMPLE": sample,
                   "MAKEFILE_LIBRARY": library,
                   "MAKEFILE_LANE": lane,
                   "MAKEFILE_PATH": path}

            rows.append(row)

    return rows
Exemple #4
0
def main(argv):
    """Main function; takes a list of arguments but excluding sys.argv[0]."""
    args = parse_args(argv)

    try:
        return args.function(args)
    except ENAError, error:
        ui.print_err("FATAL ERROR:\n  %s" % (error, ))
Exemple #5
0
def main(argv):
    """Main function; takes a list of arguments but excluding sys.argv[0]."""
    args = parse_args(argv)

    try:
        return args.function(args)
    except ENAError, error:
        ui.print_err("FATAL ERROR:\n  %s" % (error,))
Exemple #6
0
def main_wrapper(process_func, argv, ext):
    args = parse_arguments(argv, ext)
    args.regions = None
    if args.regions_fpath:
        try:
            args.regions = collect_bed_regions(args.regions_fpath)
        except ValueError, error:
            print_err("ERROR: Failed to parse BED file %r:\n%s" %
                      (args.regions_fpath, error))
            return 1
Exemple #7
0
def main_wrapper(process_func, argv, ext):
    args = parse_arguments(argv, ext)
    args.regions = None
    if args.regions_fpath:
        try:
            args.regions = collect_bed_regions(args.regions_fpath)
        except ValueError, error:
            print_err("ERROR: Failed to parse BED file %r:\n%s"
                      % (args.regions_fpath, error))
            return 1
Exemple #8
0
def main(argv):
    args = parse_args(argv)
    data = database.ZonkeyDB(args.database)
    sequences = data.mitochondria

    try:
        handle = pysam.Samfile(args.bam)
    except (IOError, ValueError), error:
        ui.print_err("Error reading BAM file: %s" % (error,))
        return 1
Exemple #9
0
def main(argv):
    args = parse_args(argv)
    data = database.ZonkeyDB(args.database)
    sequences = data.mitochondria

    try:
        handle = pysam.Samfile(args.bam)
    except (IOError, ValueError), error:
        ui.print_err("Error reading BAM file: %s" % (error,))
        return 1
Exemple #10
0
def parse_run_config(config, args):
    if not (2 <= len(args) <= 4):
        print_usage()
        return

    config.multisample = False
    config.tablefile = args[0]

    try:
        config.database = database.ZonkeyDB(config.tablefile)
    except database.ZonkeyDBError, error:
        print_err("ERROR reading database %r: %s" % (config.tablefile, error))
        return
Exemple #11
0
    def __init__(self, config, prefix, samples, features, target):
        self.name = prefix["Name"]
        self.label = prefix.get("Label") or self.name
        self.roi = prefix.get("RegionsOfInterest", {})

        self.samples = safe_coerce_to_tuple(samples)
        self.folder = config.destination
        self.target = target

        files_and_nodes = {}
        for sample in self.samples:
            files_and_nodes.update(sample.bams.iteritems())

        self.datadup_check = self._build_dataduplication_node(
            prefix, files_and_nodes)

        build_raw_bam = features["RawBAM"]
        build_realigned_bam = features["RealignedBAM"]
        if build_realigned_bam and prefix['IndexFormat'] == '.csi':
            if prefix['Path'] not in _CSI_WARNINGS:
                ui.print_err("\nWARNING: Realigned BAMs enabled for reference "
                             "genome %r, but the file contains sequences too "
                             "large for GATK, which does not support .csi "
                             "index files. Raw BAMs will be built instead of "
                             "realigned BAMs, for this reference sequence." %
                             (prefix['Path']))

                # TODO: Add reference to FAQ when written.

            _CSI_WARNINGS.add(prefix['Path'])
            build_realigned_bam = False
            build_raw_bam = True

        self.bams = {}
        if build_raw_bam:
            self.bams.update(
                self._build_raw_bam(config, prefix, files_and_nodes))

        if build_realigned_bam:
            self.bams.update(
                self._build_realigned_bam(config, prefix, files_and_nodes))

        if not self.bams:
            for sample in self.samples:
                self.bams.update(sample.bams)

        nodes = [self.datadup_check]
        for sample in self.samples:
            nodes.extend(sample.nodes)
        self.nodes = tuple(nodes)
Exemple #12
0
    def validate_bam(self, filename):
        """Validates a sample BAM file, checking that it is either a valid
        mitochondrial BAM (aligned against one of the referenc mt sequences),
        or that it is a valid nuclear BAM (aligned against the reference).

        Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE.
        """
        print_info("  - Validating BAM file %r ... " % (filename,))

        try:
            handle = pysam.Samfile(filename)
        except (ValueError, IOError), error:
            print_err("Error reading BAM: %s" % (error,))
            return
Exemple #13
0
def parse_run_config(config, args):
    if not (2 <= len(args) <= 4):
        print_usage()
        return

    config.multisample = False
    config.tablefile = args[0]

    try:
        config.database = database.ZonkeyDB(config.tablefile)
    except database.ZonkeyDBError, error:
        print_err("ERROR reading database %r: %s"
                  % (config.tablefile, error))
        return
Exemple #14
0
    def validate_bam(self, filename):
        """Validates a sample BAM file, checking that it is either a valid
        mitochondrial BAM (aligned against one of the referenc mt sequences),
        or that it is a valid nuclear BAM (aligned against the reference).

        Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE.
        """
        print_info("  - Validating BAM file %r ... " % (filename, ))

        try:
            handle = pysam.Samfile(filename)
        except (ValueError, IOError), error:
            print_err("Error reading BAM: %s" % (error, ))
            return
Exemple #15
0
    def __init__(self, config, prefix, samples, features, target):
        self.name = prefix["Name"]
        self.label = prefix.get("Label") or self.name
        self.roi = prefix.get("RegionsOfInterest", {})

        self.samples = safe_coerce_to_tuple(samples)
        self.folder = config.destination
        self.target = target

        files_and_nodes = {}
        for sample in self.samples:
            files_and_nodes.update(sample.bams.iteritems())

        self.datadup_check = self._build_dataduplication_node(
            prefix, files_and_nodes)

        build_raw_bam = features["RawBAM"]
        build_realigned_bam = features["RealignedBAM"]
        if build_realigned_bam and prefix['IndexFormat'] == '.csi':
            if prefix['Path'] not in _CSI_WARNINGS:
                ui.print_err("\nWARNING: Realigned BAMs enabled for reference "
                             "genome %r, but the file contains sequences too "
                             "large for GATK, which does not support .csi "
                             "index files. Raw BAMs will be built instead of "
                             "realigned BAMs, for this reference sequence."
                             % (prefix['Path']))

                # TODO: Add reference to FAQ when written.

            _CSI_WARNINGS.add(prefix['Path'])
            build_realigned_bam = False
            build_raw_bam = True

        self.bams = {}
        if build_raw_bam:
            self.bams.update(self._build_raw_bam(
                config, prefix, files_and_nodes))

        if build_realigned_bam:
            self.bams.update(self._build_realigned_bam(
                config, prefix, files_and_nodes))

        if not self.bams:
            for sample in self.samples:
                self.bams.update(sample.bams)

        nodes = [self.datadup_check]
        for sample in self.samples:
            nodes.extend(sample.nodes)
        self.nodes = tuple(nodes)
Exemple #16
0
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min(
        (len(record.sequence)) for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)" %
                      (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!' %
                       (handle.filename, ))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible." %
                          (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
Exemple #17
0
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min((len(record.sequence))
                     for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)"
                      % (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!'
                       % (handle.filename,))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible."
                          % (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
Exemple #18
0
def parse_config(argv):
    config, args = _parse_arguments(argv)
    if not args:
        print_usage()
        return

    config.command = _CMD_ALIASES.get(args[0])
    if config.command is None:
        print_err("ERROR: Unknown command %r" % (args[0], ))
        return
    elif config.command == "dryrun":
        config.command = "run"
        config.dry_run = True

    return parse_run_config(config, args[1:])
Exemple #19
0
def parse_config(argv):
    config, args = _parse_arguments(argv)
    if not args:
        print_usage()
        return

    config.command = _CMD_ALIASES.get(args[0])
    if config.command is None:
        print_err("ERROR: Unknown command %r" % (args[0],))
        return
    elif config.command == "dryrun":
        config.command = "run"
        config.dry_run = True

    return parse_run_config(config, args[1:])
Exemple #20
0
def _validate_nuclear_bam(data, handle, info):
    # Check that chromosomes are of expected size; unused chroms are ignored.
    bam_contigs = dict(
        zip(map(contig_name_to_plink_name, handle.references), handle.lengths))
    ref_contigs = data.contigs

    contigs_found = {}
    for name, stats in sorted(ref_contigs.iteritems()):
        if name not in bam_contigs:
            contigs_found[name] = False
        elif bam_contigs[name] != stats["Size"]:
            print_err("\nERROR: Chrom %r in the BAM does not match the "
                      "length specified in data file:\n"
                      "    - Expected: %i\n"
                      "    - Found: %i" %
                      (name, bam_contigs[name], stats["Size"]))

            return False
        else:
            contigs_found[name] = True

    if any(contigs_found.itervalues()):
        if not all(contigs_found.itervalues()):
            print_err("\nERROR: Not all nuclear chromosomes found in BAM:")
            for (name, stats) in sorted(ref_contigs.iteritems()):
                is_found = "Found" if contigs_found[name] else "Not found!"
                print_err("  - %s: %s" % (name, is_found))

            return False
        else:
            info.nuclear = True

    return True
Exemple #21
0
def _validate_nuclear_bam(data, handle, info):
    # Check that chromosomes are of expected size; unused chroms are ignored.
    bam_contigs = dict(zip(map(contig_name_to_plink_name, handle.references),
                           handle.lengths))
    ref_contigs = data.contigs

    contigs_found = {}
    for name, stats in sorted(ref_contigs.iteritems()):
        if name not in bam_contigs:
            contigs_found[name] = False
        elif bam_contigs[name] != stats["Size"]:
            print_err("\nERROR: Chrom %r in the BAM does not match the "
                      "length specified in data file:\n"
                      "    - Expected: %i\n"
                      "    - Found: %i"
                      % (name, bam_contigs[name], stats["Size"]))

            return False
        else:
            contigs_found[name] = True

    if any(contigs_found.itervalues()):
        if not all(contigs_found.itervalues()):
            print_err("\nERROR: Not all nuclear chromosomes found in BAM:")
            for (name, stats) in sorted(ref_contigs.iteritems()):
                is_found = "Found" if contigs_found[name] else "Not found!"
                print_err("  - %s: %s" % (name, is_found))

            return False
        else:
            info.nuclear = True

    return True
Exemple #22
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename, ))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing." %
                          (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r " %
                          (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {
                "Root": os.path.join(config.destination, name),
                "Files": fields[1:]
            }

    return True
Exemple #23
0
def _read_sample_table(config, filename):
    """Parses a 2 - 3 column tab-seperated table containing, on each row, a
    name to be used for a sample in the first row, and then the paths two
    either one or to two BAM files, which must represent a single nuclear or
    a single mitochondrial alignment (2 columns), or both (3 columns).
    """
    print_info("Reading table of samples from %r" % (filename,))
    valid_characters = frozenset(string.letters + string.digits + ".-_")

    samples = config.samples = {}
    with fileutils.open_ro(filename) as handle:
        for linenum, line in enumerate(handle, start=1):
            if not line.strip() or line.lstrip().startswith("#"):
                continue

            fields = filter(None, map(str.strip, line.split('\t')))
            if len(fields) not in (2, 3):
                print_err("Error reading sample table (%r) at line %i: "
                          "Expected 2 or 3 columns, found %i; please "
                          "correct file before continuing."
                          % (filename, linenum, len(fields)))
                return

            name = fields[0]
            invalid_letters = frozenset(name) - valid_characters
            if invalid_letters:
                print_err("Error reading sample table (%r) at line %i: "
                          "Sample name contains illegal character(s). Only "
                          "letters, numbers, and '-', '_', and '.' are "
                          "allowed, but found %r in name %r "
                          % (filename, linenum, "".join(invalid_letters), name))
                return
            elif name in samples:
                print_err("Duplicate sample name found in sample table "
                          "(%r) at line %i: %r. All sample names must "
                          "be unique!" % (filename, linenum, name))
                return

            samples[name] = {"Root": os.path.join(config.destination, name),
                             "Files": fields[1:]}

    return True
Exemple #24
0
    args = parse_args(argv)
    data = database.ZonkeyDB(args.database)
    sequences = data.mitochondria

    try:
        handle = pysam.Samfile(args.bam)
    except (IOError, ValueError), error:
        ui.print_err("Error reading BAM file: %s" % (error,))
        return 1

    with handle:
        bam_info = data.validate_bam_handle(handle)
        if bam_info is None:
            return 1
        elif not bam_info.is_mitochondrial:
            ui.print_err("ERROR: BAM does not contain any known mitochondrial "
                         "sequence found in BAM ..")
            return 1

        reference = sequences[bam_info.mt_contig]
        stats, majority = majority_sequence(handle,
                                            padding=bam_info.mt_padding,
                                            contig_name=bam_info.mt_contig,
                                            contig_length=bam_info.mt_length)

        sequences["Sample"] = FASTA(name="Sample",
                                    meta=None,
                                    sequence=align_majority(reference.sequence,
                                                            majority))

        # Truncate all sequences to match the (now) unpadded sample sequence
        sequences = truncate_sequences(sequences, "Sample")
Exemple #25
0
def _process_samples(config):
    for name, info in sorted(config.samples.items()):
        files = {}

        if name == "-":
            print_info("Validating unnamed sample ...")
        else:
            print_info("Validating sample %r ..." % (name, ))

        for filename in info.pop("Files"):
            filetype = config.database.validate_bam(filename)
            if not filetype:
                print_err("ERROR: File is not a valid BAM file: %r" %
                          (filename, ))
                return False

            if filetype.is_nuclear and filetype.is_mitochondrial:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False
                elif "Mito" in files:
                    print_err("WARNING: Nuclear + mitochondrial BAM, and "
                              "mitochondrial BAM specified; the mitochondrial "
                              "genome in the first BAM will not be used!")

                files["Nuc"] = filename
                files.setdefault("Mito", filename)
            elif filetype.is_nuclear:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Nuc"] = filename
            elif filetype.is_mitochondrial:
                if "Mito" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Mito"] = filename
            else:
                print_err("ERROR: BAM does not contain usable nuclear "
                          "or mitochondrial contigs: %r" % (filename, ))
                return False

        config.samples[name]["Files"] = files

    return True
Exemple #26
0
    config.multisample = False
    config.tablefile = args[0]

    try:
        config.database = database.ZonkeyDB(config.tablefile)
    except database.ZonkeyDBError, error:
        print_err("ERROR reading database %r: %s" % (config.tablefile, error))
        return

    known_samples = set(config.database.samples) | set(("Sample", ))
    unknown_samples = set(config.treemix_outgroup) - known_samples
    if unknown_samples:
        print_err("ERROR: Argument --treemix-outgroup includes unknown "
                  "sample(s): %s; known samples are %s. Note that "
                  "names are case-sensitive." %
                  (", ".join(map(repr, sorted(unknown_samples))), ", ".join(
                      map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")
Exemple #27
0
        try:
            args.regions = collect_bed_regions(args.regions_fpath)
        except ValueError, error:
            print_err("ERROR: Failed to parse BED file %r:\n%s" %
                      (args.regions_fpath, error))
            return 1

    print_msg("Opening %r" % (args.infile, ))
    with pysam.Samfile(args.infile) as handle:
        sort_order = handle.header.get('HD', {}).get('SO')
        if sort_order is None:
            print_warn("WARNING: BAM file %r is not marked as sorted!" %
                       (args.infile, ))
        elif sort_order != 'coordinate':
            print_err("ERROR: BAM file %r is %s-sorted, but only "
                      "coordinate-sorted BAMs are supported!" %
                      (args.infile, sort_order))
            return 1

        sort_bed_by_bamfile(handle, args.regions)
        return process_func(handle, args)


def _get_readgroup(record):
    try:
        return record.get_tag("RG")
    except KeyError:
        return None


def _get_readgroup_ignored(_):
Exemple #28
0
        try:
            args.regions = collect_bed_regions(args.regions_fpath)
        except ValueError, error:
            print_err("ERROR: Failed to parse BED file %r:\n%s"
                      % (args.regions_fpath, error))
            return 1

    print_msg("Opening %r" % (args.infile,))
    with pysam.Samfile(args.infile) as handle:
        sort_order = handle.header.get('HD', {}).get('SO')
        if sort_order is None:
            print_warn("WARNING: BAM file %r is not marked as sorted!"
                       % (args.infile,))
        elif sort_order != 'coordinate':
            print_err("ERROR: BAM file %r is %s-sorted, but only "
                      "coordinate-sorted BAMs are supported!"
                      % (args.infile, sort_order))
            return 1

        sort_bed_by_bamfile(handle, args.regions)
        return process_func(handle, args)


def _get_readgroup(record):
    try:
        return record.get_tag("RG")
    except KeyError:
        return None


def _get_readgroup_ignored(_):
Exemple #29
0
    args = parse_args(argv)
    data = database.ZonkeyDB(args.database)
    sequences = data.mitochondria

    try:
        handle = pysam.Samfile(args.bam)
    except (IOError, ValueError), error:
        ui.print_err("Error reading BAM file: %s" % (error,))
        return 1

    with handle:
        bam_info = data.validate_bam_handle(handle)
        if bam_info is None:
            return 1
        elif not bam_info.is_mitochondrial:
            ui.print_err("ERROR: BAM does not contain any known mitochondrial "
                         "sequence found in BAM ..")
            return 1

        reference = sequences[bam_info.mt_contig]
        stats, majority = majority_sequence(handle,
                                            padding=bam_info.mt_padding,
                                            contig_name=bam_info.mt_contig,
                                            contig_length=bam_info.mt_length)

        sequences["Sample"] = FASTA(name="Sample",
                                    meta=None,
                                    sequence=align_majority(reference.sequence,
                                                            majority))

        # Truncate all sequences to match the (now) unpadded sample sequence
        sequences = truncate_sequences(sequences, "Sample")
Exemple #30
0
def _process_samples(config):
    for name, info in sorted(config.samples.items()):
        files = {}

        if name == "-":
            print_info("Validating unnamed sample ...")
        else:
            print_info("Validating sample %r ..." % (name,))

        for filename in info.pop("Files"):
            filetype = config.database.validate_bam(filename)
            if not filetype:
                print_err("ERROR: File is not a valid BAM file: %r"
                          % (filename,))
                return False

            if filetype.is_nuclear and filetype.is_mitochondrial:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False
                elif "Mito" in files:
                    print_err("WARNING: Nuclear + mitochondrial BAM, and "
                              "mitochondrial BAM specified; the mitochondrial "
                              "genome in the first BAM will not be used!")

                files["Nuc"] = filename
                files.setdefault("Mito", filename)
            elif filetype.is_nuclear:
                if "Nuc" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Nuc"] = filename
            elif filetype.is_mitochondrial:
                if "Mito" in files:
                    print_err("ERROR: Two nuclear BAMs specified!")
                    return False

                files["Mito"] = filename
            else:
                print_err("ERROR: BAM does not contain usable nuclear "
                          "or mitochondrial contigs: %r" % (filename,))
                return False

        config.samples[name]["Files"] = files

    return True
Exemple #31
0
    config.multisample = False
    config.tablefile = args[0]

    try:
        config.database = database.ZonkeyDB(config.tablefile)
    except database.ZonkeyDBError, error:
        print_err("ERROR reading database %r: %s"
                  % (config.tablefile, error))
        return

    known_samples = set(config.database.samples) | set(("Sample",))
    unknown_samples = set(config.treemix_outgroup) - known_samples
    if unknown_samples:
        print_err("ERROR: Argument --treemix-outgroup includes unknown "
                  "sample(s): %s; known samples are %s. Note that "
                  "names are case-sensitive."
                  % (", ".join(map(repr, sorted(unknown_samples))),
                     ", ".join(map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")