Esempio n. 1
0
def test_natsorted():
    chromname_tests = [
        {
            "input": ["2", "3", "4", "m", "x", "1", "y"],
            "expected": ["1", "2", "3", "4", "x", "y", "m"],
        },
        {
            "input":
            ["chr1", "chr4", "chr5", "chr2", "chr3", "chrMT", "chrY", "chrX"],
            "expected": [
                "chr1",
                "chr2",
                "chr3",
                "chr4",
                "chr5",
                "chrX",
                "chrY",
                "chrMT",
            ],
        },
    ]

    for test in chromname_tests:
        sorted_output = hgbw.natsorted(test["input"])
        assert (sorted_output == test["expected"]
                ), "Sorted output was %s\nExpected: %s" % (sorted_output,
                                                           test["expected"])
Esempio n. 2
0
def alignment_tileset_info(samfile, chromsizes):
    """
    Get the tileset info for a bam file

    Parameters
    ----------
    tileset: tilesets.models.Tileset object
        The tileset that the tile ids should be retrieved from

    Returns
    -------
    tileset_info: {'min_pos': [],
                    'max_pos': [],
                    'tile_size': 1024,
                    'max_zoom': 7
                    }
    """
    if chromsizes is not None:
        chromsizes_list = []

        for chrom, size in chromsizes.iteritems():
            chromsizes_list += [[chrom, int(size)]]

        total_length = sum([c[1] for c in chromsizes_list])
    else:
        total_length = sum(samfile.lengths)

        references = np.array(samfile.references)
        lengths = np.array(samfile.lengths)

        ref_lengths = dict(zip(references, lengths))
        references = ctbw.natsorted(references)

        lengths = [ref_lengths[r] for r in references]
        chromsizes_list = list(
            zip(references, [int(length) for length in lengths]))

    tile_size = 256
    max_zoom = math.ceil(math.log(total_length / tile_size) / math.log(2))

    # this should eventually be a configurable option
    MAX_TILE_WIDTH = 100000

    tileset_info = {
        "min_pos": [0],
        "max_pos": [total_length],
        "max_width": tile_size * 2**max_zoom,
        "tile_size": tile_size,
        "chromsizes": chromsizes_list,
        "max_zoom": max_zoom,
        "max_tile_width": MAX_TILE_WIDTH,
    }

    return tileset_info
Esempio n. 3
0
def test_natsorted():
    chromname_tests = [{
        'input': ['2', '3', '4', 'm', 'x', '1', 'y'],
        'expected': ['1', '2', '3', '4', 'x', 'y', 'm']
    }, {
        'input':
        ['chr1', 'chr4', 'chr5', 'chr2', 'chr3', 'chrMT', 'chrY', 'chrX'],
        'expected':
        ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chrX', 'chrY', 'chrMT']
    }]

    for test in chromname_tests:
        sorted_output = hgbi.natsorted(test['input'])
        assert sorted_output == test[
            'expected'], 'Sorted output was %s\nExpected: %s' % (
                sorted_output, test['expected'])
Esempio n. 4
0
def load_reads(samfile,
               start_pos,
               end_pos,
               chromsizes=None,
               index_filename=None,
               cache=None):
    """
    Sample reads from the specified region, assuming that the chromosomes
    are ordered in some fashion. Returns an list of pysam reads

    Parameters:
    -----------
    samfile: pysam.AlignmentFile
        A pysam entry into an indexed bam file
    start_pos: int
        The start position of the sampled region
    end_pos: int
        The end position of the sampled region
    chromsize: pandas.Series
        A listing of chromosome sizes. If not provided, the chromosome
        list will be extracted from the the bam file header
    cache: 
        An object that implements the `get`, `set` and `exists` methods
        for caching data

    Returns
    -------
    reads: [read1, read2...]
        The list of in the sampled regions
    """
    # if chromorder is not None...
    # specify the chromosome order for the fetched reads

    if chromsizes is not None:
        chromsizes_list = []

        for chrom, size in chromsizes.iteritems():
            chromsizes_list += [[chrom, int(size)]]
    else:
        references = np.array(samfile.references)
        lengths = np.array(samfile.lengths)

        ref_lengths = dict(zip(references, lengths))

        # we're going to create a natural ordering for references
        # e.g. (chr1, chr2,..., chr10, chr11...chr22,chrX, chrY, chrM...)
        references = ctbw.natsorted(references)
        chromsizes_list = list(
            zip(references, [int(length) for length in lengths]))

    lengths = [r[1] for r in chromsizes_list]
    abs_chrom_offsets = np.r_[0, np.cumsum(lengths)]

    results = {
        "id": [],
        "from": [],
        "to": [],
        "md": [],
        "chrName": [],
        "chrOffset": [],
        "cigar": [],
        "m1From": [],
        "m1To": [],
        "m2From": [],
        "m2To": [],
        "mapq": [],
        "tags.HP": [],
        "strand": [],
        "variants": [],
        "cigars": [],
    }

    strands = {True: "-", False: "+"}
    import time

    idx = load_bai_index(index_filename)

    total_size = 0
    # check the size of the file to load to get an approximation
    # of whether we're going to return too much data
    for cid, start, end in abs2genomic(lengths, start_pos, end_pos):
        total_size += est_query_size_ix(idx[cid], start, end)

    MAX_SIZE = 4e6
    if total_size > MAX_SIZE:
        return {"error": "Tile encompasses too much data: {total_size}"}

    for cid, start, end in abs2genomic(lengths, start_pos, end_pos):
        chr_offset = int(abs_chrom_offsets[cid])

        if cid >= len(chromsizes_list):
            continue

        seq_name = f"{chromsizes_list[cid][0]}"
        reads = samfile.fetch(seq_name, start, end)
        for read in reads:
            if read.is_unmapped:
                continue
            # query_seq = read.query_sequence

            # differences = []

            # try:
            #     for counter, (qpos, rpos, ref_base) in enumerate(read.get_aligned_pairs(with_seq=True)):
            #         # inferred from the pysam source code:
            #         # https://github.com/pysam-developers/pysam/blob/3defba98911d99abf8c14a483e979431f069a9d2/pysam/libcalignedsegment.pyx
            #         # and GitHub issue:
            #         # https://github.com/pysam-developers/pysam/issues/163
            #         #print('qpos, rpos, ref_base', qpos, rpos, ref_base)
            #         if rpos is None:
            #             differences += [(qpos, 'I')]
            #         elif qpos is None:
            #             differences += [(counter, 'D')]
            #         elif ref_base.islower():
            #             differences += [(qpos, query_seq[qpos], ref_base)]
            # except ValueError as ve:
            #     # probably lacked an MD string
            #     pass
            try:
                id_suffix = ""
                if read.is_paired:
                    if read.is_read1:
                        id_suffix = "_1"
                    if read.is_read2:
                        id_suffix = "_2"

                read_id = read.query_name + id_suffix
                results["id"] += [read_id]
                results["from"] += [int(read.reference_start + chr_offset)]
                results["to"] += [int(read.reference_end + chr_offset)]
                results["chrName"] += [read.reference_name]
                results["chrOffset"] += [chr_offset]
                results["cigar"] += [read.cigarstring]
                results["mapq"] += [read.mapq]
                # aligned_pairs = read.get_aligned_pairs(with_seq=True)

                # For ONT reads retrieving the variants can be a lengthy
                # procedure. We can try to cache them
                use_cache = read.query_length > 40000
                if use_cache:
                    variants = get_cached_variants(cache, read_id)
                else:
                    variants = None
                # variants = None

                if not variants:
                    if read.query_sequence:
                        # read.get_aligned_pairs(with_seq=True, matches_only=True)
                        try:
                            variants = [
                                (r[0], r[1], read.query_sequence[r[0]])
                                for r in read.get_aligned_pairs(
                                    with_seq=True, matches_only=True)
                                if start <= r[1] <= end and r[2] is not None
                                and r[2].islower()
                            ]
                        except ValueError:
                            # Probably MD tag not present
                            variants = []

                        if use_cache:
                            set_cached_variants(cache, read_id, variants)

                        results["variants"] += [variants]
                    else:
                        results["variants"] += []
                else:
                    results["variants"] += [variants]

                results["cigars"] += [get_cigar_substitutions(read)]
                tags = dict(read.tags)
                results["tags.HP"] += [tags.get("HP", 0)]
                results["strand"] += [strands[read.is_reverse]]
            except:
                raise

            try:
                results["md"] += [read.get_tag("MD")]
            except KeyError:
                results["md"] += [""]
                continue

    return results
Esempio n. 5
0
def load_reads(samfile, start_pos, end_pos, chrom_order=None):
    """
    Sample reads from the specified region, assuming that the chromosomes
    are ordered in some fashion. Returns an list of pysam reads

    Parameters:
    -----------
    samfile: pysam.AlignmentFile
        A pysam entry into an indexed bam file
    start_pos: int
        The start position of the sampled region
    end_pos: int
        The end position of the sampled region
    chrom_order: ['chr1', 'chr2',...]
        A listing of chromosome names to use as the order

    Returns
    -------
    reads: [read1, read2...]
        The list of in the sampled regions
    """
    # if chromorder is not None...
    # specify the chromosome order for the fetched reads

    references = np.array(samfile.references)
    lengths = np.array(samfile.lengths)

    ref_lengths = dict(zip(references, lengths))

    # we're going to create a natural ordering for references
    # e.g. (chr1, chr2,..., chr10, chr11...chr22,chrX, chrY, chrM...)
    references = ctbw.natsorted(references)
    lengths = [ref_lengths[r] for r in references]

    abs_chrom_offsets = np.r_[0, np.cumsum(lengths)]

    if chrom_order:
        chrom_order = np.array(chrom_order)
        chrom_order_ixs = np.nonzero(np.in1d(references, chrom_order))
        lengths = lengths[chrom_order_ixs]

    results = {
        "id": [],
        "from": [],
        "to": [],
        "md": [],
        "chrName": [],
        "chrOffset": [],
        "cigar": [],
    }

    for cid, start, end in abs2genomic(lengths, start_pos, end_pos):
        chr_offset = int(abs_chrom_offsets[cid])

        if cid >= len(references):
            continue

        seq_name = f"{references[cid]}"
        reads = samfile.fetch(seq_name, start, end)

        for read in reads:
            if read.is_unmapped:
                continue
            # query_seq = read.query_sequence

            # differences = []

            # try:
            #     for counter, (qpos, rpos, ref_base) in enumerate(read.get_aligned_pairs(with_seq=True)):
            #         # inferred from the pysam source code:
            #         # https://github.com/pysam-developers/pysam/blob/3defba98911d99abf8c14a483e979431f069a9d2/pysam/libcalignedsegment.pyx
            #         # and GitHub issue:
            #         # https://github.com/pysam-developers/pysam/issues/163
            #         #print('qpos, rpos, ref_base', qpos, rpos, ref_base)
            #         if rpos is None:
            #             differences += [(qpos, 'I')]
            #         elif qpos is None:
            #             differences += [(counter, 'D')]
            #         elif ref_base.islower():
            #             differences += [(qpos, query_seq[qpos], ref_base)]
            # except ValueError as ve:
            #     # probably lacked an MD string
            #     pass
            try:
                results["id"] += [read.query_name]
                results["from"] += [int(read.reference_start + chr_offset)]
                results["to"] += [int(read.reference_end + chr_offset)]
                results["chrName"] += [read.reference_name]
                results["chrOffset"] += [chr_offset]
                results["cigar"] += [read.cigarstring]
            except:
                raise

            try:
                results["md"] += [read.get_tag("MD")]
            except KeyError:
                results["md"] += [""]
                continue

    return results