Example #1
0
def test_non_existing_read_name2():
    rs = ReadSet()
    r = Read('Read A', 56, 1)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)
    rs[(2, 'Read A')]
Example #2
0
def test_non_existing_read_name():
    rs = ReadSet()
    r = Read('Read A', 56)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)
    rs[(0, 'foo')]
Example #3
0
def string_to_readset(s, n_alleles, w=None, sample_ids=None):
    s = textwrap.dedent(s).strip()
    if w is not None:
        w = textwrap.dedent(w).strip().split('\n')
    rs = ReadSet()
    for index, line in enumerate(s.split('\n')):
        if len(line) == 0:
            continue
        if sample_ids is None:
            read = Read('Read {}'.format(index + 1), 50)
        else:
            read = Read('Read {}'.format(index + 1), 50, 0, sample_ids[index])
        for pos, c in enumerate(line):
            if c == ' ':
                continue
            q = 1
            if w is not None:
                q = int(w[index][pos])
            quality = [q] * n_alleles
            quality[int(c)] = 0
            read.add_variant(position=(pos + 1) * 10,
                             allele=int(c),
                             quality=quality)
        assert len(
            read) > 1, 'Reads covering less than two variants are not allowed'
        rs.add(read)
    print(rs)
    return rs
Example #4
0
def matrix_to_readset(lines):

    rs = ReadSet()
    index_tracker = 0
    for line in lines:

        s = line.split()
        assert len(s) % 2 == 1, "Not in matrix format."

        index = int(s[0])
        index_tracker += 1
        assert index == index_tracker, "Not in matrix format."

        read = Read("Read {}".format(index), 50)
        for i in range(int(len(s) / 2)):

            offset = int(s[2 * i + 1])
            for pos, c in enumerate(s[2 * i + 2]):
                read.add_variant(position=(offset + pos) * 10,
                                 allele=int(c),
                                 quality=1)

        rs.add(read)

    print(rs)
    return rs
Example #5
0
def test_readset2():
    rs = ReadSet()
    rs.add(Read('Read A', 1, 23))
    rs.add(Read('Read A', 2, 70))
    rs.add(Read('Read B', 3, 23))
    assert rs[(23, 'Read A')].mapqs == (1, )
    assert rs[(70, 'Read A')].mapqs == (2, )
    assert rs[(23, 'Read B')].mapqs == (3, )
Example #6
0
def test_non_existing_read_name():
    rs = ReadSet()
    r = Read("Read A", 56)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)
    with raises(KeyError):
        _ = rs[(0, "foo")]
Example #7
0
def test_readset2():
    rs = ReadSet()
    rs.add(Read("Read A", 1, 23))
    rs.add(Read("Read A", 2, 70))
    rs.add(Read("Read B", 3, 23))
    assert rs[(23, "Read A")].mapqs == (1, )
    assert rs[(70, "Read A")].mapqs == (2, )
    assert rs[(23, "Read B")].mapqs == (3, )
Example #8
0
def test_non_existing_read_name2():
    rs = ReadSet()
    r = Read('Read A', 56, 1)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)
    with raises(KeyError):
        _ = rs[(2, 'Read A')]
Example #9
0
def merge_readsets(readsets) -> ReadSet:
    all_reads = ReadSet()
    for sample, readset in readsets.items():
        for read in readset:
            assert read.is_sorted(), "Add a read.sort() here"
            all_reads.add(read)
    all_reads.sort()
    return all_reads
Example #10
0
def draw_plots(
    block_readsets,
    clustering,
    threading,
    haplotypes,
    cut_positions,
    genotype_list_multi,
    phasable_variant_table,
    plot_clusters,
    plot_threading,
    output,
):
    # Plot options
    logger.info("Generating plots ...")
    combined_readset = ReadSet()
    for block_readset in block_readsets:
        for read in block_readset:
            combined_readset.add(read)
    if plot_clusters:
        draw_clustering(
            combined_readset,
            clustering,
            phasable_variant_table,
            output + ".clusters.pdf",
            genome_space=False,
        )
    if plot_threading:
        index, rev_index = get_position_map(combined_readset)
        coverage = get_coverage(combined_readset, clustering, index)
        draw_threading(
            combined_readset,
            clustering,
            coverage,
            threading,
            cut_positions,
            haplotypes,
            phasable_variant_table,
            genotype_list_multi,
            output + ".threading.pdf",
        )
Example #11
0
def string_to_readset(s,
                      w=None,
                      sample_ids=None,
                      source_id=0,
                      scale_quality=None):
    s = textwrap.dedent(s).strip()
    if w is not None:
        w = textwrap.dedent(w).strip().split("\n")
    rs = ReadSet()
    for index, line in enumerate(s.split("\n")):
        if len(line) == 0:
            continue
        if sample_ids is None:
            read = Read("Read {}".format(index + 1), 50, source_id)
        else:
            read = Read("Read {}".format(index + 1), 50, source_id,
                        sample_ids[index])
        for pos, c in enumerate(line):
            if c == " ":
                continue
            q = 1
            if w is not None:
                q = int(w[index][pos])
            if not scale_quality == None:
                read.add_variant(position=(pos + 1) * 10,
                                 allele=int(c),
                                 quality=q * scale_quality)
            else:
                read.add_variant(position=(pos + 1) * 10,
                                 allele=int(c),
                                 quality=q)
        assert len(
            read) > 1, "Reads covering less than two variants are not allowed"
        rs.add(read)
    print(rs)
    return rs
Example #12
0
def test_readset():
    rs = ReadSet()
    r = Read('Read A', 56)
    r.add_variant(100, 1, 37)
    r.add_variant(101, 0, 18)
    rs.add(r)

    r = Read('Read B', 0)
    r.add_variant(101, 0, 23)
    rs.add(r)

    r = Read('Read C', 17)
    r.add_variant(99, 1, 27)
    r.add_variant(80, 1, 17)
    r[1] = Variant(position=105, allele=0, quality=14)
    rs.add(r)

    assert rs[0].name == 'Read A'
    assert rs[1].name == 'Read B'
    assert rs[2].name == 'Read C'

    rs.sort()

    # should be sorted after finalization
    assert rs[0].name == 'Read C'
    assert rs[1].name == 'Read A'
    assert rs[2].name == 'Read B'

    assert len(rs) == 3

    assert rs.get_positions() == [99, 100, 101, 105]

    r = rs[(0, 'Read A')]
    assert r.name == 'Read A'
    assert r.mapqs == (56, ), str(r.mapqs)

    r = rs[(0, 'Read B')]
    assert r.name == 'Read B'
    assert r.mapqs == (0, )

    r = rs[(0, 'Read C')]
    assert r.name == 'Read C'
    assert r.mapqs == (17, )
    assert len(r) == 2
    assert r[0] == Variant(position=99, allele=1, quality=27)
    assert r[1] == Variant(position=105, allele=0, quality=14)
Example #13
0
def phase_single_individual(readset, phasable_variant_table, sample,
                            phasing_param, output, timers):

    # Compute the genotypes that belong to the variant table and create a list of all genotypes
    genotype_list = create_genotype_list(phasable_variant_table, sample)

    # Select reads, only for debug
    # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids)
    # readset = selected_reads

    # Precompute block borders based on read coverage and linkage between variants
    logger.info("Detecting connected components with weak interconnect ..")
    timers.start("detecting_blocks")
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    if phasing_param.block_cut_sensitivity == 0:
        block_starts = [0]
    elif phasing_param.block_cut_sensitivity == 1:
        block_starts = compute_linkage_based_block_starts(readset,
                                                          index,
                                                          phasing_param.ploidy,
                                                          single_linkage=True)
    else:
        block_starts = compute_linkage_based_block_starts(readset,
                                                          index,
                                                          phasing_param.ploidy,
                                                          single_linkage=False)

    # Set block borders and split readset
    ext_block_starts = block_starts + [num_vars]
    num_non_singleton_blocks = len([
        i for i in range(len(block_starts))
        if ext_block_starts[i] < ext_block_starts[i + 1] - 1
    ])
    logger.info(
        "Split heterozygous variants into {} blocks (and {} singleton blocks)."
        .format(num_non_singleton_blocks,
                len(block_starts) - num_non_singleton_blocks))

    block_readsets = split_readset(readset, ext_block_starts, index)
    timers.stop("detecting_blocks")

    # Process blocks independently
    (
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
    ) = ([], [], [], [], [])
    processed_non_singleton_blocks = 0
    for block_id, block_readset in enumerate(block_readsets):
        block_start = ext_block_starts[block_id]
        block_end = ext_block_starts[block_id + 1]
        block_num_vars = block_end - block_start

        assert len(block_readset.get_positions()) == block_num_vars

        if block_num_vars > 1:
            # Only print for non-singleton block
            processed_non_singleton_blocks += 1
            logger.info(
                "Processing block {} of {} with {} reads and {} variants.".
                format(
                    processed_non_singleton_blocks,
                    num_non_singleton_blocks,
                    len(block_readset),
                    block_num_vars,
                ))

        genotype_slice = genotype_list[block_start:block_end]
        clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block(
            block_readset, genotype_slice, phasing_param, timers)

        blockwise_clustering.append(clustering)
        blockwise_paths.append(path)
        blockwise_haplotypes.append(haplotypes)
        blockwise_cut_positions.append(cut_positions)
        blockwise_haploid_cuts.append(haploid_cuts)

    # Aggregate blockwise results
    clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks(
        block_starts,
        block_readsets,
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
        phasing_param,
    )

    # Summarize data for VCF file
    accessible_positions = sorted(readset.get_positions())
    components = {}
    haploid_components = {}

    ext_cuts = cut_positions + [num_vars]
    for i, cut_pos in enumerate(cut_positions):
        for pos in range(ext_cuts[i], ext_cuts[i + 1]):
            components[accessible_positions[pos]] = accessible_positions[
                ext_cuts[i]]
            components[accessible_positions[pos] +
                       1] = accessible_positions[ext_cuts[i]]
            haploid_components[
                accessible_positions[pos]] = [0] * phasing_param.ploidy
            haploid_components[accessible_positions[pos] +
                               1] = [0] * phasing_param.ploidy

    for j in range(phasing_param.ploidy):
        ext_cuts = haploid_cuts[j] + [num_vars]
        for i, cut_pos in enumerate(haploid_cuts[j]):
            for pos in range(ext_cuts[i], ext_cuts[i + 1]):
                haploid_components[accessible_positions[pos]][
                    j] = accessible_positions[ext_cuts[i]]
                haploid_components[accessible_positions[pos] +
                                   1][j] = accessible_positions[ext_cuts[i]]

    superreads = ReadSet()
    for i in range(phasing_param.ploidy):
        read = Read("superread {}".format(i + 1), 0, 0)
        # insert alleles
        for j, allele in enumerate(haplotypes[i]):
            if allele == "n":
                continue
            allele = int(allele)
            # TODO: Needs changes for multi-allelic and we might give an actual quality value
            read.add_variant(accessible_positions[j], allele, 0)
        superreads.add(read)

    # Plot option
    if phasing_param.plot_clusters or phasing_param.plot_threading:
        timers.start("create_plots")
        draw_plots(
            block_readsets,
            clustering,
            threading,
            haplotypes,
            cut_positions,
            genotype_list,
            phasable_variant_table,
            phasing_param,
            output,
        )
        timers.stop("create_plots")

    # Return results
    return components, haploid_components, superreads
Example #14
0
def phase_single_individual(readset, phasable_variant_table, sample, phasing_param, output, timers):

    # Compute the genotypes that belong to the variant table and create a list of all genotypes
    genotype_list = create_genotype_list(phasable_variant_table, sample)

    # Select reads, only for debug
    # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids)
    # readset = selected_reads

    # Precompute block borders based on read coverage and linkage between variants
    logger.info("Detecting connected components with weak interconnect ..")
    timers.start("detecting_blocks")
    index, rev_index = get_position_map(readset)
    num_vars = len(rev_index)
    if phasing_param.block_cut_sensitivity == 0:
        block_starts = [0]
    elif phasing_param.block_cut_sensitivity == 1:
        block_starts = compute_linkage_based_block_starts(
            readset, index, phasing_param.ploidy, single_linkage=True
        )
    else:
        block_starts = compute_linkage_based_block_starts(
            readset, index, phasing_param.ploidy, single_linkage=False
        )

    # Set block borders and split readset
    ext_block_starts = block_starts + [num_vars]
    num_non_singleton_blocks = len(
        [i for i in range(len(block_starts)) if ext_block_starts[i] < ext_block_starts[i + 1] - 1]
    )
    logger.info(
        "Split heterozygous variants into {} blocks (and {} singleton blocks).".format(
            num_non_singleton_blocks, len(block_starts) - num_non_singleton_blocks
        )
    )

    block_readsets = split_readset(readset, ext_block_starts, index)
    timers.stop("detecting_blocks")

    # Process blocks independently
    (
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
    ) = ([], [], [], [], [])

    # Create genotype slices for blocks
    genotype_slices = []
    for block_id, block_readset in enumerate(block_readsets):
        block_start = ext_block_starts[block_id]
        block_end = ext_block_starts[block_id + 1]
        block_num_vars = block_end - block_start

        assert len(block_readset.get_positions()) == block_num_vars
        genotype_slices.append(genotype_list[block_start:block_end])

    processed_non_singleton_blocks = 0
    # use process pool for multiple threads
    if phasing_param.threads == 1:
        # for single-threading, process everything individually to minimize memory footprint
        for block_id, block_readset in enumerate(block_readsets):
            block_num_vars = ext_block_starts[block_id + 1] - ext_block_starts[block_id]
            if block_num_vars > 1:
                # Only print for non-singleton block
                processed_non_singleton_blocks += 1
                logger.info(
                    "Processing block {} of {} with {} reads and {} variants.".format(
                        processed_non_singleton_blocks,
                        num_non_singleton_blocks,
                        len(block_readset),
                        block_num_vars,
                    )
                )

            clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block(
                block_readset, genotype_slices[block_id], phasing_param, timers
            )

            blockwise_clustering.append(clustering)
            blockwise_paths.append(path)
            blockwise_haplotypes.append(haplotypes)
            blockwise_cut_positions.append(cut_positions)
            blockwise_haploid_cuts.append(haploid_cuts)

    else:
        # sort block readsets in descending order by number of reads
        joblist = [(i, len(block_readsets[i])) for i in range(len(block_readsets))]
        joblist.sort(key=lambda x: -x[1])

        timers.start("phase_blocks")

        # process large jobs first, 4/3-approximation for scheduling problem
        with Pool(processes=phasing_param.threads) as pool:
            """
            TODO: Python's multiprocessing makes hard copies of the passed
            arguments, which is not trivial for cython objects, especially when they
            contain pointers to other cython objects. Any passed object must be
            (de)serializable (in Python: pickle).
            All other objects created in the main thread are also accessible by the
            workers, but they are handled via the copy-on-write policy. This means,
            that e.g. the large main readset is not hardcopied for every thread,
            as long as it is not modified there. Since this would cause a massive
            waste of memory, this must not be done and the main readset must
            also never be passed as argument to the workers.
            """
            process_results = [
                pool.apply_async(
                    phase_single_block_mt,
                    (
                        block_readsets[block_id],
                        genotype_slices[block_id],
                        phasing_param,
                        timers,
                        block_id,
                        job_id,
                        num_non_singleton_blocks,
                    ),
                )
                for job_id, (block_id, block_readset) in enumerate(joblist)
            ]
            blockwise_results = [res.get() for res in process_results]

            # reorder results again
            blockwise_results.sort(key=lambda x: x[-1])

            # collect all blockwise results
            for (
                clustering,
                path,
                haplotypes,
                cut_positions,
                haploid_cuts,
                block_id,
            ) in blockwise_results:
                blockwise_clustering.append(clustering)
                blockwise_paths.append(path)
                blockwise_haplotypes.append(haplotypes)
                blockwise_cut_positions.append(cut_positions)
                blockwise_haploid_cuts.append(haploid_cuts)

        timers.stop("phase_blocks")

    # Aggregate blockwise results
    clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks(
        block_starts,
        block_readsets,
        blockwise_clustering,
        blockwise_paths,
        blockwise_haplotypes,
        blockwise_cut_positions,
        blockwise_haploid_cuts,
        phasing_param,
    )

    # Summarize data for VCF file
    accessible_positions = sorted(readset.get_positions())
    components = {}
    haploid_components = {}

    ext_cuts = cut_positions + [num_vars]
    for i, cut_pos in enumerate(cut_positions):
        for pos in range(ext_cuts[i], ext_cuts[i + 1]):
            components[accessible_positions[pos]] = accessible_positions[ext_cuts[i]]
            components[accessible_positions[pos] + 1] = accessible_positions[ext_cuts[i]]
            haploid_components[accessible_positions[pos]] = [0] * phasing_param.ploidy
            haploid_components[accessible_positions[pos] + 1] = [0] * phasing_param.ploidy

    for j in range(phasing_param.ploidy):
        ext_cuts = haploid_cuts[j] + [num_vars]
        for i, cut_pos in enumerate(haploid_cuts[j]):
            for pos in range(ext_cuts[i], ext_cuts[i + 1]):
                haploid_components[accessible_positions[pos]][j] = accessible_positions[ext_cuts[i]]
                haploid_components[accessible_positions[pos] + 1][j] = accessible_positions[
                    ext_cuts[i]
                ]

    superreads = ReadSet()
    for i in range(phasing_param.ploidy):
        read = Read("superread {}".format(i + 1), 0, 0)
        # insert alleles
        for j, allele in enumerate(haplotypes[i]):
            if allele == "n":
                continue
            allele = int(allele)
            # TODO: Needs changes for multi-allelic and we might give an actual quality value
            read.add_variant(accessible_positions[j], allele, 0)
        superreads.add(read)

    # Plot option
    if phasing_param.plot_clusters or phasing_param.plot_threading:
        timers.start("create_plots")
        draw_plots(
            block_readsets,
            clustering,
            threading,
            haplotypes,
            cut_positions,
            genotype_list,
            phasable_variant_table,
            phasing_param.plot_clusters,
            phasing_param.plot_threading,
            output,
        )
        timers.stop("create_plots")

    # Return results
    return components, haploid_components, superreads
Example #15
0
    def read(self, chromosome, variants, sample, *, read_vcf=True, regions=None):
        """
        Return a pair (readset, vcf_source_ids) where readset is a sorted ReadSet.

        Set read_vcf to False to not read phased blocks from the VCFs
        """
        readset_reader = self._readset_reader
        for_sample = "for sample {!r} ".format(sample) if not self._ignore_read_groups else ""
        logger.info("Reading alignments %sand detecting alleles ...", for_sample)
        try:
            reference = self._fasta[chromosome] if self._fasta else None
        except KeyError:
            raise CommandLineError(
                "Chromosome {!r} present in VCF file, but not in the reference FASTA {!r}".format(
                    chromosome, self._fasta.filename
                )
            )

        bam_sample = None if self._ignore_read_groups else sample
        try:
            readset = readset_reader.read(chromosome, variants, bam_sample, reference, regions)
        except SampleNotFoundError:
            logger.warning("Sample %r not found in any BAM/CRAM file.", bam_sample)
            readset = ReadSet()
        except ReadSetError as e:
            raise CommandLineError(e)
        except ReferenceNotFoundError:
            if chromosome.startswith("chr"):
                alternative = chromosome[3:]
            else:
                alternative = "chr" + chromosome
            message = "The chromosome {!r} was not found in the BAM/CRAM file.".format(chromosome)
            if readset_reader.has_reference(alternative):
                message += " Found {!r} instead".format(alternative)
            raise CommandLineError(message)

        vcf_source_ids = set()
        if read_vcf:
            # TODO this is a bit clumsy
            if self._vcfs is None:
                raise ValueError("call PhasedInputReader.read_vcfs() first")
            # Add phasing information from VCF files, if present
            sample_id = self._numeric_sample_ids[sample]
            for i, vcf in enumerate(self._vcfs):
                if chromosome in vcf:
                    variant_table = vcf[chromosome]
                    source_id = readset_reader.n_paths + i
                    vcf_source_ids.add(source_id)
                    for read in variant_table.phased_blocks_as_reads(
                        sample, variants, source_id, sample_id
                    ):
                        readset.add(read)

        # TODO is this necessary?
        for read in readset:
            read.sort()
        readset.sort()

        logger.info(
            "Found %d reads covering %d variants", len(readset), len(readset.get_positions()),
        )
        return readset, vcf_source_ids
Example #16
0
def create_testinstance1():
    var_pos = [
        24,
        56,
        89,
        113,
        162,
        166,
        187,
        205,
        211,
        248,
        273,
        299,
        307,
        324,
        351,
        370,
        378,
        400,
        441,
        455,
        478,
        492,
    ]

    readset = ReadSet()

    matrix = [
        "0011000",
        "11010100",
        " 101011010",
        " 0001011000",
        "  11001001",
        "  0010100000",
        "   100010001",
        "       0100000101",
        "    101110001",
        "        0001110011",
        "        1010001010",
        "     011100011",
        "         0010100111",
        "          1010101011",
        "          0101001110",
        "              01000001",
        "              01010001",
        "                101100",
        "                111010",
    ]

    for i in range(len(matrix)):
        read = Read(name="read" + str(i), mapq=15)
        for j in range(len(matrix[i])):
            if matrix[i][j] != " ":
                read.add_variant(var_pos[j], int(matrix[i][j]), 0)
        readset.add(read)

    clustering = [
        [0, 4, 6],
        [1, 2],
        [7, 10, 13],
        [9, 12, 14],
        [3, 5, 8, 11],
        [15, 16],
        [17],
        [18],
    ]
    genotypes = [
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 1, 1: 2},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 3, 1: 0},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 1, 1: 2},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 1, 1: 2},
        {0: 2, 1: 1},
        {0: 1, 1: 2},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
        {0: 2, 1: 1},
    ]

    return readset, var_pos, clustering, genotypes
Example #17
0
    def merge(self, readset):
        """
        Return a set of reads after merging together subsets of reads
        (into super reads) from an input readset according to a
        probabilistic model of how likely sets of reads are to appear
        together on one haplotype and on opposite haplotypes.
        readset -- the input .core.ReadSet object
        error_rate -- the probability that a nucleotide is wrong
        max_error_rate -- the maximum error rate of any edge of the read
        merging graph allowed before we discard it
        threshold -- the threshold of the ratio between the probabilities
        that a pair ' 'of reads come from the same haplotype and different
        haplotypes
        neg_threshold -- The threshold of the ratio between the
        probabilities that a pair of reads come from the same haplotype
        and different haplotypes.
        """
        logger.info(
            "Merging %d reads with error rate %.2f, maximum error rate %.2f, "
            "positive threshold %d and negative threshold %d ...",
            len(readset),
            self._error_rate,
            self._max_error_rate,
            self._positive_threshold,
            self._negative_threshold,
        )
        logger.debug("Merging started.")
        gblue = Graph()
        gred = Graph()
        gnotblue = Graph()
        gnotred = Graph()

        # Probability that any nucleotide is wrong
        error_rate = self._error_rate
        logger.debug("Error Rate: %s", error_rate)

        # If an edge has too many errors, we discard it since it is not reliable
        max_error_rate = self._max_error_rate
        logger.debug("Max Error Rate: %s", max_error_rate)

        # Threshold of the ratio between the probabilities that the two reads come from
        # the same side or from different sides
        thr = self._positive_threshold
        logger.debug("Positive Threshold: %s", thr)

        # Threshold_neg is a more conservative threshold for the evidence
        # that two reads should not be clustered together.
        thr_neg = self._negative_threshold
        logger.debug("Negative Threshold: %s", thr_neg)

        thr_diff = 1 + int(log(thr, (1 - error_rate) / (error_rate / 3)))
        thr_neg_diff = 1 + int(
            log(thr_neg, (1 - error_rate) / (error_rate / 3)))
        logger.debug("Thr. Diff.: %s - Thr. Neg. Diff.: %s", thr_diff,
                     thr_neg_diff)

        logger.debug("Start reading the reads...")
        id = 0
        orig_reads = {}
        queue = {}
        reads = {}
        for read in readset:
            id += 1
            begin_str = read[0][0]
            snps = []
            orgn = []
            for variant in read:

                site = variant[0]
                zyg = variant[1]
                qual = variant[2]

                orgn.append([str(site), str(zyg), str(qual)])
                if int(zyg) == 0:
                    snps.append("G")
                else:
                    snps.append("C")

            begin = int(begin_str)
            end = begin + len(snps)
            orig_reads[id] = orgn

            gblue.add_node(id, begin=begin, end=end, sites="".join(snps))
            gnotblue.add_node(id, begin=begin, end=end, sites="".join(snps))
            gred.add_node(id, begin=begin, end=end, sites="".join(snps))
            gnotred.add_node(id, begin=begin, end=end, sites="".join(snps))
            queue[id] = {"begin": begin, "end": end, "sites": snps}
            reads[id] = {"begin": begin, "end": end, "sites": snps}
            for x in [id for id in queue.keys() if queue[id]["end"] <= begin]:
                del queue[x]
            for id1 in queue.keys():
                if id == id1:
                    continue
                match, mismatch = eval_overlap(queue[id1], queue[id])
                if (match + mismatch >= thr_neg_diff and min(match, mismatch) /
                    (match + mismatch) <= max_error_rate
                        and match - mismatch >= thr_diff):
                    gblue.add_edge(id1, id, match=match, mismatch=mismatch)
                    if mismatch - match >= thr_diff:
                        gred.add_edge(id1, id, match=match, mismatch=mismatch)
                    if match - mismatch >= thr_neg_diff:
                        gnotred.add_edge(id1,
                                         id,
                                         match=match,
                                         mismatch=mismatch)
                    if mismatch - match >= thr_neg_diff:
                        gnotblue.add_edge(id1,
                                          id,
                                          match=match,
                                          mismatch=mismatch)

        logger.debug("Finished reading the reads.")
        logger.debug("Number of reads: %s", id)
        logger.debug("Blue Graph")
        logger.debug(
            "Nodes: %s - Edges: %s - ConnComp: %s",
            number_of_nodes(gblue),
            number_of_edges(gblue),
            len(list(connected_components(gblue))),
        )
        logger.debug("Non-Blue Graph")
        logger.debug(
            "Nodes: %s - Edges: %s - ConnComp: %s",
            number_of_nodes(gnotblue),
            number_of_edges(gnotblue),
            len(list(connected_components(gnotblue))),
        )
        logger.debug("Red Graph")
        logger.debug(
            "Nodes: %s - Edges: %s - ConnComp: %s",
            number_of_nodes(gred),
            number_of_edges(gred),
            len(list(connected_components(gred))),
        )
        logger.debug("Non-Red Graph")
        logger.debug(
            "Nodes: %s - Edges: %s - ConnComp: %s",
            number_of_nodes(gnotred),
            number_of_edges(gnotred),
            len(list(connected_components(gnotred))),
        )

        # We consider the notblue edges as an evidence that two reads
        # should not be merged together
        # Since we want to merge each blue connected components into
        # a single superread, we check each notblue edge (r1, r2) and
        # we remove some blue edges so that r1 and r2 are not in the
        # same blue connected component

        blue_component = {}
        current_component = 0
        for conncomp in connected_components(gblue):
            for v in conncomp:
                blue_component[v] = current_component
            current_component += 1

        # Keep only the notblue edges that are inside a blue connected component
        good_notblue_edges = [(v, w) for (v, w) in gnotblue.edges()
                              if blue_component[v] == blue_component[w]]

        for (u, v) in good_notblue_edges:
            while v in node_connected_component(gblue, u):
                path = shortest_path(gblue, source=u, target=v)
                # Remove the edge with the smallest support
                # A better strategy is to weight each edge with -log p
                # and remove the minimum (u,v)-cut
                w, x = min(
                    zip(path[:-1], path[1:]),
                    key=lambda p: gblue[p[0]][p[1]]["match"] - gblue[p[0]][p[
                        1]]["mismatch"],
                )
                gblue.remove_edge(w, x)

        # Merge blue components (somehow)
        logger.debug("Started Merging Reads...")
        superreads = {}  # superreads given by the clusters (if clustering)
        rep = {}  # cluster representative of a read in a cluster

        for cc in connected_components(gblue):
            if len(cc) > 1:
                r = min(cc)
                superreads[r] = {}
                for id in cc:
                    rep[id] = r

        for id in orig_reads:
            if id in rep:
                for tok in orig_reads[id]:
                    site = int(tok[0])
                    zyg = int(tok[1])
                    qual = int(tok[2])
                    r = rep[id]
                    if site not in superreads[r]:
                        superreads[r][site] = [0, 0]
                    superreads[r][site][zyg] += qual

            merged_reads = ReadSet()
            readn = 0
            for id in orig_reads:
                read = Read("read" + str(readn))
                readn += 1
                if id in rep:
                    if id == rep[id]:
                        for site in sorted(superreads[id]):
                            z = superreads[id][site]
                            if z[0] >= z[1]:
                                read.add_variant(site, 0, z[0] - z[1])

                            elif z[1] > z[0]:
                                read.add_variant(site, 1, z[1] - z[0])
                        merged_reads.add(read)
                else:
                    for tok in orig_reads[id]:
                        read.add_variant(int(tok[0]), int(tok[1]), int(tok[2]))
                    merged_reads.add(read)

        logger.debug("Finished merging reads.")
        logger.info(
            "... after merging: merged %d reads into %d reads",
            len(readset),
            len(merged_reads),
        )

        return merged_reads
Example #18
0
def test_readscoring_toy():
    readset = ReadSet()
    read1 = Read("name1", 15)
    read1.add_variant(0, 0, 1)
    read1.add_variant(1, 0, 1)
    read1.add_variant(2, 0, 1)
    read1.add_variant(3, 1, 1)
    readset.add(read1)
    read2 = Read("name2", 15)
    read2.add_variant(1, 1, 1)
    read2.add_variant(2, 0, 1)
    read2.add_variant(3, 0, 1)
    read2.add_variant(4, 1, 1)
    readset.add(read2)
    read3 = Read("name3", 15)
    read3.add_variant(2, 0, 1)
    read3.add_variant(3, 1, 1)
    read3.add_variant(4, 0, 1)
    read3.add_variant(5, 1, 1)
    readset.add(read3)
    read4 = Read("name4", 15)
    read4.add_variant(3, 0, 1)
    read4.add_variant(4, 1, 1)
    read4.add_variant(5, 0, 1)
    read4.add_variant(6, 0, 1)
    readset.add(read4)
    read5 = Read("name5", 15)
    read5.add_variant(4, 0, 1)
    read5.add_variant(5, 1, 1)
    read5.add_variant(6, 1, 1)
    read5.add_variant(7, 0, 1)
    readset.add(read5)
    read6 = Read("name6", 15)
    read6.add_variant(5, 0, 1)
    read6.add_variant(6, 0, 1)
    read6.add_variant(7, 0, 1)
    read6.add_variant(8, 1, 1)
    readset.add(read6)
    read7 = Read("name7", 15)
    read7.add_variant(6, 1, 1)
    read7.add_variant(7, 0, 1)
    read7.add_variant(8, 0, 1)
    read7.add_variant(9, 1, 1)
    readset.add(read7)
    sim = scoreReadsetGlobal(readset, 2, 2)

    assert sim.get(0, 1) < 0.0
    assert sim.get(0, 2) > 0.0
    assert sim.get(0, 3) <= 0.0
    assert sim.get(0, 4) >= 0.0
    assert sim.get(0, 5) <= 0.0
    assert sim.get(0, 6) >= 0.0
    assert sim.get(1, 2) < 0.0
    assert sim.get(1, 3) > 0.0
    assert sim.get(1, 4) <= 0.0
    assert sim.get(1, 5) >= 0.0
    assert sim.get(1, 6) <= 0.0
    assert sim.get(2, 3) < 0.0
    assert sim.get(2, 4) > 0.0
    assert sim.get(2, 5) <= 0.0
    assert sim.get(2, 6) >= 0.0
    assert sim.get(3, 4) < 0.0
    assert sim.get(3, 5) > 0.0
    assert sim.get(3, 6) <= 0.0
    assert sim.get(4, 5) < 0.0
    assert sim.get(4, 6) > 0.0
    assert sim.get(5, 6) < 0.0
Example #19
0
def run_genotype(
    phase_input_files,
    variant_file,
    reference=None,
    output=sys.stdout,
    samples=None,
    chromosomes=None,
    ignore_read_groups=False,
    indels=True,
    mapping_quality=20,
    max_coverage=15,
    nopriors=False,
    ped=None,
    recombrate=1.26,
    genmap=None,
    gt_qual_threshold=0,
    prioroutput=None,
    constant=0.0,
    overhang=10,
    affine_gap=False,
    gap_start=10,
    gap_extend=7,
    mismatch=15,
    write_command_line_header=True,
    use_ped_samples=False,
):
    """
    For now: this function only runs the genotyping algorithm. Genotype likelihoods for
    all variants are computed using the forward backward algorithm
    """
    timers = StageTimer()
    logger.info(
        "This is WhatsHap (genotyping) %s running under Python %s",
        __version__,
        platform.python_version(),
    )
    if write_command_line_header:
        command_line = "(whatshap {}) {}".format(__version__,
                                                 " ".join(sys.argv[1:]))
    else:
        command_line = None
    with ExitStack() as stack:
        # read the given input files (BAMs, VCFs, ref...)
        numeric_sample_ids = NumericSampleIds()
        phased_input_reader = stack.enter_context(
            PhasedInputReader(
                phase_input_files,
                reference,
                numeric_sample_ids,
                ignore_read_groups,
                indels=indels,
                mapq_threshold=mapping_quality,
                overhang=overhang,
                affine=affine_gap,
                gap_start=gap_start,
                gap_extend=gap_extend,
                default_mismatch=mismatch,
            ))
        show_phase_vcfs = phased_input_reader.has_vcfs

        # vcf writer for final genotype likelihoods
        vcf_writer = stack.enter_context(
            GenotypeVcfWriter(command_line=command_line,
                              in_path=variant_file,
                              out_file=output))
        # vcf writer for only the prior likelihoods (if output is desired)
        prior_vcf_writer = None
        if prioroutput is not None:
            prior_vcf_writer = stack.enter_context(
                GenotypeVcfWriter(
                    command_line=command_line,
                    in_path=variant_file,
                    out_file=stack.enter_context(open(prioroutput, "w")),
                ))

        # parse vcf with input variants
        # remove all likelihoods that may already be present
        vcf_reader = stack.enter_context(
            VcfReader(
                variant_file,
                indels=indels,
                genotype_likelihoods=False,
                ignore_genotypes=True,
            ))

        if ignore_read_groups and not samples and len(vcf_reader.samples) > 1:
            raise CommandLineError(
                "When using --ignore-read-groups on a VCF with "
                "multiple samples, --sample must also be used.")
        if not samples:
            samples = vcf_reader.samples

        # if --use-ped-samples is set, use only samples from PED file
        if ped and use_ped_samples:
            samples = set()
            for trio in PedReader(ped):
                if trio.child is None or trio.mother is None or trio.father is None:
                    continue
                samples.add(trio.mother)
                samples.add(trio.father)
                samples.add(trio.child)

        vcf_sample_set = set(vcf_reader.samples)
        for sample in samples:
            if sample not in vcf_sample_set:
                raise CommandLineError(
                    "Sample {!r} requested on command-line not found in VCF".
                    format(sample))

        if ped and genmap:
            logger.info(
                "Using region-specific recombination rates from genetic map %s.",
                genmap,
            )
            recombination_cost_computer = GeneticMapRecombinationCostComputer(
                genmap)
        else:
            if ped:
                logger.info("Using uniform recombination rate of %g cM/Mb.",
                            recombrate)
            recombination_cost_computer = UniformRecombinationCostComputer(
                recombrate)

        samples = frozenset(samples)
        families, family_trios = setup_families(samples, ped,
                                                numeric_sample_ids,
                                                max_coverage)

        # Read phase information provided as VCF files, if provided.
        with timers("parse_phasing_vcfs"):
            phased_input_reader.read_vcfs()

        # compute genotype likelihood threshold
        gt_prob = 1.0 - (10**(-gt_qual_threshold / 10.0))

        for variant_table in timers.iterate("parse_vcf", vcf_reader):

            # create a mapping of genome positions to indices
            var_to_pos = dict()
            for i in range(len(variant_table.variants)):
                var_to_pos[variant_table.variants[i].position] = i

            chromosome = variant_table.chromosome
            if (not chromosomes) or (chromosome in chromosomes):
                logger.info("======== Working on chromosome %r", chromosome)
            else:
                logger.info(
                    "Leaving chromosome %r unchanged (present in VCF but not requested by option --chromosome)",
                    chromosome,
                )
                vcf_writer.write_genotypes(chromosome,
                                           variant_table,
                                           indels,
                                           leave_unchanged=True)
                if prioroutput is not None:
                    prior_vcf_writer.write_genotypes(chromosome,
                                                     variant_table,
                                                     indels,
                                                     leave_unchanged=True)
                continue

            positions = [v.position for v in variant_table.variants]
            if not nopriors:
                # compute prior genotype likelihoods based on all reads
                for sample in samples:
                    logger.info("---- Initial genotyping of %s", sample)
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                            read_vcf=False,
                        )
                        readset.sort()
                        genotypes, genotype_likelihoods = compute_genotypes(
                            readset, positions)
                        # recompute genotypes based on given threshold
                        reg_genotype_likelihoods = []
                        for gl in range(len(genotype_likelihoods)):
                            norm_sum = (genotype_likelihoods[gl][0] +
                                        genotype_likelihoods[gl][1] +
                                        genotype_likelihoods[gl][2] +
                                        3 * constant)
                            regularized = PhredGenotypeLikelihoods([
                                (genotype_likelihoods[gl][0] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][1] + constant) /
                                norm_sum,
                                (genotype_likelihoods[gl][2] + constant) /
                                norm_sum,
                            ])
                            genotypes[gl] = determine_genotype(
                                regularized, gt_prob)
                            assert isinstance(genotypes[gl], Genotype)
                            reg_genotype_likelihoods.append(regularized)
                        variant_table.set_genotype_likelihoods_of(
                            sample,
                            [
                                PhredGenotypeLikelihoods(list(gl))
                                for gl in reg_genotype_likelihoods
                            ],
                        )
                        variant_table.set_genotypes_of(sample, genotypes)
            else:

                # use uniform genotype likelihoods for all individuals
                for sample in samples:
                    variant_table.set_genotype_likelihoods_of(
                        sample,
                        [PhredGenotypeLikelihoods([1 / 3, 1 / 3, 1 / 3])] *
                        len(positions),
                    )

            # if desired, output the priors in separate vcf
            if prioroutput is not None:
                prior_vcf_writer.write_genotypes(chromosome, variant_table,
                                                 indels)

            # Iterate over all families to process, i.e. a separate DP table is created
            # for each family.
            for representative_sample, family in sorted(families.items()):
                if len(family) == 1:
                    logger.info("---- Processing individual %s",
                                representative_sample)
                else:
                    logger.info("---- Processing family with individuals: %s",
                                ",".join(family))
                max_coverage_per_sample = max(1, max_coverage // len(family))
                logger.info("Using maximum coverage per sample of %dX",
                            max_coverage_per_sample)
                trios = family_trios[representative_sample]
                assert (len(family) == 1) or (len(trios) > 0)

                # Get the reads belonging to each sample
                readsets = dict()
                for sample in family:
                    with timers("read_bam"):
                        readset, vcf_source_ids = phased_input_reader.read(
                            chromosome,
                            variant_table.variants,
                            sample,
                        )

                    with timers("select"):
                        readset = readset.subset([
                            i for i, read in enumerate(readset)
                            if len(read) >= 2
                        ])
                        logger.info(
                            "Kept %d reads that cover at least two variants each",
                            len(readset),
                        )
                        selected_reads = select_reads(
                            readset,
                            max_coverage_per_sample,
                            preferred_source_ids=vcf_source_ids,
                        )
                    readsets[sample] = selected_reads

                # Merge reads into one ReadSet (note that each Read object
                # knows the sample it originated from).
                all_reads = ReadSet()
                for sample, readset in readsets.items():
                    for read in readset:
                        assert read.is_sorted(), "Add a read.sort() here"
                        all_reads.add(read)

                all_reads.sort()

                # Determine which variants can (in principle) be phased
                accessible_positions = sorted(all_reads.get_positions())
                logger.info(
                    "Variants covered by at least one phase-informative "
                    "read in at least one individual after read selection: %d",
                    len(accessible_positions),
                )

                # Create Pedigree
                pedigree = Pedigree(numeric_sample_ids)
                for sample in family:
                    # genotypes are assumed to be unknown, so ignore information that
                    # might already be present in the input vcf
                    all_genotype_likelihoods = variant_table.genotype_likelihoods_of(
                        sample)
                    genotype_l = [
                        all_genotype_likelihoods[var_to_pos[a_p]]
                        for a_p in accessible_positions
                    ]
                    pedigree.add_individual(
                        sample,
                        [
                            Genotype([])
                            for i in range(len(accessible_positions))
                        ],
                        genotype_l,
                    )
                for trio in trios:
                    pedigree.add_relationship(
                        father_id=trio.father,
                        mother_id=trio.mother,
                        child_id=trio.child,
                    )

                recombination_costs = recombination_cost_computer.compute(
                    accessible_positions)

                # Finally, run genotyping algorithm
                with timers("genotyping"):
                    problem_name = "genotyping"
                    logger.info(
                        "Genotype %d sample%s by solving the %s problem ...",
                        len(family),
                        "s" if len(family) > 1 else "",
                        problem_name,
                    )
                    forward_backward_table = GenotypeDPTable(
                        numeric_sample_ids,
                        all_reads,
                        recombination_costs,
                        pedigree,
                        accessible_positions,
                    )
                    # store results
                    for s in family:
                        likelihood_list = variant_table.genotype_likelihoods_of(
                            s)
                        genotypes_list = variant_table.genotypes_of(s)

                        for pos in range(len(accessible_positions)):
                            likelihoods = forward_backward_table.get_genotype_likelihoods(
                                s, pos)

                            # compute genotypes from likelihoods and store information
                            geno = determine_genotype(likelihoods, gt_prob)
                            assert isinstance(geno, Genotype)
                            genotypes_list[var_to_pos[
                                accessible_positions[pos]]] = geno
                            likelihood_list[var_to_pos[
                                accessible_positions[pos]]] = likelihoods

                        variant_table.set_genotypes_of(s, genotypes_list)
                        variant_table.set_genotype_likelihoods_of(
                            s, likelihood_list)

            with timers("write_vcf"):
                logger.info("======== Writing VCF")
                vcf_writer.write_genotypes(chromosome, variant_table, indels)
                logger.info("Done writing VCF")

            logger.debug("Chromosome %r finished", chromosome)

    logger.info("\n== SUMMARY ==")
    total_time = timers.total()
    log_memory_usage()
    logger.info(
        "Time spent reading BAM:                      %6.1f s",
        timers.elapsed("read_bam"),
    )
    logger.info(
        "Time spent parsing VCF:                      %6.1f s",
        timers.elapsed("parse_vcf"),
    )
    if show_phase_vcfs:
        logger.info(
            "Time spent parsing input phasings from VCFs: %6.1f s",
            timers.elapsed("parse_phasing_vcfs"),
        )
    logger.info("Time spent selecting reads:                  %6.1f s",
                timers.elapsed("select"))
    logger.info(
        "Time spent genotyping:                          %6.1f s",
        timers.elapsed("genotyping"),
    )
    logger.info(
        "Time spent writing VCF:                      %6.1f s",
        timers.elapsed("write_vcf"),
    )
    logger.info(
        "Time spent on rest:                          %6.1f s",
        total_time - timers.sum(),
    )
    logger.info("Total elapsed time:                          %6.1f s",
                total_time)
Example #20
0
def gfa_to_readset(gfa_filename,
                   split_gap=100,
                   w=None,
                   sample_ids=None,
                   source_id=0,
                   scale_quality=None):
    rs = ReadSet()
    node_length = {}
    node_coverage = {}
    with open(gfa_filename) as gfa_file:
        for line in gfa_file:
            fields = line.strip().split("\t")
            if fields[0] != "S":
                continue
            node_length[int(fields[1])] = len(fields[2])
    with open(gfa_filename) as gfa_file:
        for line in gfa_file:
            fields = line.strip().split("\t")
            if fields[0] != "P":
                continue
            path_name = fields[1]
            path_str = fields[2]
            for i in [int(s[:-1]) for s in path_str.split(",")]:
                if i in node_coverage:
                    node_coverage[i] += 1
                else:
                    node_coverage[i] = 1
    with open(gfa_filename) as gfa_file:
        for line in gfa_file:
            fields = line.strip().split("\t")
            if fields[0] != "P":
                continue
            path_name = fields[1]
            path_str = fields[2]
            # order it
            path = sorted(set([int(s[:-1]) for s in path_str.split(",")]))
            # break each path into pieces separated by > x nodes (todo: use actual distance in the graph)
            # for each, add it to the ReadSet
            path_length = len(path)
            segment_idx = 0
            i = 0
            # how do we find segments?
            longest_read = None
            while i < path_length:
                #read = Read("{}\t{}".format(path_name, segment_idx), 50, source_id)
                read = Read("{}".format(path_name), 50, source_id)
                segment_idx += 1
                q = 1
                # while the distance to the next node is less than our split_gap threshold
                curr = path[i]
                read.add_variant(
                    position=curr,
                    allele=1,
                    quality=-10 *
                    math.log10(1 - 1.0 / node_coverage[curr] + 0.001))
                last = curr
                i += 1
                while i < path_length:
                    curr = path[i]
                    dist = 0
                    for node_id in range(last + 1, curr):
                        dist += node_length[node_id]
                    #eprint("for", path_name, "dist is", dist)
                    if dist > split_gap:
                        break
                    else:
                        for node_id in range(last + 1, curr):
                            #eprint(node_coverage[node_id])
                            read.add_variant(
                                position=node_id, allele=0, quality=1
                            )  #-10*math.log10(1-1.0/node_coverage[node_id]+0.001))
                        read.add_variant(
                            position=curr,
                            allele=1,
                            quality=-10 *
                            math.log10(1 - 1.0 / node_coverage[curr] + 0.001))
                        i += 1
                        last = curr
                #read.sort()  # not sure if needed
                #if len(read) > min_read_length:
                if longest_read is None or len(read) > len(longest_read):
                    longest_read = read
                #rs.add(read)
            rs.add(longest_read)
    rs.sort()
    #print(rs)
    return rs