Esempio n. 1
0
def variant_context(
        reference_fasta,
        contig,
        inclusive_start,
        inclusive_end,
        alt,
        context_length):  
    """
    Retrieve the surronding reference region from a variant.

    SNVs are canonicalized so the reference base is a pyrmidine (C/T). For
    indels the reverse complement will still be taken if the first base of
    the reference is not a pyrmidine, but since the reference will also be
    reversed, that doesn't guarantee it will start with a pyrmidine.

    Parameters
    ----------
    reference_fasta : FastaReference
        reference sequence from pyfaidx package

    contig : str
        Chromosome of the variant

    inclusive_start : int
        start of the variant in 1-based inclusive coordinates

    inclusive_end : int
        end of the variant in 1-based inclusive coordinates

    alt : string
        alt sequence

    context_length : int
        number of bases on either side of the variant to return

    Returns
    ---------
    A tuple of (5', mutation, 3') where
        5' - bases immediately 5 prime to the mutation
        
        3' - bases immediately 3 prime to the mutation
        
        mutation - the ref sequence followed by a > character followed by the
            the alt sequence
    """

    # Move from 1-base coorindates to 0-base coordinates
    start = int(inclusive_start) - 1
    end = int(inclusive_end)

    full_sequence = reference_fasta[contig]

    left = str(full_sequence[start - context_length:start].seq).upper()
    middle = str(full_sequence[start: end].seq).upper()
    right = str(full_sequence[end: end + context_length].seq).upper()

    # Complement and reverse the context if necessary so the ref base is a
    # pyrmidine (C/T)
    if middle[0] in ('A', 'G'):
        context_5prime = pyfaidx.complement(right)[::-1]
        context_3prime = pyfaidx.complement(left)[::-1]
        context_mutation = "%s>%s" % (
            pyfaidx.complement(middle)[::-1], pyfaidx.complement(alt)[::-1])
    else:
        context_5prime = left
        context_3prime = right
        context_mutation = "%s>%s" % (middle, alt)

    return (context_5prime, context_mutation, context_3prime)
Esempio n. 2
0
def complement_seq(seq):
    return pyfaidx.complement(seq)
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, 
                    revcomp_strand, min_counts, 
                    offset_min, offset_max, region_size,
                    ignore_chroms, only_chroms, verbose):

    ''' main routine for calculating nuc_counts '''

    if verbose:
        msg =  ">> analyzing sequences ...\n"
        msg += ">> ignore:%s only:%s\n" % \
            (str(ignore_chroms), str(only_chroms))
        msg += ">> offset range: %d to %d\n" % (offset_min, offset_max)
        msg += ">> region size: %d\n" % (region_size)
        msg += ">> revcomp strand: %s\n" % str(revcomp_strand)
        print >>sys.stderr, msg

    seq_fasta = Fasta(fasta_filename, as_raw = True)

    nuc_counts = defaultdict(Counter)

    bedtools = (pos_signal_bedtool, neg_signal_bedtool)
    strands = ('+', '-')

    # total number of sites examined
    total_sites = 0

    for bedtool, strand in izip(bedtools, strands):

        for row in bedtool:

            # skip data based on specified chromosomes
            if row.chrom in ignore_chroms:
                continue
            if only_chroms and row.chrom not in only_chroms:
                continue

            # skip data if counts are too low
            if row.count < min_counts: continue

            # sites in bedgraph examined - must come after all checks
            # above
            total_sites += 1

            for offset in range(offset_min, offset_max + 1):

                # upstream offsets are negative values
                if strand == '+':
                    start = row.start + offset
                elif strand == '-':
                    start = row.start - offset

                if region_size == 1:
                    # half open at the position of interest
                    end = start + region_size
                else:
                    # make sure that the 3' most position in a region
                    # is the base of interest
                    if strand == '+':
                        end = start + 1 # include position with + 1
                        start = end - region_size
                    else:
                        # negative strand
                        end = start + region_size

                nucs = seq_fasta[row.chrom][start:end]

                #  1. libs where the captured strand is sequenced
                #     are the correct polarity as-is (i.e. Excision-seq
                #     libs)
                #  2. libs where the *copy* of the captured strand
                #     is sequenced should be revcomplemented (i.e.
                #     circularization-based libs)

                if (strand == '+' and revcomp_strand) or \
                   (strand == '-' and not revcomp_strand):
                    nucs = complement(nucs[::-1])

                nuc_counts[offset][nucs] += row.count

    # remove nucs that are not len region_size
    for offset, counts in nuc_counts.items():
        for nuc, count in counts.items():
            if len(nuc) != region_size:
                counts.pop(nuc)

    return total_sites, nuc_counts
Esempio n. 4
0
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename,
                    revcomp_strand, min_counts, offset_min, offset_max,
                    region_size):
    ''' main routine for calculating nuc_counts '''

    #if verbose:
    #    msg =  ">> analyzing sequences ...\n"
    #    msg += ">> ignore:%s only:%s\n" % \
    #        (str(ignore_chroms), str(only_chroms))
    #    msg += ">> offset range: %d to %d\n" % (offset_min, offset_max)
    #    msg += ">> region size: %d\n" % (region_size)
    #    msg += ">> revcomp strand: %s\n" % str(revcomp_strand)
    #    print >>sys.stderr, msg

    seq_fasta = Fasta(fasta_filename, as_raw=True)

    nuc_counts = defaultdict(Counter)

    bedtools = (pos_signal_bedtool, neg_signal_bedtool)
    strands = ('+', '-')

    # total number of sites examined
    total_sites = 0

    for bedtool, strand in zip(bedtools, strands):

        for row in bedtool:

            # skip data based on specified chromosomes
            #         if row.chrom in ignore_chroms:
            #             continue
            #         if only_chroms and row.chrom not in only_chroms:
            #             continue

            # skip data if counts are too low
            if row.count < min_counts: continue

            # sites in bedgraph examined - must come after all checks
            # above
            total_sites += 1

            for offset in range(offset_min, offset_max + 1):

                # upstream offsets are negative values
                if strand == '+':
                    start = row.start + offset
                elif strand == '-':
                    start = row.start - offset

                if region_size == 1:
                    # half open at the position of interest
                    end = start + region_size
                else:
                    # make sure that the 3' most position in a region
                    # is the base of interest
                    if strand == '+':
                        end = start + 1  # include position with + 1
                        start = end - region_size
                    else:
                        # negative strand
                        end = start + region_size

                nucs = seq_fasta[row.chrom][start:end]

                #  1. libs where the captured strand is sequenced
                #     are the correct polarity as-is (i.e. Excision-seq
                #     libs)
                #  2. libs where the *copy* of the captured strand
                #     is sequenced should be revcomplemented (i.e.
                #     circularization-based libs)

                if (strand == '+' and revcomp_strand) or \
                   (strand == '-' and not revcomp_strand):
                    nucs = complement(nucs[::-1])

                nuc_counts[offset][nucs] += row.count

    # remove nucs that are not len region_size


#    for offset, counts in nuc_counts.items():
#        for nuc, count in counts.items():
#            if len(nuc) != region_size:
#                counts.pop(nuc)

    return total_sites, nuc_counts
Esempio n. 5
0
    def extract(self, interval, variants, anchor, fixed_len=True, use_strand=None, **kwargs):
        """

        Args:
            interval: pybedtools.Interval Region of interest from
                which to query the sequence. 0-based
            variants: List[cyvcf2.Variant]: variants overlapping the `interval`.
                can also be indels. 1-based
            anchor: absolution position w.r.t. the interval start. (0-based).
                E.g. for an interval of `chr1:10-20` the anchor of 10 denotes
                the point chr1:10 in the 0-based coordinate system.
            fixed_len: if True, the return sequence will have the same length
                as the `interval` (e.g. `interval.end - interval.start`)
            use_strand (bool, optional): if True, the extracted sequence
                is reverse complemented in case interval.strand == "-".
                Overrides `self.use_strand`

        Returns:
            A single sequence (`str`) with all the variants applied.
        """
        # Preprocessing
        anchor = max(min(anchor, interval.end), interval.start)
        variant_pairs = self._variant_to_sequence(variants)

        # 1. Split variants overlapping with anchor
        # and interval start end if not fixed_len
        variant_pairs = self._split_overlapping(variant_pairs, anchor)

        if not fixed_len:
            variant_pairs = self._split_overlapping(
                variant_pairs, interval.start, which='right')
            variant_pairs = self._split_overlapping(
                variant_pairs, interval.end, which='left')

        variant_pairs = list(variant_pairs)

        # 2. split the variants into upstream and downstream
        # and sort the variants in each interval
        upstream_variants = sorted(
            filter(lambda x: x[0].start >= anchor, variant_pairs),
            key=lambda x: x[0].start
        )

        downstream_variants = sorted(
            filter(lambda x: x[0].start < anchor, variant_pairs),
            key=lambda x: x[0].start,
            reverse=True
        )

        # 3. Extend start and end position for deletions
        if fixed_len:
            istart, iend = self._updated_interval(
                interval, upstream_variants, downstream_variants)
        else:
            istart, iend = interval.start, interval.end

        # 4. Iterate from the anchor point outwards. At each
        # register the interval from which to take the reference sequence
        # as well as the interval for the variant
        down_sb = self._downstream_builder(
            downstream_variants, interval, anchor, istart)

        up_sb = self._upstream_builder(
            upstream_variants, interval, anchor, iend)

        # 5. fetch the sequence and restore intervals in builder
        seq = self._fetch(interval, istart, iend)
        up_sb.restore(seq)
        down_sb.restore(seq)

        # 6. Concate sequences from the upstream and downstream splits. Concat
        # upstream and downstream sequence. Cut to fix the length.
        down_str = down_sb.concat()
        up_str = up_sb.concat()

        if fixed_len:
            down_str, up_str = self._cut_to_fix_len(
                down_str, up_str, interval, anchor)

        seq = down_str + up_str

        if use_strand is None:
            use_strand = self.use_strand
        if use_strand and interval.strand == '-':
            # reverse-complement
            seq = complement(seq)[::-1]

        return seq
Esempio n. 6
0
def variant_context(reference_fasta, contig, inclusive_start, inclusive_end,
                    alt, context_length):
    """
    Retrieve the surronding reference region from a variant.

    SNVs are canonicalized so the reference base is a pyrmidine (C/T). For
    indels the reverse complement will still be taken if the first base of
    the reference is not a pyrmidine, but since the reference will also be
    reversed, that doesn't guarantee it will start with a pyrmidine.

    Parameters
    ----------
    reference_fasta : FastaReference
        reference sequence from pyfaidx package

    contig : str
        Chromosome of the variant

    inclusive_start : int
        start of the variant in 1-based inclusive coordinates

    inclusive_end : int
        end of the variant in 1-based inclusive coordinates

    alt : string
        alt sequence

    context_length : int
        number of bases on either side of the variant to return

    Returns
    ---------
    A tuple of (5', mutation, 3') where
        5' - bases immediately 5 prime to the mutation
        
        3' - bases immediately 3 prime to the mutation
        
        mutation - the ref sequence followed by a > character followed by the
            the alt sequence
    """

    # Move from 1-base coorindates to 0-base coordinates
    start = int(inclusive_start) - 1
    end = int(inclusive_end)

    full_sequence = reference_fasta[contig]

    left = str(full_sequence[start - context_length:start].seq).upper()
    middle = str(full_sequence[start:end].seq).upper()
    right = str(full_sequence[end:end + context_length].seq).upper()

    # Complement and reverse the context if necessary so the ref base is a
    # pyrmidine (C/T)
    if middle[0] in ('A', 'G'):
        context_5prime = pyfaidx.complement(right)[::-1]
        context_3prime = pyfaidx.complement(left)[::-1]
        context_mutation = "%s>%s" % (pyfaidx.complement(middle)[::-1],
                                      pyfaidx.complement(alt)[::-1])
    else:
        context_5prime = left
        context_3prime = right
        context_mutation = "%s>%s" % (middle, alt)

    return (context_5prime, context_mutation, context_3prime)
Esempio n. 7
0
def test_comp_empty():
    assert complement('') == ''
Esempio n. 8
0
def test_comp_valid():
    assert complement(comp_valid).startswith("AACTTCTAAAnCG")
    assert complement(complement(comp_valid)) == comp_valid
Esempio n. 9
0
def test_comp_invalid():
    complement(comp_invalid)