def variant_context( reference_fasta, contig, inclusive_start, inclusive_end, alt, context_length): """ Retrieve the surronding reference region from a variant. SNVs are canonicalized so the reference base is a pyrmidine (C/T). For indels the reverse complement will still be taken if the first base of the reference is not a pyrmidine, but since the reference will also be reversed, that doesn't guarantee it will start with a pyrmidine. Parameters ---------- reference_fasta : FastaReference reference sequence from pyfaidx package contig : str Chromosome of the variant inclusive_start : int start of the variant in 1-based inclusive coordinates inclusive_end : int end of the variant in 1-based inclusive coordinates alt : string alt sequence context_length : int number of bases on either side of the variant to return Returns --------- A tuple of (5', mutation, 3') where 5' - bases immediately 5 prime to the mutation 3' - bases immediately 3 prime to the mutation mutation - the ref sequence followed by a > character followed by the the alt sequence """ # Move from 1-base coorindates to 0-base coordinates start = int(inclusive_start) - 1 end = int(inclusive_end) full_sequence = reference_fasta[contig] left = str(full_sequence[start - context_length:start].seq).upper() middle = str(full_sequence[start: end].seq).upper() right = str(full_sequence[end: end + context_length].seq).upper() # Complement and reverse the context if necessary so the ref base is a # pyrmidine (C/T) if middle[0] in ('A', 'G'): context_5prime = pyfaidx.complement(right)[::-1] context_3prime = pyfaidx.complement(left)[::-1] context_mutation = "%s>%s" % ( pyfaidx.complement(middle)[::-1], pyfaidx.complement(alt)[::-1]) else: context_5prime = left context_3prime = right context_mutation = "%s>%s" % (middle, alt) return (context_5prime, context_mutation, context_3prime)
def complement_seq(seq): return pyfaidx.complement(seq)
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, revcomp_strand, min_counts, offset_min, offset_max, region_size, ignore_chroms, only_chroms, verbose): ''' main routine for calculating nuc_counts ''' if verbose: msg = ">> analyzing sequences ...\n" msg += ">> ignore:%s only:%s\n" % \ (str(ignore_chroms), str(only_chroms)) msg += ">> offset range: %d to %d\n" % (offset_min, offset_max) msg += ">> region size: %d\n" % (region_size) msg += ">> revcomp strand: %s\n" % str(revcomp_strand) print >>sys.stderr, msg seq_fasta = Fasta(fasta_filename, as_raw = True) nuc_counts = defaultdict(Counter) bedtools = (pos_signal_bedtool, neg_signal_bedtool) strands = ('+', '-') # total number of sites examined total_sites = 0 for bedtool, strand in izip(bedtools, strands): for row in bedtool: # skip data based on specified chromosomes if row.chrom in ignore_chroms: continue if only_chroms and row.chrom not in only_chroms: continue # skip data if counts are too low if row.count < min_counts: continue # sites in bedgraph examined - must come after all checks # above total_sites += 1 for offset in range(offset_min, offset_max + 1): # upstream offsets are negative values if strand == '+': start = row.start + offset elif strand == '-': start = row.start - offset if region_size == 1: # half open at the position of interest end = start + region_size else: # make sure that the 3' most position in a region # is the base of interest if strand == '+': end = start + 1 # include position with + 1 start = end - region_size else: # negative strand end = start + region_size nucs = seq_fasta[row.chrom][start:end] # 1. libs where the captured strand is sequenced # are the correct polarity as-is (i.e. Excision-seq # libs) # 2. libs where the *copy* of the captured strand # is sequenced should be revcomplemented (i.e. # circularization-based libs) if (strand == '+' and revcomp_strand) or \ (strand == '-' and not revcomp_strand): nucs = complement(nucs[::-1]) nuc_counts[offset][nucs] += row.count # remove nucs that are not len region_size for offset, counts in nuc_counts.items(): for nuc, count in counts.items(): if len(nuc) != region_size: counts.pop(nuc) return total_sites, nuc_counts
def calc_nuc_counts(pos_signal_bedtool, neg_signal_bedtool, fasta_filename, revcomp_strand, min_counts, offset_min, offset_max, region_size): ''' main routine for calculating nuc_counts ''' #if verbose: # msg = ">> analyzing sequences ...\n" # msg += ">> ignore:%s only:%s\n" % \ # (str(ignore_chroms), str(only_chroms)) # msg += ">> offset range: %d to %d\n" % (offset_min, offset_max) # msg += ">> region size: %d\n" % (region_size) # msg += ">> revcomp strand: %s\n" % str(revcomp_strand) # print >>sys.stderr, msg seq_fasta = Fasta(fasta_filename, as_raw=True) nuc_counts = defaultdict(Counter) bedtools = (pos_signal_bedtool, neg_signal_bedtool) strands = ('+', '-') # total number of sites examined total_sites = 0 for bedtool, strand in zip(bedtools, strands): for row in bedtool: # skip data based on specified chromosomes # if row.chrom in ignore_chroms: # continue # if only_chroms and row.chrom not in only_chroms: # continue # skip data if counts are too low if row.count < min_counts: continue # sites in bedgraph examined - must come after all checks # above total_sites += 1 for offset in range(offset_min, offset_max + 1): # upstream offsets are negative values if strand == '+': start = row.start + offset elif strand == '-': start = row.start - offset if region_size == 1: # half open at the position of interest end = start + region_size else: # make sure that the 3' most position in a region # is the base of interest if strand == '+': end = start + 1 # include position with + 1 start = end - region_size else: # negative strand end = start + region_size nucs = seq_fasta[row.chrom][start:end] # 1. libs where the captured strand is sequenced # are the correct polarity as-is (i.e. Excision-seq # libs) # 2. libs where the *copy* of the captured strand # is sequenced should be revcomplemented (i.e. # circularization-based libs) if (strand == '+' and revcomp_strand) or \ (strand == '-' and not revcomp_strand): nucs = complement(nucs[::-1]) nuc_counts[offset][nucs] += row.count # remove nucs that are not len region_size # for offset, counts in nuc_counts.items(): # for nuc, count in counts.items(): # if len(nuc) != region_size: # counts.pop(nuc) return total_sites, nuc_counts
def extract(self, interval, variants, anchor, fixed_len=True, use_strand=None, **kwargs): """ Args: interval: pybedtools.Interval Region of interest from which to query the sequence. 0-based variants: List[cyvcf2.Variant]: variants overlapping the `interval`. can also be indels. 1-based anchor: absolution position w.r.t. the interval start. (0-based). E.g. for an interval of `chr1:10-20` the anchor of 10 denotes the point chr1:10 in the 0-based coordinate system. fixed_len: if True, the return sequence will have the same length as the `interval` (e.g. `interval.end - interval.start`) use_strand (bool, optional): if True, the extracted sequence is reverse complemented in case interval.strand == "-". Overrides `self.use_strand` Returns: A single sequence (`str`) with all the variants applied. """ # Preprocessing anchor = max(min(anchor, interval.end), interval.start) variant_pairs = self._variant_to_sequence(variants) # 1. Split variants overlapping with anchor # and interval start end if not fixed_len variant_pairs = self._split_overlapping(variant_pairs, anchor) if not fixed_len: variant_pairs = self._split_overlapping( variant_pairs, interval.start, which='right') variant_pairs = self._split_overlapping( variant_pairs, interval.end, which='left') variant_pairs = list(variant_pairs) # 2. split the variants into upstream and downstream # and sort the variants in each interval upstream_variants = sorted( filter(lambda x: x[0].start >= anchor, variant_pairs), key=lambda x: x[0].start ) downstream_variants = sorted( filter(lambda x: x[0].start < anchor, variant_pairs), key=lambda x: x[0].start, reverse=True ) # 3. Extend start and end position for deletions if fixed_len: istart, iend = self._updated_interval( interval, upstream_variants, downstream_variants) else: istart, iend = interval.start, interval.end # 4. Iterate from the anchor point outwards. At each # register the interval from which to take the reference sequence # as well as the interval for the variant down_sb = self._downstream_builder( downstream_variants, interval, anchor, istart) up_sb = self._upstream_builder( upstream_variants, interval, anchor, iend) # 5. fetch the sequence and restore intervals in builder seq = self._fetch(interval, istart, iend) up_sb.restore(seq) down_sb.restore(seq) # 6. Concate sequences from the upstream and downstream splits. Concat # upstream and downstream sequence. Cut to fix the length. down_str = down_sb.concat() up_str = up_sb.concat() if fixed_len: down_str, up_str = self._cut_to_fix_len( down_str, up_str, interval, anchor) seq = down_str + up_str if use_strand is None: use_strand = self.use_strand if use_strand and interval.strand == '-': # reverse-complement seq = complement(seq)[::-1] return seq
def variant_context(reference_fasta, contig, inclusive_start, inclusive_end, alt, context_length): """ Retrieve the surronding reference region from a variant. SNVs are canonicalized so the reference base is a pyrmidine (C/T). For indels the reverse complement will still be taken if the first base of the reference is not a pyrmidine, but since the reference will also be reversed, that doesn't guarantee it will start with a pyrmidine. Parameters ---------- reference_fasta : FastaReference reference sequence from pyfaidx package contig : str Chromosome of the variant inclusive_start : int start of the variant in 1-based inclusive coordinates inclusive_end : int end of the variant in 1-based inclusive coordinates alt : string alt sequence context_length : int number of bases on either side of the variant to return Returns --------- A tuple of (5', mutation, 3') where 5' - bases immediately 5 prime to the mutation 3' - bases immediately 3 prime to the mutation mutation - the ref sequence followed by a > character followed by the the alt sequence """ # Move from 1-base coorindates to 0-base coordinates start = int(inclusive_start) - 1 end = int(inclusive_end) full_sequence = reference_fasta[contig] left = str(full_sequence[start - context_length:start].seq).upper() middle = str(full_sequence[start:end].seq).upper() right = str(full_sequence[end:end + context_length].seq).upper() # Complement and reverse the context if necessary so the ref base is a # pyrmidine (C/T) if middle[0] in ('A', 'G'): context_5prime = pyfaidx.complement(right)[::-1] context_3prime = pyfaidx.complement(left)[::-1] context_mutation = "%s>%s" % (pyfaidx.complement(middle)[::-1], pyfaidx.complement(alt)[::-1]) else: context_5prime = left context_3prime = right context_mutation = "%s>%s" % (middle, alt) return (context_5prime, context_mutation, context_3prime)
def test_comp_empty(): assert complement('') == ''
def test_comp_valid(): assert complement(comp_valid).startswith("AACTTCTAAAnCG") assert complement(complement(comp_valid)) == comp_valid
def test_comp_invalid(): complement(comp_invalid)