def get_seq(self, chrom, start, end, strand): chrom = misc.match_chrom_format(chrom, list(self.fasta.keys())) seq = self.fasta[chrom][start:end + 1] if strand == "-": seq = misc.reverse_comp(seq) return seq
def align(self, seq): alns = [] revseq = reverse_comp(seq) for i, name in enumerate(self.names_to_aligners): aligner = self.names_to_aligners[name] faln = aligner.align(seq) raln = aligner.align(revseq) cur_aln = pysam.AlignedSegment() cur_aln.reference_id = i if faln.score > raln.score: cur_aln.query_sequence = seq cur_aln.reference_start = faln.ref_begin cur_aln.set_tag("AS", faln.score) cur_aln.cigarstring = faln.cigar_string else: cur_aln.query_sequence = revseq cur_aln.reference_start = raln.ref_begin cur_aln.set_tag("AS", raln.score) cur_aln.cigarstring = raln.cigar_string cur_aln.is_reverse = True alns.append(cur_aln) return alns
def simulate_read_pair(sequence, start, length=150, isize=400, flip=False): r1 = pysam.AlignedSegment() r1.query_sequence = sequence[start:start + length] r2 = pysam.AlignedSegment() pos2 = start + isize r2.query_sequence = reverse_comp(sequence[pos2 - length:pos2]) if flip: r1, r2 = r2, r1 return ReadPair(Alignment(r1), Alignment(r2), read_stats)
def simple_dotplot(s1, s2, wordsize=8, scale=650): # scale is the final size of the output matrix for visualization l1 = int((len(s1)-wordsize)) l2 = int((len(s2)-wordsize)) width = int(numpy.ceil(l1/max([l1,l2]) * scale)) height = int(numpy.ceil(l2/max([l1,l2]) * scale)) mat = numpy.zeros((height, width)) binsize = l1/(width-1) kmertopos1 = collections.defaultdict(list) # get positions of kmers in s1 for i in range(l1): kmer = s1[i:i+wordsize] kmertopos1[kmer].append(i) # find all matching kmers from s2 for i in range(l2): kmer = s2[i:i+wordsize] positions = kmertopos1[kmer] positions = (numpy.array(positions)/binsize).astype(int) y = int(i/binsize) mat[y, positions] += 1 # find all rev-comp kmer matches from s2 for i in range(l2): kmer = misc.reverse_comp(s2[i:i+wordsize]) positions = kmertopos1[kmer] positions = (numpy.array(positions)/binsize).astype(int) y = int(i/binsize) mat[y, positions] += 1 mat = mat[::-1,] return mat
def get_seq(self, chrom, start, end, strand): seq = self.names_to_contigs[chrom][start:end + 1] if strand == "-": seq = misc.reverse_comp(seq) return seq
def original_sequence(self): if self.is_reverse: return misc.reverse_comp(self.query_sequence) return self.query_sequence