def get_inferred_sequence(self, forward_read, reverse_read, is_reverse): contig = forward_read.reference_name start = forward_read.reference_start end = reverse_read.reference_end inferred_sequence = ''.join(self.genome_dict[contig][start:end]) inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \ inferred_sequence + \ sctools.right_softclipped_sequence_strict(reverse_read) inferred_sequence = inferred_sequence[self.context_width:-self.context_width] if is_reverse: inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(forward_read) and \ sctools.left_softclipped_position(forward_read) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(reverse_read) and \ sctools.right_softclipped_position(reverse_read) >= len(self.genome_dict[contig]): contig_edge = True return inferred_sequence, contig_edge
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False): inferred_sequences = [] for read1, read2 in pairs: if read1.query_name.count('_') == 2: context_width = int(read1.query_name.split('_')[-2]) name = read1.reference_name + ':' + str(read1.reference_start+context_width) + '-' + str(read2.reference_end-context_width) inferred_sequence = genome_dict[read1.reference_name][read1.reference_start:read2.reference_end] if add_softclipped_bases: inferred_sequence = sctools.left_softclipped_sequence_strict(read1) + inferred_sequence + sctools.right_softclipped_sequence_strict(read2) inferred_sequence = inferred_sequence[context_width:-context_width] if read1.query_name.split('_')[-1] == '2': inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(read1) and \ sctools.left_softclipped_position(read1) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(read2) and \ sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]): contig_edge = True else: name = read1.reference_name + ':' + str(read1.reference_start) + '-' + str(read2.reference_end) inferred_sequence = genome_dict[read1.reference_name][read1.reference_start:read2.reference_end] if add_softclipped_bases: inferred_sequence = sctools.left_softclipped_sequence_strict(read1) + inferred_sequence + sctools.right_softclipped_sequence_strict(read2) if read1.query_name.split('_')[-1] == '2': inferred_sequence = misc.revcomp(inferred_sequence) contig_edge = False if sctools.is_left_softclipped_strict(read1) and \ sctools.left_softclipped_position(read1) < 0: contig_edge = True elif sctools.is_right_softclipped_strict(read2) and \ sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]): contig_edge = True inferred_sequences.append((name, len(inferred_sequence), contig_edge, inferred_sequence)) return inferred_sequences
def prefilter_reads(bam, database_dict, min_perc_identity, max_internal_softclip_prop, max_edge_distance): keep_reads = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for read in bam: if pysamtools.get_perc_identity(read) < min_perc_identity: continue if not read.is_reverse: if sctools.is_right_softclipped_strict(read) and \ sctools.right_softclipped_position(read) < len(database_dict[read.reference_name]) and \ sctools.right_softclip_proportion(read) > max_internal_softclip_prop: continue elif read.reference_start > max_edge_distance: continue elif sctools.is_left_softclipped_strict(read) and \ abs(0 - sctools.left_softclip_reference_start(read)) > max_edge_distance: continue if read.is_reverse: if sctools.is_left_softclipped_strict(read) and \ sctools.left_softclipped_position(read) >= 0 and \ sctools.left_softclip_proportion(read) > max_internal_softclip_prop: continue elif (len(database_dict[read.reference_name]) - read.reference_end) > max_edge_distance: continue elif sctools.is_right_softclipped_strict(read) and \ abs(0 - (len(database_dict[read.reference_name]) - sctools.right_softclip_reference_end(read))) > max_edge_distance: continue pair_id, flank_id = read.query_name.split('_') keep_reads[pair_id][read.reference_name][flank_id].append(read) return keep_reads
def filter_pairs_max_internal_softclip_prop(self, max_internal_softclip_prop): keep_pairs = list() for p in self.pairs: if sctools.is_left_softclipped_strict(p.forward_read) and \ sctools.get_left_softclip_length(p.forward_read) > 1 and \ sctools.is_right_softclipped_strict(p.reverse_read) and \ sctools.get_right_softclip_length(p.reverse_read) > 1: continue if sctools.is_right_softclipped_strict(p.forward_read) and \ p.forward_read.reference_end < p.reverse_read.reference_end and \ sctools.right_softclip_proportion(p.forward_read) > max_internal_softclip_prop: continue if sctools.is_left_softclipped_strict(p.reverse_read) and \ p.reverse_read.reference_start > p.forward_read.reference_start and \ sctools.left_softclip_proportion(p.reverse_read) > max_internal_softclip_prop: continue keep_pairs.append(p) self.pairs = keep_pairs
def __prefilter_reads(self): filtered_bam = [] for read in self.bam: if pysamtools.get_perc_identity(read) < self.min_perc_identity: continue if not read.is_reverse: if sctools.is_left_softclipped_strict(read) and \ sctools.get_left_softclip_length(read) > 1 and \ sctools.left_softclipped_position(read) >= 0: continue if read.is_reverse: if sctools.is_right_softclipped_strict(read) and \ sctools.get_right_softclip_length(read) > 1 and \ sctools.right_softclipped_position(read) < len(self.genome_dict[read.reference_name]): continue filtered_bam.append(read) self.bam = filtered_bam