Example #1
0
def get_inferred_sequences(pairs, genome_dict, add_softclipped_bases=False):

    inferred_sequences = []
    for read1, read2 in pairs:
        if read1.query_name.count('_') == 2:
            context_width = int(read1.query_name.split('_')[-2])
            name = read1.reference_name + ':' + str(
                read1.reference_start +
                context_width) + '-' + str(read2.reference_end - context_width)

            inferred_sequence = genome_dict[read1.reference_name][
                read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(
                    read1
                ) + inferred_sequence + sctools.right_softclipped_sequence_strict(
                    read2)

            inferred_sequence = inferred_sequence[context_width:-context_width]

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        else:
            name = read1.reference_name + ':' + str(
                read1.reference_start) + '-' + str(read2.reference_end)
            inferred_sequence = genome_dict[read1.reference_name][
                read1.reference_start:read2.reference_end]

            if add_softclipped_bases:
                inferred_sequence = sctools.left_softclipped_sequence_strict(
                    read1
                ) + inferred_sequence + sctools.right_softclipped_sequence_strict(
                    read2)

            if read1.query_name.split('_')[-1] == '2':
                inferred_sequence = misc.revcomp(inferred_sequence)

            contig_edge = False
            if sctools.is_left_softclipped_strict(read1) and \
                            sctools.left_softclipped_position(read1) < 0:
                contig_edge = True
            elif sctools.is_right_softclipped_strict(read2) and \
                            sctools.right_softclipped_position(read2) >= len(genome_dict[read2.reference_name]):
                contig_edge = True

        inferred_sequences.append(
            (name, len(inferred_sequence), contig_edge, inferred_sequence))

    return inferred_sequences
    def get_inferred_sequence(self, forward_read, reverse_read, is_reverse):
        contig = forward_read.reference_name
        start = forward_read.reference_start
        end = reverse_read.reference_end

        inferred_sequence = ''.join(self.genome_dict[contig][start:end])

        inferred_sequence = sctools.left_softclipped_sequence_strict(forward_read) + \
                            inferred_sequence + \
                            sctools.right_softclipped_sequence_strict(reverse_read)

        inferred_sequence = inferred_sequence[self.context_width:-self.context_width]

        if is_reverse:
            inferred_sequence = misc.revcomp(inferred_sequence)

        contig_edge = False
        if sctools.is_left_softclipped_strict(forward_read) and \
                        sctools.left_softclipped_position(forward_read) < 0:
            contig_edge = True
        elif sctools.is_right_softclipped_strict(reverse_read) and \
                        sctools.right_softclipped_position(reverse_read) >= len(self.genome_dict[contig]):
            contig_edge = True


        return inferred_sequence, contig_edge
def prefilter_reads(bam, database_dict, min_perc_identity,
                    max_internal_softclip_prop, max_edge_distance):
    keep_reads = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

    for read in bam:

        if pysamtools.get_perc_identity(read) < min_perc_identity:
            continue

        if not read.is_reverse:

            if sctools.is_right_softclipped_strict(read) and \
                sctools.right_softclipped_position(read) < len(database_dict[read.reference_name]) and \
                sctools.right_softclip_proportion(read) > max_internal_softclip_prop:
                continue

            elif read.reference_start > max_edge_distance:
                continue

            elif sctools.is_left_softclipped_strict(read) and \
                abs(0 - sctools.left_softclip_reference_start(read)) > max_edge_distance:
                continue

        if read.is_reverse:

            if sctools.is_left_softclipped_strict(read) and \
                sctools.left_softclipped_position(read) >= 0 and \
                sctools.left_softclip_proportion(read) > max_internal_softclip_prop:
                continue

            elif (len(database_dict[read.reference_name]) -
                  read.reference_end) > max_edge_distance:
                continue

            elif sctools.is_right_softclipped_strict(read) and \
                abs(0 - (len(database_dict[read.reference_name]) - sctools.right_softclip_reference_end(read))) > max_edge_distance:
                continue

        pair_id, terminus_id = read.query_name.split('_')

        keep_reads[pair_id][read.reference_name][terminus_id].append(read)

    return keep_reads
Example #4
0
    def __prefilter_reads(self):

        filtered_bam = []

        for read in self.bam:

            if pysamtools.get_perc_identity(read) < self.min_perc_identity:
                continue

            if not read.is_reverse:
                if sctools.is_left_softclipped_strict(read) and \
                    sctools.get_left_softclip_length(read) > 1 and \
                    sctools.left_softclipped_position(read) >= 0:
                    continue

            if read.is_reverse:
                if sctools.is_right_softclipped_strict(read) and \
                                sctools.get_right_softclip_length(read) > 1 and \
                                sctools.right_softclipped_position(read) < len(self.genome_dict[read.reference_name]):
                    continue

            filtered_bam.append(read)

        self.bam = filtered_bam