Exemple #1
0
    def extract_boundary_sequences(self):
        read_pairs = self.get_read_pairs()
        trimmed_read_pairs = self.trim_barcodes(read_pairs)

        total_reads = 0
        well_formed = 0
        long_enough = 0
    
        counters = {'positions': {orientation: Counter() for orientation in orientations},
                    'control_ids': Counter(),
                    'polyA_lengths': Counter(),
                    'left_ids': Counter(),
                    'right_ids': Counter(),
                    'joint_lengths': Counter(),
                   }

        with open(self.file_names['five_prime_boundaries'], 'w') as fives_fh, \
             open(self.file_names['three_prime_boundaries'], 'w') as threes_fh:

            for R1, R2 in trimmed_read_pairs:
                total_reads += 1
                five_payload_read, three_payload_read = TIF_seq_structure.find_boundary_sequences(R1, R2, counters)
                if five_payload_read and three_payload_read:
                    well_formed += 1
                    if len(five_payload_read.seq) >= self.min_payload_length and \
                       len(three_payload_read.seq) >= self.min_payload_length:
                        long_enough += 1
                        fives_fh.write(fastq.make_record(*five_payload_read))
                        threes_fh.write(fastq.make_record(*three_payload_read))

        # Pop off of counters so that what is left at the end can be written
        # directly to the id_counts file.
        position_counts = counters.pop('positions')
        for orientation in orientations:
            key = '{0}_{1}'.format(orientation, 'positions')
            array = counts_to_array(position_counts[orientation])
            self.write_file(key, array)

        polyA_lengths = counts_to_array(counters.pop('polyA_lengths'))
        self.write_file('polyA_lengths', polyA_lengths)

        joint_lengths = counts_to_array(counters.pop('joint_lengths'), dim=2)
        self.write_file('joint_lengths', joint_lengths)
        
        self.write_file('id_counts', counters)

        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Well-formed', well_formed),
             ('Long enough', long_enough),
            ],
        )
Exemple #2
0
 def zero_padded_array(self, counts):
     array = utilities.counts_to_array(counts)
     if len(array) < self.max_read_length + 1:
         padded_array = np.zeros(self.max_read_length + 1, int)
         padded_array[:len(array)] += array
     else:
         padded_array = array
     return padded_array
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=sam_file,
                                                     )

        extended_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended'],
                                             )
        filtered_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended_filtered'],
                                             )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue
                
                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue
                
                num_unique += 1
                
                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend(
            [('Mapped with no non-genomic A\'s', num_entirely_genomic),
             ('Nonunique', num_nonunique),
             ('Unique', num_unique),
            ],
        )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
Exemple #4
0
def extract_fragment_lengths(bam_file_name):
    def concordantly_mapped(mapping):
        return not mapping.mate_is_unmapped and \
               not mapping.is_unmapped and \
               mapping.rnext == mapping.tid and \
               abs(mapping.tlen) < 10000

    bam_fh = pysam.Samfile(bam_file_name)
    TLENs = (abs(mapping.tlen) for mapping in bam_fh
             if mapping.is_read1 and concordantly_mapped(mapping))
    fragment_lengths = Counter(TLENs)

    # Note that counts_to_array implicitly discards negative key values.
    fragment_lengths = utilities.counts_to_array(fragment_lengths)

    return fragment_lengths
Exemple #5
0
    def collapse_fragments(self):
        get_position = annotation.make_convertor(self.MappingAnnotation,
                                                 self.PositionAnnotation,
                                                )
        get_fragment = annotation.make_convertor(self.MappingAnnotation,
                                                 self.FragmentAnnotation,
                                                )
        amplification_counts = Counter()

        sq_lines = sam.get_sq_lines(self.merged_file_names['sorted_clean_sam'])
        sam_lines = self.get_sorted_sam_lines()

        with open(self.file_names['collapsed_sam'], 'w') as collapsed_fh, \
             open(self.file_names['collisions'], 'w') as collisions_fh:
            
            for sq_line in sq_lines:
                collapsed_fh.write(sq_line)

            position_groups = utilities.group_by(sam_lines, get_position)
            for position_annotation, position_lines in position_groups:
                fragment_counts = Counter()
                position_count = len(position_lines)
                fragment_groups = utilities.group_by(position_lines, get_fragment)
                for fragment_annotation, fragment_lines in fragment_groups:
                    fragment_count = len(fragment_lines)
                    fragment_counts[fragment_count] += 1
                    amplification_counts['{},{}'.format(position_count, fragment_count)] += 1
                    collapsed_annotation = self.CollapsedAnnotation(count=fragment_count, **fragment_annotation)
                    new_line = sam.splice_in_name(fragment_lines[0], collapsed_annotation.identifier)
                    collapsed_fh.write(new_line)
                fragment_counts = utilities.counts_to_array(fragment_counts)
                if position_count > 100:
                    collisions_fh.write(position_annotation.identifier + '\n')
                    collisions_fh.write(','.join(map(str, fragment_counts)) + '\n')

        sam.make_sorted_bam(self.file_names['collapsed_sam'],
                            self.file_names['collapsed_bam'],
                           )

        self.write_file('amplification_counts', amplification_counts)
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()
        
        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # R2 isn't expected to have adapters sequence because it will
            # have to get through the A tail first.
            position = adapters.find_adapter(self.adapter_in_R1, 3, R1.seq)
            trimmed_lengths[position] += 1
            if position < 12:
                continue
            long_enough_reads += 1

            R1_slice = slice(None, position)
            # position points to where the barcode starts in R1. The length
            # of the trimmed R2 read should be equal to position.
            R2_slice = slice(len(self.barcode), len(self.barcode) + position)

            processed_R1 = fastq.Read(R1.name, R1.seq[R1_slice], R1.qual[R1_slice])
            processed_R2 = fastq.Read(R2.name, R2.seq[R2_slice], R2.qual[R2_slice])
            
            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Long enough', long_enough_reads),
            ]
        )
Exemple #7
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])

        region_fetcher = genomes.build_region_fetcher(
            self.file_names['genome'],
            load_references=True,
            sam_file=sam_file,
        )

        extended_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended'],
        )
        filtered_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended_filtered'],
        )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher)
                             for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings,
                                            lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(
                    trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue

                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue

                num_unique += 1

                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend([
            ('Mapped with no non-genomic A\'s', num_entirely_genomic),
            ('Nonunique', num_nonunique),
            ('Unique', num_unique),
        ], )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()

        truncated_in_R1 = self.adapter_in_R1[1:]
        truncated_in_R2 = self.adapter_in_R2[1:]

        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # Check for weird thing where expected overhang base doesn't
            # exist in primer dimers.
            R1_dimer_distance = adapters.adapter_hamming_distance(
                R1.seq,
                truncated_in_R1,
                len(R1.seq),
                len(truncated_in_R1),
                len(self.barcode),
            )
            R2_dimer_distance = adapters.adapter_hamming_distance(
                R2.seq,
                truncated_in_R2,
                len(R2.seq),
                len(truncated_in_R2),
                len(self.barcode),
            )
            if R1_dimer_distance <= 3 and R2_dimer_distance <= 3:
                position = len(self.barcode)
            else:
                position = adapters.consistent_paired_position(
                    R1.seq,
                    R2.seq,
                    self.adapter_in_R1,
                    self.adapter_in_R2,
                    19,
                    3,
                )
            if position != None:
                trimmed_lengths[position] += 1
                if position - len(self.barcode) < 12:
                    continue
            else:
                position = len(R1.seq)

            long_enough_reads += 1

            payload_slice = slice(len(self.barcode), position)

            processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice],
                                      R1.qual[payload_slice])
            processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice],
                                             R2.qual[payload_slice])

            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend([
            ('Total read pairs', total_reads),
            ('Long enough', long_enough_reads),
        ])
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()

        truncated_in_R1 = self.adapter_in_R1[1:]
        truncated_in_R2 = self.adapter_in_R2[1:]
        
        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # Check for weird thing where expected overhang base doesn't
            # exist in primer dimers.
            R1_dimer_distance = adapters.adapter_hamming_distance(R1.seq,
                                                                  truncated_in_R1,
                                                                  len(R1.seq),
                                                                  len(truncated_in_R1),
                                                                  len(self.barcode),
                                                                 )
            R2_dimer_distance = adapters.adapter_hamming_distance(R2.seq,
                                                                  truncated_in_R2,
                                                                  len(R2.seq),
                                                                  len(truncated_in_R2),
                                                                  len(self.barcode),
                                                                 )
            if R1_dimer_distance <= 3 and R2_dimer_distance <= 3:
                position = len(self.barcode)
            else:
                position = adapters.consistent_paired_position(R1.seq,
                                                               R2.seq,
                                                               self.adapter_in_R1,
                                                               self.adapter_in_R2,
                                                               19,
                                                               3,
                                                              )
            if position != None:
                trimmed_lengths[position] += 1
                if position - len(self.barcode) < 12:
                    continue
            else:
                position = len(R1.seq)

            long_enough_reads += 1

            payload_slice = slice(len(self.barcode), position)

            processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice])
            processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice])
            
            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Long enough', long_enough_reads),
            ]
        )
    def combine_mappings(self):
        num_unmapped = 0
        num_R1_unmapped = 0
        num_R2_unmapped = 0
        num_nonunique = 0
        num_discordant = 0
        num_disoriented = 0
        num_concordant = 0

        tlens = Counter()

        R1_mappings = pysam.Samfile(self.file_names['R1_accepted_hits'])
        R1_unmapped = pysam.Samfile(self.file_names['R1_unmapped'])
        all_R1 = sam.merge_by_name(R1_mappings, R1_unmapped)
        R1_grouped = utilities.group_by(all_R1, lambda m: m.qname)

        R2_mappings = pysam.Samfile(self.file_names['R2_accepted_hits'])
        R2_unmapped = pysam.Samfile(self.file_names['R2_unmapped'])
        all_R2 = sam.merge_by_name(R2_mappings, R2_unmapped)
        R2_grouped = utilities.group_by(all_R2, lambda m: m.qname)

        group_pairs = izip(R1_grouped, R2_grouped)

        alignment_sorter = sam.AlignmentSorter(R1_mappings.references,
                                               R1_mappings.lengths,
                                               self.file_names['combined'],
                                              )

        with alignment_sorter:
            for (R1_qname, R1_group), (R2_qname, R2_group) in group_pairs:
                #print R1_qname, R2_qname
                if fastq.get_pair_name(R1_qname) != fastq.get_pair_name(R2_qname):
                    # Ensure that the iteration through pairs is in sync.
                    print R1_qname, R2_qname
                    raise ValueError
                
                R1_unmapped = any(m.is_unmapped for m in R1_group)
                R2_unmapped = any(m.is_unmapped for m in R2_group)
                if R1_unmapped:
                    num_R1_unmapped += 1
                if R2_unmapped:
                    num_R2_unmapped += 1
                if R1_unmapped or R2_unmapped:
                    num_unmapped += 1
                    continue

                R1_nonunique = len(R1_group) > 1 or any(m.mapq < 40 for m in R1_group)
                R2_nonunique = len(R2_group) > 1 or any(m.mapq < 40 for m in R2_group)
                if R1_nonunique or R2_nonunique:
                    num_nonunique += 1
                    continue
                
                R1_m = R1_group.pop()
                R2_m = R2_group.pop()

                R1_strand = sam.get_strand(R1_m)
                R2_strand = sam.get_strand(R2_m)

                tlen = max(R1_m.aend, R2_m.aend) - min(R1_m.pos, R2_m.pos)
                discordant = (R1_m.tid != R2_m.tid) or (R1_strand) == (R2_strand) or (tlen > 10000)
                if discordant:
                    num_discordant += 1
                    continue
                
                # Reminder: the protocol produces anti-sense reads.
                if R1_strand == '-':
                    if R1_m.pos < R2_m.pos:
                        num_disoriented += 1
                        continue

                elif R1_strand == '+':
                    if R2_m.pos < R1_m.pos:
                        num_disoriented += 1
                        continue
                
                combined_read = paired_end.combine_paired_mappings(R1_m, R2_m)
                
                tlens[tlen] += 1

                if combined_read:
                    # Flip combined_read back to the sense strand.
                    if combined_read.is_reverse:
                        combined_read.is_reverse = False
                    else:
                        combined_read.is_reverse = True

                    trim.set_nongenomic_length(combined_read, 0)
                    
                    alignment_sorter.write(combined_read)

                    num_concordant += 1

        self.summary.extend(
            [('Unmapped', num_unmapped),
             ('R1 unmapped', num_R1_unmapped),
             ('R2 unmapped', num_R2_unmapped),
             ('Nonunique', num_nonunique),
             ('Discordant', num_discordant),
             ('Unexpected orientation', num_disoriented),
             ('Concordant', num_concordant),
            ],
        )

        tlens = utilities.counts_to_array(tlens)
        self.write_file('tlens', tlens)