Example #1
0
    def extract_boundary_sequences(self):
        read_pairs = self.get_read_pairs()
        trimmed_read_pairs = self.trim_barcodes(read_pairs)

        total_reads = 0
        well_formed = 0
        long_enough = 0
    
        counters = {'positions': {orientation: Counter() for orientation in orientations},
                    'control_ids': Counter(),
                    'polyA_lengths': Counter(),
                    'left_ids': Counter(),
                    'right_ids': Counter(),
                    'joint_lengths': Counter(),
                   }

        with open(self.file_names['five_prime_boundaries'], 'w') as fives_fh, \
             open(self.file_names['three_prime_boundaries'], 'w') as threes_fh:

            for R1, R2 in trimmed_read_pairs:
                total_reads += 1
                five_payload_read, three_payload_read = TIF_seq_structure.find_boundary_sequences(R1, R2, counters)
                if five_payload_read and three_payload_read:
                    well_formed += 1
                    if len(five_payload_read.seq) >= self.min_payload_length and \
                       len(three_payload_read.seq) >= self.min_payload_length:
                        long_enough += 1
                        fives_fh.write(fastq.make_record(*five_payload_read))
                        threes_fh.write(fastq.make_record(*three_payload_read))

        # Pop off of counters so that what is left at the end can be written
        # directly to the id_counts file.
        position_counts = counters.pop('positions')
        for orientation in orientations:
            key = '{0}_{1}'.format(orientation, 'positions')
            array = counts_to_array(position_counts[orientation])
            self.write_file(key, array)

        polyA_lengths = counts_to_array(counters.pop('polyA_lengths'))
        self.write_file('polyA_lengths', polyA_lengths)

        joint_lengths = counts_to_array(counters.pop('joint_lengths'), dim=2)
        self.write_file('joint_lengths', joint_lengths)
        
        self.write_file('id_counts', counters)

        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Well-formed', well_formed),
             ('Long enough', long_enough),
            ],
        )
Example #2
0
 def preprocess(self):                                                                                 
     ''' tophat can't handle named pipes, so need to make a file. '''                                                                                               
     reads = self.get_reads()                                                                          
     total_reads = 0                                                                                   
                                                                                                       
     with open(self.file_names['preprocessed_reads'], 'w') as preprocessed_file:                       
         for read in reads:                                                                            
             total_reads += 1                                                                          
             record = fastq.make_record(*read)                                                         
             preprocessed_file.write(record)                                                           
                                                                                                       
     self.summary.extend(                                                                                  
         [('Total reads', total_reads),                                                                
         ],                                                                                            
     )                                                                                                 
    def preprocess(self):
        ''' tophat can't handle named pipes, so need to make a file. '''
        reads = self.get_reads()
        total_reads = 0

        with open(self.file_names['preprocessed_reads'],
                  'w') as preprocessed_file:
            for read in reads:
                total_reads += 1
                record = fastq.make_record(*read)
                preprocessed_file.write(record)

        self.summary.extend([
            ('Total reads', total_reads),
        ], )
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()

        truncated_in_R1 = self.adapter_in_R1[1:]
        truncated_in_R2 = self.adapter_in_R2[1:]

        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # Check for weird thing where expected overhang base doesn't
            # exist in primer dimers.
            R1_dimer_distance = adapters.adapter_hamming_distance(
                R1.seq,
                truncated_in_R1,
                len(R1.seq),
                len(truncated_in_R1),
                len(self.barcode),
            )
            R2_dimer_distance = adapters.adapter_hamming_distance(
                R2.seq,
                truncated_in_R2,
                len(R2.seq),
                len(truncated_in_R2),
                len(self.barcode),
            )
            if R1_dimer_distance <= 3 and R2_dimer_distance <= 3:
                position = len(self.barcode)
            else:
                position = adapters.consistent_paired_position(
                    R1.seq,
                    R2.seq,
                    self.adapter_in_R1,
                    self.adapter_in_R2,
                    19,
                    3,
                )
            if position != None:
                trimmed_lengths[position] += 1
                if position - len(self.barcode) < 12:
                    continue
            else:
                position = len(R1.seq)

            long_enough_reads += 1

            payload_slice = slice(len(self.barcode), position)

            processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice],
                                      R1.qual[payload_slice])
            processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice],
                                             R2.qual[payload_slice])

            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend([
            ('Total read pairs', total_reads),
            ('Long enough', long_enough_reads),
        ])
    def trim_reads(self, read_pairs):
        total_reads = 0
        long_enough_reads = 0
        trimmed_lengths = Counter()
        barcodes = Counter()

        truncated_in_R1 = self.adapter_in_R1[1:]
        truncated_in_R2 = self.adapter_in_R2[1:]
        
        for R1, R2 in read_pairs:
            total_reads += 1
            barcodes[R2.seq[:len(self.barcode)]] += 1

            # Check for weird thing where expected overhang base doesn't
            # exist in primer dimers.
            R1_dimer_distance = adapters.adapter_hamming_distance(R1.seq,
                                                                  truncated_in_R1,
                                                                  len(R1.seq),
                                                                  len(truncated_in_R1),
                                                                  len(self.barcode),
                                                                 )
            R2_dimer_distance = adapters.adapter_hamming_distance(R2.seq,
                                                                  truncated_in_R2,
                                                                  len(R2.seq),
                                                                  len(truncated_in_R2),
                                                                  len(self.barcode),
                                                                 )
            if R1_dimer_distance <= 3 and R2_dimer_distance <= 3:
                position = len(self.barcode)
            else:
                position = adapters.consistent_paired_position(R1.seq,
                                                               R2.seq,
                                                               self.adapter_in_R1,
                                                               self.adapter_in_R2,
                                                               19,
                                                               3,
                                                              )
            if position != None:
                trimmed_lengths[position] += 1
                if position - len(self.barcode) < 12:
                    continue
            else:
                position = len(R1.seq)

            long_enough_reads += 1

            payload_slice = slice(len(self.barcode), position)

            processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice])
            processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice])
            
            yield processed_R1, processed_R2

        trimmed_lengths = utilities.counts_to_array(trimmed_lengths)
        self.write_file('trimmed_lengths', trimmed_lengths)
        self.write_file('barcodes', barcodes)
        self.summary.extend(
            [('Total read pairs', total_reads),
             ('Long enough', long_enough_reads),
            ]
        )