def extract_boundary_sequences(self): read_pairs = self.get_read_pairs() trimmed_read_pairs = self.trim_barcodes(read_pairs) total_reads = 0 well_formed = 0 long_enough = 0 counters = {'positions': {orientation: Counter() for orientation in orientations}, 'control_ids': Counter(), 'polyA_lengths': Counter(), 'left_ids': Counter(), 'right_ids': Counter(), 'joint_lengths': Counter(), } with open(self.file_names['five_prime_boundaries'], 'w') as fives_fh, \ open(self.file_names['three_prime_boundaries'], 'w') as threes_fh: for R1, R2 in trimmed_read_pairs: total_reads += 1 five_payload_read, three_payload_read = TIF_seq_structure.find_boundary_sequences(R1, R2, counters) if five_payload_read and three_payload_read: well_formed += 1 if len(five_payload_read.seq) >= self.min_payload_length and \ len(three_payload_read.seq) >= self.min_payload_length: long_enough += 1 fives_fh.write(fastq.make_record(*five_payload_read)) threes_fh.write(fastq.make_record(*three_payload_read)) # Pop off of counters so that what is left at the end can be written # directly to the id_counts file. position_counts = counters.pop('positions') for orientation in orientations: key = '{0}_{1}'.format(orientation, 'positions') array = counts_to_array(position_counts[orientation]) self.write_file(key, array) polyA_lengths = counts_to_array(counters.pop('polyA_lengths')) self.write_file('polyA_lengths', polyA_lengths) joint_lengths = counts_to_array(counters.pop('joint_lengths'), dim=2) self.write_file('joint_lengths', joint_lengths) self.write_file('id_counts', counters) self.summary.extend( [('Total read pairs', total_reads), ('Well-formed', well_formed), ('Long enough', long_enough), ], )
def preprocess(self): ''' tophat can't handle named pipes, so need to make a file. ''' reads = self.get_reads() total_reads = 0 with open(self.file_names['preprocessed_reads'], 'w') as preprocessed_file: for read in reads: total_reads += 1 record = fastq.make_record(*read) preprocessed_file.write(record) self.summary.extend( [('Total reads', total_reads), ], )
def preprocess(self): ''' tophat can't handle named pipes, so need to make a file. ''' reads = self.get_reads() total_reads = 0 with open(self.file_names['preprocessed_reads'], 'w') as preprocessed_file: for read in reads: total_reads += 1 record = fastq.make_record(*read) preprocessed_file.write(record) self.summary.extend([ ('Total reads', total_reads), ], )
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() truncated_in_R1 = self.adapter_in_R1[1:] truncated_in_R2 = self.adapter_in_R2[1:] for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # Check for weird thing where expected overhang base doesn't # exist in primer dimers. R1_dimer_distance = adapters.adapter_hamming_distance( R1.seq, truncated_in_R1, len(R1.seq), len(truncated_in_R1), len(self.barcode), ) R2_dimer_distance = adapters.adapter_hamming_distance( R2.seq, truncated_in_R2, len(R2.seq), len(truncated_in_R2), len(self.barcode), ) if R1_dimer_distance <= 3 and R2_dimer_distance <= 3: position = len(self.barcode) else: position = adapters.consistent_paired_position( R1.seq, R2.seq, self.adapter_in_R1, self.adapter_in_R2, 19, 3, ) if position != None: trimmed_lengths[position] += 1 if position - len(self.barcode) < 12: continue else: position = len(R1.seq) long_enough_reads += 1 payload_slice = slice(len(self.barcode), position) processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice]) processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend([ ('Total read pairs', total_reads), ('Long enough', long_enough_reads), ])
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() truncated_in_R1 = self.adapter_in_R1[1:] truncated_in_R2 = self.adapter_in_R2[1:] for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # Check for weird thing where expected overhang base doesn't # exist in primer dimers. R1_dimer_distance = adapters.adapter_hamming_distance(R1.seq, truncated_in_R1, len(R1.seq), len(truncated_in_R1), len(self.barcode), ) R2_dimer_distance = adapters.adapter_hamming_distance(R2.seq, truncated_in_R2, len(R2.seq), len(truncated_in_R2), len(self.barcode), ) if R1_dimer_distance <= 3 and R2_dimer_distance <= 3: position = len(self.barcode) else: position = adapters.consistent_paired_position(R1.seq, R2.seq, self.adapter_in_R1, self.adapter_in_R2, 19, 3, ) if position != None: trimmed_lengths[position] += 1 if position - len(self.barcode) < 12: continue else: position = len(R1.seq) long_enough_reads += 1 payload_slice = slice(len(self.barcode), position) processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice]) processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend( [('Total read pairs', total_reads), ('Long enough', long_enough_reads), ] )