def get_read_pairs(self): data_fns = glob.glob(self.data_dir + "/*.fastq") + glob.glob(self.data_dir + "/*.fq") fn_pairs = defaultdict(lambda: {1: None, 2: None}) for data_fn in data_fns: head, tail = os.path.split(data_fn) root, ext = os.path.splitext(tail) # Expect root to end in either 1 or 2 prefix, which_member = root[:-1], int(root[-1]) fn_pairs[prefix][which_member] = data_fn read_pairs_list = [] for prefix in sorted(fn_pairs): R1_fn = fn_pairs[prefix][1] R2_fn = fn_pairs[prefix][2] if R1_fn == None or R2_fn == None: raise ValueError("unpaired file names in data_dir") R1_lines = split_file.piece(R1_fn, self.num_pieces, self.which_piece, "fastq") R2_lines = split_file.piece(R2_fn, self.num_pieces, self.which_piece, "fastq") read_pairs = fastq.read_pairs(R1_lines, R2_lines, standardize_names=True, ensure_sanger_encoding=True) read_pairs_list.append(read_pairs) all_read_pairs = chain.from_iterable(read_pairs_list) return all_read_pairs
def get_reads(self): ''' A generator over the reads in a piece of each data file. Can handle a mixture of different fastq encodings across (but not within) files. ''' total_reads = 0 for file_name in self.data_fns: total_reads_from_file = 0 file_piece = split_file.piece( file_name, self.num_pieces, self.which_piece, 'fastq', ) for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True): yield read total_reads += 1 total_reads_from_file += 1 if total_reads % 10000 == 0: logging.info('{0:,} reads processed'.format(total_reads)) head, tail = os.path.split(file_name) self.summary.append( ('Reads in {0}'.format(tail), total_reads_from_file)) logging.info('{0:,} total reads processed'.format(total_reads)) self.summary.append(('Total reads', total_reads))
def get_sorted_sam_lines(self): ''' Get this piece's set of SAM lines from the merged SAM file, ensuring that the piece boundary doesn't break up a set of mappings to the same position. ''' get_position = annotation.make_convertor(self.MappingAnnotation, self.PositionAnnotation, ) sam_lines = split_file.piece(self.merged_file_names['sorted_clean_sam'], self.num_pieces, self.which_piece, 'sam', key=get_position, ) return sam_lines
def get_reads(self): """ A generator over the reads in a piece of each data file. Can handle a mixture of different fastq encodings across (but not within) files. """ total_reads = 0 for file_name in self.data_fns: total_reads_from_file = 0 file_piece = split_file.piece(file_name, self.num_pieces, self.which_piece, "fastq") for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True): yield read total_reads += 1 total_reads_from_file += 1 if total_reads % 10000 == 0: logging.info("{0:,} reads processed".format(total_reads)) head, tail = os.path.split(file_name) self.summary.append(("Reads in {0}".format(tail), total_reads_from_file)) logging.info("{0:,} total reads processed".format(total_reads)) self.summary.append(("Total reads", total_reads))