Esempio n. 1
0
    def get_read_pairs(self):
        data_fns = glob.glob(self.data_dir + "/*.fastq") + glob.glob(self.data_dir + "/*.fq")
        fn_pairs = defaultdict(lambda: {1: None, 2: None})
        for data_fn in data_fns:
            head, tail = os.path.split(data_fn)
            root, ext = os.path.splitext(tail)
            # Expect root to end in either 1 or 2
            prefix, which_member = root[:-1], int(root[-1])
            fn_pairs[prefix][which_member] = data_fn

        read_pairs_list = []

        for prefix in sorted(fn_pairs):
            R1_fn = fn_pairs[prefix][1]
            R2_fn = fn_pairs[prefix][2]

            if R1_fn == None or R2_fn == None:
                raise ValueError("unpaired file names in data_dir")

            R1_lines = split_file.piece(R1_fn, self.num_pieces, self.which_piece, "fastq")
            R2_lines = split_file.piece(R2_fn, self.num_pieces, self.which_piece, "fastq")
            read_pairs = fastq.read_pairs(R1_lines, R2_lines, standardize_names=True, ensure_sanger_encoding=True)
            read_pairs_list.append(read_pairs)

        all_read_pairs = chain.from_iterable(read_pairs_list)

        return all_read_pairs
Esempio n. 2
0
    def get_reads(self):
        ''' A generator over the reads in a piece of each data file.
            Can handle a mixture of different fastq encodings across (but not
            within) files.
        '''
        total_reads = 0
        for file_name in self.data_fns:
            total_reads_from_file = 0
            file_piece = split_file.piece(
                file_name,
                self.num_pieces,
                self.which_piece,
                'fastq',
            )
            for read in fastq.reads(file_piece,
                                    standardize_names=True,
                                    ensure_sanger_encoding=True):
                yield read

                total_reads += 1
                total_reads_from_file += 1
                if total_reads % 10000 == 0:
                    logging.info('{0:,} reads processed'.format(total_reads))

            head, tail = os.path.split(file_name)
            self.summary.append(
                ('Reads in {0}'.format(tail), total_reads_from_file))

        logging.info('{0:,} total reads processed'.format(total_reads))

        self.summary.append(('Total reads', total_reads))
    def get_sorted_sam_lines(self):
        ''' Get this piece's set of SAM lines from the merged SAM file, ensuring
            that the piece boundary doesn't break up a set of mappings to the
            same position.
        '''
        get_position = annotation.make_convertor(self.MappingAnnotation,
                                                 self.PositionAnnotation,
                                                )

        sam_lines = split_file.piece(self.merged_file_names['sorted_clean_sam'],
                                     self.num_pieces,
                                     self.which_piece,
                                     'sam',
                                     key=get_position,
                                    )
        return sam_lines
Esempio n. 4
0
    def get_reads(self):
        """ A generator over the reads in a piece of each data file.
            Can handle a mixture of different fastq encodings across (but not
            within) files.
        """
        total_reads = 0
        for file_name in self.data_fns:
            total_reads_from_file = 0
            file_piece = split_file.piece(file_name, self.num_pieces, self.which_piece, "fastq")
            for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True):
                yield read

                total_reads += 1
                total_reads_from_file += 1
                if total_reads % 10000 == 0:
                    logging.info("{0:,} reads processed".format(total_reads))

            head, tail = os.path.split(file_name)
            self.summary.append(("Reads in {0}".format(tail), total_reads_from_file))

        logging.info("{0:,} total reads processed".format(total_reads))

        self.summary.append(("Total reads", total_reads))