Example #1
0
    def get_short_reads_for_splits_dict(self):
        short_reads_for_splits_dict = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = contigs_involved[contig_id].keys()
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = '.'.join(os.path.basename(bam_file_path).split('.')[:-1])

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            bam_file = pysam.Samfile(bam_file_path, 'rb')
            for contig_id, start, stop in contig_start_stops:
                for entry in bam_file.fetch(contig_id, start, stop):
                    '''
                    here's what's available in the entry object:
                    
                    ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
                     'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
                     'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired', 
                     'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
                     'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
                     'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend', 
                     'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
                     'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
                     'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
                     'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags', 'template_length', 'tid', 'tlen']'''

                    # we are doing only for 'single reads', but I think this has to take into account the paired-end case as well.
                    short_reads_for_splits_dict['_'.join([contig_id, str(start), str(stop), entry.query_name, bam_file_name])] = entry.query_sequence

        self.progress.end()

        return short_reads_for_splits_dict
Example #2
0
    def store_contigs_fasta(self):
        """Storing contig sequences.
        
           This is not an easy problem. We split contigs into smaller sequences at the beginning. Only
           a part of a given contig may be used during the binning process. On the other hand we can't
           simply store sequences of splits, whenever possible, we must store the entire sequence of
           the contig (only if all splits are selected from a contig in to the same bin). So, this
           function first identifies all splits coming from the same parent, then identifies sequential
           blocks of splits (see `SequentialBlocks` class), then checks whether all splits of a given
           contig is included in the bin. If that is the case, it puts the contig as a single entry,
           witht he identical FASTA id to the original contigs in the assembly file. Otherwise it appends
           `_partial_X_Y` to the FASTA id, X and Y being the start and stop positions.
        """

        if self.summary.quick:
            self.bin_info_dict['total_length'] = sum([self.summary.splits_basic_info[split_name]['length'] for split_name in self.summary.splits_basic_info if split_name in self.split_ids])
            self.bin_info_dict['num_contigs'] = len(set([self.summary.splits_basic_info[split_name]['parent'] for split_name in self.summary.splits_basic_info if split_name in self.split_ids]))
            return

        self.progress.update('Creating the FASTA file ...')

        # store original split names:
        self.store_data_in_file('original_split_names.txt', '\n'.join(self.split_ids))

        fasta_file = self.get_output_file_handle('contigs.fa')

        # some null values:
        self.bin_info_dict['total_length'] = 0
        self.bin_info_dict['num_contigs'] = 0

        # this dict will keep all the contig ids found in this bin with split names ordered:
        contigs_represented = utils.get_contigs_splits_dict(self.split_ids, self.summary.splits_basic_info) 

        # now it is time to go through each contig found in contigs_represented to
        # figure out what fraction of the contig is in fact in this bin
        for contig_id in contigs_represented:
            splits_order = contigs_represented[contig_id].keys()

            self.progress.update('Creating the FASTA file :: Identifying sequential blocks ...')
            # this is critical: sequential_blocks is a list of one ore more lists, where each item of this list
            # describes a range of splits that follow each other to represent a coherent
            # chunk of the parent sequence (if all splits from a contig is selected into this bin,
            # then there would be one list item that spans across the entire contig):
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                self.progress.update('Creating the FASTA file :: Identifying the portion of contig represented ...')
                first_split = contigs_represented[contig_id][sequential_block[0]]
                last_split = contigs_represented[contig_id][sequential_block[-1]]

                contig_sequence_start_in_splits = self.summary.splits_basic_info[first_split]['start']
                contig_sequence_end_in_splits = self.summary.splits_basic_info[last_split]['end']

                # so this much of the contig is represented by its splits:
                total_contig_length_in_splits = contig_sequence_end_in_splits - contig_sequence_start_in_splits

                # and this is is actual length:
                contig_sequence_length = self.summary.contigs_basic_info[contig_id]['length']

                if contig_sequence_length == total_contig_length_in_splits:
                    # the entireity of the contig is represented!
                    appendix = ''
                else:
                    appendix = '_partial_%d_%d' % (contig_sequence_start_in_splits, contig_sequence_end_in_splits)

                sequence = ''
                self.progress.update('Creating the FASTA file :: Reconstructing contig sequence from splits ...')
                for split_order in sequential_block:
                    sequence += self.summary.split_sequences[contigs_represented[contig_id][split_order]]

                fasta_id = contig_id + appendix

                self.progress.update('Creating the FASTA file :: Writing contig sequence into file ...')
                fasta_file.write('>%s\n' % fasta_id)
                fasta_file.write('%s\n' % textwrap.fill(sequence, 80, break_on_hyphens = False))

                # fill in basic info about contigs in bin
                len_seq = len(sequence)
                self.bin_info_dict['total_length'] += len_seq
                self.contig_lengths.append(len_seq)
                self.bin_info_dict['num_contigs'] += 1

        fasta_file.close()

        self.store_data_in_file('num_contigs.txt', '%d' % self.bin_info_dict['num_contigs'])
        self.store_data_in_file('total_length.txt', '%d' % self.bin_info_dict['total_length'])
Example #3
0
    def get_short_reads_for_splits_dict(self):
        if not self.initialized:
            raise ConfigError('The `GetReadsFromBAM` class is not initialized :/ Ad hoc use of this class is\
                               OK, but in that case you should set `self.initialized` to True, and provide\
                               the split names of interest manually.')

        if not len(self.split_names_of_interest):
            raise ConfigError("The split names of interest set is empty. This should have never happened. Good\
                               job.")

        short_reads_for_splits_dict = {}
        if self.split_R1_and_R2:
            short_reads_for_splits_dict['R1'] = {}
            short_reads_for_splits_dict['R2'] = {}
            short_reads_for_splits_dict['UNPAIRED'] = {}
        else:
            short_reads_for_splits_dict['all'] = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = list(contigs_involved[contig_id].keys())
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = filesnpaths.get_name_from_file_path(bam_file_path)

            bam_file_object = BAMFileObject(bam_file_path).get()

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            '''here's what's available in the read objects below:

            ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
             'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
             'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired',
             'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
             'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
             'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend',
             'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
             'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
             'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
             'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags',
             'template_length', 'tid', 'tlen']'''

            has_unknown_mate = {}
            if self.split_R1_and_R2:
                for contig_id, start, stop in contig_start_stops:
                    for read in bam_file_object.fetch(contig_id, start, stop):

                        defline = '_'.join([contig_id, str(start), str(stop), read.query_name, bam_file_name])

                        if not read.is_paired:
                            short_reads_for_splits_dict['UNPAIRED'][defline] = read.query_sequence

                        elif defline in has_unknown_mate:
                            # `read`s mate has already been read. so assign the read and the mate
                            # to their respective 'R1' and 'R2' dictionaries, then remove the mate
                            # from has_unknown_mate since its mate is now known.
                            read_DIRECTION = 'R1' if read.is_read1 else 'R2'
                            mate_DIRECTION = 'R2' if read_DIRECTION == 'R1' else 'R1'
                            short_reads_for_splits_dict[mate_DIRECTION][defline] = has_unknown_mate[defline]
                            short_reads_for_splits_dict[read_DIRECTION][defline] = read.query_sequence
                            del has_unknown_mate[defline]

                        else:
                            has_unknown_mate[defline] = read.query_sequence
                short_reads_for_splits_dict['UNPAIRED'].update(has_unknown_mate)
            else:
                for contig_id, start, stop in contig_start_stops:
                    for read in bam_file_object.fetch(contig_id, start, stop):
                        short_reads_for_splits_dict['all']['_'.join([contig_id, str(start), str(stop), read.query_name, bam_file_name])] = read.query_sequence
            bam_file_object.close()

        self.progress.end()

        return short_reads_for_splits_dict
Example #4
0
    def get_short_reads_for_splits_dict(self):
        short_reads_for_splits_dict = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = contigs_involved[contig_id].keys()
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = '.'.join(os.path.basename(bam_file_path).split('.')[:-1])

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            bam_file = pysam.Samfile(bam_file_path, 'rb')
            for contig_id, start, stop in contig_start_stops:
                for entry in bam_file.fetch(contig_id, start, stop):
                    '''
                    here's what's available in the entry object:
                    
                    ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
                     'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
                     'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired', 
                     'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
                     'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
                     'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend', 
                     'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
                     'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
                     'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
                     'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags', 'template_length', 'tid', 'tlen']'''

                    # we are doing only for 'single reads', but I think this has to take into account the paired-end case as well.
                    short_reads_for_splits_dict['_'.join([contig_id, str(start), str(stop), entry.query_name, bam_file_name])] = entry.query_sequence

        self.progress.end()

        return short_reads_for_splits_dict
Example #5
0
    def store_contigs_fasta(self):
        """Storing contig sequences.
        
           This is not an easy problem. We split contigs into smaller sequences at the beginning. Only
           a part of a given contig may be used during the binning process. On the other hand we can't
           simply store sequences of splits, whenever possible, we must store the entire sequence of
           the contig (only if all splits are selected from a contig in to the same bin). So, this
           function first identifies all splits coming from the same parent, then identifies sequential
           blocks of splits (see `SequentialBlocks` class), then checks whether all splits of a given
           contig is included in the bin. If that is the case, it puts the contig as a single entry,
           witht he identical FASTA id to the original contigs in the assembly file. Otherwise it appends
           `_partial_X_Y` to the FASTA id, X and Y being the start and stop positions.
        """

        if self.summary.quick:
            self.bin_info_dict['total_length'] = sum([
                self.summary.splits_basic_info[split_name]['length']
                for split_name in self.summary.splits_basic_info
                if split_name in self.split_ids
            ])
            self.bin_info_dict['num_contigs'] = len(
                set([
                    self.summary.splits_basic_info[split_name]['parent']
                    for split_name in self.summary.splits_basic_info
                    if split_name in self.split_ids
                ]))
            return

        self.progress.update('Creating the FASTA file ...')

        # store original split names:
        self.store_data_in_file('original_split_names.txt',
                                '\n'.join(self.split_ids))

        fasta_file = self.get_output_file_handle('contigs.fa')

        # some null values:
        self.bin_info_dict['total_length'] = 0
        self.bin_info_dict['num_contigs'] = 0

        # this dict will keep all the contig ids found in this bin with split names ordered:
        contigs_represented = utils.get_contigs_splits_dict(
            self.split_ids, self.summary.splits_basic_info)

        # now it is time to go through each contig found in contigs_represented to
        # figure out what fraction of the contig is in fact in this bin
        for contig_id in contigs_represented:
            splits_order = contigs_represented[contig_id].keys()

            self.progress.update(
                'Creating the FASTA file :: Identifying sequential blocks ...')
            # this is critical: sequential_blocks is a list of one ore more lists, where each item of this list
            # describes a range of splits that follow each other to represent a coherent
            # chunk of the parent sequence (if all splits from a contig is selected into this bin,
            # then there would be one list item that spans across the entire contig):
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(
                splits_order).process()

            for sequential_block in sequential_blocks:
                self.progress.update(
                    'Creating the FASTA file :: Identifying the portion of contig represented ...'
                )
                first_split = contigs_represented[contig_id][
                    sequential_block[0]]
                last_split = contigs_represented[contig_id][
                    sequential_block[-1]]

                contig_sequence_start_in_splits = self.summary.splits_basic_info[
                    first_split]['start']
                contig_sequence_end_in_splits = self.summary.splits_basic_info[
                    last_split]['end']

                # so this much of the contig is represented by its splits:
                total_contig_length_in_splits = contig_sequence_end_in_splits - contig_sequence_start_in_splits

                # and this is is actual length:
                contig_sequence_length = self.summary.contigs_basic_info[
                    contig_id]['length']

                if contig_sequence_length == total_contig_length_in_splits:
                    # the entireity of the contig is represented!
                    appendix = ''
                else:
                    appendix = '_partial_%d_%d' % (
                        contig_sequence_start_in_splits,
                        contig_sequence_end_in_splits)

                sequence = ''
                self.progress.update(
                    'Creating the FASTA file :: Reconstructing contig sequence from splits ...'
                )
                for split_order in sequential_block:
                    sequence += self.summary.split_sequences[
                        contigs_represented[contig_id][split_order]]

                fasta_id = contig_id + appendix

                self.progress.update(
                    'Creating the FASTA file :: Writing contig sequence into file ...'
                )
                fasta_file.write('>%s\n' % fasta_id)
                fasta_file.write(
                    '%s\n' %
                    textwrap.fill(sequence, 80, break_on_hyphens=False))

                # fill in basic info about contigs in bin
                len_seq = len(sequence)
                self.bin_info_dict['total_length'] += len_seq
                self.contig_lengths.append(len_seq)
                self.bin_info_dict['num_contigs'] += 1

        fasta_file.close()

        self.store_data_in_file('num_contigs.txt',
                                '%d' % self.bin_info_dict['num_contigs'])
        self.store_data_in_file('total_length.txt',
                                '%d' % self.bin_info_dict['total_length'])