def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) if self.input_file_path: self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = pysam.Samfile(self.input_file_path, 'rb') self.progress.end() self.contig_names = self.bam.references self.contig_lengths = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lengths, self.contig_names), reverse=True): print '%-40s %s' % (tpl[1], pp(int(tpl[0]))) else: self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object( self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]): print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) if self.input_file_path: self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = pysam.Samfile(self.input_file_path, 'rb') self.progress.end() self.contig_names = self.bam.references self.contig_lenghts = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lenghts, self.contig_names), reverse = True): print '%-40s %s' % (tpl[1], pp(int(tpl[0]))) else: self.progress.new('Init') self.progress.update('Reading serialized profile') self.contigs = dictio.read_serialized_object(self.serialized_profile_path) self.progress.end() self.run.info('profile_loaded_from', self.serialized_profile_path) self.run.info('num_contigs', pp(len(self.contigs))) for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]): print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
def init_mock_profile(self): self.progress.new('Init') self.progress.update('...') self.num_reads_mapped = 0 self.progress.end() self.contig_names = list(self.contigs_basic_info.keys()) self.contig_lengths = [self.contigs_basic_info[contig_name]['length'] for contig_name in self.contigs_basic_info] self.total_length = sum(self.contig_lengths) self.num_contigs = len(self.contig_names) utils.check_contig_names(self.contig_names) self.run.info('input_bam', None) self.run.info('output_dir', self.output_directory, display_only=True) self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped))) self.run.info('num_contigs', pp(self.num_contigs)) # check for the -M parameter. self.remove_contigs_that_are_shorter_than_min_contig_length() self.run.info('num_contigs_after_M', self.num_contigs, display_only=True) self.run.info('num_contigs', self.num_contigs, quiet=True) self.run.info('num_splits', self.num_splits) self.run.info('total_length', self.total_length) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value('num_splits', self.num_splits) profile_db.db.set_meta_value('num_contigs', self.num_contigs) profile_db.db.set_meta_value('total_length', self.total_length) profile_db.disconnect() self.layer_additional_data['total_reads_mapped'] = self.num_reads_mapped self.layer_additional_keys.append('total_reads_mapped')
def init_mock_profile(self): self.progress.new('Init') self.progress.update('...') self.num_reads_mapped = 0 self.progress.end() self.contig_names = list(self.contigs_basic_info.keys()) self.contig_lengths = [self.contigs_basic_info[contig_name]['length'] for contig_name in self.contigs_basic_info] self.total_length = sum(self.contig_lengths) self.num_contigs = len(self.contig_names) utils.check_contig_names(self.contig_names) self.run.info('input_bam', None) self.run.info('output_dir', self.output_directory, display_only=True) self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped))) self.run.info('num_contigs', pp(self.num_contigs)) # check for the -M parameter. self.remove_contigs_based_on_min_max_contig_length() self.run.info('num_contigs_after_M', self.num_contigs, display_only=True) self.run.info('num_contigs', self.num_contigs, quiet=True) self.run.info('num_splits', self.num_splits) self.run.info('total_length', self.total_length) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value('num_splits', self.num_splits) profile_db.db.set_meta_value('num_contigs', self.num_contigs) profile_db.db.set_meta_value('total_length', self.total_length) profile_db.disconnect() self.layer_additional_data['total_reads_mapped'] = self.num_reads_mapped self.layer_additional_keys.append('total_reads_mapped')
def init_profile_from_BAM(self): self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = bamops.BAMFileObject(self.input_file_path, run=self.run, progress=self.progress).get() self.num_reads_mapped = self.bam.mapped self.progress.end() self.contig_names = self.bam.references self.contig_lengths = self.bam.lengths utils.check_contig_names(self.contig_names) self.run.info('input_bam', self.input_file_path) self.run.info('output_dir', self.output_directory, display_only=True) self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped))) self.run.info('num_contigs', pp(len(self.contig_names))) if self.contig_names_of_interest: indexes = [self.contig_names.index(r) for r in self.contig_names_of_interest if r in self.contig_names] self.contig_names = [self.contig_names[i] for i in indexes] self.contig_lengths = [self.contig_lengths[i] for i in indexes] self.run.info('num_contigs_selected_for_analysis', pp(len(self.contig_names))) # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_gene_calls(self.contig_names) # check for the -M parameter. self.remove_contigs_based_on_min_max_contig_length() # let's see whether the user screwed up to follow the simple instructions # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation for contig_name in self.contig_names: if contig_name not in self.contig_names_in_contigs_db: raise ConfigError("At least one contig name in your BAM file does not match contig names stored in the\ contigs database. For instance, this is one contig name found in your BAM file: '%s',\ and this is another one found in your contigs database: '%s'. You may be using an\ contigs database for profiling that has nothing to do with the BAM file you are\ trying to profile, or you may have failed to fix your contig names in your FASTA file\ prior to mapping, which is described here: %s"\ % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS')) self.run.info('num_contigs_after_M', self.num_contigs, display_only=True) self.run.info('num_contigs', self.num_contigs, quiet=True) self.run.info('num_splits', self.num_splits) self.run.info('total_length', self.total_length) self.run.info('max_coverage_depth', pp(self.max_coverage_depth)) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value('num_splits', self.num_splits) profile_db.db.set_meta_value('num_contigs', self.num_contigs) profile_db.db.set_meta_value('total_length', self.total_length) profile_db.disconnect() self.layer_additional_data['total_reads_mapped'] = self.num_reads_mapped self.layer_additional_keys.append('total_reads_mapped')
def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = pysam.Samfile(self.input_file_path, 'rb') self.progress.end() self.contig_names = self.bam.references self.contig_lengths = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lengths, self.contig_names), reverse=True): print('%-40s %s' % (tpl[1], pp(int(tpl[0]))))
def init_profile_from_BAM(self): self.progress.new('Init') self.progress.update('Reading BAM File') try: self.bam = pysam.Samfile(self.input_file_path, 'rb') except ValueError as e: self.progress.end() raise ConfigError, 'Are you sure "%s" is a BAM file? Because samtools is not happy with it: """%s"""' % (self.input_file_path, e) self.progress.end() self.contig_names = self.bam.references self.contig_lenghts = self.bam.lengths utils.check_contig_names(self.contig_names) try: self.num_reads_mapped = self.bam.mapped except ValueError: raise ConfigError, "It seems the BAM file is not indexed. See 'anvi-init-bam' script." runinfo = self.generate_output_destination('RUNINFO') self.run.init_info_file_obj(runinfo) self.run.info('input_bam', self.input_file_path) self.run.info('output_dir', self.output_directory, display_only = True) self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped))) self.run.info('num_contigs', pp(len(self.contig_names))) if self.contig_names_of_interest: indexes = [self.contig_names.index(r) for r in self.contig_names_of_interest if r in self.contig_names] self.contig_names = [self.contig_names[i] for i in indexes] self.contig_lenghts = [self.contig_lenghts[i] for i in indexes] self.run.info('num_contigs_selected_for_analysis', pp(len(self.contig_names))) # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_gene_calls(self.contig_names) # check for the -M parameter. contigs_longer_than_M = set() for i in range(0, len(self.contig_names)): if self.contig_lenghts[i] >= self.min_contig_length: contigs_longer_than_M.add(i) if not len(contigs_longer_than_M): raise ConfigError, "0 contigs larger than %s nts." % pp(self.min_contig_length) else: self.contig_names = [self.contig_names[i] for i in contigs_longer_than_M] self.contig_lenghts = [self.contig_lenghts[i] for i in contigs_longer_than_M] self.num_contigs = len(self.contig_names) # we will store these two self.total_length = sum(self.contig_lenghts) # into the db in a second. # let's see whether the user screwed up to follow the simple instructions # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation for contig_name in self.contig_names: if contig_name not in self.contig_names_in_contigs_db: raise ConfigError, "At least one contig name in your BAM file does not match contig names stored in the\ contigs database. For instance, this is one contig name found in your BAM file: '%s',\ and this is another one found in your contigs database: '%s'. You may be using an\ contigs database for profiling that has nothing to do with the BAM file you are\ trying to profile, or you may have failed to fix your contig names in your FASTA file\ prior to mapping, which is described here: %s"\ % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS') # finally, compute contig splits. contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.splits_in_contigs_db = contigs_db.db.get_table_as_dict(t.splits_info_table_name) contigs_db.disconnect() contigs_longer_than_M = set(self.contig_names) # for fast access self.split_names = set([]) self.contig_name_to_splits = {} for split_name in sorted(self.splits_in_contigs_db.keys()): parent = self.splits_in_contigs_db[split_name]['parent'] if parent not in contigs_longer_than_M: continue self.split_names.add(split_name) if self.contig_name_to_splits.has_key(parent): self.contig_name_to_splits[parent].append(split_name) else: self.contig_name_to_splits[parent] = [split_name] # we just recovered number of splits that are coming from contigs # longer than M: self.num_splits = len(self.split_names) self.run.info('num_contigs_after_M', self.num_contigs, display_only = True) self.run.info('num_contigs', self.num_contigs, quiet = True) self.run.info('num_splits', self.num_splits) self.run.info('total_length', self.total_length) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value('num_splits', self.num_splits) profile_db.db.set_meta_value('num_contigs', self.num_contigs) profile_db.db.set_meta_value('total_length', self.total_length) profile_db.db.set_meta_value('total_reads_mapped', int(self.num_reads_mapped)) profile_db.disconnect()
def init_profile_from_BAM(self): self.progress.new('Init') self.progress.update('Reading BAM File') self.bam = bamops.BAMFileObject(self.input_file_path, run=self.run, progress=self.progress).get() self.num_reads_mapped = self.bam.mapped self.progress.end() self.contig_names = self.bam.references self.contig_lengths = self.bam.lengths utils.check_contig_names(self.contig_names) runinfo = self.generate_output_destination('RUNINFO') self.run.init_info_file_obj(runinfo) self.run.info('input_bam', self.input_file_path) self.run.info('output_dir', self.output_directory, display_only=True) self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped))) self.run.info('num_contigs', pp(len(self.contig_names))) if self.contig_names_of_interest: indexes = [ self.contig_names.index(r) for r in self.contig_names_of_interest if r in self.contig_names ] self.contig_names = [self.contig_names[i] for i in indexes] self.contig_lengths = [self.contig_lengths[i] for i in indexes] self.run.info('num_contigs_selected_for_analysis', pp(len(self.contig_names))) # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_gene_calls(self.contig_names) # check for the -M parameter. contigs_longer_than_M = set() for i in range(0, len(self.contig_names)): if self.contig_lengths[i] >= self.min_contig_length: contigs_longer_than_M.add(i) if not len(contigs_longer_than_M): raise ConfigError, "0 contigs larger than %s nts." % pp( self.min_contig_length) else: self.contig_names = [ self.contig_names[i] for i in contigs_longer_than_M ] self.contig_lengths = [ self.contig_lengths[i] for i in contigs_longer_than_M ] self.num_contigs = len( self.contig_names) # we will store these two self.total_length = sum( self.contig_lengths) # into the db in a second. # let's see whether the user screwed up to follow the simple instructions # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation for contig_name in self.contig_names: if contig_name not in self.contig_names_in_contigs_db: raise ConfigError, "At least one contig name in your BAM file does not match contig names stored in the\ contigs database. For instance, this is one contig name found in your BAM file: '%s',\ and this is another one found in your contigs database: '%s'. You may be using an\ contigs database for profiling that has nothing to do with the BAM file you are\ trying to profile, or you may have failed to fix your contig names in your FASTA file\ prior to mapping, which is described here: %s"\ % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS') contigs_longer_than_M = set(self.contig_names) # for fast access self.split_names = set([]) self.contig_name_to_splits = {} for split_name in sorted(self.splits_basic_info.keys()): parent = self.splits_basic_info[split_name]['parent'] if parent not in contigs_longer_than_M: continue self.split_names.add(split_name) if self.contig_name_to_splits.has_key(parent): self.contig_name_to_splits[parent].append(split_name) else: self.contig_name_to_splits[parent] = [split_name] # we just recovered number of splits that are coming from contigs # longer than M: self.num_splits = len(self.split_names) self.run.info('num_contigs_after_M', self.num_contigs, display_only=True) self.run.info('num_contigs', self.num_contigs, quiet=True) self.run.info('num_splits', self.num_splits) self.run.info('total_length', self.total_length) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value('num_splits', self.num_splits) profile_db.db.set_meta_value('num_contigs', self.num_contigs) profile_db.db.set_meta_value('total_length', self.total_length) profile_db.db.set_meta_value('total_reads_mapped', int(self.num_reads_mapped)) profile_db.disconnect()