def __init__(self, args=None): self.args = None self.input_file_path = None self.annotation_db_path = None self.serialized_profile_path = None self.split_length = 20000 self.output_directory = None self.list_contigs_and_exit = None self.min_contig_length = 10000 self.min_mean_coverage = 0 self.min_coverage_for_variability = 10 # if a nucleotide position is covered less than this, don't bother self.contig_names_of_interest = None self.contigs_shall_be_clustered = False self.report_variability_full = False # don't apply any noise filtering, and simply report ALL base frequencies self.overwrite_output_destinations = False if args: self.args = args self.input_file_path = args.input_file self.annotation_db_path = args.annotation_db_path self.serialized_profile_path = args.profile self.output_directory = args.output_directory self.list_contigs_and_exit = args.list_contigs self.min_contig_length = args.min_contig_length self.min_mean_coverage = args.min_mean_coverage self.min_coverage_for_variability = args.min_coverage_for_variability self.contigs_shall_be_clustered = args.cluster_contigs self.number_of_threads = 4 self.no_trehading = True self.sample_id = args.sample_id self.report_variability_full = args.report_variability_full self.overwrite_output_destinations = args.overwrite_output_destinations if args.contigs_of_interest: if not os.path.exists(args.contigs_of_interest): raise ConfigError, "Contigs file (%s) is missing..." % (args.contigs_of_interest) self.contig_names_of_interest = set( [ c.strip() for c in open(args.contigs_of_interest).readlines() if c.strip() and not c.startswith("#") ] ) self.bam = None self.contigs = {} self.genes_in_contigs = {} self.contig_names_in_annotation_db = None self.database_paths = {"ANNOTATION.db": self.annotation_db_path} self.profile_db_path = None self.clustering_configs = constants.clustering_configs["single"] self.progress = terminal.Progress() self.run = terminal.Run(width=35) self.metadata = Metadata(self.progress) # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_positions_table_name self.variable_positions_table_entries = []
class BAMProfiler: """Creates an über class for BAM file operations""" def __init__(self, args=None): self.args = None self.input_file_path = None self.annotation_db_path = None self.serialized_profile_path = None self.split_length = 20000 self.output_directory = None self.list_contigs_and_exit = None self.min_contig_length = 10000 self.min_mean_coverage = 0 self.min_coverage_for_variability = 10 # if a nucleotide position is covered less than this, don't bother self.contig_names_of_interest = None self.contigs_shall_be_clustered = False self.report_variability_full = False # don't apply any noise filtering, and simply report ALL base frequencies self.overwrite_output_destinations = False if args: self.args = args self.input_file_path = args.input_file self.annotation_db_path = args.annotation_db_path self.serialized_profile_path = args.profile self.output_directory = args.output_directory self.list_contigs_and_exit = args.list_contigs self.min_contig_length = args.min_contig_length self.min_mean_coverage = args.min_mean_coverage self.min_coverage_for_variability = args.min_coverage_for_variability self.contigs_shall_be_clustered = args.cluster_contigs self.number_of_threads = 4 self.no_trehading = True self.sample_id = args.sample_id self.report_variability_full = args.report_variability_full self.overwrite_output_destinations = args.overwrite_output_destinations if args.contigs_of_interest: if not os.path.exists(args.contigs_of_interest): raise ConfigError, "Contigs file (%s) is missing..." % (args.contigs_of_interest) self.contig_names_of_interest = set( [ c.strip() for c in open(args.contigs_of_interest).readlines() if c.strip() and not c.startswith("#") ] ) self.bam = None self.contigs = {} self.genes_in_contigs = {} self.contig_names_in_annotation_db = None self.database_paths = {"ANNOTATION.db": self.annotation_db_path} self.profile_db_path = None self.clustering_configs = constants.clustering_configs["single"] self.progress = terminal.Progress() self.run = terminal.Run(width=35) self.metadata = Metadata(self.progress) # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_positions_table_name self.variable_positions_table_entries = [] def init_dirs_and_dbs(self): if not self.annotation_db_path: raise ConfigError, "You can not run profiling without an annotation database. You can create\ one using 'anvi-gen-annotation-database'. Not sure how? Please see the\ user manual." self.output_directory = filesnpaths.check_output_directory( self.output_directory or self.input_file_path + "-ANVIO_PROFILE", ok_if_exists=self.overwrite_output_destinations, ) self.progress.new("Initializing") self.progress.update("Creating the output directory ...") filesnpaths.gen_output_directory( self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations ) self.progress.update("Initializing the annotation database ...") annotation_db = dbops.AnnotationDatabase(self.annotation_db_path) self.split_length = int(annotation_db.meta["split_length"]) self.annotation_hash = annotation_db.meta["annotation_hash"] self.contig_names_in_annotation_db = set( annotation_db.db.get_table_as_dict(t.contigs_info_table_name, string_the_key=True).keys() ) annotation_db.disconnect() self.progress.update( 'Creating a new single profile database with annotation hash "%s" ...' % self.annotation_hash ) self.profile_db_path = self.generate_output_destination("PROFILE.db") profile_db = dbops.ProfileDatabase(self.profile_db_path) meta_values = { "db_type": "profile", "sample_id": self.sample_id, "samples": self.sample_id, "merged": False, "contigs_clustered": self.contigs_shall_be_clustered, "min_coverage_for_variability": self.min_coverage_for_variability, "default_view": "single", "min_contig_length": self.min_contig_length, "report_variability_full": self.report_variability_full, "annotation_hash": self.annotation_hash, } profile_db.create(meta_values) self.progress.end() def _run(self): self.check_args() self.set_sample_id() if self.list_contigs_and_exit: self.list_contigs() sys.exit() self.init_dirs_and_dbs() # we will set up things here so the information in the annotation_db # can be utilized directly from within the contigs for loop. contig to # gene associations will be stored in self.genes_in_contigs dictionary for # fast access. if self.annotation_db_path: self.populate_genes_in_contigs_dict() self.run.info("anvio", anvio.__version__) self.run.info("profiler_version", anvio.__profile__version__) self.run.info("sample_id", self.sample_id) self.run.info("profile_db", self.profile_db_path, display_only=True) self.run.info("annotation_db", True if self.annotation_db_path else False) self.run.info("annotation_hash", self.annotation_hash) self.run.info("cmd_line", utils.get_cmd_line()) self.run.info("merged", False) self.run.info("split_length", self.split_length) self.run.info("min_contig_length", self.min_contig_length) self.run.info("min_mean_coverage", self.min_mean_coverage) self.run.info("clustering_performed", self.contigs_shall_be_clustered) self.run.info("min_coverage_for_variability", self.min_coverage_for_variability) self.run.info("report_variability_full", self.report_variability_full) # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized # profile... if self.input_file_path: self.init_profile_from_BAM() self.profile() self.store_profile() self.store_summarized_profile_for_each_split() else: self.init_serialized_profile() self.store_summarized_profile_for_each_split() self.generate_variabile_positions_table() self.generate_gene_coverages_table() # here we store both metadata and TNF information into the database: profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) self.metadata.store_metadata_for_contigs_and_splits(self.sample_id, self.contigs, profile_db.db) profile_db.disconnect() # the only view for the single PROFILE database is ready, and already # set as the default view. store the info in the db: views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__) views_table.append("single", "metadata_splits") views_table.store() if self.contigs_shall_be_clustered: self.cluster_contigs() runinfo_serialized = self.generate_output_destination("RUNINFO.cp") self.run.info("runinfo", runinfo_serialized) self.run.store_info_dict(runinfo_serialized, strip_prefix=self.output_directory) self.run.quit() def populate_genes_in_contigs_dict(self): self.progress.new("Annotation") self.progress.update("Reading genes in contigs table") annotation_db = dbops.AnnotationDatabase(self.annotation_db_path) genes_in_contigs_table = annotation_db.db.get_table_as_dict( t.genes_contigs_table_name, t.genes_contigs_table_structure ) annotation_db.disconnect() self.progress.update("Populating ORFs dictionary for each contig ...") for gene in genes_in_contigs_table: e = genes_in_contigs_table[gene] if self.genes_in_contigs.has_key(e["contig"]): self.genes_in_contigs[e["contig"]].add((gene, e["start"], e["stop"])) else: self.genes_in_contigs[e["contig"]] = set([(gene, e["start"], e["stop"])]) self.progress.end() self.run.info( "annotation_db", "%d genes processed successfully." % len(genes_in_contigs_table), display_only=True ) def generate_variabile_positions_table(self): variable_positions_table = dbops.TableForVariability( self.profile_db_path, anvio.__profile__version__, progress=self.progress ) self.progress.new("Storing variability information") for contig in self.contigs.values(): for split in contig.splits: for column_profile in split.column_profiles.values(): column_profile["sample_id"] = self.sample_id variable_positions_table.append(column_profile) variable_positions_table.store() self.progress.end() self.run.info("variable_positions_table", True, quiet=True) def generate_gene_coverages_table(self): gene_coverages_table = dbops.TableForGeneCoverages( self.profile_db_path, anvio.__profile__version__, progress=self.progress ) self.progress.new("Profiling genes") num_contigs = len(self.contigs) contig_names = self.contigs.keys() for i in range(0, num_contigs): contig = contig_names[i] self.progress.update("Processing contig %d of %d" % (i + 1, num_contigs)) # if no open reading frames were found in a contig, it wouldn't have an entry in the annotation table, # therefore there wouldn't be any record of it in contig_ORFs; so we better check ourselves before # we wreck ourselves and the ultimately the analysis of this poor user: if self.genes_in_contigs.has_key(contig): gene_coverages_table.analyze_contig(self.contigs[contig], self.sample_id, self.genes_in_contigs[contig]) gene_coverages_table.store() self.progress.end() self.run.info("gene_coverages_table", True, quiet=True) def set_sample_id(self): if self.sample_id: utils.check_sample_id(self.sample_id) else: if self.input_file_path: self.input_file_path = os.path.abspath(self.input_file_path) self.sample_id = os.path.basename(self.input_file_path).upper().split(".BAM")[0] self.sample_id = self.sample_id.replace("-", "_") self.sample_id = self.sample_id.replace(".", "_") if self.sample_id[0] in constants.digits: self.sample_id = "s" + self.sample_id utils.check_sample_id(self.sample_id) if self.serialized_profile_path: self.serialized_profile_path = os.path.abspath(self.serialized_profile_path) self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path)) def check_contigs_without_any_ORFs(self, contig_names): if not self.annotation_db_path: return contig_names = set(contig_names) contigs_without_annotation = [c for c in contig_names if c not in self.genes_in_contigs] if len(contigs_without_annotation): import random P = lambda x: "are %d contigs" % (x) if x > 1 else "there is one contig" self.run.warning( 'You have instructed profiling to use an annotation database,\ however, there %s in your BAM file that did not get annotated. Which means\ whatever method you used to identify open reading frames in these contigs\ failed to find any open reading frames in those. Which may be normal\ (a) if your contigs are very short, or (b) if your gene finder is not\ capable of dealing with your stuff. If you know what you are doing, that\ is fine. Otherwise please double check. Here is one contig missing\ annotation if you would like to play: %s"' % (P(len(contigs_without_annotation)), random.choice(contigs_without_annotation)) ) def init_serialized_profile(self): self.progress.new("Init") self.progress.update("Reading serialized profile") self.contigs = dictio.read_serialized_object(self.serialized_profile_path) self.progress.end() self.run.info("profile_loaded_from", self.serialized_profile_path) self.run.info("num_contigs", pp(len(self.contigs))) if self.contig_names_of_interest: contigs_to_discard = set() for contig in self.contigs: if contig not in self.contig_names_of_interest: contigs_to_discard.add(contig) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info("num_contigs_selected_for_analysis", pp(len(self.contigs))) self.check_contigs() # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_ORFs(self.contigs.keys()) contigs_to_discard = set() for contig in self.contigs.values(): if contig.length < self.min_contig_length: contigs_to_discard.add(contig.name) if len(contigs_to_discard): for contig in contigs_to_discard: self.contigs.pop(contig) self.run.info("contigs_raw_longer_than_M", len(self.contigs)) self.check_contigs() def list_contigs(self): import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) if self.input_file_path: self.progress.new("Init") self.progress.update("Reading BAM File") self.bam = pysam.Samfile(self.input_file_path, "rb") self.progress.end() self.contig_names = self.bam.references self.contig_lenghts = self.bam.lengths utils.check_contig_names(self.contig_names) for tpl in sorted(zip(self.contig_lenghts, self.contig_names), reverse=True): print "%-40s %s" % (tpl[1], pp(int(tpl[0]))) else: self.progress.new("Init") self.progress.update("Reading serialized profile") self.contigs = dictio.read_serialized_object(self.serialized_profile_path) self.progress.end() self.run.info("profile_loaded_from", self.serialized_profile_path) self.run.info("num_contigs", pp(len(self.contigs))) for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]): print "%-40s %s" % (tpl[1], pp(int(tpl[0]))) def init_profile_from_BAM(self): self.progress.new("Init") self.progress.update("Reading BAM File") try: self.bam = pysam.Samfile(self.input_file_path, "rb") except ValueError as e: self.progress.end() raise ConfigError, 'Are you sure "%s" is a BAM file? Because samtools is not happy with it: """%s"""' % ( self.input_file_path, e, ) self.progress.end() self.contig_names = self.bam.references self.contig_lenghts = self.bam.lengths utils.check_contig_names(self.contig_names) try: self.num_reads_mapped = self.bam.mapped except ValueError: raise ConfigError, "It seems the BAM file is not indexed. See 'anvi-init-bam' script." runinfo = self.generate_output_destination("RUNINFO") self.run.init_info_file_obj(runinfo) self.run.info("input_bam", self.input_file_path) self.run.info("output_dir", self.output_directory, display_only=True) self.run.info("total_reads_mapped", pp(int(self.num_reads_mapped))) self.run.info("num_contigs", pp(len(self.contig_names))) if self.contig_names_of_interest: indexes = [self.contig_names.index(r) for r in self.contig_names_of_interest if r in self.contig_names] self.contig_names = [self.contig_names[i] for i in indexes] self.contig_lenghts = [self.contig_lenghts[i] for i in indexes] self.run.info("num_contigs_selected_for_analysis", pp(len(self.contig_names))) # it brings good karma to let the user know what the hell is wrong with their data: self.check_contigs_without_any_ORFs(self.contig_names) # check for the -M parameter. contigs_longer_than_M = set() for i in range(0, len(self.contig_names)): if self.contig_lenghts[i] >= self.min_contig_length: contigs_longer_than_M.add(i) if not len(contigs_longer_than_M): raise ConfigError, "0 contigs larger than %s nts." % pp(self.min_contig_length) else: self.contig_names = [self.contig_names[i] for i in contigs_longer_than_M] self.contig_lenghts = [self.contig_lenghts[i] for i in contigs_longer_than_M] self.num_contigs = len(self.contig_names) # we will store these two self.total_length = sum(self.contig_lenghts) # into the db in a second. # let's see whether the user screwed up to follow the simple instructions # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation for contig_name in self.contig_names: if contig_name not in self.contig_names_in_annotation_db: raise ConfigError, "At least one contig name in your BAM file does not match contig names stored in the\ annotation database. For instance, this is one contig name found in your BAM file: '%s',\ and this is another one found in your annotation database: '%s'. You may be using an\ annotation database for profiling that has nothing to do with the BAM file you are\ trying to profile, or you may have failed to fix your contig names in your FASTA file\ prior to mapping, which is described here: %s" % ( contig_name, self.contig_names_in_annotation_db.pop(), "http://goo.gl/Q9ChpS", ) # finally, compute contig splits. annotation_db = dbops.AnnotationDatabase(self.annotation_db_path) self.splits_in_annotation_db = annotation_db.db.get_table_as_dict(t.splits_info_table_name) annotation_db.disconnect() contigs_longer_than_M = set(self.contig_names) # for fast access self.split_names = set([]) self.contig_name_to_splits = {} for split_name in self.splits_in_annotation_db: parent = self.splits_in_annotation_db[split_name]["parent"] if parent not in contigs_longer_than_M: continue self.split_names.add(split_name) if self.contig_name_to_splits.has_key(parent): self.contig_name_to_splits[parent].append(split_name) else: self.contig_name_to_splits[parent] = [split_name] # we just recovered number of splits that are coming from contigs # longer than M: self.num_splits = len(self.split_names) self.run.info("num_contigs_after_M", self.num_contigs, display_only=True) self.run.info("num_contigs", self.num_contigs, quiet=True) self.run.info("num_splits", self.num_splits) self.run.info("total_length", self.total_length) profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True) profile_db.db.set_meta_value("num_splits", self.num_splits) profile_db.db.set_meta_value("num_contigs", self.num_contigs) profile_db.db.set_meta_value("total_length", self.total_length) profile_db.db.set_meta_value("total_reads_mapped", int(self.num_reads_mapped)) profile_db.disconnect() def generate_output_destination(self, postfix, directory=False): return_path = os.path.join(self.output_directory, postfix) if directory == True: if os.path.exists(return_path): shutil.rmtree(return_path) os.makedirs(return_path) return return_path def profile(self): """Big deal function""" # So we start with essential stats. In the section below, we will simply go through each contig # in the BAM file and populate the contigs dictionary for the first time. for i in range(0, len(self.contig_names)): contig_name = self.contig_names[i] contig = Contig(contig_name) contig.length = self.contig_lenghts[i] contig.split_length = self.split_length contig.min_coverage_for_variability = self.min_coverage_for_variability contig.report_variability_full = self.report_variability_full self.progress.new( 'Profiling "%s" (%d of %d) (%s nts)' % (contig.name, i + 1, len(self.contig_names), pp(int(contig.length))) ) # populate contig with empty split objects and for split_name in self.contig_name_to_splits[contig_name]: s = self.splits_in_annotation_db[split_name] split = Split(split_name, contig_name, s["order_in_parent"], s["start"], s["end"]) contig.splits.append(split) # analyze coverage for each split contig.analyze_coverage(self.bam, self.progress) # test the mean coverage of the contig. discarded_contigs_due_to_C = set([]) if contig.coverage.mean < self.min_mean_coverage: # discard this contig and continue discarded_contigs_due_to_C.add(contig.name) self.progress.end() continue contig.analyze_auxiliary(self.bam, self.progress) self.progress.end() # add contig to the dict. self.contigs[contig_name] = contig if discarded_contigs_due_to_C: self.run.info("contigs_after_C", pp(len(self.contigs))) # set contig abundance set_contigs_abundance(self.contigs) self.check_contigs() def store_profile(self): output_file = self.generate_output_destination("PROFILE.cp") self.progress.new("Storing Profile") self.progress.update("Serializing information for %s contigs ..." % pp(len(self.contigs))) dictio.write_serialized_object(self.contigs, output_file) self.progress.end() self.run.info("profile_dict", output_file) def store_summarized_profile_for_each_split(self): summary_index = {} summary_index_output_path = self.generate_output_destination("SUMMARY.cp") summary_dir = self.generate_output_destination("SUMMARY", directory=True) self.progress.new("Storing summary files") counter = 1 for contig in self.contigs: self.progress.update("working on contig %s of %s" % (pp(counter), pp(len(self.contigs)))) for split in self.contigs[contig].splits: split_summary_path = self.generate_output_destination(os.path.join(summary_dir, "%.6d.cp" % counter)) dictio.write_serialized_object( { self.sample_id: { "coverage": split.coverage.c, "variability": split.auxiliary.v, "competing_nucleotides": split.auxiliary.competing_nucleotides, } }, split_summary_path, ) summary_index[split.name] = split_summary_path counter += 1 self.progress.end() self.run.info("profile_summary_dir", summary_dir) dictio.write_serialized_object( dictio.strip_prefix_from_dict_values(summary_index, self.output_directory), summary_index_output_path ) self.run.info("profile_summary_index", summary_index_output_path) def check_contigs(self): if not len(self.contigs): raise ConfigError, "0 contigs to work with. Bye." def cluster_contigs(self): for config_name in self.clustering_configs: config_path = self.clustering_configs[config_name] config = ClusteringConfiguration( config_path, self.output_directory, db_paths=self.database_paths, row_ids_of_interest=self.split_names ) try: newick = clustering.order_contigs_simple(config, progress=self.progress) except Exception as e: self.run.warning('Clustering has failed for "%s": "%s"' % (config_name, e)) self.progress.end() continue dbops.add_hierarchical_clustering_to_db( self.profile_db_path, config_name, newick, make_default=config_name == constants.single_default, run=self.run, ) def check_args(self): if (not self.input_file_path) and (not self.serialized_profile_path): raise ConfigError, "You must declare either an input file, or a serialized profile. Use '--help'\ to learn more about the command line parameters." if self.input_file_path and self.serialized_profile_path: raise ConfigError, "You can't declare both an input file and a serialized profile." if self.serialized_profile_path and (not self.output_directory): raise ConfigError, "When loading serialized profiles, you need to declare an output directory." if self.input_file_path and not os.path.exists(self.input_file_path): raise ConfigError, "No such file: '%s'" % self.input_file_path if self.serialized_profile_path and not os.path.exists(self.serialized_profile_path): raise ConfigError, "No such file: '%s'" % self.serialized_profile_path if not self.min_coverage_for_variability >= 0: raise ConfigError, "Minimum coverage for variability must be 0 or larger." if not self.min_mean_coverage >= 0: raise ConfigError, "Minimum mean coverage must be 0 or larger." if not self.min_contig_length >= 0: raise ConfigError, "Minimum contig length must be 0 or larger."