def sanity_check(self): self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False) if not self.contigs_db_path: raise ConfigError("You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['blank']: raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!") if profile_db.meta['db_type'] != 'profile': raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\ database. There is something wrong here.") profile_db.disconnect() self.summary = summarizer.ProfileSummarizer(self.args) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!") else: self.bin_names_of_interest = [self.bin_name]
def get_summary_object_for_profile_db(self, profile_db_path, init_gene_coverages=True): collection_name = self.descriptions.genomes[ self.unique_profile_db_path_to_internal_genome_name[ profile_db_path][0]]['collection_id'] profile_db_path = self.descriptions.genomes[ self.unique_profile_db_path_to_internal_genome_name[ profile_db_path][0]]['profile_db_path'] contigs_db_path = self.descriptions.genomes[ self.unique_profile_db_path_to_internal_genome_name[ profile_db_path][0]]['contigs_db_path'] # poor-man's whatever bin_names_list = [ self.descriptions.genomes[g]['bin_id'] for g in self. unique_profile_db_path_to_internal_genome_name[profile_db_path] ] ARGS = summarizer.ArgsTemplateForSummarizerClass() ARGS.profile_db = profile_db_path ARGS.contigs_db = contigs_db_path ARGS.skip_init_functions = True ARGS.init_gene_coverages = init_gene_coverages ARGS.collection_name = collection_name ARGS.bin_names_list = bin_names_list ARGS.output_dir = None summary = summarizer.ProfileSummarizer(ARGS) summary.init() summary.init_collection_profile(collection_name) return summary
def sanity_check(self): self.output_directory = filesnpaths.check_output_directory( self.output_directory, ok_if_exists=False) if not self.contigs_db_path: raise ConfigError( "You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['db_type'] != 'profile' or profile_db.meta[ 'blank'] or not profile_db.meta['merged']: raise ConfigError( "You an only split merged profiles :/ We hope this is not a moment of a terrible disappointment.\ If it is, you should consider writing to us.") self.summary = summarizer.ProfileSummarizer(self.args) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError( "The bin name you wish to split from this profile databse is not in the collection. Busted!" ) else: self.bin_names_of_interest = [self.bin_name]
def gen_summary(args, d, request, response, collection_name): set_default_headers(response) if args.read_only: return json.dumps({'error': "Sorry! This is a read-only instance."}) if d.manual_mode: return json.dumps({ 'error': "Creating summaries is only possible with proper anvi'o runs at the moment :/" }) run.info_single('A summary of collection "%s" has been requested.' % collection_name) # get a dummy args instance, and fill it down below summarizer_args = summarizer.ArgsTemplateForSummarizerClass() # common params. we will set pan/profile specific params a bit later: summarizer_args.collection_name = collection_name summarizer_args.taxonomic_level = d.taxonomic_level if d.mode == 'pan': summarizer_args.pan_db = d.pan_db_path summarizer_args.genomes_storage = d.genomes_storage_path summarizer_args.output_dir = os.path.join( os.path.dirname(summarizer_args.pan_db), 'SUMMARY_%s' % collection_name) elif d.mode == 'full': summarizer_args.profile_db = d.profile_db_path summarizer_args.contigs_db = d.contigs_db_path summarizer_args.output_dir = os.path.join( os.path.dirname(summarizer_args.profile_db), 'SUMMARY_%s' % collection_name) else: return json.dumps({ 'error': 'We do not know anything about this mode: "%s"' % d.mode }) # call the summary: try: summary = summarizer.PanSummarizer( summarizer_args, r=run, p=progress) if d.mode == 'pan' else summarizer.ProfileSummarizer( summarizer_args, r=run, p=progress) summary.process() except Exception as e: return json.dumps({ 'error': 'Something failed in the "%s" summary mode. This is what we know: %s' % (d.mode, e) }) run.info_single('HTML output for summary is ready: %s' % summary.index_html) path = "summary/%s/index.html" % (collection_name) return json.dumps({'path': path})
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detection_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') self.gamma = A('gamma') self.eta = A('eta') self.zeta = A('zeta') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.number_of_positive_samples = None self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.store_gene_detections_and_gene_coverages_tables = A( 'store_gene_detections_and_gene_coverages_tables') self.gene_coverages = {} self.gene_detection = {} self.samples = {} self.positive_samples = {} self.negative_samples = {} self.gene_class_information = {} self.samples_information = {} self.profile_db = {} self.sanity_check() if self.profile_db_path is None: self.get_data_from_txt_file() else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() else: self.profile_db = ProfileSuperclass(args) self.profile_db.init_gene_coverages_and_detection_dicts() self.gene_coverages = self.profile_db.gene_coverages_dict self.gene_detection = self.profile_db.gene_detection_dict self.samples = set( next(iter(self.gene_coverages.values())).keys())
def sanity_check(self): self.output_directory = filesnpaths.check_output_directory( self.output_directory, ok_if_exists=True) if not self.contigs_db_path: raise ConfigError( "You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['blank']: raise ConfigError( "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!" ) if profile_db.meta['db_type'] != 'profile': raise ConfigError( "Anvi'o was trying to split this profile, but it just realized that it is not a profile\ database. There is something wrong here.") profile_db.disconnect() # if this is not set false, the summarizer class attemts to remove the main output directory # upon initialization. not doing that is useful in this context since this allows multiple # anvi-split runs to work on bins in the same collection in parallel: self.args.delete_output_directory_if_exists = False self.summary = summarizer.ProfileSummarizer(self.args) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError( "The bin name you wish to split from this profile databse is not in the collection. Busted!" ) else: self.bin_names_of_interest = [self.bin_name]
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.exclude_samples = A('exclude_samples') self.include_samples = A('include_samples') self.profile_db = {} self.coverage_values_per_nt = {} self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.sample_detection_information_was_initiated = False self.positive_samples = [] self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_detection_information = pd.DataFrame.empty self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty self.additional_description = '' self.total_length = None self.samples_coverage_stats_dicts_was_initiated = False self.samples_coverage_stats_dicts = pd.DataFrame.empty self.non_outlier_indices = {} if self.exclude_samples: # check that there is a file like this filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([ l.split('\t')[0].strip() for l in open(self.exclude_samples, 'rU').readlines() ]) if not self.samples_to_exclude: raise ConfigError( "You asked to exclude samples, but provided an empty list." ) run.info( 'Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude, ) else: self.samples_to_exclude = set([]) if self.include_samples: # check that there is a file like this filesnpaths.is_file_exists(self.include_samples) self.samples_to_include = set([ l.split('\t')[0].strip() for l in open(self.include_samples, 'rU').readlines() ]) if not self.samples_to_include: raise ConfigError( "You provided an empty list of samples to include.") run.info( 'Including Samples', 'The following samples will be included: %s' % self.samples_to_include, ) else: self.samples_to_include = set([]) # run sanity check on all input arguments self.sanity_check() if self.profile_db_path is None: # TODO: this will probably be removed because we don't save the coverage information in nucleotide level. pass else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() self.init_samples(self.summary.p_meta['samples']) else: self.profile_db = ProfileSuperclass(args) self.init_samples(self.profile_db.p_meta['samples']) self.profile_db.init_split_coverage_values_per_nt_dict() self.profile_db.init_gene_level_coverage_stats_dicts() self.coverage_values_per_nt = get_coverage_values_per_nucleotide( self.profile_db.split_coverage_values_per_nt_dict, self.samples) # comply with the new design and get gene_coverages and gene_detection dicsts from # gene_level_coverage_stats_dict. gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts( ) self.init_coverage_and_detection_dataframes( gene_coverages, gene_detection) # getting the total length of all contigs self.total_length = self.profile_db.p_meta['total_length']
def __init__(self, args, run=run, progress=progress): self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.gene_coverages_data_file_path = A('data_file') self.gene_detections_data_file_path = A('gene_detection_data_file') self.profile_db_path = A('profile_db') self.output_file_prefix = A('output_file_prefix') self.alpha = A('alpha') self.beta = A('beta') self.gamma = A('gamma') self.eta = A('eta') self.zeta = A('zeta') self.additional_layers_to_append = A('additional_layers_to_append') self.samples_information_to_append = A('samples_information_to_append') self.collection_name = A('collection_name') self.bin_id = A('bin_id') self.bin_ids_file_path = A('bin_ids_file') self.store_gene_detections_and_gene_coverages_tables = A( 'store_gene_detections_and_gene_coverages_tables') self.exclude_samples = A('exclude_samples') self.gene_coverages = pd.DataFrame.empty self.gene_detections = pd.DataFrame.empty self.samples = {} self.positive_samples = pd.DataFrame.empty self.number_of_positive_samples = None self.negative_samples = pd.DataFrame.empty self.number_of_negative_samples = None self.gene_class_information = pd.DataFrame.empty self.samples_information = pd.DataFrame.empty self.profile_db = {} self.gene_presence_absence_in_samples = pd.DataFrame.empty self.gene_coverages_filtered = pd.DataFrame.empty # check that there is a file like this if self.exclude_samples: filesnpaths.is_file_exists(self.exclude_samples) self.samples_to_exclude = set([ l.split('\t')[0].strip() for l in open(args.exclude_samples, 'rU').readlines() ]) run.info( 'Excluding Samples', 'The following samples will be excluded: %s' % self.samples_to_exclude, ) else: self.samples_to_exclude = set([]) self.sanity_check() if self.profile_db_path is None: self.get_data_from_txt_file() else: # load sample list and gene_coverage_dict from the merged profile db args.init_gene_coverages = True if self.collection_name: self.summary = summarizer.ProfileSummarizer(args) self.summary.init() else: self.profile_db = ProfileSuperclass(args) self.profile_db.init_gene_coverages_and_detection_dicts() self.gene_coverages = pd.DataFrame.from_dict( self.profile_db.gene_coverages_dict, orient='index', dtype=float) self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True) self.Ng = len(self.gene_coverages.index) self.gene_detections = pd.DataFrame.from_dict( self.profile_db.gene_detection_dict, orient='index', dtype=float) self.gene_detections.drop(self.samples_to_exclude, axis=1, inplace=True) self.samples = set(self.gene_coverages.columns)