def init_samples_txt(self): if 'sample' not in self.samples_information.columns.values: raise ConfigError("You know what. This '%s' file does not look anything like " "a samples file." % self.samples_txt_file) if len(self.samples_information['sample']) != len(set(self.samples_information['sample'])): raise ConfigError("Names of samples in your samples_txt file must be unique. " "It looks like some names appear twice in your file: %s" % self.samples_txt_file) for sample in self.samples_information['sample']: try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: " "%s" % (self.samples_txt_file, e)) if 'r1' not in self.samples_information.columns or 'r2' not in self.samples_information: raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. " "We are not sure what's wrong, but we expected to find columns with " "titles 'r1' and 'r2' and we did not find such columns." % self.samples_txt_file) fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2']) try: bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))] except Exception as e: raise ConfigError(f"The format of your samples txt file does not seem to be working for anvi'o. This is " f"what the downstream processes had to say: '{e}'. Please double-check columns in " f"your samples txt.") if bad_fastq_names: run.warning("We noticed some of your sequence files in '%s' do not end with either '.fastq' " "or '.fastq.gz'. That's okay, but anvi'o decided it should warn you. Here are the first " "5 such files that have unconventional file extensions: %s." \ % (self.samples_txt_file, ', '.join(bad_fastq_names[:5])))
def init_samples_txt(self): if 'sample' not in self.samples_information.columns.values: raise ConfigError("You know what. This '%s' file does not look anything like\ a samples file." % self.samples_txt_file) if len(self.samples_information['sample']) != len(set(self.samples_information['sample'])): raise ConfigError("Names of samples in your samples_txt file must be unique. \ It looks like some names appear twice in your file: %s" % self.samples_txt_file) for sample in self.samples_information['sample']: try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \ %s" % (self.samples_txt_file, e)) if 'r1' not in self.samples_information.columns or 'r2' not in self.samples_information: raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. \ We are not sure what's wrong, but we expected to find columns with \ titles 'r1' and 'r2' and we did not find such columns." % self.samples_txt_file) fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2']) bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))] if bad_fastq_names: raise ConfigError("We require tha all fastq file names end with either '.fastq' \ or '.fastq.gz'. Some or all of the file names in %s aren't formatted \ accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
def init(self): super().init() # loading the samples.txt file samples_txt_file = self.get_param_value_from_config(['samples_txt']) filesnpaths.is_file_exists(samples_txt_file) # getting the samples information (names, [group], path to r1, path to r2) from samples.txt self.samples_information = pd.read_csv(samples_txt_file, sep='\t', index_col=False) if 'sample' not in self.samples_information.columns.values: raise ConfigError( "You know what. This '%s' file does not look anything like\ a samples file." % samples_txt_file) for sample in self.samples_information['sample']: try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError( "While processing the samples txt file ('%s'), anvi'o ran into the following error: \ %s" % (samples_txt_file, e)) self.sanity_check_for_kraken()
def set_sample_id(self): if self.sample_id: utils.check_sample_id(self.sample_id) else: self.sample_id = os.path.basename(self.output_directory) self.sample_id = self.sample_id.replace('-', '_') if self.sample_id[0] in constants.digits: self.sample_id = 's' + self.sample_id utils.check_sample_id(self.sample_id)
def load_references_for_removal(self): """Load and perform some sanity checks on the references for removal""" self.references_for_removal = u.get_TAB_delimited_file_as_dictionary( self.references_for_removal_txt) # adding the references_for_removal to the fasta_information dict self.fasta_information.update(self.references_for_removal) for sample in self.references_for_removal.keys(): try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError( "While processing the references for removal txt file ('%s'), anvi'o ran into the following error: " "%s" % (self.samples_txt_file, e)) files_that_end_with_gz = [] for ref_dict in self.references_for_removal.values(): if 'path' not in ref_dict: raise ConfigError( 'Yor references for removal txt file is not formatted properly. It must have only two columns ' 'with the headers "reference" and "path".') if ref_dict['path'].endswith('.gz'): filesnpaths.is_file_exists(ref_dict['path']) files_that_end_with_gz.append(ref_dict['path']) else: # if the file is not compressed then we can verify that it is a fasta file filesnpaths.is_file_fasta_formatted(ref_dict['path']) if files_that_end_with_gz: run.warning( 'The following reference for removal files are compressed: %s. ' 'That\'s fine, but it means that we will skip the ' 'sanity check to verify that this is actually ' 'a properly formatted fasta file. Things are ' 'probably Ok, this is just one of these occasions ' 'in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz)) if self.references_mode: # Make sure that the user didn't give the same name to references and references_for_removal ref_name_in_both = [ r for r in self.references_for_removal if r in self.contigs_information ] if ref_name_in_both: raise ConfigError( 'You must have unique names for your fasta files in your fasta txt file ' 'and your references for removal txt file. These are the names that appear ' 'in both: %s' % ', '.join(ref_name_in_both)) dont_remove = self.get_param_value_from_config( ['remove_short_reads_based_on_references', 'dont_remove_just_map']) if not dont_remove: self.remove_short_reads_based_on_references = True
def set_sample_id(self): if self.sample_id: utils.check_sample_id(self.sample_id) else: if self.input_file_path: self.input_file_path = os.path.abspath(self.input_file_path) self.sample_id = os.path.basename(self.input_file_path).upper().split('.BAM')[0] self.sample_id = self.sample_id.replace('-', '_') self.sample_id = self.sample_id.replace('.', '_') if self.sample_id[0] in constants.digits: self.sample_id = 's' + self.sample_id utils.check_sample_id(self.sample_id) if self.serialized_profile_path: self.serialized_profile_path = os.path.abspath(self.serialized_profile_path) self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path))
def set_sample_id(self): if self.sample_id: utils.check_sample_id(self.sample_id) else: if self.input_file_path: self.input_file_path = os.path.abspath(self.input_file_path) self.sample_id = os.path.basename(self.input_file_path).upper().split(".BAM")[0] self.sample_id = self.sample_id.replace("-", "_") self.sample_id = self.sample_id.replace(".", "_") if self.sample_id[0] in constants.digits: self.sample_id = "s" + self.sample_id utils.check_sample_id(self.sample_id) if self.serialized_profile_path: self.serialized_profile_path = os.path.abspath(self.serialized_profile_path) self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path))
def check_project_name(self): """Check the name of the tRNA-seq project.""" if self.run_anvi_merge_trnaseq: project_name = self.get_param_value_from_config( ['anvi_merge_trnaseq', '--project-name']) if not project_name: raise ConfigError( "Since you are running anvi-merge-trnaseq, " "please provide a project name for the sample(s) in the config file." ) try: u.check_sample_id(project_name) except ConfigError as e: raise ConfigError("While checking the project name, '%s', " "anvi'o ran into the following error: %s" % (project_name, e))
def check_sample_names(self): """Check that the name of each tRNA-seq library is anvi'o-compliant.""" for sample_name in self.sample_info['sample']: try: u.check_sample_id(sample_name) except ConfigError as e: raise ConfigError( "While processing the samples_txt file, '%s', " "anvi'o ran into the following error: %s" % (self.samples_txt_file, e)) if len(set(self.sample_info['sample'])) != len( self.sample_info['sample']): raise ConfigError( "Sample names in the samples_txt file, '%s', must be unique." % self.samples_txt_file)
def sanity_check_for_samples_txt(self): if 'sample' not in self.samples_information.columns.values: raise ConfigError("You know what. This '%s' file does not look anything like\ a samples file." % self.samples_txt_file) for sample in self.samples_information['sample']: try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \ %s" % (self.samples_txt_file, e)) fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2']) bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))] if bad_fastq_names: raise ConfigError("We require tha all fastq file names end with either '.fastq' \ or '.fastq.gz'. Some or all of the file names in %s aren't formatted \ accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
def load_references_for_removal(self): """Load and perform some sanity checks on the references for removal""" self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(self.references_for_removal_txt) # adding the references_for_removal to the fasta_information dict self.fasta_information.update(self.references_for_removal) for sample in self.references_for_removal.keys(): try: u.check_sample_id(sample) except ConfigError as e: raise ConfigError("While processing the references for removal txt file ('%s'), anvi'o ran into the following error: \ %s" % (self.samples_txt_file, e)) files_that_end_with_gz = [] for ref_dict in self.references_for_removal.values(): if 'path' not in ref_dict: raise ConfigError('Yor references for removal txt file is not formatted properly. It must have only two columns \ with the headers "reference" and "path".') if ref_dict['path'].endswith('.gz'): filesnpaths.is_file_exists(ref_dict['path']) files_that_end_with_gz.append(ref_dict['path']) else: # if the file is not compressed then we can verify that it is a fasta file filesnpaths.is_file_fasta_formatted(ref_dict['path']) if files_that_end_with_gz: run.warning('The following reference for removal files are compressed: %s. \ That\'s fine, but it means that we will skip the \ sanity check to verify that this is actually \ a properly formatted fasta file. Things are \ probably Ok, this is just one of these occasions \ in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz)) if self.references_mode: # Make sure that the user didn't give the same name to references and references_for_removal ref_name_in_both = [r for r in self.references_for_removal if r in self.contigs_information] if ref_name_in_both: raise ConfigError('You must have unique names for your fasta files in your fasta txt file \ and your references for removal txt file. These are the names that appear \ in both: %s' % ', '.join(ref_name_in_both)) dont_remove = self.get_param_value_from_config(['remove_short_reads_based_on_references', 'dont_remove_just_map']) if not dont_remove: self.remove_short_reads_based_on_references = True
def NameIsOK(n): try: check_sample_id(n) except ConfigError: return False return True
def check_samples_txt(self): if self.run_iu_merge_pairs: proper_header = ['sample', 'split', 'r1', 'r2'] else: proper_header = ['sample', 'split', 'fasta'] missing_columns = [] for column_title in proper_header: if column_title not in self.sample_info.columns: missing_columns.append(column_title) if missing_columns: raise ConfigError( "The samples_txt file, '%s', is not properly formatted, " "as the following columns are missing: '%s'." % (self.sample_info, ', '.join(missing_columns))) for sample_name in self.sample_info['sample']: try: u.check_sample_id(sample_name) except ConfigError as e: raise ConfigError( "While processing the samples_txt file, '%s', " "Anvi'o ran into the following error: %s" % (self.samples_txt_file, e)) unknown_split_types = [] for split_type in self.sample_info['split']: if split_type not in TRNASeqWorkflow.known_split_types: unknown_split_types.append(split_type) if unknown_split_types: run.warning( "Some of the names of split types in the samples_txt file, '%s', " "are not what we were expecting (%s). " "That's okay, but Anvi'o decided it should warn you. " "Here are the names of split types that are not in our little list: %s. " % (self.samples_txt_file, ', '.join( TRNASeqWorkflow.known_split_types), ', '.join( sorted(set(unknown_split_types))))) if self.run_iu_merge_pairs: fastq_paths = self.sample_info['r1'].tolist( ) + self.sample_info['r2'].tolist() bad_fastq_paths = [ s for s in fastq_paths if not filesnpaths.is_file_exists(s, dont_raise=True) ] if bad_fastq_paths: raise ConfigError( "The following FASTQ files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fastq_paths))) bad_fastq_names = [ s for s in fastq_paths if (not s.endswith('.fq') and not s.endswith('.fq.gz') and not s.endswith('.fastq') and not s.endswith('.fastq.gz')) ] if bad_fastq_names: run.warning( "Some of the sequence files in the samples_txt file, '%s', " "do not end with '.fq', '.fq.gz', 'fastq' or '.fastq.gz'. " "That's okay, but Anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fastq_names[:5]))) else: fasta_paths = self.sample_info['fasta'].tolist() bad_fasta_paths = [ s for s in fasta_paths if not filesnpaths.is_file_exists(s, dont_raise=True) ] if bad_fasta_paths: raise ConfigError( "The following FASTA files in the samples_txt file, '%s', cannot be found: %s." % (self.samples_txt_file, ', '.join(bad_fasta_paths))) bad_fasta_names = [ s for s in fasta_paths if (not s.endswith('.fa') and not s.endswith('.fa.gz') and not s.endswith('.fasta') and not s.endswith('.fasta.gz')) ] if bad_fasta_names: run.warning( "Some of the FASTA files in the samples_txt file, '%s', " "do not end with '.fa', '.fa.gz', 'fasta' or '.fasta.gz'. " "That's okay, but Anvi'o decided it should warn you. " "Here are the first 5 such files that have unconventional file extensions: %s." % (self.samples_txt_file, ', '.join(bad_fasta_names[:5])))