Exemple #1
0
    def init_samples_txt(self):
        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError("You know what. This '%s' file does not look anything like "
                              "a samples file." % self.samples_txt_file)

        if len(self.samples_information['sample']) != len(set(self.samples_information['sample'])):
            raise ConfigError("Names of samples in your samples_txt file must be unique. "
                              "It looks like some names appear twice in your file: %s" % self.samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: "
                                  "%s" % (self.samples_txt_file, e))

        if 'r1' not in self.samples_information.columns or 'r2' not in self.samples_information:
            raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. "
                              "We are not sure what's wrong, but we expected to find columns with "
                              "titles 'r1' and 'r2' and we did not find such columns." % self.samples_txt_file)

        fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
        try:
            bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
        except Exception as e:
            raise ConfigError(f"The format of your samples txt file does not seem to be working for anvi'o. This is "
                              f"what the downstream processes had to say: '{e}'. Please double-check columns in "
                              f"your samples txt.")
        if bad_fastq_names:
            run.warning("We noticed some of your sequence files in '%s' do not end with either '.fastq' "
                        "or '.fastq.gz'. That's okay, but anvi'o decided it should warn you. Here are the first "
                        "5 such files that have unconventional file extensions: %s." \
                         % (self.samples_txt_file, ', '.join(bad_fastq_names[:5])))
Exemple #2
0
    def init_samples_txt(self):
        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError("You know what. This '%s' file does not look anything like\
                               a samples file." % self.samples_txt_file)

        if len(self.samples_information['sample']) != len(set(self.samples_information['sample'])):
            raise ConfigError("Names of samples in your samples_txt file must be unique. \
                               It looks like some names appear twice in your file: %s" % self.samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        if 'r1' not in self.samples_information.columns or 'r2' not in self.samples_information:
            raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. \
                               We are not sure what's wrong, but we expected to find columns with \
                               titles 'r1' and 'r2' and we did not find such columns." % self.samples_txt_file)

        fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
        bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
        if bad_fastq_names:
            raise ConfigError("We require tha all fastq file names end with either '.fastq' \
                               or '.fastq.gz'. Some or all of the file names in %s aren't formatted \
                               accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
Exemple #3
0
    def init(self):
        super().init()

        # loading the samples.txt file
        samples_txt_file = self.get_param_value_from_config(['samples_txt'])
        filesnpaths.is_file_exists(samples_txt_file)
        # getting the samples information (names, [group], path to r1, path to r2) from samples.txt
        self.samples_information = pd.read_csv(samples_txt_file,
                                               sep='\t',
                                               index_col=False)

        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError(
                "You know what. This '%s' file does not look anything like\
                               a samples file." % samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError(
                    "While processing the samples txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (samples_txt_file, e))

        self.sanity_check_for_kraken()
Exemple #4
0
    def init_samples_txt(self):
        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError("You know what. This '%s' file does not look anything like\
                               a samples file." % self.samples_txt_file)

        if len(self.samples_information['sample']) != len(set(self.samples_information['sample'])):
            raise ConfigError("Names of samples in your samples_txt file must be unique. \
                               It looks like some names appear twice in your file: %s" % self.samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        if 'r1' not in self.samples_information.columns or 'r2' not in self.samples_information:
            raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. \
                               We are not sure what's wrong, but we expected to find columns with \
                               titles 'r1' and 'r2' and we did not find such columns." % self.samples_txt_file)

        fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
        bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
        if bad_fastq_names:
            raise ConfigError("We require tha all fastq file names end with either '.fastq' \
                               or '.fastq.gz'. Some or all of the file names in %s aren't formatted \
                               accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
Exemple #5
0
 def set_sample_id(self):
     if self.sample_id:
         utils.check_sample_id(self.sample_id)
     else:
         self.sample_id = os.path.basename(self.output_directory)
         self.sample_id = self.sample_id.replace('-', '_')
         if self.sample_id[0] in constants.digits:
             self.sample_id = 's' + self.sample_id
         utils.check_sample_id(self.sample_id)
Exemple #6
0
 def set_sample_id(self):
     if self.sample_id:
         utils.check_sample_id(self.sample_id)
     else:
         self.sample_id = os.path.basename(self.output_directory)
         self.sample_id = self.sample_id.replace('-', '_')
         if self.sample_id[0] in constants.digits:
             self.sample_id = 's' + self.sample_id
         utils.check_sample_id(self.sample_id)
Exemple #7
0
    def load_references_for_removal(self):
        """Load and perform some sanity checks on the references for removal"""
        self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(
            self.references_for_removal_txt)
        # adding the references_for_removal to the fasta_information dict
        self.fasta_information.update(self.references_for_removal)

        for sample in self.references_for_removal.keys():
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError(
                    "While processing the references for removal txt file ('%s'), anvi'o ran into the following error: "
                    "%s" % (self.samples_txt_file, e))

        files_that_end_with_gz = []
        for ref_dict in self.references_for_removal.values():
            if 'path' not in ref_dict:
                raise ConfigError(
                    'Yor references for removal txt file is not formatted properly. It must have only two columns '
                    'with the headers "reference" and "path".')
            if ref_dict['path'].endswith('.gz'):
                filesnpaths.is_file_exists(ref_dict['path'])
                files_that_end_with_gz.append(ref_dict['path'])
            else:
                # if the file is not compressed then we can verify that it is a fasta file
                filesnpaths.is_file_fasta_formatted(ref_dict['path'])

        if files_that_end_with_gz:
            run.warning(
                'The following reference for removal files are compressed: %s. '
                'That\'s fine, but it means that we will skip the '
                'sanity check to verify that this is actually '
                'a properly formatted fasta file. Things are '
                'probably Ok, this is just one of these occasions '
                'in which anvi\'o is oversharing.' %
                ', '.join(files_that_end_with_gz))

        if self.references_mode:
            # Make sure that the user didn't give the same name to references and references_for_removal
            ref_name_in_both = [
                r for r in self.references_for_removal
                if r in self.contigs_information
            ]
            if ref_name_in_both:
                raise ConfigError(
                    'You must have unique names for your fasta files in your fasta txt file '
                    'and your references for removal txt file. These are the names that appear '
                    'in both: %s' % ', '.join(ref_name_in_both))
        dont_remove = self.get_param_value_from_config(
            ['remove_short_reads_based_on_references', 'dont_remove_just_map'])
        if not dont_remove:
            self.remove_short_reads_based_on_references = True
Exemple #8
0
 def set_sample_id(self):
     if self.sample_id:
         utils.check_sample_id(self.sample_id)
     else:
         if self.input_file_path:
             self.input_file_path = os.path.abspath(self.input_file_path)
             self.sample_id = os.path.basename(self.input_file_path).upper().split('.BAM')[0]
             self.sample_id = self.sample_id.replace('-', '_')
             self.sample_id = self.sample_id.replace('.', '_')
             if self.sample_id[0] in constants.digits:
                 self.sample_id = 's' + self.sample_id
             utils.check_sample_id(self.sample_id)
         if self.serialized_profile_path:
             self.serialized_profile_path = os.path.abspath(self.serialized_profile_path)
             self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path))
Exemple #9
0
 def set_sample_id(self):
     if self.sample_id:
         utils.check_sample_id(self.sample_id)
     else:
         if self.input_file_path:
             self.input_file_path = os.path.abspath(self.input_file_path)
             self.sample_id = os.path.basename(self.input_file_path).upper().split(".BAM")[0]
             self.sample_id = self.sample_id.replace("-", "_")
             self.sample_id = self.sample_id.replace(".", "_")
             if self.sample_id[0] in constants.digits:
                 self.sample_id = "s" + self.sample_id
             utils.check_sample_id(self.sample_id)
         if self.serialized_profile_path:
             self.serialized_profile_path = os.path.abspath(self.serialized_profile_path)
             self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path))
Exemple #10
0
 def set_sample_id(self):
     if self.sample_id:
         utils.check_sample_id(self.sample_id)
     else:
         if self.input_file_path:
             self.input_file_path = os.path.abspath(self.input_file_path)
             self.sample_id = os.path.basename(self.input_file_path).upper().split('.BAM')[0]
             self.sample_id = self.sample_id.replace('-', '_')
             self.sample_id = self.sample_id.replace('.', '_')
             if self.sample_id[0] in constants.digits:
                 self.sample_id = 's' + self.sample_id
             utils.check_sample_id(self.sample_id)
         if self.serialized_profile_path:
             self.serialized_profile_path = os.path.abspath(self.serialized_profile_path)
             self.sample_id = os.path.basename(os.path.dirname(self.serialized_profile_path))
Exemple #11
0
 def check_project_name(self):
     """Check the name of the tRNA-seq project."""
     if self.run_anvi_merge_trnaseq:
         project_name = self.get_param_value_from_config(
             ['anvi_merge_trnaseq', '--project-name'])
         if not project_name:
             raise ConfigError(
                 "Since you are running anvi-merge-trnaseq, "
                 "please provide a project name for the sample(s) in the config file."
             )
         try:
             u.check_sample_id(project_name)
         except ConfigError as e:
             raise ConfigError("While checking the project name, '%s', "
                               "anvi'o ran into the following error: %s" %
                               (project_name, e))
Exemple #12
0
    def check_sample_names(self):
        """Check that the name of each tRNA-seq library is anvi'o-compliant."""
        for sample_name in self.sample_info['sample']:
            try:
                u.check_sample_id(sample_name)
            except ConfigError as e:
                raise ConfigError(
                    "While processing the samples_txt file, '%s', "
                    "anvi'o ran into the following error: %s" %
                    (self.samples_txt_file, e))

        if len(set(self.sample_info['sample'])) != len(
                self.sample_info['sample']):
            raise ConfigError(
                "Sample names in the samples_txt file, '%s', must be unique." %
                self.samples_txt_file)
Exemple #13
0
    def sanity_check_for_samples_txt(self):
        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError("You know what. This '%s' file does not look anything like\
                               a samples file." % self.samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
        bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
        if bad_fastq_names:
            raise ConfigError("We require tha all fastq file names end with either '.fastq' \
                               or '.fastq.gz'. Some or all of the file names in %s aren't formatted \
                               accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
Exemple #14
0
    def sanity_check_for_samples_txt(self):
        if 'sample' not in self.samples_information.columns.values:
            raise ConfigError("You know what. This '%s' file does not look anything like\
                               a samples file." % self.samples_txt_file)

        for sample in self.samples_information['sample']:
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the samples txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        fastq_file_names = list(self.samples_information['r1']) + list(self.samples_information['r2'])
        bad_fastq_names = [s for s in fastq_file_names if (not s.endswith('.fastq') and not s.endswith('.fastq.gz'))]
        if bad_fastq_names:
            raise ConfigError("We require tha all fastq file names end with either '.fastq' \
                               or '.fastq.gz'. Some or all of the file names in %s aren't formatted \
                               accordingly. These are the file names we don't like: %s" % (self.samples_txt_file, ', '.join(bad_fastq_names)))
Exemple #15
0
    def load_references_for_removal(self):
        """Load and perform some sanity checks on the references for removal"""
        self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(self.references_for_removal_txt)
        # adding the references_for_removal to the fasta_information dict
        self.fasta_information.update(self.references_for_removal)

        for sample in self.references_for_removal.keys():
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the references for removal txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        files_that_end_with_gz = []
        for ref_dict in self.references_for_removal.values():
            if 'path' not in ref_dict:
                raise ConfigError('Yor references for removal txt file is not formatted properly. It must have only two columns \
                                   with the headers "reference" and "path".')
            if ref_dict['path'].endswith('.gz'):
                filesnpaths.is_file_exists(ref_dict['path'])
                files_that_end_with_gz.append(ref_dict['path'])
            else:
                # if the file is not compressed then we can verify that it is a fasta file
                filesnpaths.is_file_fasta_formatted(ref_dict['path'])

        if files_that_end_with_gz:
            run.warning('The following reference for removal files are compressed: %s. \
                         That\'s fine, but it means that we will skip the \
                         sanity check to verify that this is actually \
                         a properly formatted fasta file. Things are \
                         probably Ok, this is just one of these occasions \
                         in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz))

        if self.references_mode:
            # Make sure that the user didn't give the same name to references and references_for_removal
            ref_name_in_both = [r for r in self.references_for_removal if r in self.contigs_information]
            if ref_name_in_both:
                raise ConfigError('You must have unique names for your fasta files in your fasta txt file \
                                   and your references for removal txt file. These are the names that appear \
                                   in both: %s' % ', '.join(ref_name_in_both))
        dont_remove = self.get_param_value_from_config(['remove_short_reads_based_on_references', 'dont_remove_just_map'])
        if not dont_remove:
            self.remove_short_reads_based_on_references = True
Exemple #16
0
def NameIsOK(n):
    try:
        check_sample_id(n)
    except ConfigError:
        return False
    return True
def NameIsOK(n):
    try:
        check_sample_id(n)
    except ConfigError:
        return False
    return True
Exemple #18
0
    def check_samples_txt(self):

        if self.run_iu_merge_pairs:
            proper_header = ['sample', 'split', 'r1', 'r2']
        else:
            proper_header = ['sample', 'split', 'fasta']
        missing_columns = []
        for column_title in proper_header:
            if column_title not in self.sample_info.columns:
                missing_columns.append(column_title)
        if missing_columns:
            raise ConfigError(
                "The samples_txt file, '%s', is not properly formatted, "
                "as the following columns are missing: '%s'." %
                (self.sample_info, ', '.join(missing_columns)))

        for sample_name in self.sample_info['sample']:
            try:
                u.check_sample_id(sample_name)
            except ConfigError as e:
                raise ConfigError(
                    "While processing the samples_txt file, '%s', "
                    "Anvi'o ran into the following error: %s" %
                    (self.samples_txt_file, e))

        unknown_split_types = []
        for split_type in self.sample_info['split']:
            if split_type not in TRNASeqWorkflow.known_split_types:
                unknown_split_types.append(split_type)
        if unknown_split_types:
            run.warning(
                "Some of the names of split types in the samples_txt file, '%s', "
                "are not what we were expecting (%s). "
                "That's okay, but Anvi'o decided it should warn you. "
                "Here are the names of split types that are not in our little list: %s. "
                % (self.samples_txt_file, ', '.join(
                    TRNASeqWorkflow.known_split_types), ', '.join(
                        sorted(set(unknown_split_types)))))

        if self.run_iu_merge_pairs:
            fastq_paths = self.sample_info['r1'].tolist(
            ) + self.sample_info['r2'].tolist()
            bad_fastq_paths = [
                s for s in fastq_paths
                if not filesnpaths.is_file_exists(s, dont_raise=True)
            ]
            if bad_fastq_paths:
                raise ConfigError(
                    "The following FASTQ files in the samples_txt file, '%s', cannot be found: %s."
                    % (self.samples_txt_file, ', '.join(bad_fastq_paths)))
            bad_fastq_names = [
                s for s in fastq_paths
                if (not s.endswith('.fq') and not s.endswith('.fq.gz') and
                    not s.endswith('.fastq') and not s.endswith('.fastq.gz'))
            ]
            if bad_fastq_names:
                run.warning(
                    "Some of the sequence files in the samples_txt file, '%s', "
                    "do not end with '.fq', '.fq.gz', 'fastq' or '.fastq.gz'. "
                    "That's okay, but Anvi'o decided it should warn you. "
                    "Here are the first 5 such files that have unconventional file extensions: %s."
                    % (self.samples_txt_file, ', '.join(bad_fastq_names[:5])))
        else:
            fasta_paths = self.sample_info['fasta'].tolist()

            bad_fasta_paths = [
                s for s in fasta_paths
                if not filesnpaths.is_file_exists(s, dont_raise=True)
            ]
            if bad_fasta_paths:
                raise ConfigError(
                    "The following FASTA files in the samples_txt file, '%s', cannot be found: %s."
                    % (self.samples_txt_file, ', '.join(bad_fasta_paths)))

            bad_fasta_names = [
                s for s in fasta_paths
                if (not s.endswith('.fa') and not s.endswith('.fa.gz') and
                    not s.endswith('.fasta') and not s.endswith('.fasta.gz'))
            ]
            if bad_fasta_names:
                run.warning(
                    "Some of the FASTA files in the samples_txt file, '%s', "
                    "do not end with '.fa', '.fa.gz', 'fasta' or '.fasta.gz'. "
                    "That's okay, but Anvi'o decided it should warn you. "
                    "Here are the first 5 such files that have unconventional file extensions: %s."
                    % (self.samples_txt_file, ', '.join(bad_fasta_names[:5])))