Beispiel #1
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files, tln_tables = dict(), dict()
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f, extension)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            batchfile_fh = Batchfile(batchfile)
            genomic_files, tln_tables = batchfile_fh.genome_path, batchfile_fh.genome_tln

        # Check that all of the genome IDs are valid.
        for genome_key in genomic_files:
            self._verify_genome_id(genome_key)

        # Check that the prefix is valid and the path exists
        invalid_paths = list()
        for genome_key, genome_path in genomic_files.items():

            if not os.path.isfile(genome_path):
                invalid_paths.append((genome_key, genome_path))

        # Report on any invalid paths
        if len(invalid_paths) > 0:
            self.warnings.info(f'Reading from batchfile: {batchfile}')
            self.warnings.error(f'The following {len(invalid_paths)} genomes '
                                f'have invalid paths specified in the batchfile:')
            for g_path, g_gid in invalid_paths:
                self.warnings.info(f'{g_gid}\t{g_path}')
            raise GTDBTkExit(f'There are {len(invalid_paths)} paths in the '
                             f'batchfile which do not exist, see gtdb.warnings.log')

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error('No genomes found in directory: %s. Check '
                                  'the --extension flag used to identify '
                                  'genomes.' % genome_dir)
            else:
                self.logger.error('No genomes found in batch file: %s. Please '
                                  'check the format of this file.' % batchfile)
            raise GTDBTkExit

        invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids())
        if len(invalid_genomes) > 0:
            self.warnings.info(f'The following {len(invalid_genomes)} have the '
                               f'same ID as GTDB-Tk reference genomes:')
            for invalid_genome in sorted(invalid_genomes):
                self.warnings.info(invalid_genome)
            raise GTDBTkExit(f'You have {len(invalid_genomes)} genomes with the '
                             f'same id as GTDB-Tk reference genomes, please '
                             f'rename them. See gtdb.warnings.log.')

        return genomic_files, tln_tables
Beispiel #2
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files, tln_tables = dict(), dict()
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f, extension)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            with open(batchfile, "r") as fh:
                for line_no, line in enumerate(fh):
                    line_split = line.strip().split("\t")
                    if line_split[0] == '':
                        continue  # blank line

                    if len(line_split) not in {2, 3}:
                        raise GTDBTkExit('Batch file must contain either 2 '
                                         'columns (detect translation table), '
                                         'or 3 (specify translation table).')

                    if len(line_split) == 2:
                        genome_file, genome_id = line_split
                    elif len(line_split) == 3:
                        genome_file, genome_id, tln_table = line_split
                        if tln_table not in {'4', '11'}:
                            raise GTDBTkExit(
                                'Specified translation table must '
                                'be either 4, or 11.')
                        tln_tables[genome_id] = int(tln_table)

                    self._verify_genome_id(genome_id)

                    if genome_file is None or genome_file == '':
                        raise GTDBTkExit('Missing genome file on line %d.' %
                                         (line_no + 1))
                    elif genome_id is None or genome_id == '':
                        raise GTDBTkExit('Missing genome ID on line %d.' %
                                         (line_no + 1))
                    elif genome_id in genomic_files:
                        raise GTDBTkExit(
                            'Genome ID %s appears multiple times.' % genome_id)
                    if genome_file in genomic_files.values():
                        self.logger.warning(
                            'Genome file appears multiple times: %s' %
                            genome_file)

                    genomic_files[genome_id] = genome_file

        # Check that the prefix is valid and the path exists
        invalid_paths = list()
        for genome_key, genome_path in genomic_files.items():

            if not os.path.isfile(genome_path):
                invalid_paths.append((genome_key, genome_path))

        # Report on any invalid paths
        if len(invalid_paths) > 0:
            self.warnings.info(f'Reading from batchfile: {batchfile}')
            self.warnings.error(
                f'The following {len(invalid_paths)} genomes '
                f'have invalid paths specified in the batchfile:')
            for g_path, g_gid in invalid_paths:
                self.warnings.info(f'{g_gid}\t{g_path}')
            raise GTDBTkExit(
                f'There are {len(invalid_paths)} paths in the '
                f'batchfile which do not exist, see gtdb.warnings.log')

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error('No genomes found in directory: %s. Check '
                                  'the --extension flag used to identify '
                                  'genomes.' % genome_dir)
            else:
                self.logger.error('No genomes found in batch file: %s. Please '
                                  'check the format of this file.' % batchfile)
            raise GTDBTkExit

        invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids())
        if len(invalid_genomes) > 0:
            self.warnings.info(
                f'The following {len(invalid_genomes)} have the '
                f'same ID as GTDB-Tk reference genomes:')
            for invalid_genome in sorted(invalid_genomes):
                self.warnings.info(invalid_genome)
            raise GTDBTkExit(
                f'You have {len(invalid_genomes)} genomes with the '
                f'same id as GTDB-Tk reference genomes, please '
                f'rename them. See gtdb.warnings.log.')

        return genomic_files, tln_tables