def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files, tln_tables = dict(), dict() if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f, extension) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: batchfile_fh = Batchfile(batchfile) genomic_files, tln_tables = batchfile_fh.genome_path, batchfile_fh.genome_tln # Check that all of the genome IDs are valid. for genome_key in genomic_files: self._verify_genome_id(genome_key) # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): if not os.path.isfile(genome_path): invalid_paths.append((genome_key, genome_path)) # Report on any invalid paths if len(invalid_paths) > 0: self.warnings.info(f'Reading from batchfile: {batchfile}') self.warnings.error(f'The following {len(invalid_paths)} genomes ' f'have invalid paths specified in the batchfile:') for g_path, g_gid in invalid_paths: self.warnings.info(f'{g_gid}\t{g_path}') raise GTDBTkExit(f'There are {len(invalid_paths)} paths in the ' f'batchfile which do not exist, see gtdb.warnings.log') if len(genomic_files) == 0: if genome_dir: self.logger.error('No genomes found in directory: %s. Check ' 'the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error('No genomes found in batch file: %s. Please ' 'check the format of this file.' % batchfile) raise GTDBTkExit invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids()) if len(invalid_genomes) > 0: self.warnings.info(f'The following {len(invalid_genomes)} have the ' f'same ID as GTDB-Tk reference genomes:') for invalid_genome in sorted(invalid_genomes): self.warnings.info(invalid_genome) raise GTDBTkExit(f'You have {len(invalid_genomes)} genomes with the ' f'same id as GTDB-Tk reference genomes, please ' f'rename them. See gtdb.warnings.log.') return genomic_files, tln_tables
def _genomes_to_process(self, genome_dir, batchfile, extension): """Get genomes to process. Parameters ---------- genome_dir : str Directory containing genomes. batchfile : str File describing genomes. extension : str Extension of files to process. Returns ------- genomic_files : d[genome_id] -> FASTA file Map of genomes to their genomic FASTA files. """ genomic_files, tln_tables = dict(), dict() if genome_dir: for f in os.listdir(genome_dir): if f.endswith(extension): genome_id = remove_extension(f, extension) genomic_files[genome_id] = os.path.join(genome_dir, f) elif batchfile: with open(batchfile, "r") as fh: for line_no, line in enumerate(fh): line_split = line.strip().split("\t") if line_split[0] == '': continue # blank line if len(line_split) not in {2, 3}: raise GTDBTkExit('Batch file must contain either 2 ' 'columns (detect translation table), ' 'or 3 (specify translation table).') if len(line_split) == 2: genome_file, genome_id = line_split elif len(line_split) == 3: genome_file, genome_id, tln_table = line_split if tln_table not in {'4', '11'}: raise GTDBTkExit( 'Specified translation table must ' 'be either 4, or 11.') tln_tables[genome_id] = int(tln_table) self._verify_genome_id(genome_id) if genome_file is None or genome_file == '': raise GTDBTkExit('Missing genome file on line %d.' % (line_no + 1)) elif genome_id is None or genome_id == '': raise GTDBTkExit('Missing genome ID on line %d.' % (line_no + 1)) elif genome_id in genomic_files: raise GTDBTkExit( 'Genome ID %s appears multiple times.' % genome_id) if genome_file in genomic_files.values(): self.logger.warning( 'Genome file appears multiple times: %s' % genome_file) genomic_files[genome_id] = genome_file # Check that the prefix is valid and the path exists invalid_paths = list() for genome_key, genome_path in genomic_files.items(): if not os.path.isfile(genome_path): invalid_paths.append((genome_key, genome_path)) # Report on any invalid paths if len(invalid_paths) > 0: self.warnings.info(f'Reading from batchfile: {batchfile}') self.warnings.error( f'The following {len(invalid_paths)} genomes ' f'have invalid paths specified in the batchfile:') for g_path, g_gid in invalid_paths: self.warnings.info(f'{g_gid}\t{g_path}') raise GTDBTkExit( f'There are {len(invalid_paths)} paths in the ' f'batchfile which do not exist, see gtdb.warnings.log') if len(genomic_files) == 0: if genome_dir: self.logger.error('No genomes found in directory: %s. Check ' 'the --extension flag used to identify ' 'genomes.' % genome_dir) else: self.logger.error('No genomes found in batch file: %s. Please ' 'check the format of this file.' % batchfile) raise GTDBTkExit invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids()) if len(invalid_genomes) > 0: self.warnings.info( f'The following {len(invalid_genomes)} have the ' f'same ID as GTDB-Tk reference genomes:') for invalid_genome in sorted(invalid_genomes): self.warnings.info(invalid_genome) raise GTDBTkExit( f'You have {len(invalid_genomes)} genomes with the ' f'same id as GTDB-Tk reference genomes, please ' f'rename them. See gtdb.warnings.log.') return genomic_files, tln_tables