Esempio n. 1
0
    def _parse_fastani_results(self, fastout_file, dict_results):
        """ Parse the fastani output file

        Parameters
        ----------
        fastout_file : str
            fastani output file.


        Returns
        -------
        dictionary
            dict_results[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}}
        """
        with open(fastout_file, 'r') as fastfile:
            for line in fastfile:
                info = line.strip().split()
                ref_genome = os.path.basename(info[1]).replace(
                    Config.FASTANI_GENOMES_EXT, "")
                user_g = remove_extension(os.path.basename(info[0]))
                ani = float(info[2])
                af = round(float(info[3]) / float(info[4]), 2)
                if user_g in dict_results:
                    dict_results[user_g][ref_genome] = {"ani": ani, 'af': af}
                else:
                    dict_results[user_g] = {ref_genome: {"ani": ani, "af": af}}
        return dict_results
Esempio n. 2
0
    def _parse_fastani_results_reverse(self, fastout_file, dict_parser_distance):
        # TODO: Merge _parse_fastani_results and _parse_fastani_results_reverse
        """ Parse the fastani output file for the reverse comparison and pick the best ANI and AF


        Parameters
        ----------
        fastout_file : fastani output file.
        dict_parser_distance: dictionaryof user genomes vs list of refrence genomes with ANI and AF


        Returns
        -------
        dictionary
            dict_parser_distance[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}}
        """
        with open(fastout_file) as fastfile:
            for line in fastfile:
                info = line.strip().split()
                ref_genome = os.path.basename(info[0]).replace(
                    Config.FASTANI_GENOMES_EXT, "")
                user_g = remove_extension(os.path.basename(info[1]))
                ani = float(info[2])
                af = round(float(info[3]) / float(info[4]), 2)
                if user_g in dict_parser_distance:
                    if ref_genome in dict_parser_distance.get(user_g):
                        if dict_parser_distance.get(user_g).get(ref_genome).get('ani') < ani:
                            dict_parser_distance[user_g][ref_genome]["ani"] = ani
                        if dict_parser_distance.get(user_g).get(ref_genome).get('af') < af:
                            dict_parser_distance[user_g][ref_genome]["af"] = af
                    else:
                        dict_parser_distance[user_g][ref_genome] = {"ani": ani, 'af': af}
                else:
                    dict_parser_distance[user_g] = {ref_genome: {"ani": ani, "af": af}}
        return dict_parser_distance
Esempio n. 3
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files, tln_tables = dict(), dict()
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f, extension)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            batchfile_fh = Batchfile(batchfile)
            genomic_files, tln_tables = batchfile_fh.genome_path, batchfile_fh.genome_tln

        # Check that all of the genome IDs are valid.
        for genome_key in genomic_files:
            self._verify_genome_id(genome_key)

        # Check that the prefix is valid and the path exists
        invalid_paths = list()
        for genome_key, genome_path in genomic_files.items():

            if not os.path.isfile(genome_path):
                invalid_paths.append((genome_key, genome_path))

        # Report on any invalid paths
        if len(invalid_paths) > 0:
            self.warnings.info(f'Reading from batchfile: {batchfile}')
            self.warnings.error(f'The following {len(invalid_paths)} genomes '
                                f'have invalid paths specified in the batchfile:')
            for g_path, g_gid in invalid_paths:
                self.warnings.info(f'{g_gid}\t{g_path}')
            raise GTDBTkExit(f'There are {len(invalid_paths)} paths in the '
                             f'batchfile which do not exist, see gtdb.warnings.log')

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error('No genomes found in directory: %s. Check '
                                  'the --extension flag used to identify '
                                  'genomes.' % genome_dir)
            else:
                self.logger.error('No genomes found in batch file: %s. Please '
                                  'check the format of this file.' % batchfile)
            raise GTDBTkExit

        invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids())
        if len(invalid_genomes) > 0:
            self.warnings.info(f'The following {len(invalid_genomes)} have the '
                               f'same ID as GTDB-Tk reference genomes:')
            for invalid_genome in sorted(invalid_genomes):
                self.warnings.info(invalid_genome)
            raise GTDBTkExit(f'You have {len(invalid_genomes)} genomes with the '
                             f'same id as GTDB-Tk reference genomes, please '
                             f'rename them. See gtdb.warnings.log.')

        return genomic_files, tln_tables
Esempio n. 4
0
File: main.py Progetto: 31380/GTDBTk
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files = {}
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            with open(batchfile, "r") as fh:
                for line_no, line in enumerate(fh):
                    line_split = line.strip().split("\t")
                    if line_split[0] == '':
                        continue  # blank line

                    if len(line_split) != 2:
                        self.logger.error(
                            'Batch file must contain exactly 2 columns.')
                        raise GenomeBatchfileMalformed

                    genome_file, genome_id = line_split
                    self._verify_genome_id(genome_id)

                    if genome_file is None or genome_file == '':
                        raise GTDBTkExit('Missing genome file on line %d.' %
                                         (line_no + 1))
                    elif genome_id is None or genome_id == '':
                        raise GTDBTkExit('Missing genome ID on line %d.' %
                                         (line_no + 1))
                    elif genome_id in genomic_files:
                        raise GTDBTkExit(
                            'Genome ID %s appears multiple times.' % genome_id)
                    if genome_file in genomic_files.values():
                        self.logger.warning(
                            'Genome file appears multiple times: %s' %
                            genome_file)

                    genomic_files[genome_id] = genome_file

        # Check that the prefix is valid and the path exists
        invalid_paths = list()
        for genome_key, genome_path in genomic_files.items():
            if genome_key.startswith("RS_") or genome_key.startswith("GB_") \
                    or genome_key.startswith("UBA"):
                self.logger.error(
                    "Submitted genomes start with the same prefix"
                    " (RS_,GB_,UBA) as reference genomes in"
                    " GTDB-Tk. This will cause issues for"
                    " downstream analysis.")
                raise GTDBTkExit

            if not os.path.isfile(genome_path):
                invalid_paths.append((genome_key, genome_path))

        # Report on any invalid paths
        if len(invalid_paths) > 0:
            self.warnings.info(f'Reading from batchfile: {batchfile}')
            self.warnings.error(
                f'The following {len(invalid_paths)} genomes '
                f'have invalid paths specified in the batchfile:')
            for g_path, g_gid in invalid_paths:
                self.warnings.info(f'{g_gid}\t{g_path}')
            raise GTDBTkExit(
                f'There are {len(invalid_paths)} paths in the '
                f'batchfile which do not exist, see gtdb.warnings.log')

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error('No genomes found in directory: %s. Check '
                                  'the --extension flag used to identify '
                                  'genomes.' % genome_dir)
            else:
                self.logger.error('No genomes found in batch file: %s. Please '
                                  'check the format of this file.' % batchfile)
            raise GTDBTkExit

        return genomic_files
Esempio n. 5
0
    def _producer(self, genome_file):
        """Apply prodigal to genome with most suitable translation table.

        Parameters
        ----------
        genome_file : str
            Fasta file for genome.
        """

        genome_id = remove_extension(genome_file)

        aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa')
        nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna')
        gff_file = os.path.join(self.output_dir, genome_id + '.gff')

        best_translation_table = -1
        table_coding_density = {4: -1, 11: -1}
        table_prob = {4: -1, 11: -1}
        if self.called_genes:
            os.system('cp %s %s' %
                      (os.path.abspath(genome_file), aa_gene_file))
        else:
            seqs = read_fasta(genome_file)

            if len(seqs) == 0:
                self.logger.warning(
                    'Cannot call Prodigal on an empty genome. Skipped: {}'.
                    format(genome_file))
                return None

            tmp_dir = tempfile.mkdtemp()

            # determine number of bases
            total_bases = 0
            for seq in seqs.values():
                total_bases += len(seq)

            # call genes under different translation tables
            if self.translation_table:
                translation_tables = [self.translation_table]
            else:
                translation_tables = [4, 11]

            translation_table_gffs = dict()
            tln_table_stats = dict()
            for translation_table in translation_tables:
                os.makedirs(os.path.join(tmp_dir, str(translation_table)))
                aa_gene_file_tmp = os.path.join(tmp_dir,
                                                str(translation_table),
                                                genome_id + '_genes.faa')
                nt_gene_file_tmp = os.path.join(tmp_dir,
                                                str(translation_table),
                                                genome_id + '_genes.fna')

                # check if there are sufficient bases to calculate prodigal parameters
                if total_bases < 100000 or self.meta:
                    proc_str = 'meta'  # use best precalculated parameters
                else:
                    proc_str = 'single'  # estimate parameters from data

                # If this is a gzipped genome, re-write the uncompressed genome file to disk
                prodigal_input = genome_file
                if genome_file.endswith('.gz'):
                    prodigal_input = os.path.join(
                        tmp_dir,
                        os.path.basename(genome_file[0:-3]) + '.fna')
                    write_fasta(seqs, prodigal_input)

                args = [
                    'prodigal', '-m', '-p', proc_str, '-q', '-f', 'gff', '-g',
                    str(translation_table), '-a', aa_gene_file_tmp, '-d',
                    nt_gene_file_tmp, '-i', prodigal_input
                ]
                if self.closed_ends:
                    args.append('-c')

                self.logger.debug('{}: {}'.format(genome_id, ' '.join(args)))

                proc = subprocess.Popen(args,
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.STDOUT)
                proc_out, proc_err = proc.communicate()
                gff_stdout = proc_out

                translation_table_gffs[translation_table] = gff_stdout

                if proc.returncode != 0:
                    self.logger.warning(
                        'Prodigal returned a non-zero exit code while processing: {}'
                        .format(genome_file))
                    return None

                # determine coding density
                prodigal_parser = ProdigalGeneFeatureParser(gff_stdout)

                # Skip if no genes were called.
                if prodigal_parser.n_sequences_processed() == 0:
                    shutil.rmtree(tmp_dir)
                    self.logger.warning(
                        'No genes were called! Check the quality of your genome. Skipped: {}'
                        .format(genome_file))
                    return None

                # Save the statistics for this translation table
                prodigal_stats = prodigal_parser.generate_statistics()
                tln_table_stats[translation_table] = prodigal_stats
                table_coding_density[
                    translation_table] = prodigal_stats.coding_density

            # determine best translation table
            if not self.translation_table:

                # Logistic classifier coefficients
                b0 = 12.363017423768538
                bi = np.array([
                    0.01212327382066545, -0.9250857181041326,
                    -0.10176647009345675, 0.7733711446656522,
                    0.6355731038236031, -0.1631355971443377,
                    -0.14713264317198863, -0.10320909026025472,
                    0.09621494439016824, 0.4992209080695785, 1.159933669041023,
                    -0.0507139271834123, 1.2619603455217179,
                    0.24392226222721214, -0.08567859197118802,
                    -0.18759562346413916, 0.13136209122186523,
                    -0.1399459561138417, 2.08086235029142, 0.6917662070950119
                ])

                # Scale x
                scaler_mean = np.array([
                    0.0027036907781622732, -1.8082140490218692,
                    -8.511942254988097e-08, 19.413811775420918,
                    12.08719100126732, 249.89521467118365,
                    0.0011868456444391487, -0.0007358432829349235,
                    0.004750880986023392, -0.04096159411654551,
                    -0.12505492579693805, -0.03749033894554058,
                    0.13053986993752234, -0.15914556336256136,
                    -0.6075506034967058, 0.06704648371665446,
                    0.04316693333324335, 0.26905236546875266,
                    0.010326462563249823, 333.3320678912514
                ])
                scaler_scale = np.array([
                    0.08442772272873166, 2.043313786484819,
                    2.917510891467501e-05, 22.577812640992242,
                    12.246767248868036, 368.87834547339907,
                    0.0014166252200216657, 0.0014582164250905056,
                    0.025127203671053467, 0.5095427815162036,
                    0.2813128128116135, 0.2559877920464989, 1.274371529860827,
                    0.7314782174742842, 1.6885750374356985,
                    0.17019369029012987, 0.15376309021975043,
                    0.583965556283342, 0.025076680822882474, 544.3648797867784
                ])
                xi = np.array(tln_table_stats[11]) - np.array(
                    tln_table_stats[4])
                xi -= scaler_mean
                xi /= scaler_scale

                # If xi are all 0, then P(11) = 1.
                prob_tbl_11 = 1 / (1 + np.exp(-1 * (b0 + (bi * xi).sum())))
                best_translation_table = 11 if prob_tbl_11 >= 0.5 else 4
                table_prob[4] = 1.0 - prob_tbl_11
                table_prob[11] = prob_tbl_11

            else:
                best_translation_table = self.translation_table

            shutil.copyfile(
                os.path.join(tmp_dir, str(best_translation_table),
                             genome_id + '_genes.faa'), aa_gene_file)
            shutil.copyfile(
                os.path.join(tmp_dir, str(best_translation_table),
                             genome_id + '_genes.fna'), nt_gene_file)
            with open(gff_file, 'w') as f:
                f.write(translation_table_gffs[best_translation_table])

            # clean up temporary files
            shutil.rmtree(tmp_dir)
        return genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[
            4], table_coding_density[11], table_prob[4], table_prob[11]
Esempio n. 6
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files = OrderedDict()
        if genome_dir:
            self.logger.debug(
                'Looking for genomes with extension *.{} in: {}'.format(
                    extension, genome_dir))
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)
                    self.logger.debug('Found genome: {}'.format(genome_id))

        elif batchfile:
            self.logger.debug(
                'Using genomes specified in: {}'.format(batchfile))
            with open(batchfile, 'r') as f:
                for line_no, line in enumerate(f.readlines()):
                    line_split = line.strip().split('\t')
                    if line_split[0] == '':
                        continue  # blank line

                    if len(line_split) != 2:
                        self.logger.error(
                            'Batch file must contain exactly 2 columns.')
                        raise GenomeBatchfileMalformed

                    genome_file, genome_id = line_split
                    self._assert_genome_id_valid(genome_id)

                    if genome_file is None or genome_file == '':
                        self.logger.error('Missing genome file on line %d.' %
                                          (line_no + 1))
                        raise GenomeBatchfileMalformed
                    elif genome_id is None or genome_id == '':
                        self.logger.error('Missing genome ID on line %d.' %
                                          (line_no + 1))
                        raise GenomeBatchfileMalformed
                    elif genome_id in genomic_files:
                        self.logger.error(
                            'Genome ID %s appear multiple times.' % genome_id)
                        raise GenomeBatchfileMalformed
                    if genome_file in genomic_files.values():
                        self.logger.warning(
                            'Genome file appears multiple times: %s' %
                            genome_file)

                    genomic_files[genome_id] = genome_file
                    self.logger.debug('Found genome {} at: {}'.format(
                        genome_id, genome_file))

        for genome_key in genomic_files.iterkeys():
            if genome_key.startswith("RS_") or genome_key.startswith(
                    "GB_") or genome_key.startswith("UBA"):
                self.logger.error(
                    "Submitted genomes start with the same prefix (RS_,GB_,UBA) as "
                    "reference genomes in GTDB-Tk. This will cause issues for "
                    "downstream analysis.")
                raise GenomeNameInvalid

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error(
                    'No genomes found in directory: %s. Check the --extension flag used to identify '
                    'genomes.' % genome_dir)
            else:
                self.logger.error(
                    'No genomes found in batch file: %s. Please check the format of this file.'
                    % batchfile)
            raise NoGenomesFound

        return genomic_files
Esempio n. 7
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes.
        batchfile : str
            File describing genomes.
        extension : str
            Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files, tln_tables = dict(), dict()
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f, extension)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            with open(batchfile, "r") as fh:
                for line_no, line in enumerate(fh):
                    line_split = line.strip().split("\t")
                    if line_split[0] == '':
                        continue  # blank line

                    if len(line_split) not in {2, 3}:
                        raise GTDBTkExit('Batch file must contain either 2 '
                                         'columns (detect translation table), '
                                         'or 3 (specify translation table).')

                    if len(line_split) == 2:
                        genome_file, genome_id = line_split
                    elif len(line_split) == 3:
                        genome_file, genome_id, tln_table = line_split
                        if tln_table not in {'4', '11'}:
                            raise GTDBTkExit(
                                'Specified translation table must '
                                'be either 4, or 11.')
                        tln_tables[genome_id] = int(tln_table)

                    self._verify_genome_id(genome_id)

                    if genome_file is None or genome_file == '':
                        raise GTDBTkExit('Missing genome file on line %d.' %
                                         (line_no + 1))
                    elif genome_id is None or genome_id == '':
                        raise GTDBTkExit('Missing genome ID on line %d.' %
                                         (line_no + 1))
                    elif genome_id in genomic_files:
                        raise GTDBTkExit(
                            'Genome ID %s appears multiple times.' % genome_id)
                    if genome_file in genomic_files.values():
                        self.logger.warning(
                            'Genome file appears multiple times: %s' %
                            genome_file)

                    genomic_files[genome_id] = genome_file

        # Check that the prefix is valid and the path exists
        invalid_paths = list()
        for genome_key, genome_path in genomic_files.items():

            if not os.path.isfile(genome_path):
                invalid_paths.append((genome_key, genome_path))

        # Report on any invalid paths
        if len(invalid_paths) > 0:
            self.warnings.info(f'Reading from batchfile: {batchfile}')
            self.warnings.error(
                f'The following {len(invalid_paths)} genomes '
                f'have invalid paths specified in the batchfile:')
            for g_path, g_gid in invalid_paths:
                self.warnings.info(f'{g_gid}\t{g_path}')
            raise GTDBTkExit(
                f'There are {len(invalid_paths)} paths in the '
                f'batchfile which do not exist, see gtdb.warnings.log')

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.error('No genomes found in directory: %s. Check '
                                  'the --extension flag used to identify '
                                  'genomes.' % genome_dir)
            else:
                self.logger.error('No genomes found in batch file: %s. Please '
                                  'check the format of this file.' % batchfile)
            raise GTDBTkExit

        invalid_genomes = set(genomic_files.keys()) & set(get_reference_ids())
        if len(invalid_genomes) > 0:
            self.warnings.info(
                f'The following {len(invalid_genomes)} have the '
                f'same ID as GTDB-Tk reference genomes:')
            for invalid_genome in sorted(invalid_genomes):
                self.warnings.info(invalid_genome)
            raise GTDBTkExit(
                f'You have {len(invalid_genomes)} genomes with the '
                f'same id as GTDB-Tk reference genomes, please '
                f'rename them. See gtdb.warnings.log.')

        return genomic_files, tln_tables