Ejemplo n.º 1
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)
        if not genome_files:
            self.logger.warning('  [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        prodigal = Prodigal(options.cpus)
        summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.iteritems():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('')
        self.logger.info('  Identified genes written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Ejemplo n.º 2
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('  Genes in genomes written to: %s' % options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True)
            self.logger.info('  Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)

        self.time_keeper.print_time_stamp()
Ejemplo n.º 3
0
    def call_genes(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)

        genome_files = self._input_files(options.input_genomes,
                                         options.file_ext)

        prodigal = Prodigal(options.cpus, not options.silent)
        summary_stats = prodigal.run(genome_files, options.output_dir, False,
                                     options.force_table, False)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'),
                    'w')
        fout.write(
            'Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n'
        )
        for genome_id, stats in summary_stats.items():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' %
                       (genome_id, stats.best_translation_table,
                        stats.coding_density_4, stats.coding_density_11))
        fout.close()

        self.logger.info('Identified genes written to: %s' %
                         options.output_dir)
Ejemplo n.º 4
0
    def call_genes(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        
        genome_files = self._input_files(options.input_genomes, options.file_ext)

        prodigal = Prodigal(options.cpus, not options.silent)
        summary_stats = prodigal.run(genome_files, 
                                        options.output_dir, 
                                        called_genes=False, 
                                        translation_table=options.force_table, 
                                        meta=False,
                                        closed_ends=True)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.items():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('Identified genes written to: %s' % options.output_dir)
Ejemplo n.º 5
0
    def _runProdigal(self, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        """

        temp_dir, fasta_file = os.path.split(fasta_path)
        output_dir = os.path.join(temp_dir, self.userAnnotationDir)
        genome_id = fasta_file[0:fasta_file.rfind('_')]

        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path], output_dir)
        summary_stats = summary_stats[summary_stats.keys()[0]]

        # rename output files to adhere to GTDB conventions
        aa_gene_file = os.path.join(
            output_dir, genome_id + ConfigMetadata.PROTEIN_FILE_SUFFIX)
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)

        nt_gene_file = os.path.join(
            output_dir, genome_id + ConfigMetadata.NT_GENE_FILE_SUFFIX)
        shutil.move(summary_stats.nt_gene_file, nt_gene_file)

        gff_file = os.path.join(output_dir,
                                genome_id + ConfigMetadata.GFF_FILE_SUFFIX)
        shutil.move(summary_stats.gff_file, gff_file)

        # save translation table information
        translation_table_file = os.path.join(
            output_dir, 'prodigal_translation_table.tsv')
        fout = open(translation_table_file, 'w')
        fout.write(
            '%s\t%d\n' %
            ('best_translation_table', summary_stats.best_translation_table))
        fout.write('%s\t%.2f\n' %
                   ('coding_density_4', summary_stats.coding_density_4 * 100))
        fout.write(
            '%s\t%.2f\n' %
            ('coding_density_11', summary_stats.coding_density_11 * 100))
        fout.close()

        checksum = sha256(aa_gene_file)
        fout = open(aa_gene_file + ConfigMetadata.CHECKSUM_SUFFIX, 'w')
        fout.write(checksum)
        fout.close()

        return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
Ejemplo n.º 6
0
    def call_genes(self, options):
        """Call genes command"""

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('Genes in genomes written to: %s' %
                         options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file],
                         unbinned_output_dir,
                         meta=True)
            self.logger.info('Genes in unbinned scaffolds written to: %s' %
                             unbinned_output_dir)
Ejemplo n.º 7
0
    def _run_prodigal(self, genome_id, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        """

        output_dir = os.path.join(self.marker_gene_dir, genome_id)

        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path], output_dir, called_genes=self.proteins)
        summary_stats = summary_stats[summary_stats.keys()[0]]

        # rename output files to adhere to GTDB conventions and desired genome ID
        aa_gene_file = os.path.join(output_dir, genome_id + self.protein_file_suffix)
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)

        nt_gene_file = None
        gff_file = None
        translation_table_file = None
        if not self.proteins:
            nt_gene_file = os.path.join(output_dir, genome_id + self.nt_gene_file_suffix)
            shutil.move(summary_stats.nt_gene_file, nt_gene_file)

            gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix)
            shutil.move(summary_stats.gff_file, gff_file)

            # save translation table information
            translation_table_file = os.path.join(output_dir, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' % ('best_translation_table', summary_stats.best_translation_table))
            fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats.coding_density_4 * 100))
            fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats.coding_density_11 * 100))
            fout.close()

        return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
Ejemplo n.º 8
0
    def _run_prodigal(self, genome_paths):
        """Run Prodigal on genomes."""

        # get genome path and translation table for each file
        self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths))
        genome_files = []
        translation_table = {}
        for gid, gpath in genome_paths.items():
            assembly_id = os.path.basename(os.path.normpath(gpath))
            canonical_gid = assembly_id[0:assembly_id.find('_', 4)]
            
            genome_file = os.path.join(gpath, assembly_id + '_genomic.fna')
            if os.path.exists(genome_file):
                if os.stat(genome_file).st_size == 0:
                    self.logger.warning('Genomic file appears to be empty: %s' % genome_file)
                    continue
                
                genome_files.append(genome_file)
            else:
                self.logger.warning('Genomic file appears to be missing: %s' % genome_file)
                    
            gff_file = os.path.join(gpath, assembly_id + '_genomic.gff')
            if os.path.exists(gff_file):
                if os.stat(gff_file).st_size == 0:
                    self.logger.warning('GFF appears to be empty: %s' % gff_file)
                    continue

                tt = self._parse_translation_table(gff_file)
                if tt:
                    translation_table[canonical_gid] = tt
                else:
                    translation_table[canonical_gid] = None
                    self.logger.warning('Unable to determine translation table for: %s' % gff_file)
                    sys.exit(-1)
            else:
                self.logger.warning('GFF appears to be missing: %s' % gff_file)
                sys.exit(-1)
        
        # run Prodigal on each genome
        self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths))
        prodigal = Prodigal(cpus=self.cpus)
        summary_stats = prodigal.run(genome_files, 
                                    translation_table=translation_table, 
                                    output_dir=self.tmp_dir)

        # move results into individual genome directories
        self.logger.info('Moving files and calculating checksums.')
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            canonical_gid = genome_id[0:genome_id.find('_', 4)]
            
            aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(self.tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            if translation_table[canonical_gid]:
                fout.write('%s\t%d\t%s\n' % ('best_translation_table', 
                                                summary_stats[genome_id].best_translation_table,
                                                'used table specified by NCBI'))
            else:
                fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
                fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
                fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
Ejemplo n.º 9
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue

            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                aa_gene_file = os.path.join(assembly_dir, 'prodigal',
                                            genome_id + '_protein.faa')
                if os.path.exists(aa_gene_file):
                    # verify checksum
                    checksum_file = aa_gene_file + '.sha256'
                    if os.path.exists(checksum_file):
                        checksum = sha256(aa_gene_file)
                        cur_checksum = open(checksum_file).readline().strip()
                        if checksum == cur_checksum:
                            continue

                genome_file = os.path.join(assembly_dir,
                                           assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)

            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path,
                                        genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' %
                       ('best_translation_table',
                        summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_4',
                        summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_11',
                        summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
Ejemplo n.º 10
0
def main(args):
    global debug
    if args["--debug"]:
        debug = True
    else:
        debug = None

    sys.excepthook = exceptionHandler

    nproc = int(args["--procs"])
    ncpu = int(args["--cpus"])
    min_len = int(args["--min-len"])

    with open(args["<files>"], "r") as json_file:
        filename = json_file
        files = json.load(json_file)

    genome_files = files["genomes"]
    read_files = files["reads"]

    comms = list(files["reads"].keys())

    tmp_dir = args["--tmp"]
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir, exist_ok=True)

    tmp_damage = os.path.join(tmp_dir, "damage-aa")
    if not os.path.isdir(tmp_damage):
        os.makedirs(tmp_damage, exist_ok=True)

    out_dir = args["--out-dir"]
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    p_procs = nproc * ncpu

    if p_procs > len(genome_files):
        p_procs = len(genome_files)

    logging.info(
        "Predicting genes from genomes usin {} processes...".format(p_procs))
    output_dir = os.path.join(tmp_dir, "gene_prediction")
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    prodigal = Prodigal(cpus=p_procs, verbose=True)
    gene_preds = prodigal.run(
        genome_files=genome_files,
        output_dir=output_dir,
        called_genes=False,
        translation_table=None,
        meta=False,
        closed_ends=False,
    )
    gene_pred = {}

    for k in gene_preds.keys():
        gene_pred[k] = {
            "faa": gene_preds[k].aa_gene_file,
            "fna": gene_preds[k].nt_gene_file,
            "translation_table": gene_preds[k].best_translation_table,
        }

    genome_ids = list(gene_pred.keys())

    func = partial(
        pa.analyze_proteins,
        files=files,
        gene_predictions=gene_pred,
        min_len=min_len,
        outdir=tmp_damage,
        debug=debug,
        nproc=nproc,
    )

    logging.info("Finding damage in codons...")
    comm_files = list(product(comms, genome_ids))

    # if p_procs > len(comm_files):
    #     p_procs = len(comm_files)

    if debug is True:
        data = list(map(func, comm_files))
    else:
        p = MyPool(ncpu)
        data = list(
            tqdm.tqdm(
                p.imap_unordered(func, comm_files),
                total=len(comm_files),
            ))
    logging.info("Combining files...")

    p_procs = nproc * ncpu
    if p_procs > len(comms):
        p_procs = len(comms)

    func = partial(combine_files, tmp_damage=tmp_damage, out_dir=out_dir)
    if debug is True:
        ofiles = list(map(func, comms))
    else:
        p = MyPool(p_procs)
        ofiles = list(
            tqdm.tqdm(
                p.imap_unordered(func, comms),
                total=len(comms),
            ))
    # for comm in comms:
    #     out_suffix = ".tsv.gz"
    #     fname = "{}_aa-damage".format(comm)
    #     outfile = Path(out_dir, fname).with_suffix(out_suffix)
    #     files = glob.glob(str(Path(tmp_damage, comm + "*")))
    #     li = []

    #     for file in files:
    #         df = pd.read_csv(file, index_col=None, header=0, sep="\t")
    #         li.append(df)

    #     df = pd.concat(li, axis=0, ignore_index=True)
    #     df.to_csv(
    #         path_or_buf=outfile,
    #         sep="\t",
    #         header=True,
    #         index=False,
    #         compression="gzip",
    #     )

    logging.info("Protein analysis done.")
Ejemplo n.º 11
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue
              
            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                if False:
                    # for safety, I am just recalling genes for all genomes right now,
                    # but this is very efficient
                    aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa')
                    if os.path.exists(aa_gene_file):
                        # verify checksum
                        checksum_file = aa_gene_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(aa_gene_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            
            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()