def __init__(self, output_dir):
        """Initialization."""
        
        check_dependencies(['blastn', 'makeblastdb'])
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        self.output_dir = output_dir
        
        logger_setup(output_dir, "in_silico_probes.log", "in_silico_probes", __version__, False)
        self.logger = logging.getLogger('timestamp')
        
        self.output_fmt = '6 qseqid qlen qseq sseqid slen sseq length mismatch gaps pident bitscore evalue'

        self.BlastHit = namedtuple('BlastHit', """query_id
                                                query_len
                                                query_aln_seq
                                                subject_id
                                                subject_len
                                                subject_aln_seq
                                                aln_len
                                                mismatch
                                                gaps
                                                perc_identity
                                                bitscore
                                                evalue""")
Example #2
0
    def lsu_tree(self, options):
        """Infer 23S tree spanning GTDB genomes."""

        check_dependencies(['esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP', 'blastn'])

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_lsu_file)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.run('lsu',
                            options.gtdb_metadata_file,
                            options.gtdb_lsu_file,
                            options.min_lsu_length,
                            options.min_scaffold_length,
                            options.min_quality,
                            options.max_contigs,
                            options.min_N50,
                            not options.disable_tax_filter,
                            #options.reps_only,
                            #options.user_genomes,
                            options.genome_list,
                            options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
    def __init__(self, af_sp, max_genomes, ani_cache_file, cpus, output_dir):
        """Initialization."""

        check_dependencies(['fastANI', 'mash'])

        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')

        self.af_sp = af_sp

        self.fastani = FastANI(ani_cache_file, cpus)

        # maximum number of randomly selected genomes to
        self.max_genomes_for_stats = max_genomes
        # consider when calculating pairwise statistics

        self.RepStats = namedtuple(
            'RepStats', 'min_ani mean_ani std_ani median_ani')
        self.PairwiseStats = namedtuple('PairwiseStats', ('min_ani',
                                                          'mean_ani',
                                                          'std_ani',
                                                          'median_ani',
                                                          'ani_to_medoid',
                                                          'mean_ani_to_medoid',
                                                          'mean_ani_to_rep',
                                                          'ani_below_95'))
Example #4
0
    def infer(self, options):
        """Infer tree from MSA."""

        self.logger.warning("Tree inference is still under development!")

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if (options.cpus > 1):
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)
        fasttree = FastTree(multithreaded=(options.cpus > 1))

        tree_unrooted_output = os.path.join(
            options.out_dir,
            options.prefix + options.suffix + '.unrooted.tree')
        tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log')
        tree_output_log = os.path.join(options.out_dir, 'fasttree.log')
        fasttree.run(options.msa_file, 'prot', options.prot_model,
                     tree_unrooted_output, tree_log, tree_output_log)

        self.logger.info('Done.')
Example #5
0
    def lsu_tree(self, options):
        """Infer 23S tree spanning GTDB genomes."""

        check_dependencies([
            'esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP',
            'blastn'
        ])

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_lsu_file)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.run(
            'lsu',
            options.gtdb_metadata_file,
            options.gtdb_lsu_file,
            options.min_lsu_length,
            options.min_scaffold_length,
            options.min_quality,
            options.max_contigs,
            options.min_N50,
            not options.disable_tax_filter,
            # options.reps_only,
            # options.user_genomes,
            options.genome_list,
            options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
Example #6
0
    def __init__(self, cpus):
        """Initialization."""
        
        check_dependencies(['mash'])
        
        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')
    def __init__(self, cpus, output_dir):
        """Initialization."""

        check_dependencies(['gtdbtk'])

        self.cpus = cpus

        self.output_dir = output_dir
        self.logger = logging.getLogger('timestamp')
Example #8
0
    def infer(self, options):
        """Infer tree from MSA."""

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                options.prefix + options.suffix + '.unrooted.tree')
            tree_log = os.path.join(
                options.out_dir, options.prefix + options.suffix + '.tree.log')
            fasttree_log = os.path.join(
                options.out_dir,
                options.prefix + options.suffix + '.fasttree.log')
        else:
            output_tree = os.path.join(options.out_dir,
                                       options.prefix + '.unrooted.tree')
            tree_log = os.path.join(options.out_dir,
                                    options.prefix + '.tree.log')
            fasttree_log = os.path.join(options.out_dir,
                                        options.prefix + '.fasttree.log')

        if options.prot_model == 'JTT':
            model_str = ''
        elif options.prot_model == 'WAG':
            model_str = ' -wag'
        elif options.prot_model == 'LG':
            model_str = ' -lg'

        support_str = ''
        if options.no_support:
            support_str = ' -nosupport'

        gamma_str = ' -gamma'
        if options.no_gamma:
            gamma_str = ''

        cmd = '-quiet%s%s%s -log %s %s > %s 2> %s' % (
            support_str, model_str, gamma_str, tree_log, options.msa_file,
            output_tree, fasttree_log)
        if options.cpus > 1:
            cmd = 'FastTreeMP ' + cmd
        else:
            cmd = 'FastTree ' + cmd
        self.logger.info('Running: %s' % cmd)
        os.system(cmd)

        self.logger.info('Done.')
    def __init__(self, cpus):
        """Initialization."""

        check_dependencies(['mash'])

        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')

        self.logger.info('Using Mash v{}.'.format(self._get_version()))
Example #10
0
    def __init__(self, gbk_arc_assembly_file, gbk_bac_assembly_file,
                 rfq_arc_assembly_file, rfq_bac_assembly_file):
        check_dependencies(['tRNAscan-SE'])

        self.genome_file_ext = '_genomic.fna'

        self.domain_dict = self.parseAssemblySummary(gbk_arc_assembly_file,
                                                     gbk_bac_assembly_file,
                                                     rfq_arc_assembly_file,
                                                     rfq_bac_assembly_file)
    def __init__(self, ani_cache_file, cpus, output_dir):
        """Initialization."""

        check_dependencies(['fastANI'])

        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')

        self.fastani = FastANI(ani_cache_file, cpus)
Example #12
0
    def __init__(self, ani_cache_file, cpus):
        """Initialization."""

        check_dependencies(['fastANI'])

        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')

        self.ani_cache_file = ani_cache_file
        self._read_cache()
Example #13
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        check_dependencies(('FastTree', 'hmmsearch'))

        if options.subparser_name == 'ssu_tree':
            self.ssu_tree(options)
        elif options.subparser_name == 'lsu_tree':
            self.lsu_tree(options)
        elif options.subparser_name == 'rna_tree':
            self.rna_tree(options)
        elif options.subparser_name == 'rna_dump':
            self.rna_dump(options)
        elif options.subparser_name == 'derep_tree':
            self.derep_tree(options)
        elif options.subparser_name == 'bootstrap':
            self.bootstrap(options)
        elif options.subparser_name == 'jk_markers':
            self.jk_markers(options)
        elif options.subparser_name == 'jk_taxa':
            self.jk_taxa(options)
        elif options.subparser_name == 'combine':
            self.combine(options)
        elif options.subparser_name == 'midpoint':
            self.midpoint(options)
        elif options.subparser_name == 'outgroup':
            self.outgroup(options)
        elif options.subparser_name == 'propagate':
            self.propagate(options)
        elif options.subparser_name == 'fill_ranks':
            self.fill_ranks(options)
        elif options.subparser_name == 'strip':
            self.strip(options)
        elif options.subparser_name == 'rm_support':
            self.rm_support(options)
        elif options.subparser_name == 'pull':
            self.pull(options)
        elif options.subparser_name == 'append':
            self.append(options)
        elif options.subparser_name == 'prune':
            self.prune(options)
        elif options.subparser_name == 'pd':
            self.phylogenetic_diversity(options)
        elif options.subparser_name == 'pd_clade':
            self.phylogenetic_diversity_clade(options)
        elif options.subparser_name == 'arb_records':
            self.arb_records(options)
        else:
            self.logger.error('Unknown GenomeTreeTk command: ' + options.subparser_name + '\n')
            sys.exit(-1)

        return 0
Example #14
0
    def __init__(self):
        """Initialization."""

        self.logger = logging.getLogger('timestamp')

        check_dependencies(['hmmpress'])

        # hard-coded paths to HMM files on ACE servers
        # (change these if using on your own system)
        self.TIGRFAM_HMMS = '/srv/db/tigrfam/15.0/TIGRFAMs_15.0_HMM/tigrfam.hmm'
        self.PFAM_HMMS = '/srv/db/pfam/33.1/Pfam-A.hmm'
Example #15
0
    def __init__(self, cpus=1):
        """Initialize."""

        check_dependencies(['pplacer', 'guppy', 'fastANI'])

        self.taxonomy_file = Config.TAXONOMY_FILE

        self.order_rank = ["d__", "p__", "c__", "o__", 'f__', 'g__']

        self.logger = logging.getLogger('timestamp')
        self.cpus = cpus
Example #16
0
    def __init__(self, dpi=96):
        """Initialize."""
        self.logger = logging.getLogger()

        Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi')
        options = Options(6, 6, 12, 12, 96)

        AbstractPlot.__init__(self, options)
        
        self.dpi = dpi

        check_dependencies(['genometreetk'])
Example #17
0
    def __init__(self):
        """Initialize."""

        check_dependencies(['comparem', 'diamond', 'makeblastdb'])

        self.underclassified = 'underclassified'

        self.rank_prefixes = Taxonomy.rank_prefixes
        self.rank_index = Taxonomy.rank_index
        self.rank_labels = Taxonomy.rank_labels

        self.time_keeper = TimeKeeper()
Example #18
0
    def __init__(self, dpi=96):
        """Initialize."""
        self.logger = logging.getLogger()

        Options = namedtuple('Options', 'width height tick_font_size label_font_size dpi')
        options = Options(6, 6, 12, 12, 96)

        AbstractPlot.__init__(self, options)
        
        self.dpi = dpi

        check_dependencies(['genometreetk'])
Example #19
0
    def identify(self, genomes, out_dir, prefix):
        """Identify marker genes in genomes."""

        check_dependencies(['prodigal', 'hmmsearch'])

        try:
            self.logger.info(
                'Identifying markers in %d genomes with %d threads.' %
                (len(genomes), self.cpus))

            self.logger.info("Running Prodigal to identify genes.")
            self.marker_gene_dir = os.path.join(out_dir,
                                                Config.MARKER_GENE_DIR)
            prodigal = Prodigal(self.cpus, False, self.marker_gene_dir,
                                self.protein_file_suffix,
                                self.nt_gene_file_suffix, self.gff_file_suffix)
            genome_dictionary = prodigal.run(genomes)

            # annotated genes against TIGRFAM and Pfam databases
            self.logger.info("Identifying TIGRFAM protein families.")
            gene_files = [
                genome_dictionary[db_genome_id]['aa_gene_path']
                for db_genome_id in genome_dictionary.keys()
            ]

            tigr_search = TigrfamSearch(self.cpus, self.tigrfam_hmms,
                                        self.protein_file_suffix,
                                        self.tigrfam_suffix,
                                        self.tigrfam_top_hit_suffix,
                                        self.checksum_suffix,
                                        self.marker_gene_dir)
            tigr_search.run(gene_files)

            self.logger.info("Identifying Pfam protein families.")
            pfam_search = PfamSearch(self.cpus, self.pfam_hmm_dir,
                                     self.protein_file_suffix,
                                     self.pfam_suffix,
                                     self.pfam_top_hit_suffix,
                                     self.checksum_suffix,
                                     self.marker_gene_dir)
            pfam_search.run(gene_files)

            self._report_identified_marker_genes(genome_dictionary, out_dir,
                                                 prefix)

        except IOError as e:
            self.logger.error(str(e))
            self.logger.error("GTDB-Tk has encountered an error.")

        except Exception as e:
            self.logger.error(str(e))
            raise
Example #20
0
    def __init__(self, cpus, silent=False):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """

        self.logger = logging.getLogger('timestamp')

        check_dependencies(['blastn', 'blastp', 'makeblastdb'])

        self.cpus = cpus
        self.silent = silent

        self.output_fmt = {'standard': '6',
                            'custom': '6 qseqid qlen sseqid stitle slen length pident evalue bitscore'}
        self.blastp_tasks = {'blastp', 'blastp-fast', 'blastp-short'}
        self.blastn_tasks = {'blastn', 'blastn-short', 'dc-megablast', 'megablast', 'rmblastn'}

        self.BlastHit = namedtuple('BlastHit', """query_id
                                                subject_id
                                                perc_identity
                                                aln_length
                                                mismatch_count
                                                gap_open_count
                                                query_start
                                                query_end
                                                subject_start
                                                subject_end
                                                evalue
                                                bitscore""")

        self.BlastHitCustom = namedtuple('BlastHitCustom', """query_id
                                                                query_len
                                                                subject_id
                                                                subject_annotation
                                                                subject_len
                                                                alignment_len
                                                                perc_identity
                                                                evalue
                                                                bitscore""")

        self.BlastHitHomologs = namedtuple('BlastHitHomologs', """query_id
                                                                subject_id
                                                                subject_annotation
                                                                perc_identity
                                                                query_perc_aln_len
                                                                subject_perc_aln_len
                                                                evalue
                                                                bitscore""")
Example #21
0
    def __init__(self, ani_cache_file, cpus):
        """Initialization."""

        check_dependencies(['fastANI'])

        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')

        self.ani_cache_file = ani_cache_file
        self._read_cache()

        self.logger.info('Using FastANI v{}.'.format(self._get_version()))
    def __init__(self, cpus, output_dir):
        """Initialization."""

        check_dependencies(['gtdbtk'])

        self.output_dir = output_dir
        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus
        if self.cpus > 64:
            self.logger.error(
                'Testing indicates pplacer will stale if used with more than 64 CPUs.')
            sys.exit(-1)
Example #23
0
    def __init__(self, cpus):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use during homology search.
        """

        check_dependencies(['mafft', 'muscle', 'seqmagick', 'trimal'])

        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus
    def __init__(self, tmp_dir='/tmp/', cpus=1):
        """Initialization."""

        self.tmp_dir = tmp_dir
        self.cpus = cpus

        check_dependencies(['prodigal', 'hmmsearch', 'pfam_search.pl'])

        self.tigrfam_hmms = '/srv/whitlam/bio/db/tigrfam/15.0/TIGRFAMs_15.0_HMM/tigrfam.hmm'

        self.pfam_hmm_dir = '/srv/db/pfam/33.1/'
        self.protein_file_ext = '_protein.faa'

        self.logger = logging.getLogger('timestamp')
Example #25
0
    def __init__(self, gbk_arc_assembly_file, gbk_bac_assembly_file,
                 rfq_arc_assembly_file, rfq_bac_assembly_file, cpus):
        check_dependencies(['tRNAscan-SE'])

        self.genome_file_ext = '_genomic.fna'

        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus

        self.domain_dict = self.parseAssemblySummary(gbk_arc_assembly_file,
                                                     gbk_bac_assembly_file,
                                                     rfq_arc_assembly_file,
                                                     rfq_bac_assembly_file)
Example #26
0
    def __init__(self, cpus):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use during homology search.
        """

        check_dependencies(['FastTreeMP', 'raxmlHPC-PTHREADS-SSE3'])

        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus
Example #27
0
    def rogue_test(self, options):
        """Rogue taxa command."""

        check_dir_exists(options.input_tree_dir)
        check_file_exists(options.taxonomy_file)
        make_sure_path_exists(options.output_dir)

        if options.decorate:
            check_dependencies(['genometreetk'])

        rt = RogueTest()
        rt.run(options.input_tree_dir, options.taxonomy_file,
               options.outgroup_taxon, options.decorate, options.output_dir)

        self.logger.info('Finished rogue taxa test.')
Example #28
0
    def __init__(self, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')

        self.min_mash_ani = 90.0
        
        self.af_sp = 0.65

        self.ani_cache = ANI_Cache(ani_cache_file, cpus)
Example #29
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        check_dependencies(('diamond', 'ktImportText'))

        if (options.subparser_name == 'scaffold_stats'):
            self.scaffold_stats(options)
        elif (options.subparser_name == 'genome_stats'):
            self.genome_stats(options)
        elif (options.subparser_name == 'taxon_profile'):
            self.taxon_profile(options)
        elif (options.subparser_name == 'taxon_filter'):
            self.taxon_filter(options)
        elif (options.subparser_name == 'outliers'):
            self.outliers(options)
        elif (options.subparser_name == 'ssu_erroneous'):
            self.ssu_erroneous(options)
        elif (options.subparser_name == 'kmeans'):
            self.kmeans(options)
        elif (options.subparser_name == 'dbscan'):
            self.dbscan(options)
        elif (options.subparser_name == 'split'):
            self.split(options)
        elif (options.subparser_name == 'manual'):
            self.manual(options)
        elif (options.subparser_name == 'reference'):
            self.reference(options)
        elif (options.subparser_name == 'compatible'):
            self.compatible(options)
        elif (options.subparser_name == 'unique'):
            self.unique(options)
        elif (options.subparser_name == 'bin_compare'):
            self.bin_compare(options)
        elif (options.subparser_name == 'modify_bin'):
            self.modify_bin(options)
        elif (options.subparser_name == 'filter_bins'):
            self.filter_bins(options)
        elif (options.subparser_name == 'call_genes'):
            self.call_genes(options)
        elif (options.subparser_name == 'unbinned'):
            self.unbinned(options)
        else:
            self.logger.error('Unknown RefineM command: ' +
                              options.subparser_name + '\n')
            sys.exit()

        return 0
Example #30
0
    def __init__(self, cpus, output_dir):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        output_dir : str
            Directory to store results.
        """

        self.logger = logging.getLogger('timestamp')
        
        check_dependencies(('diamond', 'ktImportText'))

        self.cpus = cpus
        self.output_dir = output_dir
Example #31
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        if options.subparser_name == 'nucleotide':
            self.nucleotide(options)
        elif options.subparser_name == 'gene':
            self.gene(options)
        elif options.subparser_name == 'rna':
            check_dependencies(['blastn', 'nhmmer'])
            self.rna(options)
        else:
            self.logger.error('  [Error] Unknown RefineM command: ' + options.subparser_name + '\n')
            sys.exit()

        self.logger.info('Done.')

        return 0
Example #32
0
    def rna_tree(self, options):
        """Infer 16S + 23S tree spanning GTDB genomes."""

        check_dependencies(['FastTreeMP'])

        check_file_exists(options.ssu_msa)
        check_file_exists(options.ssu_tree)
        check_file_exists(options.lsu_msa)
        check_file_exists(options.lsu_tree)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.combine(options.ssu_msa, options.ssu_tree,
                             options.lsu_msa, options.lsu_tree,
                             options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
Example #33
0
    def __init__(self, cpus, output_dir):
        """Initialization."""

        check_dependencies(['mash', 'fastANI'])

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        self.output_dir = output_dir
        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')

        # genome assignment parameters
        self.ani_threshold = 95  # assign genomes above this ANI value
        self.af_threshold = 0.65  # assign genomes above this alignment fraction AF

        self.mash_ani_threshold = 96.5  # assign genomes with Mash above this ANI threshold
Example #34
0
    def __init__(self, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['mash', 'fastANI'])
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        self.output_dir = output_dir
        self.cpus = cpus

        self.logger = logging.getLogger('timestamp')
        
        # genome assignment parameters
        self.ani_threshold = 95             # assign genomes above this ANI value
        self.af_threshold = 0.65            # assign genomes above this alignment fraction AF

        self.mash_ani_threshold = 96.5      # assign genomes with Mash above this ANI threshold
Example #35
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        check_dependencies(('blastn'))

        if options.subparser_name == 'nucleotide':
            self.nucleotide(options)
        elif options.subparser_name == 'gene':
            self.gene(options)
        elif options.subparser_name == 'ssu':
            self.ssu(options)
        else:
            self.logger.error('  [Error] Unknown RefineM command: ' + options.subparser_name + '\n')
            sys.exit()

        self.logger.info('Done.')

        return 0
    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.true_str = ['t', 'T', 'true', 'True']
        
        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.min_mash_ani = 90.0

        self.fastani = FastANI(ani_cache_file, cpus)
Example #37
0
    def rna_tree(self, options):
        """Infer 16S + 23S tree spanning GTDB genomes."""

        check_dependencies(['FastTreeMP'])

        check_file_exists(options.ssu_msa)
        check_file_exists(options.ssu_tree)
        check_file_exists(options.lsu_msa)
        check_file_exists(options.lsu_tree)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.combine(options.ssu_msa,
                                options.ssu_tree,
                                options.lsu_msa,
                                options.lsu_tree,
                                options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.true_str = ['t', 'T', 'true', 'True']
        
        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.min_mash_ani = 90.0
        
        self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid')
        
        self.ani_cache = ANI_Cache(ani_cache_file, cpus)
Example #39
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        check_dependencies(('diamond', 'ktImportText'))

        if(options.subparser_name == 'scaffold_stats'):
            print options
            self.scaffold_stats(options)
        elif(options.subparser_name == 'genome_stats'):
            self.genome_stats(options)
        elif(options.subparser_name == 'gene_profile'):
            self.gene_profile(options)
        elif(options.subparser_name == 'outliers'):
            self.outliers(options)
        elif(options.subparser_name == 'cluster'):
            self.cluster(options)
        elif(options.subparser_name == 'reference'):
            self.reference(options)
        elif(options.subparser_name == 'compatible'):
            self.compatible(options)
        elif(options.subparser_name == 'unique'):
            self.unique(options)
        elif(options.subparser_name == 'bin_compare'):
            self.bin_compare(options)
        elif(options.subparser_name == 'modify'):
            self.modify(options)
        elif(options.subparser_name == 'call_genes'):
            self.call_genes(options)
        elif(options.subparser_name == 'unbinned'):
            self.unbinned(options)
        elif (options.subparser_name == 'tetra_compare'):
            self.tetra_compare(options)
        else:
            self.logger.error('  [Error] Unknown RefineM command: ' + options.subparser_name + '\n')
            sys.exit()

        return 0
Example #40
0
    def __init__(self, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.ani_cache = ANI_Cache(ani_cache_file, cpus)
        
        self.max_genomes_for_stats = 25     # maximum number of randomly selected genomes to
                                            # consider when calculating pairwise statistics
        
        self.RepStats = namedtuple('RepStats', 'min_ani mean_ani std_ani median_ani')
        self.PairwiseStats = namedtuple('PairwiseStats', ('min_ani',
                                                           'mean_ani', 
                                                           'std_ani', 
                                                           'median_ani', 
                                                           'ani_to_medoid',
                                                           'mean_ani_to_medoid',
                                                           'ani_below_95'))
Example #41
0
    def ssu_tree(self, options):
        """Infer 16S tree spanning GTDB genomes."""

        check_dependencies(['mothur', 'ssu-align', 'ssu-mask', 'FastTreeMP', 'blastn'])

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_ssu_file)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.run('ssu',
                            options.gtdb_metadata_file,
                            options.gtdb_ssu_file,
                            options.min_ssu_length,
                            options.min_scaffold_length,
                            options.min_quality,
                            options.max_contigs,
                            options.min_N50,
                            not options.disable_tax_filter,
                            options.genome_list,
                            options.output_dir,
                            options.align_method)

        self.logger.info('Results written to: %s' % options.output_dir)
Example #42
0
    # process each genome
    print 'Generating metadata for each genome:'
    parallel = Parallel(cpus = cpus)
    parallel.run(self._producer,
                  None,
                  input_files,
                  self._progress)

if __name__ == '__main__':
  print __prog_name__ + ' v' + __version__ + ': ' + __prog_desc__
  print '  by ' + __author__ + ' (' + __email__ + ')' + '\n'

  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('rna_gene', choices=['ssu', 'lsu_23S'], help="rRNA gene to process")
  parser.add_argument('ncbi_genome_dir', help='base directory leading to NCBI archaeal and bacterial genome assemblies')
  parser.add_argument('user_genome_dir', help='base directory leading to user genomes or NONE to skip')
  parser.add_argument('-t', '--threads', help='number of CPUs to use', type=int, default=32)

  args = parser.parse_args()

  check_dependencies(['genometk'])

  try:
    p = RNA()
    p.run(args.rna_gene, args.ncbi_genome_dir, args.user_genome_dir, args.threads)
  except SystemExit:
    print "\nControlled exit resulting from an unrecoverable error or warning."
  except:
    print "\nUnexpected error:", sys.exc_info()[0]
    raise
Example #43
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        check_dependencies(('FastTree', 'hmmsearch'))

        if options.subparser_name == 'ssu_tree':
            self.ssu_tree(options)
        elif options.subparser_name == 'lsu_tree':
            self.lsu_tree(options)
        elif options.subparser_name == 'rna_tree':
            self.rna_tree(options)
        elif options.subparser_name == 'rna_dump':
            self.rna_dump(options)
        elif options.subparser_name == 'derep_tree':
            self.derep_tree(options)
        elif options.subparser_name == 'bootstrap':
            self.bootstrap(options)
        elif options.subparser_name == 'jk_markers':
            self.jk_markers(options)
        elif options.subparser_name == 'jk_taxa':
            self.jk_taxa(options)
        elif options.subparser_name == 'combine':
            self.combine(options)
        elif options.subparser_name == 'midpoint':
            self.midpoint(options)
        elif options.subparser_name == 'outgroup':
            self.outgroup(options)
        elif options.subparser_name == 'qc_genomes':
            self.qc_genomes(options)
        elif options.subparser_name == 'mash_dist':
            self.mash_dist(options)
        elif options.subparser_name == 'select_type_genomes':
            self.select_type_genomes(options)
        elif options.subparser_name == 'cluster_named_types':
            self.cluster_named_types(options)
        elif options.subparser_name == 'cluster_de_novo':
            self.cluster_de_novo(options)
        elif options.subparser_name == 'cluster_user':
            self.cluster_user(options)
        elif options.subparser_name == 'tree_gids':
            self.tree_gids(options)
        elif options.subparser_name == 'assign':
            self.assign(options)
        elif options.subparser_name == 'rep_compare':
            self.rep_compare(options)
        elif options.subparser_name == 'cluster_stats':
            self.cluster_stats(options)
        elif options.subparser_name == 'propagate':
            self.propagate(options)
        elif options.subparser_name == 'fill_ranks':
            self.fill_ranks(options)
        elif options.subparser_name == 'strip':
            self.strip(options)
        elif options.subparser_name == 'pull':
            self.pull(options)
        elif options.subparser_name == 'append':
            self.append(options)
        elif options.subparser_name == 'pd':
            self.phylogenetic_diversity(options)
        elif options.subparser_name == 'pd_clade':
            self.phylogenetic_diversity_clade(options)
        elif options.subparser_name == 'arb_records':
            self.arb_records(options)
        else:
            self.logger.error('Unknown GenomeTreeTk command: ' + options.subparser_name + '\n')
            sys.exit()

        return 0
Example #44
0
  def __init__(self):
    check_dependencies(['hmmsearch'])

    self.pfam_hmm_dir = '/srv/db/pfam/27/'

    self.protein_file_ext = '_protein.faa'
Example #45
0
    def __init__(self):
        check_dependencies(["prokka"])

        self.genome_file_ext = "_genomic.fna"
Example #46
0
  def __init__(self):
    check_dependencies(['hmmsearch'])

    self.tigrfam_hmms = '/srv/whitlam/bio/db/tigrfam/15.0/TIGRFAMs_15.0_HMM/tigrfam.hmm'

    self.protein_file_ext = '_protein.faa'
Example #47
0
 def __init__(self):
     check_dependencies(['prodigal'])