Beispiel #1
0
    def __init__(self):
        """Initialize."""

        check_dependencies(['comparem', 'diamond', 'makeblastdb'])

        self.underclassified = 'underclassified'

        self.rank_prefixes = Taxonomy.rank_prefixes
        self.rank_index = Taxonomy.rank_index
        self.rank_labels = Taxonomy.rank_labels

        self.time_keeper = TimeKeeper()
Beispiel #2
0
    def run(self, genome_id_file, marker_id_file, model, output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out,
                                  output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True,
                          output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(
            output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes,
                                      output_alignment_dir,
                                      concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir,
                                             'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model,
                      concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
Beispiel #3
0
 def __init__(self):
     """Initialization"""
     self.logger = logging.getLogger()
     self.time_keeper = TimeKeeper()
Beispiel #4
0
class OptionsParser(object):
    def __init__(self):
        """Initialization"""
        self.logger = logging.getLogger()
        self.time_keeper = TimeKeeper()

    def outliers(self, options):
        """Create information for identifying taxnomic outliers"""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

        if options.highlight_polyphyly and not options.fmeasure_table:
            self.logger.error(
                "The '--highlight_polyphyly' flag must be used with the '--fmeasure_table' flag."
            )
            return

        o = Outliers(options.skip_mpld3, options.dpi, options.output_dir)
        o.run(options.input_tree, options.taxonomy_file, options.viral,
              options.plot_taxa_file, options.plot_dist_taxa_only,
              options.plot_domain, options.highlight_polyphyly,
              options.highlight_taxa_file, options.trusted_taxa_file,
              options.fixed_root, options.min_children, options.min_support,
              options.mblet, options.fmeasure_table, options.min_fmeasure,
              options.fmeasure_mono, options.verbose_table)

        self.logger.info('Done.')

    def scale_tree(self, options):
        """Scale a rooted tree based on RED."""

        check_file_exists(options.input_tree)

        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Scaling tree based on RED.')
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)
        for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            rd_to_parent = n.rel_dist - n.parent_node.rel_dist
            n.edge_length = rd_to_parent

        tree.write_to_path(options.output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        self.logger.info('Done.')

    def compare_red(self, options):
        """Compare RED values of taxa calculated over different trees."""

        check_file_exists(options.red_table1)
        check_file_exists(options.red_table2)
        check_file_exists(options.red_dict2)

        median_reds = eval(open(options.red_dict2).readline())

        red1 = {}
        red2 = {}
        lineage = {}
        for d, red_file in [(red1, options.red_table1),
                            (red2, options.red_table2)]:
            with open(red_file) as f:
                f.readline()

                for line in f:
                    line_split = line.strip().split('\t')
                    taxon = line_split[0]
                    median_red = float(line_split[2])
                    d[taxon] = median_red

                    if d == red1:
                        lineage[taxon] = line_split[1]

        red1_label = os.path.splitext(os.path.basename(options.red_table1))[0]
        red2_label = os.path.splitext(os.path.basename(options.red_table2))[0]

        fout = open(options.output_table, 'w')
        fout.write(
            'Taxon\tLineage\t%s\t%s\tDifference\tAbs. Difference\tChanged rank\n'
            % (red1_label, red2_label))
        if options.viral:
            sorted_taxa = sort_viral_taxa(set(red1.keys()).union(red2.keys()))
        else:
            sorted_taxa = Taxonomy().sort_taxa(
                set(red1.keys()).union(red2.keys()))

        for taxon in sorted_taxa:
            r1 = red1.get(taxon, 'NA')
            r2 = red2.get(taxon, 'NA')
            if r1 == 'NA':
                fout.write('%s\t%s\t%s\t%.3f\t%s\t%s' %
                           (taxon, 'NA', 'NA', r2, 'NA', 'NA'))
            elif r2 == 'NA':
                fout.write('%s\t%s\t%.3f\t%s\t%s\t%s\t%s\n' %
                           (taxon, lineage[taxon], r1, 'NA', 'NA', 'NA', 'NA'))
            else:
                fout.write(
                    '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' %
                    (taxon, lineage[taxon], r1, r2, r1 - r2, abs(r1 - r2)))

            if r2 != 'NA':
                rank_prefix = taxon[0:3]
                if rank_prefix == 'd__':
                    continue

                if options.viral:
                    rank_label = VIRAL_RANK_LABELS[VIRAL_RANK_PREFIXES.index(
                        rank_prefix)]
                else:
                    rank_label = Taxonomy.rank_labels[
                        Taxonomy.rank_prefixes.index(rank_prefix)]
                rank_median = median_reds[rank_label]

                closest_rank = rank_label
                closest_dist = 1e6
                if r2 < rank_median - 0.1 or r2 > rank_median + 0.1:
                    for rank, median_red in median_reds.items():
                        d = abs(r2 - median_red)
                        if d < closest_dist:
                            closest_dist = d
                            closest_rank = rank

                if rank_label != closest_rank:
                    fout.write('\tTrue (%s: %.3f)' %
                               (closest_rank, closest_dist))
                else:
                    fout.write('\tFalse')
                fout.write('\n')

        fout.close()

    def mark_tree(self, options):
        """Mark tree command."""

        check_file_exists(options.input_tree)

        mt = MarkTree()
        mt.run(options.input_tree, options.output_tree, options.min_support,
               options.only_named_clades, options.min_length,
               not options.no_percentile, not options.no_relative_divergence,
               not options.no_prediction, options.thresholds)

        self.logger.info('Marked tree written to: %s' % options.output_tree)

    def rogue_test(self, options):
        """Rogue taxa command."""

        check_dir_exists(options.input_tree_dir)
        check_file_exists(options.taxonomy_file)
        make_sure_path_exists(options.output_dir)

        if options.decorate:
            check_dependencies(['genometreetk'])

        rt = RogueTest()
        rt.run(options.input_tree_dir, options.taxonomy_file,
               options.outgroup_taxon, options.decorate, options.output_dir)

        self.logger.info('Finished rogue taxa test.')

    def decorate(self, options):
        """Place internal taxonomic labels on tree."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        decorate = Decorate()
        decorate.run(options.input_tree, options.taxonomy_file, options.viral,
                     options.trusted_taxa_file, options.min_children,
                     options.min_support, options.skip_rd_refine,
                     options.output_tree)

        self.logger.info('Finished decorating tree.')

    def taxon_stats(self, options):
        """Taxon stats command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy().read(options.taxonomy_file)
        taxon_children = Taxonomy().taxon_children(taxonomy)

        fout = open(options.output_file, 'w')
        fout.write('Taxa')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t# named %s' % rank)
        fout.write('\t# extant taxon with complete taxonomy')
        fout.write('\n')

        for rank_prefix in Taxonomy.rank_prefixes:
            # find taxon at the specified rank
            cur_taxa = []
            for taxon in taxon_children:
                if taxon.startswith(rank_prefix):
                    cur_taxa.append(taxon)

            cur_taxa.sort()

            for taxon in cur_taxa:
                fout.write(taxon)
                fout.write('\t-' * Taxonomy.rank_index[rank_prefix])

                next_taxa = [taxon]
                for _ in range(Taxonomy.rank_index[rank_prefix],
                               Taxonomy.rank_index['s__'] + 1):
                    children_taxa = set()
                    for t in next_taxa:
                        children_taxa.update(taxon_children[t])

                    fout.write('\t%d' % len(children_taxa))
                    next_taxa = children_taxa
                fout.write('\n')

        fout.close()

        self.logger.info('Summary statistics written to: %s' %
                         options.output_file)

    def robustness_plot(self, options):
        """Robustness plot command"""
        self.logger.info('')
        self.logger.info(
            '*******************************************************************************'
        )
        self.logger.info(
            ' [PhyloRank - robustness_plot] Plotting distances across a set of tree.'
        )
        self.logger.info(
            '*******************************************************************************'
        )

        robustness_plot = RobustnessPlot()
        robustness_plot.run(options.rank, options.input_tree_dir,
                            options.full_tree_file, options.derep_tree_file,
                            options.taxonomy_file, options.output_prefix,
                            options.min_children, options.title)

        self.time_keeper.print_time_stamp()

    def rd_ranks(self, options):
        """Calculate number of taxa for specified rd thresholds."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        r = RdRanks()
        r.run(options.input_tree, options.thresholds, options.output_dir)

        self.logger.info('Done.')

    def bl_dist(self, options):
        """Calculate distribution of branch lengths at each taxonomic rank."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        b = BranchLengthDistribution()
        b.run(options.input_tree, options.trusted_taxa_file,
              options.min_children, options.taxonomy_file, options.output_dir)

        self.logger.info('Done.')

    def bl_optimal(self, options):
        """Determine branch length for best congruency with existing taxonomy."""

        b = BranchLengthDistribution()
        optimal_bl, correct_taxa, incorrect_taxa = b.optimal(
            options.input_tree, options.rank, options.min_dist,
            options.max_dist, options.step_size, options.output_table)

        prec = float(correct_taxa) / (correct_taxa + incorrect_taxa)

        self.logger.info('Optimal branch length is %f.' % optimal_bl)
        self.logger.info(
            'This results in %d correct and %d incorrect taxa (precision = %.2f).'
            % (correct_taxa, incorrect_taxa, prec))

    def bl_decorate(self, options):
        """Decorate tree based using a mean branch length criterion."""

        check_file_exists(options.input_tree)

        b = BranchLengthDistribution()
        b.decorate(options.input_tree, options.taxonomy_file,
                   options.threshold, options.rank,
                   options.retain_named_lineages, options.keep_labels,
                   options.prune, options.output_tree)

        self.logger.info('Done.')

    def bl_table(self, options):
        """Produce table with number of lineage for increasing mean branch lengths."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxon_category)

        b = BranchLengthDistribution()
        b.table(options.input_tree, options.taxon_category, options.step_size,
                options.output_table)

        self.logger.info('Done.')

    def rank_res(self, options):
        """Calculate taxonomic resolution at each rank."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        if options.taxa_file:
            taxa_out = open(options.taxa_file, 'w')
            taxa_out.write('Rank\tLowest Rank\tTaxon\n')

        # determine taxonomic resolution of named groups
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        rank_res = defaultdict(lambda: defaultdict(int))
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if taxon_name:
                lowest_rank = [x.strip()
                               for x in taxon_name.split(';')][-1][0:3]
                for rank_prefix in Taxonomy.rank_prefixes:
                    if rank_prefix in taxon_name:
                        rank_res[rank_prefix][lowest_rank] += 1
                        if options.taxa_file:
                            rank_prefix_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[rank_prefix]]
                            lowest_rank_name = Taxonomy.rank_labels[
                                Taxonomy.rank_index[lowest_rank]]
                            taxa_out.write('%s\t%s\t%s\n' %
                                           (rank_prefix_name, lowest_rank_name,
                                            taxon_name))

        # identify any singleton taxa which are treated as having species level resolution
        for line in open(options.taxonomy_file):
            line_split = line.split('\t')
            genome_id = line_split[0]
            taxonomy = line_split[1].split(';')

            for i, rank_prefix in enumerate(Taxonomy.rank_prefixes):
                if taxonomy[i] == rank_prefix:
                    # this taxa is undefined at the specified rank so
                    # must be the sole representative; e.g., a p__
                    # indicates a taxon that represents a novel phyla
                    rank_res[rank_prefix]['s__'] += 1
                    if options.taxa_file:
                        rank_prefix_name = Taxonomy.rank_labels[
                            Taxonomy.rank_index[rank_prefix]]
                        taxa_out.write('%s\t%s\t%s (%s)\n' %
                                       (rank_prefix_name, 'species',
                                        taxonomy[i], genome_id))
        if options.taxa_file:
            taxa_out.close()

        # write out results
        fout = open(options.output_file, 'w')
        fout.write('Category')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t' + rank)
        fout.write('\n')

        for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]):
            fout.write(Taxonomy.rank_labels[i + 1])

            for j, r in enumerate(Taxonomy.rank_prefixes[1:]):
                if i >= j:
                    fout.write('\t' + str(rank_res[r].get(rank_prefix, 0)))
                else:
                    fout.write('\t-')
            fout.write('\n')
        fout.close()

        self.logger.info('Done.')

    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        # check_dependencies(('diamond', 'ktImportText'))

        if options.subparser_name == 'outliers':
            self.outliers(options)
        elif options.subparser_name == 'scale_tree':
            self.scale_tree(options)
        elif options.subparser_name == 'compare_red':
            self.compare_red(options)
        elif options.subparser_name == 'mark_tree':
            self.mark_tree(options)
        elif options.subparser_name == 'rogue_test':
            self.rogue_test(options)
        elif options.subparser_name == 'decorate':
            self.decorate(options)
        elif options.subparser_name == 'taxon_stats':
            self.taxon_stats(options)
        elif options.subparser_name == 'robustness_plot':
            self.robustness_plot(options)
        elif options.subparser_name == 'rd_ranks':
            self.rd_ranks(options)
        elif options.subparser_name == 'bl_dist':
            self.bl_dist(options)
        elif options.subparser_name == 'bl_optimal':
            self.bl_optimal(options)
        elif options.subparser_name == 'bl_decorate':
            self.bl_decorate(options)
        elif options.subparser_name == 'bl_table':
            self.bl_table(options)
        elif options.subparser_name == 'rank_res':
            self.rank_res(options)
        else:
            self.logger.error('  [Error] Unknown PhyloRank command: ' +
                              options.subparser_name + '\n')
            sys.exit()

        return 0
Beispiel #5
0
 def __init__(self):
     self.logger = logging.getLogger()
     self.time_keeper = TimeKeeper()
Beispiel #6
0
class OptionsParser():
    def __init__(self):
        self.logger = logging.getLogger()
        self.time_keeper = TimeKeeper()

    def _genome_files(self, genome_dir, genome_ext):
        """Identify genomes files.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes of interest.
        genome_ext : str
            Extension of genome files.

        Returns
        -------
        list
            Name of genome files in directory.
        """

        check_dir_exists(genome_dir)

        genome_files = []
        for f in os.listdir(genome_dir):
            if f.endswith(genome_ext):
                genome_files.append(os.path.join(genome_dir, f))

        if not genome_files:
            self.logger.warning('  [Warning] No genomes found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        return genome_files

    def _write_usage_profile(self, genome_usage, feature_set, output_file):
        """Write out occurrence of specified features for each genome.

        Parameters
        ----------
        genome_usage : d[genome_id][feature] -> count
            Occurrence of genomic feature in genome
        feature_set : iterable
            All genomic features.
        output_file : str
            File to produce.
        """

        sorted_feature_set = sorted(feature_set)

        fout = open(output_file, 'w')
        fout.write('Genome ID')
        for feature in sorted_feature_set:
            fout.write('\t' + feature)
        fout.write('\n')

        totals = defaultdict(int)
        for genome_id, features in genome_usage.iteritems():
            for feature in sorted_feature_set:
                totals[genome_id] += features.get(feature, 0)

        for genome_id, features in genome_usage.iteritems():
            fout.write(genome_id)

            for feature in sorted_feature_set:
                fout.write('\t%.2f%%' % (features.get(feature, 0) * 100.0 / totals[genome_id]))
            fout.write('\n')

    def ani(self, options):
        """ANI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - ani] Calculating the ANI between genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)

        self.logger.info('')
        self.logger.info('  Average nucleotide identity information written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)
        if not genome_files:
            self.logger.warning('  [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        prodigal = Prodigal(options.cpus)
        summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.iteritems():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('')
        self.logger.info('  Identified genes written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def rblast(self, options):
        """Reciprocal blast command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.protein_dir)
        make_sure_path_exists(options.output_dir)

        aa_gene_files = []
        for f in os.listdir(options.protein_dir):
            if f.endswith(options.protein_ext):
                aa_gene_files.append(os.path.join(options.protein_dir, f))

        if not aa_gene_files:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        # modify gene ids to include genome ids in order to ensure
        # all gene identifiers are unique across the set of genomes,
        # also removes the trailing asterisk used to identify the stop
        # codon
        self.logger.info('')
        self.logger.info('  Appending genome identifiers to all gene identifiers.')
        gene_out_dir = os.path.join(options.output_dir, 'genes')
        make_sure_path_exists(gene_out_dir)
        modified_aa_gene_files = []
        for gf in aa_gene_files:
            genome_id = remove_extension(gf)

            aa_file = os.path.join(gene_out_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

            modified_aa_gene_files.append(aa_file)

        # perform the reciprocal blast with blastp or diamond
        self.logger.info('')
        if options.blastp:
            rblast = ReciprocalBlast(options.cpus)
            rblast.run(modified_aa_gene_files, options.evalue, options.output_dir)

            # concatenate all blast tables to mimic output of diamond, all hits
            # for a given genome MUST be in consecutive order to fully mimic
            # the expected results from diamond
            self.logger.info('')
            self.logger.info('  Creating single file with all blast hits (be patient!).')
            blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')])
            hit_tables = [os.path.join(options.output_dir, f) for f in blast_files]
            concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv'))
        else:
            rdiamond = ReciprocalDiamond(options.cpus)
            rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir)

        self.logger.info('')
        self.logger.info('  Reciprocal blast hits written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def aai(self, options):
        """AAI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.rblast_dir)
        make_sure_path_exists(options.output_dir)

        genome_ids = []
        protein_dir = os.path.join(options.rblast_dir, 'genes')
        for f in os.listdir(protein_dir):
            if f.endswith('.faa'):
                genome_id = remove_extension(f, '.faa')
                genome_ids.append(genome_id)

        if not genome_ids:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        aai_calculator = AAICalculator(options.cpus)
        aai_calculator.run(genome_ids,
                            protein_dir,
                            options.rblast_dir,
                            options.per_identity,
                            options.per_aln_len,
                            options.write_shared_genes,
                            options.output_dir)

        shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes)
        self.logger.info('')
        self.logger.info('  Identified homologs between genome pairs written to: %s' % shared_genes_dir)

        self.time_keeper.print_time_stamp()

    def aa_usage(self, options):
        """Amino acid usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - aa_usage] Calculating amino acid usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.protein_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.protein_dir)
        for f in files:
            if f.endswith(options.protein_ext):
                gene_files.append(os.path.join(options.protein_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            return

        # calculate amino acid usage
        amino_acid_usage = AminoAcidUsage(options.cpus)
        genome_aa_usage, aa_set = amino_acid_usage.run(gene_files)

        # write out results
        self._write_usage_profile(genome_aa_usage, aa_set, options.output_file)

        self.logger.info('')
        self.logger.info('  Amino acid usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()

    def codon_usage(self, options):
        """Codon usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - codon_usage] Calculating codon usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        # calculate amino acid usage
        codon_usage = CodonUsage(options.cpus, options.keep_ambiguous)
        genome_codon_usage, codon_set, _mean_length = codon_usage.run(gene_files)

        # write out results
        self._write_usage_profile(genome_codon_usage, codon_set, options.output_file)

        self.logger.info('')
        self.logger.info('  Codon usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()

    def stop_usage(self, options):
        """Stop codon usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - stop_usage] Calculating stop codon usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        # calculate amino acid usage
        codon_usage = CodonUsage(options.cpus, keep_ambiguous=False, stop_codon_only=True)
        genome_codon_usage, codon_set, mean_gene_length = codon_usage.run(gene_files)

        # write out results
        fout = open(options.output_file, 'w')
        for codon in codon_set:
            fout.write('\t' + codon)
            if mean_gene_length:
                fout.write('\t' + codon + ': avg. seq. length')
        fout.write('\n')

        for genome_id, codons in genome_codon_usage.iteritems():
            fout.write(genome_id)

            for codon in codon_set:
                fout.write('\t%d' % codons.get(codon, 0))

                if mean_gene_length:
                    mean_len = mean_gene_length[genome_id].get(codon, None)
                    if mean_len:
                        fout.write('\t%.1f' % mean_len)
                    else:
                        fout.write('\tna')
            fout.write('\n')

        self.logger.info('')
        self.logger.info('  Stop codon usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()

    def kmer_usage(self, options):
        """Kmer usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - kmer_usage] Calculating kmer usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        if options.k > 10 or options.k <= 0:
            self.logger.warning('[Warning] CompareM only support kmers with k <= 10.')
            sys.exit(0)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)

        # calculate amino acid usage
        kmer_usage = KmerUsage(options.k, options.cpus)
        genome_kmer_usage, kmer_set = kmer_usage.run(genome_files)

        # write out results
        self.logger.info('')
        self.logger.info('  Writing kmer profile to file (be patient!).')
        self._write_usage_profile(genome_kmer_usage, kmer_set, options.output_file)

        self.logger.info('')
        self.logger.info('  Kmer usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()

    def lgt_di(self, options):
        """LGT dinucleotide usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - lgt_di] Calculating dinuceotide (3rd,1st) usage of genes.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        lgt_dinucleotide = LgtDinucleotide(options.cpus)
        lgt_dinucleotide.run(gene_files, options.crit_value, options.output_dir)

        self.logger.info('')
        self.logger.info('  Dinucleotide usage written to directory: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def lgt_codon(self, options):
        """LGT dinucleotide usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - lgt_codon] Calculating codon usage of genes.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        lgt_codon = LgtCodon(options.cpus)
        lgt_codon.run(gene_files, options.output_dir)

        self.logger.info('')
        self.logger.info('  Codon usage written to directory: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def unique(self, options):
        """Unique command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - unique] Identifying genes present in a single genome.')
        self.logger.info('*******************************************************************************')

        self.time_keeper.print_time_stamp()

    def pcoa_plot(self, options):
        """Unique command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - pcoa_plot] Generating PCoA plot showing relative similarity of genomes.')
        self.logger.info('*******************************************************************************')

        self.logger.info('')
        self.logger.info('  Performing PCoA.')
        pcoa = PCoA()
        pcoa.plot(options.aai_summary_file)

        self.time_keeper.print_time_stamp()

    def heatmap(self, options):
        """Unique command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - heatmap] Generating heatmap showing relative similarity of genomes.')
        self.logger.info('*******************************************************************************')

        self.logger.info('')
        self.logger.info('  Making heatmap.')
        heatmapper = Heatmap(options.aai_summary_file, options.output_file)
        heatmapper.plot(options.cluster, options.method, options.metric)

        self.time_keeper.print_time_stamp()

    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""
        try:
            if options.bVerbose:
                logging.basicConfig(format='', level=logging.DEBUG)
            elif options.bQuiet:
                logging.basicConfig(format='', level=logging.ERROR)
            else:
                logging.basicConfig(format='', level=logging.INFO)
        except:
            logging.basicConfig(format='', level=logging.INFO)

        try:
            if options.file == "stdout":
                options.file = ''
        except:
            pass

        if(options.subparser_name == 'call_genes'):
            self.call_genes(options)
        elif(options.subparser_name == 'rblast'):
            self.rblast(options)
        elif(options.subparser_name == 'aai'):
            self.aai(options)
        elif(options.subparser_name == 'aai_wf'):
            root_dir = options.output_dir
            make_sure_path_exists(root_dir)

            options.output_dir = os.path.join(root_dir, 'genes')
            self.call_genes(options)

            options.protein_ext = 'faa'
            options.protein_dir = os.path.join(root_dir, 'genes')
            options.output_dir = os.path.join(root_dir, 'rblast')
            self.rblast(options)

            options.output_dir = root_dir
            options.rblast_dir = os.path.join(root_dir, 'rblast')
            self.aai(options)
        elif(options.subparser_name == 'aa_usage'):
            self.aa_usage(options)
        elif(options.subparser_name == 'codon_usage'):
            self.codon_usage(options)
        elif(options.subparser_name == 'kmer_usage'):
            self.kmer_usage(options)
        elif(options.subparser_name == 'stop_usage'):
            self.stop_usage(options)
        elif(options.subparser_name == 'lgt_di'):
            self.lgt_di(options)
        elif(options.subparser_name == 'lgt_codon'):
            self.lgt_codon(options)
        elif(options.subparser_name == 'unique'):
            self.unique(options)
        elif(options.subparser_name == 'pcoa_plot'):
            self.pcoa_plot(options)
        elif(options.subparser_name == 'heatmap'):
            self.heatmap(options)
        else:
            self.logger.error('  [Error] Unknown CompareM command: "' + options.subparser_name + '"\n')
            sys.exit()

        return 0
Beispiel #7
0
 def __init__(self):
     """Initialization"""
     self.logger = logging.getLogger()
     self.time_keeper = TimeKeeper()
Beispiel #8
0
class OptionsParser():
    def __init__(self):
        """Initialization"""
        self.logger = logging.getLogger()
        self.time_keeper = TimeKeeper()
        
    #~ def item_eval(item):
        #~ try:
            #~ return ast.literal_eval(item)
        #~ except ValueError:
            #~ return item

    def _genome_files(self, genome_dir, genome_ext):
        """Identify genomes files.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes of interest.
        genome_ext : str
            Extension of genome files.

        Returns
        -------
        list
            Path to genome files.
        """

        check_dir_exists(genome_dir)

        genome_files = []
        for f in os.listdir(genome_dir):
            if f.endswith(genome_ext):
                genome_files.append(os.path.join(genome_dir, f))

        if not genome_files:
            self.logger.warning('  [Warning] No genomes found. Check the --genome_ext or --protein_ext flag used to identify genomes.')
            sys.exit()

        return genome_files

    def _check_nuclotide_seqs(self, seq_files):
        """Check if files contain sequences in nucleotide space.

        Parameters
        ----------
        seq_files : iterable
            Sequence files to check.

        Returns
        -------
        boolean
            True if files can be treated as containing nucleotide sequences.
        """

        for seq_file in seq_files:
            if not seq_io.is_nucleotide(seq_file):
                print('Expected all files to contain sequences in nucleotide space.')
                print('File %s appears like it may contain amino acids sequences.' % seq_file)

                yes_response = query_yes_no('Do all files contain only nucleotide sequences?', default='no')
                if not yes_response:
                    return False

        return True

    def _check_protein_seqs(self, seq_files):
        """Check if files contain sequences in amino acid space.

        Parameters
        ----------
        seq_files : iterable
            Sequence files to check.

        Returns
        -------
        boolean
            True if files can be treated as containing amino acid sequences.
        """

        for seq_file in seq_files:
            if not seq_io.is_protein(seq_file):
                print('Expected all files to contain sequences in amino acid space.')
                print('File %s appears like it may contain nucleotide sequences.' % seq_file)

                yes_response = query_yes_no('Do all files contain only amino acid sequences?', default='no')
                if not yes_response:
                    return False

        return True

    def scaffold_stats(self, options):
        """Scaffold statistics command"""
        print options
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)

        # get coverage information
        if not options.coverage_file:
            if not options.bam_files:
                self.logger.warning('\n  [Warning] One or more BAM files must be specified in order to calculate coverage profiles.')
                coverage_file = None
            else:
                coverage = Coverage(options.cpus)
                coverage_file = os.path.join(options.output_dir, 'coverage.tsv')
                coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist)
                self.logger.info('')
                self.logger.info('  Coverage profiles written to: %s' % coverage_file)
        else:
            coverage_file = options.coverage_file

        # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF 
        if not options.tetra_file:
            self.logger.info('')
            tetra = Tetranucleotide(options.cpus)
            tetra_file = os.path.join(options.output_dir, 'tetra.tsv')
            signatures = tetra.run(options.scaffold_file)
            tetra.write(signatures, tetra_file)
            self.logger.info('  Tetranucleotide signatures written to: %s' % tetra_file)
        else:
            tetra_file = options.tetra_file

        # write out scaffold statistics
        stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv')
        stats = ScaffoldStats(options.cpus)
        stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output)

        self.logger.info('  Scaffold statistic written to: %s' % stats_output)

        self.time_keeper.print_time_stamp()

    def genome_stats(self, options):
        """Genomes statistics command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - genome_stats] Calculating statistics for genomes.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats(options.cpus)
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats.run(scaffold_stats)
        genome_stats.write(options.output_file)

        self.logger.info('  Genome statistic written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()

    def gene_profile(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - gene_profile] Generating taxonomic profiles from genes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('[Warning] All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        gene_profile = GeneProfile(options.cpus, options.output_dir)
        gene_profile.run(gene_files,
                            options.scaffold_stats_file,
                            options.db_file,
                            options.taxonomy_file,
                            options.per_to_classify,
                            options.evalue,
                            options.per_identity)

        self.logger.info('')
        self.logger.info('  Results written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()

    def outliers(self, options):
        """Outlier command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, outlier_file)
        self.logger.info('  Outlier information written to: ' + outlier_file)

        # create outlier plots
        self.logger.info('')

        highlight_scaffolds_ids = {}
        if options.highlight_file:
            for line in open(options.highlight_file):
                line_split = line.strip().split('\t')
                if len(line_split) > 1:
                    highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')]
                else:
                    highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0]

        link_scaffold_ids = []
        if options.links_file:
            with open(options.links_file) as links_file:
                for line in links_file:
                    #print line.strip().split('\t')
                    link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))])
            #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file))
            
        #print list(link_scaffold_ids[0])
        
        # create plots
        genomes_processed = 0
        plot_dir = os.path.join(options.output_dir, 'plots')
        make_sure_path_exists(plot_dir)
        genome_plots = defaultdict(list)
        for genome_id, gs in genome_stats.iteritems():
            genomes_processed += 1

            sys.stdout.write('  Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' %
                                                                                            (genomes_processed,
                                                                                             len(genome_stats),
                                                                                             genomes_processed * 100.0 / len(genome_stats)))
            sys.stdout.flush()

            genome_scaffold_stats = {}
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id]

            if options.individual_plots:
                #~ # GC plot
                #~ gc_plots = GcPlots(options)
                #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type)
                #~ gc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html'))

                # TD plot
                td_plots = TdPlots(options)
                td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc])

                output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type)
                td_plots.save_plot(output_plot, dpi=options.dpi)
                td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html'))

                #~ # mean absolute deviation of coverage profiles
                #~ cov_perc_plots = CovPercPlots(options)
                #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type)
                #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html'))
#~ 
                #~ # coverage correlation plots
                #~ if len(gs.mean_coverage) > 1:
                    #~ cov_corr_plots = CovCorrPlots(options)
                    #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr])
#~ 
                    #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type)
                    #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi)
                    #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html'))

            #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots
            #~ combined_plots = CombinedPlots(options)
            #~ combined_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids, link_scaffold_ids, gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type)
            #~ combined_plots.save_plot(output_plot, dpi=options.dpi)
            #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html'))
#~ 
            #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html'))
#~ 
            #~ # combined plot of distributions
            #~ dist_plots = DistributionPlots(options)
            #~ dist_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids,
                            #~ link_scaffold_ids,
                            #~ gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type)
            #~ dist_plots.save_plot(output_plot, dpi=options.dpi)
            #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html'))
#~ 
            #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html'))
#~ 
            #~ # GC vs. coverage plot
            #~ gc_cov_plot = GcCovPlot(options)
            #~ gc_cov_plot.plot(genome_scaffold_stats,
                             #~ highlight_scaffolds_ids, link_scaffold_ids,
                             #~ gs.mean_gc, gs.mean_coverage)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type)
            #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi)
            #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html'))
#~ 
            #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html'))

            # tetranucleotide signature PCA plot
            tetra = TetraPcaPlot(options)
            tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids)

            output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type)
            tetra.save_plot(output_plot, dpi=options.dpi)
            tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html'))

            genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html'))

        sys.stdout.write('\n')

        outliers.create_html_index(plot_dir, genome_plots)

        self.logger.info('  Outlier plots written to: ' + plot_dir)

        self.time_keeper.print_time_stamp()

    def cluster(self, options):
        """Cluster command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.run(scaffold_stats,
                    options.num_clusters,
                    options.num_components,
                    options.K,
                    options.no_coverage,
                    options.no_pca,
                    options.iterations,
                    options.genome_file,
                    options.output_dir)

        self.logger.info('')
        self.logger.info('  Partitioned sequences written to: ' + options.output_dir)

        self.time_keeper.print_time_stamp()

    def reference(self, options):
        """Reference command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_prot_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(ref_gene_files):
            self.logger.warning('[Warning] All files must contain amino acid sequences.')
            sys.exit()

        reference = Reference(options.cpus, options.output_dir)
        reference_out = reference.run(options.scaffold_prot_file,
                                        options.scaffold_stats_file,
                                        ref_gene_files,
                                        options.db_file,
                                        options.evalue,
                                        options.per_identity)

        self.logger.info('')
        self.logger.info('  Results written to: ' + reference_out)

        self.time_keeper.print_time_stamp()

    def compatible(self, options):
        """Compatible command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                         options.min_genes,
                                                         float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, output_file)

        self.logger.info('')
        self.logger.info('  Results written to: ' + output_file)

        self.time_keeper.print_time_stamp()

    def modify(self, options):
        """Modify command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(os.path.dirname(options.output_genome))

        if not (options.add or options.remove or options.outlier_file or options.compatible_file):
            self.logger.warning('  [Warning] No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file,
                                                               options.scaffold_file,
                                                               options.add,
                                                               options.remove,
                                                               options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)
            else:
                outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)

        if failed_to_add:
            self.logger.warning('  [Warning] Failed to add the following sequence(s):')
            for seq_id in failed_to_add:
                self.logger.warning('    %s' % seq_id)

        if failed_to_remove:
            self.logger.warning('  [Warning] Failed to remove the following sequence(s):')
            for seq_id in failed_to_remove:
                self.logger.warning('    %s' % seq_id)

        self.logger.info('')
        self.logger.info('  Modified genome written to: ' + options.output_genome)

        self.time_keeper.print_time_stamp()

    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('  Genes in genomes written to: %s' % options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True)
            self.logger.info('  Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)

        self.time_keeper.print_time_stamp()

    def unique(self, options):
        """Unique command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - unique] Ensuring sequences are assigned to a single genome.')
        self.logger.info('*******************************************************************************')

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        duplicates = genome_tk.unique(genome_files)

        self.logger.info('')
        if len(duplicates) == 0:
            self.logger.info('  Pass: All sequences were identified exactly once.')
        else:
            self.logger.info('  Fail: One or more sequences were observed multiple times.')

            genome_ids = sorted(duplicates.keys())
            for i in xrange(0, len(genome_ids)):
                genome_idA = genome_ids[i]

                for j in xrange(i, len(genome_ids)):
                    genome_idB = genome_ids[j]

                    dup_seq_ids = duplicates[genome_idA][genome_idB]
                    if len(dup_seq_ids) == 0:
                        continue

                    self.logger.info('')
                    if genome_idA == genome_idB:
                        self.logger.info('  There are %d sequences present more than once in %s:' % (len(dup_seq_ids), genome_idA))
                    else:
                        self.logger.info('  There are %d sequences shared between %s and %s:' % (len(dup_seq_ids), genome_idA, genome_idB))

                    for seq_id in dup_seq_ids:
                        self.logger.info('    %s' % seq_id)

        self.time_keeper.print_time_stamp()

    def bin_compare(self, options):
        """Bin compare command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - bin_compare] Comparing two sets of genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir1)
        check_dir_exists(options.genome_nt_dir2)

        genomes_files1 = self._genome_files(options.genome_nt_dir1, options.genome_ext1)
        if not self._check_nuclotide_seqs(genomes_files1):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        genomes_files2 = self._genome_files(options.genome_nt_dir2, options.genome_ext2)
        if not self._check_nuclotide_seqs(genomes_files2):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        bin_comparer = BinComparer()
        bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file, options.output_file)

        self.logger.info('')
        self.logger.info('  Detailed bin comparison written to: ' + options.output_file)

        self.time_keeper.print_time_stamp()

    def unbinned(self, options):
        """Unbinned Command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - unbinned] Identify unbinned scaffolds.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)

        genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genomes_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        unbinned = Unbinned()
        unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len)

        seq_io.write_fasta(unbinned_seqs, options.output_file)

        self.logger.info('')
        self.logger.info('  Unbinned scaffolds written to: ' + options.output_file)

        self.time_keeper.print_time_stamp()
        
        
    def tetra_compare(self, options):
        """Tetranucleotide comparison command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - tetra_compare] compare tetranucleotide frequencies')
        self.logger.info('*******************************************************************************')
        
        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)
        
        windows=WindowGen(options.cpus)
        windows_file, links_file=windows.write_windows(options.scaffold_file,options.output_dir,options.window_size,options.gap_size)
        
        options.scaffold_file=windows_file
        print options.scaffold_file
        options.genome_nt_dir=os.path.split(windows_file)[0] #Expects one genome - the scaffolds file
        print options.genome_nt_dir
        options.links_file=links_file
        print options.links_file
        
        self.scaffold_stats(options)
        
        options.scaffold_stats_file=os.path.join(options.output_dir, 'scaffold_stats.tsv')
        print options.scaffold_stats_file
        
        self.outliers(options)
        

    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        check_dependencies(('diamond', 'ktImportText'))

        if(options.subparser_name == 'scaffold_stats'):
            print options
            self.scaffold_stats(options)
        elif(options.subparser_name == 'genome_stats'):
            self.genome_stats(options)
        elif(options.subparser_name == 'gene_profile'):
            self.gene_profile(options)
        elif(options.subparser_name == 'outliers'):
            self.outliers(options)
        elif(options.subparser_name == 'cluster'):
            self.cluster(options)
        elif(options.subparser_name == 'reference'):
            self.reference(options)
        elif(options.subparser_name == 'compatible'):
            self.compatible(options)
        elif(options.subparser_name == 'unique'):
            self.unique(options)
        elif(options.subparser_name == 'bin_compare'):
            self.bin_compare(options)
        elif(options.subparser_name == 'modify'):
            self.modify(options)
        elif(options.subparser_name == 'call_genes'):
            self.call_genes(options)
        elif(options.subparser_name == 'unbinned'):
            self.unbinned(options)
        elif (options.subparser_name == 'tetra_compare'):
            self.tetra_compare(options)
        else:
            self.logger.error('  [Error] Unknown RefineM command: ' + options.subparser_name + '\n')
            sys.exit()

        return 0
    def run(self, genome_id_file,
                    marker_id_file,
                    model,
                    output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
Beispiel #10
0
class OptionsParser():
    def __init__(self):
        """Initialization"""
        self.logger = logging.getLogger()
        self.time_keeper = TimeKeeper()

    def outliers(self, options):
        """Create information for identifying taxnomic outliers"""

        check_file_exists(options.input_tree)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

        o = Outliers(options.dpi)
        o.run(options.input_tree,
                options.taxonomy_file,
                options.output_dir,
                options.plot_taxa_file,
                options.plot_dist_taxa_only,
                options.plot_domain,
                options.trusted_taxa_file,
                options.fixed_root,
                options.min_children,
                options.min_support,
                options.verbose_table)

        self.logger.info('Done.')
        
    def tree_diff(self, options):
        """Tree diff command."""
        
        check_file_exists(options.input_tree1)
        check_file_exists(options.input_tree2)
        
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)
        
        td = TreeDiff()
        td.run(options.input_tree1,
                options.input_tree2,
                options.output_dir,
                options.min_support,
                options.min_taxa,
                options.named_only)
        
        self.logger.info('Done.')
        
    def tree_tax_diff(self, options):
        """Taxonomy difference command."""
        
        check_file_exists(options.input_tree1)
        check_file_exists(options.input_tree2)
        
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)
        
        td = TaxDiff()
        td.tree_tax_diff(options.input_tree1,
                            options.input_tree2,
                            options.output_dir)
        
        self.logger.info('Done.')
        
    def tax_diff(self, options):
        """Taxonomy difference command."""
        
        check_file_exists(options.tax1_file)
        check_file_exists(options.tax2_file)
        
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)
        
        td = TaxDiff()
        td.tax_diff(options.tax1_file,
                options.tax2_file,
                options.include_user_taxa,
                options.output_dir)
        
        self.logger.info('Done.')
        
    def dist_plot(self, options):
        """Distribution plot command"""

        check_file_exists(options.input_tree)

        if options.plot_taxa_file:
            check_file_exists(options.plot_taxa_file)

        if options.trusted_taxa_file:
            check_file_exists(options.trusted_taxa_file)

        dist_plot = DistributionPlot()
        dist_plot.run(options.input_tree,
                            options.output_prefix,
                            options.plot_taxa_file,
                            options.trusted_taxa_file,
                            options.min_children,
                            options.min_support)

        self.logger.info('Done.')

    def mark_tree(self, options):
        """Mark tree command"""

        check_file_exists(options.input_tree)

        mt = MarkTree()
        mt.run(options.input_tree,
                    options.output_tree,
                    options.min_support,
                    options.only_named_clades,
                    options.min_length,
                    not options.no_percentile,
                    not options.no_relative_divergence,
                    not options.no_prediction,
                    options.thresholds)

        self.logger.info('Marked tree written to: %s' % options.output_tree)
        
    def decorate(self, options):
        """Place internal taxonomic labels on tree."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        decorate = Decorate()
        decorate.run(options.input_tree,
                        options.taxonomy_file,
                        options.trusted_taxa_file,
                        options.min_children,
                        options.min_support,
                        options.output_tree)

        self.logger.info('Finished decorating tree.')
   
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree) #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.iteritems():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' % options.output_file)

    def validate(self, options):
        """Validate command"""

        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy()
        t = taxonomy.read(options.taxonomy_file)

        errors = taxonomy.validate(t,
                                     not options.no_prefix,
                                     not options.no_all_ranks,
                                     not options.no_hierarhcy,
                                     not options.no_species,
                                     True)

        invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors

        if sum([len(e) for e in errors]) == 0:
            self.logger.info('No errors identified in taxonomy file.')
        else:
            self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks))
            self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes))
            self.logger.info('Identified %d invalid species names.' % len(invalid_species_name))
            self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies))

    def append(self, options):
        """Append command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [PhyloRank - append] Appending taxonomy to extant tree labels.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy().read(options.taxonomy_file)

        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for n in tree.leaf_node_iter():
            taxa_str = taxonomy.get(n.label, None)
            if taxa_str == None:
                self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label)
                sys.exit(-1)
            n.label = n.label + '|' + ';'.join(taxonomy[n.label])

        tree.write_to_path(options.output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

        self.logger.info('')
        self.logger.info('  Decorated tree written to: %s' % options.output_tree)

        self.time_keeper.print_time_stamp()

    def taxon_stats(self, options):
        """Taxon stats command"""
        check_file_exists(options.taxonomy_file)

        taxonomy = Taxonomy().read(options.taxonomy_file)
        taxon_children = Taxonomy().taxon_children(taxonomy)

        fout = open(options.output_file, 'w')
        fout.write('Taxa')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t# named %s' % rank)
        fout.write('\t# extant taxon with complete taxonomy')
        fout.write('\n')

        for rank_prefix in Taxonomy.rank_prefixes:
            # find taxon at the specified rank
            cur_taxa = []
            for taxon in taxon_children:
                if taxon.startswith(rank_prefix):
                    cur_taxa.append(taxon)

            cur_taxa.sort()

            for taxon in cur_taxa:
                fout.write(taxon)
                fout.write('\t-' * Taxonomy.rank_index[rank_prefix])

                next_taxa = [taxon]
                for _ in xrange(Taxonomy.rank_index[rank_prefix], Taxonomy.rank_index['s__'] + 1):
                    children_taxa = set()
                    for t in next_taxa:
                        children_taxa.update(taxon_children[t])

                    fout.write('\t%d' % len(children_taxa))
                    next_taxa = children_taxa
                fout.write('\n')

        fout.close()

        self.logger.info('Summary statistics written to: %s' % options.output_file)

    def robustness_plot(self, options):
        """Robustness plot command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [PhyloRank - robustness_plot] Plotting distances across a set of tree.')
        self.logger.info('*******************************************************************************')

        robustness_plot = RobustnessPlot()
        robustness_plot.run(options.rank,
                                options.input_tree_dir,
                                options.full_tree_file,
                                options.derep_tree_file,
                                options.taxonomy_file,
                                options.output_prefix,
                                options.min_children,
                                options.title)

        self.time_keeper.print_time_stamp()

    def rd_ranks(self, options):
        """Calculate number of taxa for specified rd thresholds."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        r = RdRanks()
        r.run(options.input_tree,
                options.thresholds,
                options.output_dir)

        self.logger.info('Done.')
        
    def bl_dist(self, options):
        """Calculate distribution of branch lengths at each taxonomic rank."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        b = BranchLengthDistribution()
        b.run(options.input_tree,
                options.trusted_taxa_file,
                options.min_children,
                options.taxonomy_file,
                options.output_dir)

        self.logger.info('Done.')
        
    def bl_optimal(self, options):
        """Determine branch length for best congruency with existing taxonomy."""
        
        b = BranchLengthDistribution()
        optimal_bl, correct_taxa, incorrect_taxa = b.optimal(options.input_tree, 
                                                                options.rank,
                                                                options.min_dist,
                                                                options.max_dist,
                                                                options.step_size,
                                                                options.output_table)
        
        prec = float(correct_taxa) / (correct_taxa + incorrect_taxa)
        
        self.logger.info('Optimal branch length is %f.' % optimal_bl)
        self.logger.info('This results in %d correct and %d incorrect taxa (precision = %.2f).' % (correct_taxa, incorrect_taxa, prec))
        
    def bl_decorate(self, options):
        """Decorate tree based using a mean branch length criterion."""
        
        check_file_exists(options.input_tree)
        
        b = BranchLengthDistribution()
        b.decorate(options.input_tree, 
                    options.taxonomy_file,
                    options.threshold, 
                    options.rank, 
                    options.retain_named_lineages,
                    options.keep_labels,
                    options.prune,
                    options.output_tree)
        
        self.logger.info('Done.')
        
    def bl_table(self, options):
        """Produce table with number of lineage for increasing mean branch lengths."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxon_category)

        b = BranchLengthDistribution()
        b.table(options.input_tree,
                options.taxon_category,
                options.step_size,
                options.output_table)

        self.logger.info('Done.')
        
    def rank_res(self, options):
        """Calculate taxonomic resolution at each rank."""

        check_file_exists(options.input_tree)
        check_file_exists(options.taxonomy_file)
        
        if options.taxa_file:
            taxa_out = open(options.taxa_file, 'w')
            taxa_out.write('Rank\tLowest Rank\tTaxon\n')

        # determine taxonomic resolution of named groups
        tree = dendropy.Tree.get_from_path(options.input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        rank_res = defaultdict(lambda: defaultdict(int))
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            _support, taxon_name, _auxiliary_info = parse_label(node.label)
            
            if taxon_name:
                lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3]
                for rank_prefix in Taxonomy.rank_prefixes:
                    if rank_prefix in taxon_name:
                        rank_res[rank_prefix][lowest_rank] += 1
                        if options.taxa_file:
                            rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]]
                            lowest_rank_name = Taxonomy.rank_labels[Taxonomy.rank_index[lowest_rank]]
                            taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name))

        # identify any singleton taxa which are treated as having species level resolution
        for line in open(options.taxonomy_file):
            line_split = line.split('\t')
            genome_id = line_split[0]
            taxonomy = line_split[1].split(';')
            
            for i, rank_prefix in enumerate(Taxonomy.rank_prefixes):
                if taxonomy[i] == rank_prefix:
                    # this taxa is undefined at the specified rank so
                    # must be the sole representative; e.g., a p__
                    # indicates a taxon that represents a novel phyla
                    rank_res[rank_prefix]['s__'] += 1
                    if options.taxa_file:
                        rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]]
                        taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id))                   
        if options.taxa_file:
            taxa_out.close()
                      
        # write out results
        fout = open(options.output_file, 'w')
        fout.write('Category')
        for rank in Taxonomy.rank_labels[1:]:
            fout.write('\t' + rank)
        fout.write('\n')

        for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]):
            fout.write(Taxonomy.rank_labels[i+1])
            
            for j, r in enumerate(Taxonomy.rank_prefixes[1:]):
                if i >= j:
                    fout.write('\t' + str(rank_res[r].get(rank_prefix, 0)))
                else:
                    fout.write('\t-')
            fout.write('\n')
        fout.close()

        self.logger.info('Done.')

    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        logging.basicConfig(format='', level=logging.INFO)

        # check_dependencies(('diamond', 'ktImportText'))

        if(options.subparser_name == 'outliers'):
            self.outliers(options)
        elif(options.subparser_name == 'mark_tree'):
            self.mark_tree(options)
        elif(options.subparser_name == 'tree_diff'):
            self.tree_diff(options)
        elif(options.subparser_name == 'tree_tax_diff'):
            self.tree_tax_diff(options)
        elif(options.subparser_name == 'tax_diff'):
            self.tax_diff(options)
        elif(options.subparser_name == 'decorate'):
            self.decorate(options)
        elif(options.subparser_name == 'pull'):
            self.pull(options)
        elif(options.subparser_name == 'validate'):
            self.validate(options)
        elif(options.subparser_name == 'append'):
            self.append(options)
        elif(options.subparser_name == 'taxon_stats'):
            self.taxon_stats(options)
        elif(options.subparser_name == 'robustness_plot'):
            self.robustness_plot(options)
        elif(options.subparser_name == 'dist_plot'):
            self.dist_plot(options)
        elif(options.subparser_name == 'rd_ranks'):
            self.rd_ranks(options)
        elif(options.subparser_name == 'bl_dist'):
            self.bl_dist(options)
        elif(options.subparser_name == 'bl_optimal'):
            self.bl_optimal(options)
        elif(options.subparser_name == 'bl_decorate'):
            self.bl_decorate(options)
        elif(options.subparser_name == 'bl_table'):
            self.bl_table(options)    
        elif(options.subparser_name == 'rank_res'):
            self.rank_res(options)
        else:
            self.logger.error('  [Error] Unknown PhyloRank command: ' + options.subparser_name + '\n')
            sys.exit()

        return 0
Beispiel #11
0
class MakeDatabase(object):
    """Make a dereplicated database of genes.

    Dereplication is done between genes within a named taxonomic
    group (e.g., genomes in the same genus) and is based on the
    average amino acid identity (AAI) between genes. Groups with large
    numbers of taxa can take an excessive amount of time to
    process so are subsampled to a specific number of taxa.
    Subsampling is done in a manor which aims to retain
    phylogenetic diversity and thus helps ensures a good
    distribution of genes within the group. Care is taken
    to ensure type strains are retained during dereplication.

    Note: this script is tailored to IMG in that it assumes
    a certain directory structure and file extensions. It also
    corrects a number of common issues with IMG genomes:
      - non-ascii characters in fasta header lines
      - hyphens at the start of some protein sequences
    """

    def __init__(self):
        """Initialize."""

        check_dependencies(['comparem', 'diamond', 'makeblastdb'])

        self.underclassified = 'underclassified'

        self.rank_prefixes = Taxonomy.rank_prefixes
        self.rank_index = Taxonomy.rank_index
        self.rank_labels = Taxonomy.rank_labels

        self.time_keeper = TimeKeeper()

    def read_taxonomy(self, input_taxonomy):
        """Read taxonomy file.

        Taxonomy file should have the following format:
            <genome_id>\t<taxonomy_str>

            where taxonomy_str is in GreenGenes format:
                d__Bacteria;p__Firmicutes;...

        Parameters
        ----------
        input_taxonomy : str
            Taxonomy file.

        Returns
        -------
        dict
            Taxonomy for each genome id.
        """

        taxonomy = {}
        for line in open(input_taxonomy):
            line_split = line.split('\t')

            taxonomy[line_split[0]] = [x.strip() for x in line_split[1].split(';')]

        return taxonomy

    def read_type_strain(self, type_strain_file):
        """Read type strain file.

        The type strain file should have the following format:
            <genome_id>\t<genome_name>

        Parameters
        ----------
        type_strain_file : str
            File specifying type strains.

        Returns
        -------
        set
            Set of all genome ids specified as type strains.
        """

        type_strains = set()
        for line in open(type_strain_file):
            line_split = line.split('\t')
            type_strains.add(line_split[0])

        return type_strains

    def select_taxa(self, genome_list, taxonomy, type_strains, max_taxa):
        """Select subset of genomes with a good distribution across named groups.

        Groups genomes into named groups and subsamples evenly across
        these groups. Ideally, genomes would be grouped into species, but
        some genomes may not have a species identifier. Such genomes are
        assigned to the most specific named group possible. Any genome
        marked as a type strain will be retained.

        Parameters
        ----------
        genome_list : iterable of genome ids
            Genomes to subsample.
        taxonomy : d[genome_id] -> [domain, ..., species]
            Taxonomy of each genome.
        type_strains : iterable
            Genome identifiers of type strains.
        max_taxa : int
            Number of genomes to retain.

        Returns
        -------
        iterable
            Subsampled list of genomes.
        """

        if len(genome_list) <= max_taxa:
            return genome_list

        reduced_genome_list = []

        # group genomes into the most specific named groups possible
        groups = defaultdict(set)
        for genome_id in genome_list:
            # add in type strains regardless of taxonomy
            if genome_id in type_strains:
                reduced_genome_list.append(genome_id)
                continue

            # get first classified rank
            for rank_index in xrange(self.rank_index['s__'], -1, -1):
                taxa = taxonomy[genome_id][rank_index]
                if taxa != self.rank_prefixes[rank_index]:
                    break

            groups[taxa].add(genome_id)

        # sample genomes from each named group
        while len(reduced_genome_list) < max_taxa:
            genomes_to_select = max_taxa - len(reduced_genome_list)
            genomes_per_group = max(genomes_to_select / len(groups), 1)
            for taxa, genome_ids in groups.iteritems():
                selected_genomes = random.sample(genome_ids, min(len(genome_ids), genomes_per_group))
                groups[taxa] = genome_ids.difference(selected_genomes)

                reduced_genome_list.extend(selected_genomes)

                if len(reduced_genome_list) == max_taxa:
                    break  # special case where we are adding single genomes from each group

        return reduced_genome_list

    def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore):
        """Write genes to output stream.

        Parameters
        ----------
        gene_out : stream
            Output stream.
        gene_dir : str
            Directory containing called genes in amino acid space.
        genome_list : iterable
            Genomes to process.
        genes_to_ignore : set
            Genes which should not be written to file.
        """

        genes_kept = 0
        for genome_id in genome_list:
            genome_gene_file = os.path.join(gene_dir, genome_id + '.faa')
            if not os.path.exists(genome_gene_file):
                print '[WARNING] Missing gene file for genome %s.' % genome_gene_file
                continue

            if os.stat(genome_gene_file).st_size == 0:
                print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file
                continue

            for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
                if gene_id in genes_to_ignore:
                    continue

                # IMG headers sometimes contain non-ascii characters which cause
                # problems with BLAST and DIAMOND so there are explicitly filtered out
                annotation = filter(lambda x: x in string.printable, annotation)

                # a few IMG genomes contain protein sequences which start with a hyphen
                if seq[0] == '-':
                    seq = seq[1:]

                gene_out.write('>' + gene_id + ' ' + annotation + '\n')
                gene_out.write(seq + '\n')
                genes_kept += 1

        return genes_kept

    def img_gene_id_to_scaffold_id(self, genome_dir, genome_id, output_dir):
        """Modify IMG gene ids to format which explicitly gives scaffold names.

        For downstream processing it is often necessary to know which scaffold
        a gene is contained on. IMG uses unique identifiers for genes. As such,
        these are changed to the following format:

        <scaffold_id>_<gene #> <annotation> [IMG gene id]

        Parameters
        ----------
        genome_dir : str
            Directory with files for genome.
        genome_id : str
            Unique identifier of genome.
        output_dir : float
            Directory to contain modified fasta files.
        """

        # determine source scaffold for each gene
        gene_id_to_scaffold_id = {}
        gene_number = defaultdict(int)
        for line in open(os.path.join(genome_dir, genome_id + '.gff')):
            if line[0] == '#':
                continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))

        fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):

            annotation = annotation[annotation.find(' ') + 1:]  # remove additional gene id from annotation
            annotation += ' [IMG Gene ID: ' + gene_id + ']'  # append IMG gene id for future reference

            fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
            fout.write(seq + '\n')
        fout.close()

    def amend_gene_identifies(self, gene_dir, output_dir):
        """Modify gene ids to include source genome id.

        The following format is used:
          <gene_id>~<genome_id>

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

    def filter_aai(self, tmp_dir, gene_dir, ammended_gene_dir, per_identity, per_aln_len, cpus):
        """Filter genes with similar amino acid identity.

        Parameters
        ----------
        tmp_dir : str
            Temporary directory for storing results.
        gene_dir : str
            Directory with fasta files containing protein sequences.
        ammended_gene_dir : str
            Directory to store protein sequences with ammended gene ids.
        per_identity : float
            Percent identity for subsampling similar genes.
        per_aln_len : float
            Percent alignment length for subsampling similar genes.
        cpus : int
            Number of cpus to use.

        Returns
        -------
        genes_to_remove : set
            Unique identifiers of genes to filter.
        """

        rblast_dir = os.path.join(tmp_dir, 'rblast')
        os.system('comparem rblast -e 1e-10 -p %d -c %d %s %s' % (per_identity, cpus, gene_dir, rblast_dir))
        aai_dir = os.path.join(tmp_dir, 'aai')
        os.system('comparem aai -p %d -a %d -c %d %s %s' % (per_identity, per_aln_len, cpus, rblast_dir, aai_dir))

        # identify homologs to be filtered
        print ''
        print '  Identifying homologs to be filtered.'
        shared_genes_dir = os.path.join(aai_dir, 'shared_genes')
        files = os.listdir(shared_genes_dir)

        homologs = defaultdict(set)
        for f in files:
            with open(os.path.join(shared_genes_dir, f)) as fin:
                fin.readline()

                for line in fin:
                    line_split = line.split('\t')

                    gene_idA = line_split[0]
                    gene_idB = line_split[1]

                    homologs[gene_idA].add(gene_idB)
                    homologs[gene_idB].add(gene_idA)

        genes_to_remove = set()
        genes_to_keep = set()
        sorted_keys = sorted(homologs, key=lambda k: len(homologs[k]), reverse=True)
        for gene_id in sorted_keys:
            gene_set = homologs[gene_id]

            if len(gene_set.intersection(genes_to_keep)) > 0:
                genes_to_remove.update(gene_set - genes_to_keep)
                genes_to_remove.add(gene_id)
            else:
                genes_to_keep.add(gene_id)
                genes_to_remove.update(gene_set - genes_to_keep)

        # The CompareM call to rblast creates fasta files where gene ids are modified to
        # also contain genome ids. This is just a hack so to point to the directory with
        # these amended fasta files.
        os.system('ln -s %s %s' % (os.path.join(rblast_dir, 'genes'), ammended_gene_dir))

        return genes_to_remove

    def run(self,
                taxonomy_file, type_strains_file,
                genome_dir, max_taxa, rank,
                per_identity, per_aln_len,
                genomes_to_process, keep_all_genes,
                create_diamond_db, create_blast_db,
                cpus, output_dir):
        """ Create dereplicate set of genes.

        Taxonomy file should have the following format:
            <genome_id>\t<taxonomy_str>

            where taxonomy_str is in GreenGenes format:
                d__Bacteria;p__Proteobacteria;...;s__Escherichia coli

        Type strain file should have the following format:
            <genome_id>\t<genome name>

        Parameters
        ----------
        taxonomy_file : str
            File indicating taxonomy string for all genomes of interest
        type_strains_file : str
            File indicating type strains.
        genome_dir : str
            Directory with genomes in individual directories.
        max_taxa : int
            Maximum taxa to retain in a named group.
        rank : int
            Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species).
        per_identity : float
            Percent identity for subsampling similar genes.
        per_aln_len : float
            Percent alignment length for subsampling similar genes.
        genomes_to_process : str
            File with list of genomes to retain instead of performing taxon subsampling.
        keep_all_genes : boolean
            Flag indicating that no gene subsampling should be performed.
        create_diamond_db : boolean
            Flag indicating if DIAMOND database should be created.
        create_blast_db : boolean
            Flag indicating if BLAST database should be created.
        cpus : int
            Number of cpus to use.
        output_dir : str
            Desired output directory for storing results.
        """

        make_sure_path_exists(output_dir)

        print 'Dereplicating at the rank of %s.' % self.rank_labels[rank]

        print ''
        print 'Reading taxonomy file.'
        taxonomy = self.read_taxonomy(taxonomy_file)
        print '  There are %d genomes with taxonomy strings.' % len(taxonomy)

        print ''
        print 'Reading type strain file.'
        type_strains = self.read_type_strain(type_strains_file)
        print '  There are %d type strains.' % len(type_strains)

        # get specific list of genomes to process
        genomes_to_retain = set()
        if genomes_to_process:
            print ''
            print 'Reading genomes to retain.'
            for line in open(genomes_to_process):
                line_split = line.split()
                genomes_to_retain.add(line_split[0])
            print '  Retaining %d genomes.' % len(genomes_to_retain)

        # identify unique genes in each named group
        fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w')
        rank_genomes = defaultdict(list)
        genomes_with_missing_data = set()
        underclassified_genomes = 0
        for genome_id, t in taxonomy.iteritems():
            if genomes_to_process and genome_id not in genomes_to_retain:
                continue

            genome_file = os.path.join(genome_dir, genome_id, genome_id + '.genes.faa')
            if not os.path.exists(genome_file):
                genomes_with_missing_data.add(genome_id)
                fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n')
                continue

            taxa = t[rank]
            if taxa[3:] == '':
                underclassified_genomes += 1
                rank_genomes[self.underclassified].append(genome_id)
            else:
                rank_genomes[taxa].append(genome_id)
        fout.close()

        total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()])

        print ''
        print 'Under-classified genomes automatically placed into the database: %d' % underclassified_genomes
        print 'Genomes with missing sequence data: %d' % len(genomes_with_missing_data)
        print ''
        print 'Total named groups: %d' % len(rank_genomes)
        print 'Total genomes to process: %d' % total_genomes_to_process

        # process each named group
        print ''
        gene_file = os.path.join(output_dir, 'genome_db.%s.genes.faa' % str(datetime.date.today()))
        gene_out = open(gene_file, 'w')

        taxonomy_out = open(os.path.join(output_dir, 'taxonomy.%s.tsv' % str(datetime.date.today())), 'w')

        tmp_dir = tempfile.mkdtemp()
        total_genes_removed = 0
        total_genes_kept = 0
        total_genomes_kept = 0
        processed_genomes = 0
        for taxa, genome_list in rank_genomes.iteritems():
            processed_genomes += len(genome_list)

            print ''
            print '-------------------------------------------------------------------------------'
            print ' Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process)
            print self.time_keeper.get_time_stamp()
            print '-------------------------------------------------------------------------------'

            # create directory with selected genomes
            taxon_dir = os.path.join(tmp_dir, 'taxon')
            os.mkdir(taxon_dir)

            reduced_genome_list = genome_list
            if not genomes_to_process and taxa != self.underclassified:  # perform taxon subsampling
                reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa)
            total_genomes_kept += len(reduced_genome_list)

            gene_dir = os.path.join(taxon_dir, 'genes')
            os.mkdir(gene_dir)
            for genome_id in reduced_genome_list:
                taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n')
                cur_genome_dir = os.path.join(genome_dir, genome_id)
                self.img_gene_id_to_scaffold_id(cur_genome_dir, genome_id, gene_dir)

            # filter genes based on amino acid identity
            genes_to_remove = []
            amended_gene_dir = os.path.join(taxon_dir, 'ammended_genes')
            if keep_all_genes or taxa == self.underclassified:
                # modify gene identifiers to include genome ids
                self.amend_gene_identifies(gene_dir, amended_gene_dir)
            else:
                # filter genes on AAI
                genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, cpus)

            print ''
            print '  Writing unique genes from genomes in %s.' % taxa
            genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove)

            print '    Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list))
            print '    Genes to keep: %d' % genes_kept
            print '    Genes removed: %d' % len(genes_to_remove)

            total_genes_kept += genes_kept
            total_genes_removed += len(genes_to_remove)

            shutil.rmtree(taxon_dir)

        taxonomy_out.close()
        gene_out.close()

        print ''
        print 'Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process))
        print '  Total genes kept: %d' % total_genes_kept
        print '  Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed))

        if create_diamond_db:
            print ''
            print 'Creating DIAMOND database.'
            os.system('diamond makedb -b 10 -p 32 -d %s --in %s' % (gene_file, gene_file))
            print ''

        if create_blast_db:
            print ''
            print 'Creating BLAST database.'
            os.system('makeblastdb -dbtype prot -in %s' % gene_file)
            print ''

        shutil.rmtree(tmp_dir)