def run(self, taxonomy_file, genome_list_file):
    """Add taxonomy to database."""

    genome_list = set()
    if genome_list_file:
        for line in open(genome_list_file):
	  if '\t' in line:
            genome_list.add(line.rstrip().split('\t')[0])
	  else:
	    genome_list.add(line.rstrip().split(',')[0])

    # read taxonomy file
    taxonomy = Taxonomy().read(taxonomy_file)

    # add full taxonomy string to database
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    for genome_id, taxa in taxonomy.iteritems():
      if genome_list_file and genome_id not in genome_list:
        continue

      taxa_str = ';'.join(taxa)
      temp_file.write('%s\t%s\n' % (genome_id, taxa_str))

    temp_file.close()
    cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_taxonomy', 'TEXT', temp_file.name)
    print cmd
    os.system(cmd)
    os.remove(temp_file.name)

    # add each taxonomic rank to database
    for i, rank in enumerate(Taxonomy.rank_labels):
      temp_file = tempfile.NamedTemporaryFile(delete=False)
      for genome_id, taxa in taxonomy.iteritems():
        if genome_list_file and genome_id not in genome_list:
          continue

        rank_str = taxa[i]
        if Taxonomy.rank_labels[i] == 'species':
          # ensure species name includes genus
          if taxa[i-1][3:] not in taxa[i]:
            rank_str = 's__' + taxa[i-1][3:] + ' ' + taxa[i][3:]

        temp_file.write('%s\t%s\n' % (genome_id, rank_str))

      temp_file.close()
      cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name)
      print cmd
      os.system(cmd)
      os.remove(temp_file.name)
Example #2
0
    def run(self, taxonomy_file, genome_list):
        """Add taxonomy to database."""

        if genome_list:
            genomes_to_process = set()
            for line in open(genome_list):
                if line[0] == '#':
                    continue

                genomes_to_process.add(line.rstrip().split('\t')[0])

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # add full taxonomy string to database
        temp_file = tempfile.NamedTemporaryFile(delete=False)
        for genome_id, taxa in taxonomy.iteritems():
            if genome_id.startswith('GCA_'):
                genome_id = 'GB_' + genome_id
            elif genome_id.startswith('GCF_'):
                genome_id = 'RS_' + genome_id

            if not genome_list or genome_id in genomes_to_process:
                taxa_str = ';'.join(taxa)
                temp_file.write('%s\t%s\n' % (genome_id, taxa_str))

        temp_file.close()
        cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % (
            'metadata_taxonomy', 'ncbi_taxonomy_unfiltered', 'TEXT',
            temp_file.name)
        print cmd
        os.system(cmd)
        os.remove(temp_file.name)
Example #3
0
    def run(self, taxonomy_file, genome_list_file):
        """Add taxonomy to database."""

        genome_list = set()
        if genome_list_file:
            for line in open(genome_list_file):
                if '\t' in line:
                    genome_list.add(line.rstrip().split('\t')[0])
                else:
                    genome_list.add(line.rstrip().split(',')[0])

        # read taxonomy file
        taxonomy = Taxonomy().read(taxonomy_file)

        # add each taxonomic rank to database
        for i, rank in enumerate(Taxonomy.rank_labels):
            temp_file = tempfile.NamedTemporaryFile(delete=False)
            for genome_id, taxa in taxonomy.iteritems():
                if genome_list_file and genome_id not in genome_list:
                    continue

                rank_str = taxa[i]
                temp_file.write('%s\t%s\n' % (genome_id, rank_str))

            temp_file.close()
            cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % (
                'metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name)
            print cmd
            os.system(cmd)
            os.remove(temp_file.name)
Example #4
0
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree) #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.iteritems():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' % options.output_file)
Example #5
0
    def root(self, options):
        """Root tree using outgroup."""
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in gtdb_taxonomy.iteritems():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        self.logger.info('Done.')
Example #6
0
    def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program,
            prot_model, split_chars, output_dir):
        """Infer concatenated gene tree.

        Parameters
        ----------
        gene_dirs : list
            GeneTreeTk output directories with information for individual genes.
        min_per_gene : float
            Minimum percentage of genes required to retain taxa.
        min_per_bps : float
            Minimum percentage of base pairs required to retain taxa.
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
            Directory to store results.
        """

        # read MSA files
        concat = defaultdict(lambda: defaultdict(list))
        msa_length = 0
        gene_lengths = {}
        for gene_dir in gene_dirs:
            homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa')

            for seq_id, seq in seq_io.read_seq(homologs):
                taxon_id, gene_id = self._split_ids(seq_id, split_chars)
                if not taxon_id:
                    self.logger.error('Failed to split identifier: %s' %
                                      seq_id)
                    sys.exit(-1)

                concat[taxon_id][gene_dir].append(seq)

            msa_length += len(seq)
            gene_lengths[gene_dir] = len(seq)

        # filter taxon
        mc_filter = set()
        min_per_gene_filter = set()
        min_per_bps_filter = set()
        for taxon_id in concat:
            # check if multiple copy
            missing = 0
            taxon_msa_len = 0
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    missing += 1
                    continue

                if len(concat[taxon_id][gene_id]) > 1:
                    mc_filter.add(taxon_id)
                    break

                taxon_msa_len += len(concat[taxon_id][gene_id][0])

            if taxon_id not in mc_filter:
                if missing > len(gene_dirs) * (1.0 -
                                               float(min_per_gene) / 100.0):
                    min_per_gene_filter.add(taxon_id)
                elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0:
                    min_per_bps_filter.add(taxon_id)

        min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0)

        filtered_taxa = mc_filter.union(min_per_gene_filter).union(
            min_per_bps_filter)
        remaining_taxa = set(concat) - filtered_taxa
        self.logger.info('No. genes: %d' % len(gene_dirs))
        self.logger.info('No. taxa across all genes: %d' % len(concat))
        self.logger.info('Total filtered taxa: %d' % len(filtered_taxa))
        self.logger.info('  Due to multi-copy genes: %d' % len(mc_filter))
        self.logger.info('  Due to having <%d of the genes: %d' %
                         (min_req_genes, len(min_per_gene_filter)))
        self.logger.info('  Due to an insufficient number of base pairs: %d' %
                         len(min_per_bps_filter))
        self.logger.info('Remaining taxa: %d' % len(remaining_taxa))
        self.logger.info('Length of concatenated MSA: %d' % msa_length)

        # create the multiple sequences alignment
        msa_file = os.path.join(output_dir, 'concatenated.faa')
        fout = open(msa_file, 'w')
        for taxon_id in remaining_taxa:
            msa = ''
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    msa += '-' * gene_lengths[gene_id]
                else:
                    msa += concat[taxon_id][gene_id][0]

            fout.write('>%s\n' % taxon_id)
            fout.write('%s\n' % msa)
        fout.close()

        # read all taxonomy files
        # (assumes taxonomy is the same for taxa across all genes)
        taxonomy = {}
        for gene_id in gene_dirs:
            taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv')
            t = Taxonomy().read(taxonomy_file)
            for label, taxa_str in t.iteritems():
                taxon_id, gene_id = self._split_ids(label, split_chars)
                taxonomy[taxon_id] = taxa_str

        # create taxonomy file for retained taxa
        self.logger.info('Creating taxonomy file for retained taxa.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for taxon_id in remaining_taxa:
            if taxon_id in taxonomy:  # query genomes will generally be missing
                fout.write('%s\t%s\n' %
                           (taxon_id, ';'.join(taxonomy[taxon_id])))
        fout.close()

        # infer tree
        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                'concatenated.unrooted.tree')
            tree_log = os.path.join(output_dir, 'concatenated.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        self.logger.info('Rooting tree at midpoint.')
        tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        if len(remaining_taxa) > 2:
            tree.reroot_at_midpoint(update_bipartitions=False)
        tree_output = os.path.join(output_dir, 'concatenated.rooted.tree')
        tree.write_to_path(tree_output,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # create tax2tree consensus map and decorate tree
        t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(msa_file, taxonomy, metadata,
                                 arb_metadata_file)
Example #7
0
    def _filter_taxa_for_dist_inference(self, tree, taxonomy, trusted_taxa,
                                        min_children, min_support):
        """Determine taxa to use for inferring distribution of relative divergences.
    
        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.
        taxonomy : d[taxon ID] -> [d__x; p__y; ...]
            Taxonomy for each taxon.
        trusted_taxa : iterable
            Trusted taxa to consider when inferring distribution.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
            Only consider taxa with at least this level of support when inferring distribution.
        """

        # determine children taxa for each named group
        taxon_children = Taxonomy().taxon_children(taxonomy)

        # get all named groups
        taxa_for_dist_inference = set()
        for taxon_id, taxa in taxonomy.iteritems():
            for taxon in taxa:
                taxa_for_dist_inference.add(taxon)

        # sanity check species names as these are a common problem
        species = set()
        for taxon_id, taxa in taxonomy.iteritems():
            if len(taxa) > Taxonomy.rank_index['s__']:
                species_name = taxa[Taxonomy.rank_index['s__']]
                valid, error_msg = True, None
                if species_name != 's__':
                    valid, error_msg = Taxonomy().validate_species_name(
                        species_name, require_full=True, require_prefix=True)
                if not valid:
                    print '[Warning] Species name %s for %s is invalid: %s' % (
                        species_name, taxon_id, error_msg)
                    continue

                species.add(species_name)

        # restrict taxa to those with a sufficient number of named children
        # Note: a taxonomic group with no children will not end up in the
        # taxon_children data structure so care must be taken when applying
        # this filtering criteria.
        if min_children > 0:
            valid_taxa = set()
            for taxon, children_taxa in taxon_children.iteritems():
                if len(children_taxa) >= min_children:
                    valid_taxa.add(taxon)

            taxa_for_dist_inference.intersection_update(valid_taxa)

            # explicitly add in the species since they have no
            # children and thus be absent from the taxon_child dictionary
            taxa_for_dist_inference.update(species)

        # restrict taxa used for inferring distribution to those with sufficient support
        if min_support > 0:
            for node in tree.preorder_node_iter():
                if not node.label or node.is_leaf():
                    continue

                # check for support value
                support, taxon_name, _auxiliary_info = parse_label(node.label)

                if not taxon_name:
                    continue

                if support and float(support) < min_support:
                    taxa_for_dist_inference.difference_update([taxon_name])
                elif not support and min_support > 0:
                    # no support value, so inform user if they were trying to filter on this property
                    print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.'
                    continue

        # restrict taxa used for inferring distribution to the trusted set
        if trusted_taxa:
            taxa_for_dist_inference = trusted_taxa.intersection(
                taxa_for_dist_inference)

        return taxa_for_dist_inference
Example #8
0
   def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir):
       """Tabulate differences between two taxonomies.
       
       Parameters
       ----------
       tax1_file : str
           First taxonomy file.
       tax2_file : str
           Second taxonomy file.
       include_user_taxa : boolean
           Flag indicating if User genomes should be considered.
       output_dir : str
           Output directory.
       """
       
       tax1 = Taxonomy().read(tax1_file)
       tax2 = Taxonomy().read(tax2_file)
       
       if not include_user_taxa:
           new_tax1 = {}
           for genome_id, taxonomy in tax1.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax1[genome_id] = taxonomy
           tax1 = new_tax1
           
           new_tax2 = {}
           for genome_id, taxonomy in tax2.iteritems():
               if not genome_id.startswith('U_'):
                   new_tax2[genome_id] = taxonomy
           tax2 = new_tax2
       
       common_taxa = set(tax1.keys()).intersection(tax2.keys())
       
       self.logger.info('First taxonomy contains %d taxa.' % len(tax1))
       self.logger.info('Second taxonomy contains %d taxa.' % len(tax2))
       self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa))
       
       # identify differences between taxonomies
       tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0]
       tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0]
       output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2))
       
       unchanged = defaultdict(int)           # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__
       active_change = defaultdict(int)       # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A
       passive_change = defaultdict(int)      # T2 = g__??? -> T1 = g__Jane
       unresolved_change = defaultdict(int)   # T2 = g__Box -> T1 = g__???
       for taxa in common_taxa:
           t1 = tax1[taxa]
           t2 = tax2[taxa]
           
           for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)):
               if taxon1 == taxon2:
                   unchanged[rank] += 1
               elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]:
                   active_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon2 == Taxonomy.rank_prefixes[rank]:
                   passive_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon1 == Taxonomy.rank_prefixes[rank]:
                   unresolved_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
                   
       fout.close()
 
       # report results
       output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n')
       print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal'
       for rank in xrange(0, len(Taxonomy.rank_prefixes)):
           total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank]
           if total != 0:
               fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' %
                                   (Taxonomy.rank_labels[rank],
                                   unchanged[rank], unchanged[rank] * 100.0 / total,
                                   active_change[rank], active_change[rank] * 100.0 / total,
                                   passive_change[rank], passive_change[rank] * 100.0 / total,
                                   unresolved_change[rank], unresolved_change[rank] * 100.0 / total))
               print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank],
                                                   unchanged[rank],
                                                   active_change[rank],
                                                   passive_change[rank],
                                                   unresolved_change[rank],
                                                   total)
Example #9
0
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # determine children taxa for each named group
    taxon_children = Taxonomy().taxon_children(taxonomy)

    # get all named groups
    taxa_for_dist_inference = set()
    for taxon_id, taxa in taxonomy.iteritems():
        for taxon in taxa:
            taxa_for_dist_inference.add(taxon)

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.iteritems():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True)
            if not valid:
                print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg)
                continue
                
            species.add(species_name)

    # restrict taxa to those with a sufficient number of named children
    # Note: a taxonomic group with no children will not end up in the
    # taxon_children data structure so care must be taken when applying
    # this filtering criteria.
    if min_children > 0:
        valid_taxa = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children:
                valid_taxa.add(taxon)

        taxa_for_dist_inference.intersection_update(valid_taxa)

        # explicitly add in the species since they have no
        # children and thus be absent from the taxon_child dictionary
        taxa_for_dist_inference.update(species)

    # restrict taxa used for inferring distribution to those with sufficient support
    if min_support > 0:
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            # check for support value
            support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            if support and float(support) < min_support:
                taxa_for_dist_inference.difference_update([taxon_name])
            elif not support and min_support > 0:
                # no support value, so inform user if they were trying to filter on this property
                print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.'
                continue

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference)

    return taxa_for_dist_inference