def domainAssignmentReport(self, outfile):
        """Reports results of automated domain assignment."""
        
        # identify genomes that have not been compared to representatives
        self.cur.execute("SELECT id FROM metadata_taxonomy")
        genome_ids = [genome_id[0] for genome_id in self.cur.fetchall()]
   
        # get concatenated alignments for all representatives
        self.cur.execute("SELECT count(*) from marker_set_contents where set_id = 1;")
        len_bac_marker = self.cur.fetchone()[0]

        self.cur.execute("SELECT count(*) from marker_set_contents where set_id = 2;")
        len_arc_marker = self.cur.fetchone()[0]
        
        genome_mngr = GenomeManager(self.cur, self.currentUser)

        # process each genome
        fout = open(outfile, 'w')
        fout.write('Genome Id\tPredicted domain\tArchaeal Marker Percentage\tBacterial Marker Percentage\tNCBI taxonomy\tGTDB taxonomy\n')
        for genome_id in genome_ids:
            query_taxonomy_req = ("SELECT gtdb_domain, ncbi_taxonomy, gtdb_taxonomy " +
                                      "FROM metadata_taxonomy WHERE id = %s;")
            self.cur.execute(query_taxonomy_req, (genome_id,))
            gtdb_domain, ncbi_taxonomy, gtdb_taxonomy = self.cur.fetchone()
                                 
            domain, arc_aa_per, bac_aa_per = self._domainAssignment(genome_id, len_arc_marker, len_bac_marker)

            external_genome_id = genome_mngr.genomeIdsToExternalGenomeIds([genome_id])[genome_id]
            fout.write('%s\t%s\t%.2f\t%.2f\t%s\t%s\n' % (external_genome_id, domain, arc_aa_per, bac_aa_per, ncbi_taxonomy, gtdb_taxonomy))
            
        fout.close()
Exemple #2
0
    def ncbiDereplicatedGenomes(self, include_user_reps):
        """Get identifiers from the dereplicated set of NCBI genome.

        Identifiers are return for NCBI representative genomes
        and NCBI genomes without a representative.

        Parameters
        ----------
        include_user_reps : bool
            Flag indicating if NCBI genomes assigned to a
            User representative should be returned.

        Returns
        -------
        list
            List of database genome identifiers.
        """

        try:
            genome_mngr = GenomeManager(self.cur, self.currentUser)
            ncbi_genomes_ids = genome_mngr.ncbiGenomeIds()

            self.cur.execute(
                "SELECT id " + "FROM metadata_taxonomy " +
                "WHERE (gtdb_representative = 'TRUE' " +
                "OR gtdb_genome_representative IS NULL) " + "AND id = ANY(%s)",
                (ncbi_genomes_ids, ))
            derep_genome_ids = [genome_id[0] for genome_id in self.cur]

            if include_user_reps:
                self.cur.execute(
                    "SELECT id " + "FROM metadata_taxonomy " +
                    "WHERE (gtdb_representative = 'FALSE' " +
                    "AND gtdb_genome_representative LIKE %s " +
                    "AND id = ANY(%s))", (
                        'U%',
                        ncbi_genomes_ids,
                    ))
                derep_genome_ids += [genome_id[0] for genome_id in self.cur]

        except GenomeDatabaseError as e:
            raise e

        return derep_genome_ids
    def ncbiDereplicatedGenomes(self, include_user_reps):
        """Get identifiers from the dereplicated set of NCBI genome.

        Identifiers are return for NCBI representative genomes
        and NCBI genomes without a representative.

        Parameters
        ----------
        include_user_reps : bool
            Flag indicating if NCBI genomes assigned to a
            User representative should be returned.

        Returns
        -------
        list
            List of database genome identifiers.
        """

        try:
            genome_mngr = GenomeManager(self.cur, self.currentUser)
            ncbi_genomes_ids = genome_mngr.ncbiGenomeIds()

            self.cur.execute("SELECT id " +
                             "FROM metadata_taxonomy " +
                             "WHERE (gtdb_representative = 'TRUE' " +
                             "OR gtdb_genome_representative IS NULL) " +
                             "AND id = ANY(%s)", (ncbi_genomes_ids,))
            derep_genome_ids = [genome_id[0] for genome_id in self.cur]

            if include_user_reps:
                self.cur.execute("SELECT id " +
                                 "FROM metadata_taxonomy " +
                                 "WHERE (gtdb_representative = 'FALSE' " +
                                 "AND gtdb_genome_representative LIKE %s " +
                                 "AND id = ANY(%s))", ('U%', ncbi_genomes_ids,))
                derep_genome_ids += [genome_id[0] for genome_id in self.cur]

        except GenomeDatabaseError as e:
            raise e

        return derep_genome_ids
Exemple #4
0
    def domainAssignmentReport(self, outfile):
        """Reports results of automated domain assignment."""

        # identify genomes that have not been compared to representatives
        self.cur.execute("SELECT id FROM metadata_taxonomy")
        genome_ids = [genome_id[0] for genome_id in self.cur.fetchall()]

        # get concatenated alignments for all representatives
        self.cur.execute(
            "SELECT count(*) from marker_set_contents where set_id = 1;")
        len_bac_marker = self.cur.fetchone()[0]

        self.cur.execute(
            "SELECT count(*) from marker_set_contents where set_id = 2;")
        len_arc_marker = self.cur.fetchone()[0]

        genome_mngr = GenomeManager(self.cur, self.currentUser)

        # process each genome
        fout = open(outfile, 'w')
        fout.write(
            'Genome Id\tPredicted domain\tArchaeal Marker Percentage\tBacterial Marker Percentage\tNCBI taxonomy\tGTDB taxonomy\n'
        )
        for genome_id in genome_ids:
            query_taxonomy_req = (
                "SELECT gtdb_domain, ncbi_taxonomy, gtdb_taxonomy " +
                "FROM metadata_taxonomy WHERE id = %s;")
            self.cur.execute(query_taxonomy_req, (genome_id, ))
            gtdb_domain, ncbi_taxonomy, gtdb_taxonomy = self.cur.fetchone()

            domain, arc_aa_per, bac_aa_per = self._domainAssignment(
                genome_id, len_arc_marker, len_bac_marker)

            external_genome_id = genome_mngr.genomeIdsToExternalGenomeIds(
                [genome_id])[genome_id]
            fout.write('%s\t%s\t%.2f\t%.2f\t%s\t%s\n' %
                       (external_genome_id, domain, arc_aa_per, bac_aa_per,
                        ncbi_taxonomy, gtdb_taxonomy))

        fout.close()
Exemple #5
0
    def userRepresentativeGenomes(self):
        """Get genome identifiers for all user representative genomes.

        Returns
        -------
        list
            List of database identifiers for user representative genomes.
        """

        try:
            genome_mngr = GenomeManager(self.cur, self.currentUser)
            user_genomes_ids = genome_mngr.userGenomeIds()

            self.cur.execute("SELECT id " +
                             "FROM metadata_taxonomy " +
                             "WHERE gtdb_representative = 'TRUE' " +
                             "AND id = ANY(%s)", (user_genomes_ids,))
            user_rep_genome_ids = [genome_id[0] for genome_id in self.cur]

        except GenomeDatabaseError as e:
            raise e

        return user_rep_genome_ids
    def userRepresentativeGenomes(self):
        """Get genome identifiers for all user representative genomes.

        Returns
        -------
        list
            List of database identifiers for user representative genomes.
        """

        try:
            genome_mngr = GenomeManager(self.cur, self.currentUser)
            user_genomes_ids = genome_mngr.userGenomeIds()

            self.cur.execute("SELECT id " +
                             "FROM metadata_taxonomy " +
                             "WHERE gtdb_representative = 'TRUE' " +
                             "AND id = ANY(%s)", (user_genomes_ids,))
            user_rep_genome_ids = [genome_id[0] for genome_id in self.cur]

        except GenomeDatabaseError as e:
            raise e

        return user_rep_genome_ids
Exemple #7
0
    def filterGenomes(self, marker_ids, genome_ids, quality_threshold,
                      quality_weight, comp_threshold, cont_threshold,
                      min_perc_aa, min_rep_perc_aa, taxa_filter,
                      guaranteed_taxa_filter, genomes_to_exclude,
                      guaranteed_ids, rep_ids, directory, prefix):
        """Filter genomes based on provided criteria.

        Parameters
        ----------

        Returns
        -------
        set
            Database identifiers of retained genomes.
        """

        if not os.path.exists(directory):
            os.makedirs(directory)

        # get mapping from db genome IDs to external IDs
        genome_mngr = GenomeManager(self.cur, self.currentUser)
        external_ids = genome_mngr.genomeIdsToExternalGenomeIds(genome_ids)
        filter_genome_file = os.path.join(directory,
                                          prefix + '_filtered_genomes.tsv')
        fout_filtered = open(filter_genome_file, 'w')

        self.logger.info('Filtering initial set of %d genomes.' %
                         len(genome_ids))

        extra_guaranteed_ids = [
            x for x in guaranteed_ids if x not in genome_ids
        ]
        if len(extra_guaranteed_ids) > 0:
            self.logger.warning(
                'Identified {0} guaranteed genomes absent from specified input genomes (Those genomes will not appear in the final tree).'
                .format(len(extra_guaranteed_ids)))
            guaranteed_ids = [x for x in guaranteed_ids if x in genome_ids]
        self.logger.info(
            'Identified %d genomes to be excluded from filtering.' %
            len(guaranteed_ids))

        # for all markers, get the expected marker size
        self.cur.execute(
            "SELECT markers.id, markers.name, description, id_in_database, size, external_id_prefix "
            + "FROM markers, marker_databases " + "WHERE markers.id in %s "
            "AND markers.marker_database_id = marker_databases.id "
            "ORDER by external_id_prefix ASC, id_in_database ASC",
            (tuple(marker_ids), ))

        chosen_markers = dict()
        chosen_markers_order = []

        total_alignment_len = 0
        for marker_id, marker_name, marker_description, id_in_database, size, external_id_prefix in self.cur:
            chosen_markers[marker_id] = {
                'external_id_prefix': external_id_prefix,
                'name': marker_name,
                'description': marker_description,
                'id_in_database': id_in_database,
                'size': size
            }
            chosen_markers_order.append(marker_id)
            total_alignment_len += size

        # filter genomes based on taxonomy
        genomes_to_retain = genome_ids
        if taxa_filter:
            new_genomes_to_retain = self._taxa_filter(taxa_filter,
                                                      genomes_to_retain,
                                                      guaranteed_ids,
                                                      retain_guaranteed=True)
            for genome_id in genomes_to_retain - new_genomes_to_retain:
                rep_str = 'Representative' if genome_id in rep_ids else ''
                fout_filtered.write(
                    '%s\t%s\t%s\n' %
                    (external_ids[genome_id],
                     'Filtered on taxonomic affiliation.', rep_str))

            genomes_to_retain = new_genomes_to_retain

        if guaranteed_taxa_filter:
            new_genomes_to_retain = self._taxa_filter(guaranteed_taxa_filter,
                                                      genomes_to_retain,
                                                      guaranteed_ids,
                                                      retain_guaranteed=False)
            for genome_id in genomes_to_retain - new_genomes_to_retain:
                rep_str = 'Representative' if genome_id in rep_ids else ''
                fout_filtered.write(
                    '%s\t%s\t%s\n' %
                    (external_ids[genome_id],
                     'Filtered on guaranteed taxonomic affiliation.', rep_str))

            genomes_to_retain = new_genomes_to_retain

        # find genomes based on completeness, contamination, or genome quality
        self.logger.info(
            'Filtering genomes with completeness <%.1f%%, contamination >%.1f%%, or quality <%.1f%% (weight = %.1f).'
            % (comp_threshold, cont_threshold, quality_threshold,
               quality_weight))
        filtered_genomes = self._filterOnGenomeQuality(genomes_to_retain,
                                                       quality_threshold,
                                                       quality_weight,
                                                       comp_threshold,
                                                       cont_threshold)

        # sanity check representatives are not of poor quality
        final_filtered_genomes = set()
        for genome_id, quality in filtered_genomes.iteritems():
            if genome_id not in guaranteed_ids:
                if genome_id in rep_ids:
                    self.logger.warning(
                        'Retaining representative genome %s despite poor estimated quality (comp=%.1f%%, cont=%.1f%%).'
                        % (external_ids[genome_id], quality[0], quality[1]))
                else:
                    final_filtered_genomes.add(genome_id)
                    fout_filtered.write(
                        '%s\t%s\t%.2f\t%.2f\n' %
                        (external_ids[genome_id],
                         'Filtered on quality (completeness, contamination).',
                         quality[0], quality[1]))

        self.logger.info(
            'Filtered %d genomes based on completeness, contamination, and quality.'
            % len(final_filtered_genomes))

        genomes_to_retain -= final_filtered_genomes

        # filter genomes explicitly specified for exclusion
        if genomes_to_exclude:
            for genome_id in genomes_to_exclude:
                if genome_id in external_ids:
                    fout_filtered.write('%s\t%s\n' %
                                        (external_ids[genome_id],
                                         'Explicitly marked for exclusion.'))

            conflicting_genomes = guaranteed_ids.intersection(
                genomes_to_exclude)
            if conflicting_genomes:
                raise GenomeDatabaseError(
                    'Genomes marked for both retention and exclusion, e.g.: %s'
                    % conflicting_genomes.pop())

            new_genomes_to_retain = genomes_to_retain.difference(
                genomes_to_exclude)
            self.logger.info(
                'Filtered %d genomes explicitly indicated for exclusion.' %
                (len(genomes_to_retain) - len(new_genomes_to_retain)))
            genomes_to_retain = new_genomes_to_retain

        # filter genomes with insufficient number of amino acids in MSA
        self.logger.info(
            'Filtering genomes with insufficient amino acids in the MSA.')
        filter_on_aa = set()
        for genome_id in genomes_to_retain:
            aligned_marker_query = (
                "SELECT sequence, multiple_hits,hit_number,unique_genes " +
                "FROM aligned_markers " + "WHERE genome_id = %s " +
                "AND sequence is NOT NULL " + "AND marker_id IN %s")

            self.cur.execute(aligned_marker_query,
                             (genome_id, tuple(marker_ids)))

            total_aa = 0
            for sequence, multiple_hits, hit_number, unique_genes in self.cur:
                if not multiple_hits:
                    total_aa += len(sequence) - sequence.count('-')
                elif unique_genes == 1:
                    total_aa += len(sequence) - sequence.count('-')

            # should retain guaranteed genomes unless they have zero amino
            # acids in MSA
            if genome_id in guaranteed_ids:
                if total_aa != 0:
                    continue
                else:
                    self.logger.warning(
                        'Filtered guaranteed genome %s with zero amino acids in MSA.'
                        % external_ids[genome_id])

            perc_alignment = total_aa * 100.0 / total_alignment_len
            if perc_alignment < min_perc_aa:
                rep_str = ''
                if genome_id in rep_ids:
                    if perc_alignment < min_rep_perc_aa:
                        rep_str = 'Representative'
                        self.logger.warning(
                            'Filtered representative genome %s due to lack of aligned amino acids (%.1f%%).'
                            % (external_ids[genome_id], perc_alignment))
                    else:
                        self.logger.warning(
                            'Retaining representative genome %s despite small numbers of aligned amino acids (%.1f%%).'
                            % (external_ids[genome_id], perc_alignment))
                        continue

                filter_on_aa.add(genome_id)
                fout_filtered.write('%s\t%s\t%d\t%.1f\t%s\n' % (
                    external_ids[genome_id],
                    'Insufficient number of amino acids in MSA (total AA, % alignment length)',
                    total_aa, perc_alignment, rep_str))

        fout_filtered.close()

        self.logger.info(
            'Filtered %d genomes with insufficient amino acids in the MSA.' %
            len(filter_on_aa))

        genomes_to_retain.difference_update(filter_on_aa)
        self.logger.info('Producing tree data for %d genomes.' %
                         len(genomes_to_retain))

        good_genomes_file = os.path.join(directory,
                                         prefix + '_good_genomes.tsv')
        good_genomes = open(good_genomes_file, 'w')
        for item in genomes_to_retain:
            good_genomes.write("{0}\n".format(item))
        good_genomes.close()

        return (genomes_to_retain, chosen_markers_order, chosen_markers)
Exemple #8
0
    def assignToRepresentative(self):
        """Assign genomes to representatives.

        This method assumes any genomes to process
        have already been committed to the database
        as identification and alignment of canonical
        marker genes is done in independent database
        transactions.
        """

        # identify genomes that have not been compared to representatives
        unprocessed_genome_ids = self._unprocessedGenomes()
        if not unprocessed_genome_ids:
            return

        # get canonical bacterial and archaeal markers
        marker_set_mngr = MarkerSetManager(self.cur, self.currentUser)
        bac_marker_ids = marker_set_mngr.canonicalBacterialMarkers()
        ar_marker_ids = marker_set_mngr.canonicalArchaealMarkers()

        # identify and align genes from canonical bacterial and archaeal marker
        # sets
        all_markers = set(bac_marker_ids).union(ar_marker_ids)
        aligned_mngr = AlignedMarkerManager(
            self.cur, self.threads, self.db_release)
        aligned_mngr.calculateAlignedMarkerSets(
            unprocessed_genome_ids, all_markers)

        # get list of representative genomes
        rep_genome_ids = self.representativeGenomes()
        self.logger.info("Comparing %d unprocessed genomes to %d representatives." %
                         (len(unprocessed_genome_ids),
                          len(rep_genome_ids)))

        # get external genome IDs for representative genomes
        genome_mngr = GenomeManager(self.cur, self.currentUser)
        external_ids = genome_mngr.genomeIdsToExternalGenomeIds(rep_genome_ids)

        # get domains for all representatives
        rep_genome_dictionary = self._getRepresentativeDomain()

        # define desired order of marker genes
        # (order doesn't matter, but must be consistent between genomes)
        bac_marker_index = {}
        for i, marker_id in enumerate(bac_marker_ids):
            bac_marker_index[marker_id] = i

        ar_marker_index = {}
        for i, marker_id in enumerate(ar_marker_ids):
            ar_marker_index[marker_id] = i

        # get concatenated alignments for all representatives
        rep_bac_aligns = {}
        rep_ar_aligns = {}
        for rep_id in rep_genome_ids:
            rep_bac_aligns[rep_id] = marker_set_mngr.concatenatedAlignedMarkers(
                rep_id, bac_marker_index)
            rep_ar_aligns[rep_id] = marker_set_mngr.concatenatedAlignedMarkers(
                rep_id, ar_marker_index)

        self.cur.execute(
            "SELECT count(*) from marker_set_contents where set_id = 1;")
        len_bac_marker = self.cur.fetchone()[0]

        self.cur.execute(
            "SELECT count(*) from marker_set_contents where set_id = 2;")
        len_arc_marker = self.cur.fetchone()[0]
        # process each genome
        assigned_to_rep_count = 0
        for genome_id in unprocessed_genome_ids:
            # get canonical alignment
            genome_bac_align = marker_set_mngr.concatenatedAlignedMarkers(
                genome_id, bac_marker_index)
            genome_ar_align = marker_set_mngr.concatenatedAlignedMarkers(
                genome_id, ar_marker_index)

            domain, _arc_aa_per, _bac_aa_per = self._domainAssignment(
                genome_id, len_arc_marker, len_bac_marker)

            assigned_representative = None
            assigned_representative_dic = {}
            bac_max_mismatches = (1.0 - self.aai_threshold) * \
                (len(genome_bac_align) - genome_bac_align.count('-'))
            ar_max_mismatches = (1.0 - self.aai_threshold) * \
                (len(genome_ar_align) - genome_ar_align.count('-'))
            for rep_id in rep_genome_ids:
                rep_bac_align = rep_bac_aligns[rep_id]
                rep_ar_align = rep_ar_aligns[rep_id]
                if rep_genome_dictionary[rep_id] == domain:
                    if rep_genome_dictionary[rep_id] == 'd__Bacteria':
                        m = self._aai_mismatches(
                            genome_bac_align, rep_bac_align, bac_max_mismatches)
                        if m is not None:  # necessary to distinguish None and 0
                            #assigned_representative = rep_id
                            assigned_representative_dic[rep_id] = m
                            #bac_max_mismatches = m

                    elif rep_genome_dictionary[rep_id] == 'd__Archaea':
                        m = self._aai_mismatches(
                            genome_ar_align, rep_ar_align, ar_max_mismatches)
                        if m is not None:  # necessary to distinguish None and 0
                            assigned_representative_dic[rep_id] = m
                            ar_max_mismatches = m

            # assign genome to current representative
            if assigned_representative_dic:
                sorted_reps = sorted(assigned_representative_dic.items(
                ), key=operator.itemgetter(1))[0:10]
                try:
                    assigned_representative = self._calculate_fastani_distance(
                        genome_id, sorted_reps)
                except Exception as error:
                    raise GenomeDatabaseError(error.message)
                if assigned_representative:
                    assigned_to_rep_count += 1
                    query = ("UPDATE metadata_taxonomy " +
                             "SET gtdb_genome_representative = %s " +
                             "WHERE id = %s")
                    self.cur.execute(
                        query, (external_ids[assigned_representative], genome_id))
                    query_taxonomy_req = ("SELECT gtdb_class, gtdb_species," +
                                          "gtdb_phylum, gtdb_family, gtdb_domain, gtdb_order, gtdb_genus " +
                                          "FROM metadata_taxonomy WHERE id = %s;")
                    self.cur.execute(query_taxonomy_req, (genome_id,))
                    if all(v is None or v == '' for v in self.cur.fetchone()):
                        query_taxonomy_update = ("UPDATE metadata_taxonomy as mt_newg SET " +
                                                 "gtdb_class = mt_repr.gtdb_class," +
                                                 "gtdb_species = mt_repr.gtdb_species," +
                                                 "gtdb_phylum = mt_repr.gtdb_phylum," +
                                                 "gtdb_family = mt_repr.gtdb_family," +
                                                 "gtdb_domain =  mt_repr.gtdb_domain," +
                                                 "gtdb_order = mt_repr.gtdb_order," +
                                                 "gtdb_genus =  mt_repr.gtdb_genus " +
                                                 "FROM metadata_taxonomy mt_repr " +
                                                 "WHERE mt_repr.id = %s " +
                                                 "AND mt_newg.id = %s")
                        self.cur.execute(query_taxonomy_update,
                                         (assigned_representative, genome_id))
            else:
                query_taxonomy_req = ("SELECT gtdb_class, gtdb_species," +
                                      "gtdb_phylum, gtdb_family, gtdb_domain, gtdb_order, gtdb_genus " +
                                      "FROM metadata_taxonomy WHERE id = %s;")
                self.cur.execute(query_taxonomy_req, (genome_id,))
                if all(v is None or v == '' for v in self.cur.fetchone()):
                    domain, _arc_aa_per, _bac_aa_per = self._domainAssignment(
                        genome_id, len_arc_marker, len_bac_marker)

                    if domain:
                        self.cur.execute("UPDATE metadata_taxonomy " +
                                         "SET gtdb_domain = %s " +
                                         "WHERE id = %s", (domain, genome_id))

        self.logger.info("Assigned %d genomes to a representative." %
                         assigned_to_rep_count)

        # currently, new genomes are never made a representative
        query = "UPDATE metadata_taxonomy SET gtdb_representative = %s WHERE id = %s"
        self.cur.executemany(query, [('False', genome_id)
                                     for genome_id in unprocessed_genome_ids])
    def assignToRepresentative(self):
        """Assign genomes to representatives.

        This method assumes any genomes to process
        have already been committed to the database
        as identification and alignment of canonical
        marker genes is done in independent database
        transactions.
        """
        
        # identify genomes that have not been compared to representatives
        unprocessed_genome_ids = self._unprocessedGenomes()
        if not unprocessed_genome_ids:
            return

        # get canonical bacterial and archaeal markers
        marker_set_mngr = MarkerSetManager(self.cur, self.currentUser)
        bac_marker_ids = marker_set_mngr.canonicalBacterialMarkers()
        ar_marker_ids = marker_set_mngr.canonicalArchaealMarkers()

        # identify and align genes from canonical bacterial and archaeal marker sets
        all_markers = set(bac_marker_ids).union(ar_marker_ids)
        aligned_mngr = AlignedMarkerManager(self.cur, self.threads)
        aligned_mngr.calculateAlignedMarkerSets(unprocessed_genome_ids, all_markers)

        # get list of representative genomes
        rep_genome_ids = self.representativeGenomes()
        self.logger.info("Comparing %d unprocessed genomes to %d representatives." %
                         (len(unprocessed_genome_ids),
                          len(rep_genome_ids)))

        # get external genome IDs for representative genomes
        genome_mngr = GenomeManager(self.cur, self.currentUser)
        external_ids = genome_mngr.genomeIdsToExternalGenomeIds(rep_genome_ids)
        
        # define desired order of marker genes
        # (order doesn't matter, but must be consistent between genomes)
        bac_marker_index = {}
        for i, marker_id in enumerate(bac_marker_ids):
            bac_marker_index[marker_id] = i

        ar_marker_index = {}
        for i, marker_id in enumerate(ar_marker_ids):
            ar_marker_index[marker_id] = i
            
        # get concatenated alignments for all representatives
        rep_bac_aligns = {}
        rep_ar_aligns = {}
        for rep_id in rep_genome_ids:
            rep_bac_aligns[rep_id] = marker_set_mngr.concatenatedAlignedMarkers(rep_id, bac_marker_index)
            rep_ar_aligns[rep_id] = marker_set_mngr.concatenatedAlignedMarkers(rep_id, ar_marker_index)

        self.cur.execute("SELECT count(*) from marker_set_contents where set_id = 1;")
        len_bac_marker = self.cur.fetchone()[0]

        self.cur.execute("SELECT count(*) from marker_set_contents where set_id = 2;")
        len_arc_marker = self.cur.fetchone()[0]

        # process each genome
        assigned_to_rep_count = 0
        for genome_id in unprocessed_genome_ids:
            # get canonical alignment
            genome_bac_align = marker_set_mngr.concatenatedAlignedMarkers(genome_id, bac_marker_index)
            genome_ar_align = marker_set_mngr.concatenatedAlignedMarkers(genome_id, ar_marker_index)

            assigned_representative = None
            bac_max_mismatches = (1.0 - self.aai_threshold) * (len(genome_bac_align) - genome_bac_align.count('-'))
            ar_max_mismatches = (1.0 - self.aai_threshold) * (len(genome_ar_align) - genome_ar_align.count('-'))
            for rep_id in rep_genome_ids:
                rep_bac_align = rep_bac_aligns[rep_id]
                rep_ar_align = rep_ar_aligns[rep_id]

                m = self._aai_mismatches(genome_bac_align, rep_bac_align, bac_max_mismatches)
                if m is not None:  # necessary to distinguish None and 0
                    assigned_representative = rep_id
                    bac_max_mismatches = m
                else:
                    m = self._aai_mismatches(genome_ar_align, rep_ar_align, ar_max_mismatches)
                    if m is not None:  # necessary to distinguish None and 0
                        assigned_representative = rep_id
                        ar_max_mismatches = m

            # assign genome to current representative
            if assigned_representative:
                assigned_to_rep_count += 1
                query = ("UPDATE metadata_taxonomy " +
                         "SET gtdb_genome_representative = %s " +
                         "WHERE id = %s")
                self.cur.execute(query, (external_ids[assigned_representative], genome_id))
                query_taxonomy_req = ("SELECT gtdb_class, gtdb_species, gtdb_taxonomy," +
                                      "gtdb_phylum, gtdb_family, gtdb_domain, gtdb_order, gtdb_genus " +
                                      "FROM metadata_taxonomy WHERE id = %s;")
                self.cur.execute(query_taxonomy_req, (genome_id,))
                if all(v is None or v == '' for v in self.cur.fetchone()):
                    query_taxonomy_update = ("UPDATE metadata_taxonomy as mt_newg SET " +
                                             "gtdb_class = mt_repr.gtdb_class," +
                                             "gtdb_species = mt_repr.gtdb_species," +
                                             "gtdb_taxonomy = mt_repr.gtdb_taxonomy," +
                                             "gtdb_phylum = mt_repr.gtdb_phylum," +
                                             "gtdb_family = mt_repr.gtdb_family," +
                                             "gtdb_domain =  mt_repr.gtdb_domain," +
                                             "gtdb_order = mt_repr.gtdb_order," +
                                             "gtdb_genus =  mt_repr.gtdb_genus " +
                                             "FROM metadata_taxonomy mt_repr " +
                                             "WHERE mt_repr.id = %s " +
                                             "AND mt_newg.id = %s")
                    self.cur.execute(query_taxonomy_update, (assigned_representative, genome_id))
            else: 
                query_taxonomy_req = ("SELECT gtdb_class, gtdb_species, gtdb_taxonomy," +
                                      "gtdb_phylum, gtdb_family, gtdb_domain, gtdb_order, gtdb_genus " +
                                      "FROM metadata_taxonomy WHERE id = %s;")
                self.cur.execute(query_taxonomy_req, (genome_id,))
                if all(v is None or v == '' for v in self.cur.fetchone()):
                    domain, _arc_aa_per, _bac_aa_per = self._domainAssignment(genome_id, len_arc_marker, len_bac_marker)
                        
                    if domain:
                        self.cur.execute("UPDATE metadata_taxonomy " +
                                         "SET gtdb_domain = %s " +
                                         "WHERE id = %s", (domain, genome_id))

        self.logger.info("Assigned %d genomes to a representative." % assigned_to_rep_count)

        # currently, new genomes are never made a representative
        query = "UPDATE metadata_taxonomy SET gtdb_representative = %s WHERE id = %s"
        self.cur.executemany(query, [('False', genome_id) for genome_id in unprocessed_genome_ids])
Exemple #10
0
    def filterGenomes(
        self,
        marker_ids,
        genome_ids,
        quality_threshold,
        quality_weight,
        comp_threshold,
        cont_threshold,
        min_perc_aa,
        min_rep_perc_aa,
        taxa_filter,
        genomes_to_exclude,
        guaranteed_ids,
        rep_ids,
        directory,
        prefix,
    ):
        """Filter genomes based on provided criteria.

        Parameters
        ----------

        Returns
        -------
        set
            Database identifiers of retained genomes.
        """

        if not os.path.exists(directory):
            os.makedirs(directory)

        # get mapping from db genome IDs to external IDs
        genome_mngr = GenomeManager(self.cur, self.currentUser)
        external_ids = genome_mngr.genomeIdsToExternalGenomeIds(genome_ids)
        filter_genome_file = os.path.join(directory, prefix + "_filtered_genomes.tsv")
        fout_filtered = open(filter_genome_file, "w")

        self.logger.info("Filtering initial set of %d genomes." % len(genome_ids))
        self.logger.info("Identified %d genomes to be excluded from filtering." % len(guaranteed_ids))

        # for all markers, get the expected marker size
        self.cur.execute(
            "SELECT markers.id, markers.name, description, id_in_database, size, external_id_prefix "
            + "FROM markers, marker_databases "
            + "WHERE markers.id in %s "
            "AND markers.marker_database_id = marker_databases.id "
            "ORDER by external_id_prefix ASC, id_in_database ASC",
            (tuple(marker_ids),),
        )

        chosen_markers = dict()
        chosen_markers_order = []

        total_alignment_len = 0
        for marker_id, marker_name, marker_description, id_in_database, size, external_id_prefix in self.cur:
            chosen_markers[marker_id] = {
                "external_id_prefix": external_id_prefix,
                "name": marker_name,
                "description": marker_description,
                "id_in_database": id_in_database,
                "size": size,
            }
            chosen_markers_order.append(marker_id)
            total_alignment_len += size

        # filter genomes based on taxonomy
        genomes_to_retain = genome_ids
        if taxa_filter:
            self.logger.info("Filtering genomes outside taxonomic groups of interest (%s)." % taxa_filter)
            taxa_to_retain = [x.strip() for x in taxa_filter.split(",")]
            genome_ids_from_taxa = self._genomesFromTaxa(genome_ids, taxa_to_retain)

            new_genomes_to_retain = genomes_to_retain.intersection(genome_ids_from_taxa).union(guaranteed_ids)
            self.logger.info(
                "Filtered %d genomes based on taxonomic affiliations."
                % (len(genomes_to_retain) - len(new_genomes_to_retain))
            )

            for genome_id in genomes_to_retain - new_genomes_to_retain:
                rep_str = "Representative" if genome_id in rep_ids else ""
                fout_filtered.write(
                    "%s\t%s\t%s\n" % (external_ids[genome_id], "Filtered on taxonomic affiliation.", rep_str)
                )

            genomes_to_retain = new_genomes_to_retain

        # find genomes based on completeness, contamination, or genome quality
        self.logger.info(
            "Filtering genomes with completeness <%.1f%%, contamination >%.1f%%, or quality <%.1f%% (weight = %.1f)."
            % (comp_threshold, cont_threshold, quality_threshold, quality_weight)
        )
        filtered_genomes = self._filterOnGenomeQuality(
            genomes_to_retain, quality_threshold, quality_weight, comp_threshold, cont_threshold
        )

        # sanity check representatives are not of poor quality
        final_filtered_genomes = set()
        for genome_id, quality in filtered_genomes.iteritems():
            if genome_id not in guaranteed_ids:
                if genome_id in rep_ids:
                    self.logger.warning(
                        "Retaining representative genome %s despite poor estimated quality (comp=%.1f%%, cont=%.1f%%)."
                        % (external_ids[genome_id], quality[0], quality[1])
                    )
                else:
                    final_filtered_genomes.add(genome_id)
                    fout_filtered.write(
                        "%s\t%s\t%.2f\t%.2f\n"
                        % (
                            external_ids[genome_id],
                            "Filtered on quality (completeness, contamination).",
                            quality[0],
                            quality[1],
                        )
                    )

        self.logger.info(
            "Filtered %d genomes based on completeness, contamination, and quality." % len(final_filtered_genomes)
        )

        genomes_to_retain -= final_filtered_genomes

        # filter genomes explicitly specified for exclusion
        if genomes_to_exclude:
            for genome_id in genomes_to_exclude:
                fout_filtered.write("%s\t%s\n" % (external_ids[genome_id], "Explicitly marked for exclusion."))

            conflicting_genomes = guaranteed_ids.intersection(genomes_to_exclude)
            if conflicting_genomes:
                raise GenomeDatabaseError(
                    "Genomes marked for both retention and exclusion, e.g.: %s" % conflicting_genomes.pop()
                )

            new_genomes_to_retain = genomes_to_retain.difference(genomes_to_exclude)
            self.logger.info(
                "Filtered %d genomes explicitly indicated for exclusion."
                % (len(genomes_to_retain) - len(new_genomes_to_retain))
            )
            genomes_to_retain = new_genomes_to_retain

        # filter genomes with insufficient number of amino acids in MSA
        self.logger.info("Filtering genomes with insufficient amino acids in the MSA.")
        filter_on_aa = set()
        for genome_id in genomes_to_retain:
            aligned_marker_query = (
                "SELECT sequence, multiple_hits "
                + "FROM aligned_markers "
                + "WHERE genome_id = %s "
                + "AND sequence is NOT NULL "
                + "AND marker_id IN %s"
            )

            self.cur.execute(aligned_marker_query, (genome_id, tuple(marker_ids)))

            total_aa = 0
            for sequence, multiple_hits in self.cur:
                if not multiple_hits:
                    total_aa += len(sequence) - sequence.count("-")

            # should retain guaranteed genomes unless they have zero amino acids in MSA
            if genome_id in guaranteed_ids:
                if total_aa != 0:
                    continue
                else:
                    self.logger.warning(
                        "Filtered guaranteed genome %s with zero amino acids in MSA." % external_ids[genome_id]
                    )

            perc_alignment = total_aa * 100.0 / total_alignment_len
            if perc_alignment < min_perc_aa:
                rep_str = ""
                if genome_id in rep_ids:
                    if perc_alignment < min_rep_perc_aa:
                        rep_str = "Representative"
                        self.logger.warning(
                            "Filtered representative genome %s due to lack of aligned amino acids (%.1f%%)."
                            % (external_ids[genome_id], perc_alignment)
                        )
                    else:
                        self.logger.warning(
                            "Retaining representative genome %s despite small numbers of aligned amino acids (%.1f%%)."
                            % (external_ids[genome_id], perc_alignment)
                        )
                        continue

                filter_on_aa.add(genome_id)
                fout_filtered.write(
                    "%s\t%s\t%d\t%.1f\t%s\n"
                    % (
                        external_ids[genome_id],
                        "Insufficient number of amino acids in MSA (total AA, % alignment length)",
                        total_aa,
                        perc_alignment,
                        rep_str,
                    )
                )

        fout_filtered.close()

        self.logger.info("Filtered %d genomes with insufficient amino acids in the MSA." % len(filter_on_aa))

        genomes_to_retain.difference_update(filter_on_aa)
        self.logger.info("Producing tree data for %d genomes." % len(genomes_to_retain))

        return (genomes_to_retain, chosen_markers_order, chosen_markers)