Beispiel #1
0
    def _msa_filter_by_taxa(self, concatenated_file, gtdb_taxonomy, taxa_filter, outgroup_taxon):
        """Filter GTDB MSA filtered to specified taxa."""

        msa = read_fasta(concatenated_file)
        self.logger.info(
            'Read concatenated alignment for %d GTDB genomes.' % len(msa))

        if taxa_filter is not None:
            taxa_to_keep = set(taxa_filter.split(','))

            if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None:
                taxa_to_keep.add(outgroup_taxon)

            filtered_genomes = 0
            for genome_id, taxa in gtdb_taxonomy.iteritems():
                common_taxa = taxa_to_keep.intersection(taxa)
                if len(common_taxa) == 0:
                    if genome_id in msa:
                        del msa[genome_id]
                        filtered_genomes += 1

            self.logger.info(
                'Filtered %d taxa based on assigned taxonomy.' % filtered_genomes)

        return msa
Beispiel #2
0
    def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file):
        if maskid == 'bac' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
        elif maskid == 'arc' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
        elif mask_type == 'file':
            mask = maskid
        with open(mask, 'r') as f:
            maskstr = f.readline()

        outfwriter = open(output_file, 'w')
        dict_genomes = read_fasta(untrimmed_msa, False)

        for k, v in dict_genomes.iteritems():
            aligned_seq = ''.join([v[i] for i in xrange(
                0, len(maskstr)) if maskstr[i] == '1'])
            fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
            outfwriter.write(fasta_outstr)
        outfwriter.close()
        return True
Beispiel #3
0
    def run(self, msa_file, marker_list):
        """Randomly select a subset of columns from the MSA of each marker."""

        # read multiple sequence alignment
        self.logger.info('Reading multiple sequence alignment.')
        msa = read_fasta(msa_file, False)
        self.logger.info('Read MSA for %d genomes.' % len(msa))

        filtered_seqs, pruned_seqs = self.trim(msa, marker_list)

        self.logger.info(
            'Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.'
            % (len(pruned_seqs), self.min_perc_aa))

        # write out trimmed sequences
        filter_file = open(os.path.join(self.output_dir, "filtered_msa.faa"),
                           'w')
        for gid, seq in filtered_seqs.items():
            fasta_outstr = ">%s\n%s\n" % (gid, seq)
            filter_file.write(fasta_outstr)
        filter_file.close()

        self.logger.info('Done.')
Beispiel #4
0
    def _report_identified_marker_genes(self, gene_dict, outdir, marker_gene_dir, prefix):
        """Report statistics for identified marker genes."""

        translation_table_file = open(os.path.join(outdir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)), "w")
        bac_outfile = open(os.path.join(outdir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)), "w")
        arc_outfile = open(os.path.join(outdir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), "w")

        header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n"

        bac_outfile.write(header)
        arc_outfile.write(header)

        # gather information for all marker genes
        marker_dbs = {"PFAM": PFAM_TOP_HIT_SUFFIX, "TIGR": TIGRFAM_TOP_HIT_SUFFIX}

        marker_bac_list_original = []
        for db_marker in Config.BAC120_MARKERS.keys():
            marker_bac_list_original.extend([marker.replace(".HMM", "").replace(".hmm", "")
                                             for marker in Config.BAC120_MARKERS[db_marker]])

        marker_arc_list_original = []
        for db_marker in Config.AR122_MARKERS.keys():
            marker_arc_list_original.extend([marker.replace(".HMM", "").replace(".hmm", "")
                                             for marker in Config.AR122_MARKERS[db_marker]])

        for db_genome_id, info in gene_dict.items():

            unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], []
            unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], []

            gene_bac_dict, gene_arc_dict = {}, {}

            path = info.get("aa_gene_path")
            for _marker_db, marker_suffix in marker_dbs.iteritems():
                # get all gene sequences
                protein_file = str(path)
                tophit_path = protein_file.replace(PROTEIN_FILE_SUFFIX, marker_suffix)

                # we load the list of all the genes detected in the genome
                all_genes_dict = read_fasta(protein_file, False)

                # Prodigal adds an asterisks at the end of each called genes.
                # These asterisks sometimes appear in the MSA, which can be
                # an issue for some downstream software
                for seq_id, seq in all_genes_dict.iteritems():
                    if seq[-1] == '*':
                        all_genes_dict[seq_id] = seq[:-1]

                # we store the tophit file line by line and store the
                # information in a dictionary
                with open(tophit_path) as tp:
                    # first line is header line
                    tp.readline()

                    for line_tp in tp:
                        linelist = line_tp.split("\t")
                        genename = linelist[0]
                        sublist = linelist[1]
                        if ";" in sublist:
                            diff_markers = sublist.split(";")
                        else:
                            diff_markers = [sublist]

                        for each_mark in diff_markers:
                            sublist = each_mark.split(",")
                            markerid = sublist[0]

                            if (markerid not in marker_bac_list_original and
                                    markerid not in marker_arc_list_original):
                                continue

                            if markerid in marker_bac_list_original:
                                if markerid in gene_bac_dict:
                                    gene_bac_dict.get(markerid)[
                                        "multihit"] = True
                                else:
                                    gene_bac_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False}

                            if markerid in marker_arc_list_original:
                                if markerid in gene_arc_dict:
                                    gene_arc_dict.get(markerid)[
                                        "multihit"] = True
                                else:
                                    gene_arc_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False}

            for mid in marker_bac_list_original:
                if mid not in gene_bac_dict:
                    missing_genes_bac.append(mid)
                elif gene_bac_dict[mid]["multihit"]:
                    multi_hits_bac.append(mid)
                else:
                    unique_genes_bac.append(mid)

            for mid in marker_arc_list_original:
                if mid not in gene_arc_dict:
                    missing_genes_arc.append(mid)
                elif gene_arc_dict[mid]["multihit"]:
                    multi_hits_arc.append(mid)
                else:
                    unique_genes_arc.append(mid)

            bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(db_genome_id,
                                                                           len(unique_genes_bac),
                                                                           len(multi_hits_bac),
                                                                           len(missing_genes_bac),
                                                                           ','.join(
                                                                               unique_genes_bac),
                                                                           ','.join(
                                                                               multi_hits_bac),
                                                                           ','.join(missing_genes_bac)))

            arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(db_genome_id,
                                                                           len(unique_genes_arc),
                                                                           len(multi_hits_arc),
                                                                           len(missing_genes_arc),
                                                                           ','.join(
                                                                               unique_genes_arc),
                                                                           ','.join(
                                                                               multi_hits_arc),
                                                                           ','.join(missing_genes_arc)))

            translation_table_file.write('{}\t{}\n'.format(
                db_genome_id, info.get("best_translation_table")))

        bac_outfile.close()
        arc_outfile.close()
        translation_table_file.close()

        # Create a symlink to store the summary files in the root.
        os.symlink(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
                   os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        os.symlink(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
                   os.path.join(outdir, os.path.basename(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        os.symlink(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
                   os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))