def _msa_filter_by_taxa(self, concatenated_file, gtdb_taxonomy, taxa_filter, outgroup_taxon): """Filter GTDB MSA filtered to specified taxa.""" msa = read_fasta(concatenated_file) self.logger.info( 'Read concatenated alignment for %d GTDB genomes.' % len(msa)) if taxa_filter is not None: taxa_to_keep = set(taxa_filter.split(',')) if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None: taxa_to_keep.add(outgroup_taxon) filtered_genomes = 0 for genome_id, taxa in gtdb_taxonomy.iteritems(): common_taxa = taxa_to_keep.intersection(taxa) if len(common_taxa) == 0: if genome_id in msa: del msa[genome_id] filtered_genomes += 1 self.logger.info( 'Filtered %d taxa based on assigned taxonomy.' % filtered_genomes) return msa
def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file): if maskid == 'bac' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120) elif maskid == 'arc' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122) elif mask_type == 'file': mask = maskid with open(mask, 'r') as f: maskstr = f.readline() outfwriter = open(output_file, 'w') dict_genomes = read_fasta(untrimmed_msa, False) for k, v in dict_genomes.iteritems(): aligned_seq = ''.join([v[i] for i in xrange( 0, len(maskstr)) if maskstr[i] == '1']) fasta_outstr = ">%s\n%s\n" % (k, aligned_seq) outfwriter.write(fasta_outstr) outfwriter.close() return True
def run(self, msa_file, marker_list): """Randomly select a subset of columns from the MSA of each marker.""" # read multiple sequence alignment self.logger.info('Reading multiple sequence alignment.') msa = read_fasta(msa_file, False) self.logger.info('Read MSA for %d genomes.' % len(msa)) filtered_seqs, pruned_seqs = self.trim(msa, marker_list) self.logger.info( 'Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.' % (len(pruned_seqs), self.min_perc_aa)) # write out trimmed sequences filter_file = open(os.path.join(self.output_dir, "filtered_msa.faa"), 'w') for gid, seq in filtered_seqs.items(): fasta_outstr = ">%s\n%s\n" % (gid, seq) filter_file.write(fasta_outstr) filter_file.close() self.logger.info('Done.')
def _report_identified_marker_genes(self, gene_dict, outdir, marker_gene_dir, prefix): """Report statistics for identified marker genes.""" translation_table_file = open(os.path.join(outdir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)), "w") bac_outfile = open(os.path.join(outdir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)), "w") arc_outfile = open(os.path.join(outdir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), "w") header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n" bac_outfile.write(header) arc_outfile.write(header) # gather information for all marker genes marker_dbs = {"PFAM": PFAM_TOP_HIT_SUFFIX, "TIGR": TIGRFAM_TOP_HIT_SUFFIX} marker_bac_list_original = [] for db_marker in Config.BAC120_MARKERS.keys(): marker_bac_list_original.extend([marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.BAC120_MARKERS[db_marker]]) marker_arc_list_original = [] for db_marker in Config.AR122_MARKERS.keys(): marker_arc_list_original.extend([marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.AR122_MARKERS[db_marker]]) for db_genome_id, info in gene_dict.items(): unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], [] unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], [] gene_bac_dict, gene_arc_dict = {}, {} path = info.get("aa_gene_path") for _marker_db, marker_suffix in marker_dbs.iteritems(): # get all gene sequences protein_file = str(path) tophit_path = protein_file.replace(PROTEIN_FILE_SUFFIX, marker_suffix) # we load the list of all the genes detected in the genome all_genes_dict = read_fasta(protein_file, False) # Prodigal adds an asterisks at the end of each called genes. # These asterisks sometimes appear in the MSA, which can be # an issue for some downstream software for seq_id, seq in all_genes_dict.iteritems(): if seq[-1] == '*': all_genes_dict[seq_id] = seq[:-1] # we store the tophit file line by line and store the # information in a dictionary with open(tophit_path) as tp: # first line is header line tp.readline() for line_tp in tp: linelist = line_tp.split("\t") genename = linelist[0] sublist = linelist[1] if ";" in sublist: diff_markers = sublist.split(";") else: diff_markers = [sublist] for each_mark in diff_markers: sublist = each_mark.split(",") markerid = sublist[0] if (markerid not in marker_bac_list_original and markerid not in marker_arc_list_original): continue if markerid in marker_bac_list_original: if markerid in gene_bac_dict: gene_bac_dict.get(markerid)[ "multihit"] = True else: gene_bac_dict[markerid] = { "gene": genename, "multihit": False} if markerid in marker_arc_list_original: if markerid in gene_arc_dict: gene_arc_dict.get(markerid)[ "multihit"] = True else: gene_arc_dict[markerid] = { "gene": genename, "multihit": False} for mid in marker_bac_list_original: if mid not in gene_bac_dict: missing_genes_bac.append(mid) elif gene_bac_dict[mid]["multihit"]: multi_hits_bac.append(mid) else: unique_genes_bac.append(mid) for mid in marker_arc_list_original: if mid not in gene_arc_dict: missing_genes_arc.append(mid) elif gene_arc_dict[mid]["multihit"]: multi_hits_arc.append(mid) else: unique_genes_arc.append(mid) bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(db_genome_id, len(unique_genes_bac), len(multi_hits_bac), len(missing_genes_bac), ','.join( unique_genes_bac), ','.join( multi_hits_bac), ','.join(missing_genes_bac))) arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(db_genome_id, len(unique_genes_arc), len(multi_hits_arc), len(missing_genes_arc), ','.join( unique_genes_arc), ','.join( multi_hits_arc), ','.join(missing_genes_arc))) translation_table_file.write('{}\t{}\n'.format( db_genome_id, info.get("best_translation_table"))) bac_outfile.close() arc_outfile.close() translation_table_file.close() # Create a symlink to store the summary files in the root. os.symlink(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) os.symlink(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) os.symlink(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))