def run_ptp(self, jp): full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp=jp, ref_jp=self.refjson, full_alignment=full_aln, min_lw=0.5, debug=self.cfg.debug) self.cfg.log.debug("Species clusters:") if fout: fo2 = open(fout + ".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") self.cfg.log.debug(s) if fo2: fo2.close()
def checkinput(self, query_fname, minp=0.9): formats = [ "fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed" ] for fmt in formats: try: self.seqs = SeqGroup(sequences=query_fname, format=fmt) break except: self.cfg.log.debug("Guessing input format: not " + fmt) if self.seqs == None: self.cfg.exit_user_error( "Invalid input file format: %s\nThe supported input formats are fasta and phylip" % query_fname) if self.ignore_refalign: self.cfg.log.info( "Assuming query file contains reference sequences, skipping the alignment step...\n" ) self.write_combined_alignment() return self.query_count = len(self.seqs) # add query seq name prefix to avoid confusion between reference and query sequences self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX) self.seqs.write(format="fasta", outfile=self.tmpquery) self.cfg.log.info("Checking if query sequences are aligned ...") entries = self.seqs.get_entries() seql = len(entries[0][1]) aligned = True for entri in entries[1:]: l = len(entri[1]) if not seql == l: aligned = False break if aligned and len(self.seqs) > 1: self.cfg.log.info("Query sequences are aligned") refalnl = self.refjson.get_alignment_length() if refalnl == seql: self.cfg.log.info( "Merging query alignment with reference alignment") self.merge_alignment(self.seqs) else: self.cfg.log.info( "Merging query alignment with reference alignment using MUSCLE" ) self.require_muscle() refaln = self.refjson.get_alignment(fout=self.tmp_refaln) m = muscle(self.cfg) self.epa_alignment = m.merge(refaln, self.tmpquery) else: self.cfg.log.info("Query sequences are not aligned") self.cfg.log.info( "Align query sequences to the reference alignment using HMMER") self.require_hmmer() self.align_to_refenence(self.noalign, minp=minp)
def load_reduced_refalign(self): formats = ["fasta", "phylip_relaxed"] for fmt in formats: try: self.reduced_refalign_seqs = SeqGroup( sequences=self.reduced_refalign_fname, format=fmt) break except: pass if self.reduced_refalign_seqs == None: errmsg = "FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname self.cfg.exit_fatal_error(errmsg)
def load_alignment(self): in_file = self.cfg.align_fname self.input_seqs = None formats = [ "fasta", "phylip_relaxed", "iphylip_relaxed", "phylip", "iphylip" ] for fmt in formats: try: self.input_seqs = SeqGroup(sequences=in_file, format=fmt) break except: self.cfg.log.debug("Guessing input format: not " + fmt) if self.input_seqs == None: self.cfg.exit_user_error( "Invalid input file format: %s\nThe supported input formats are fasta and phylip" % in_file)
def setUp(self): cfg = EpacTrainerConfig() cfg.debug = True testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") phy_fname = os.path.join(testfile_dir, "test.phy") tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname) seqs = SeqGroup(sequences=phy_fname, format="phylip") self.inval = InputValidator(cfg, tax, seqs, False) self.expected_mis_ids = ["Missing1", "Missing2"] self.expected_dups = ["DupSeq(01)", "DupSeq02"] self.expected_merges = [ self.inval.taxonomy.seq_rank_id(sid) for sid in self.expected_dups ]