Exemple #1
0
    def run_ptp(self, jp):
        full_aln = SeqGroup(self.epa_alignment)
        species_list = epa_2_ptp(epa_jp=jp,
                                 ref_jp=self.refjson,
                                 full_alignment=full_aln,
                                 min_lw=0.5,
                                 debug=self.cfg.debug)

        self.cfg.log.debug("Species clusters:")

        if fout:
            fo2 = open(fout + ".species", "w")
        else:
            fo2 = None

        for sp_cluster in species_list:
            translated_taxa = []
            for taxon in sp_cluster:
                origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                translated_taxa.append(origin_taxon_name)
            s = ",".join(translated_taxa)
            if fo2:
                fo2.write(s + "\n")
            self.cfg.log.debug(s)

        if fo2:
            fo2.close()
Exemple #2
0
    def checkinput(self, query_fname, minp=0.9):
        formats = [
            "fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"
        ]
        for fmt in formats:
            try:
                self.seqs = SeqGroup(sequences=query_fname, format=fmt)
                break
            except:
                self.cfg.log.debug("Guessing input format: not " + fmt)
        if self.seqs == None:
            self.cfg.exit_user_error(
                "Invalid input file format: %s\nThe supported input formats are fasta and phylip"
                % query_fname)

        if self.ignore_refalign:
            self.cfg.log.info(
                "Assuming query file contains reference sequences, skipping the alignment step...\n"
            )
            self.write_combined_alignment()
            return

        self.query_count = len(self.seqs)

        # add query seq name prefix to avoid confusion between reference and query sequences
        self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX)

        self.seqs.write(format="fasta", outfile=self.tmpquery)
        self.cfg.log.info("Checking if query sequences are aligned ...")
        entries = self.seqs.get_entries()
        seql = len(entries[0][1])
        aligned = True
        for entri in entries[1:]:
            l = len(entri[1])
            if not seql == l:
                aligned = False
                break

        if aligned and len(self.seqs) > 1:
            self.cfg.log.info("Query sequences are aligned")
            refalnl = self.refjson.get_alignment_length()
            if refalnl == seql:
                self.cfg.log.info(
                    "Merging query alignment with reference alignment")
                self.merge_alignment(self.seqs)
            else:
                self.cfg.log.info(
                    "Merging query alignment with reference alignment using MUSCLE"
                )
                self.require_muscle()
                refaln = self.refjson.get_alignment(fout=self.tmp_refaln)
                m = muscle(self.cfg)
                self.epa_alignment = m.merge(refaln, self.tmpquery)
        else:
            self.cfg.log.info("Query sequences are not aligned")
            self.cfg.log.info(
                "Align query sequences to the reference alignment using HMMER")
            self.require_hmmer()
            self.align_to_refenence(self.noalign, minp=minp)
Exemple #3
0
 def load_reduced_refalign(self):
     formats = ["fasta", "phylip_relaxed"]
     for fmt in formats:
         try:
             self.reduced_refalign_seqs = SeqGroup(
                 sequences=self.reduced_refalign_fname, format=fmt)
             break
         except:
             pass
     if self.reduced_refalign_seqs == None:
         errmsg = "FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname
         self.cfg.exit_fatal_error(errmsg)
Exemple #4
0
 def load_alignment(self):
     in_file = self.cfg.align_fname
     self.input_seqs = None
     formats = [
         "fasta", "phylip_relaxed", "iphylip_relaxed", "phylip", "iphylip"
     ]
     for fmt in formats:
         try:
             self.input_seqs = SeqGroup(sequences=in_file, format=fmt)
             break
         except:
             self.cfg.log.debug("Guessing input format: not " + fmt)
     if self.input_seqs == None:
         self.cfg.exit_user_error(
             "Invalid input file format: %s\nThe supported input formats are fasta and phylip"
             % in_file)
    def setUp(self):
        cfg = EpacTrainerConfig()
        cfg.debug = True
        testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    "testfiles")
        tax_fname = os.path.join(testfile_dir, "test.tax")
        phy_fname = os.path.join(testfile_dir, "test.phy")
        tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname)
        seqs = SeqGroup(sequences=phy_fname, format="phylip")
        self.inval = InputValidator(cfg, tax, seqs, False)

        self.expected_mis_ids = ["Missing1", "Missing2"]
        self.expected_dups = ["DupSeq(01)", "DupSeq02"]
        self.expected_merges = [
            self.inval.taxonomy.seq_rank_id(sid) for sid in self.expected_dups
        ]