def search_one(self, query_seqid, pctid, threads=None): pctid_str = "{:.1f}".format(pctid) print("Searching", query_seqid, "at", pctid_str, "pct identity") query_seq = self.seqs[query_seqid] query_fp = "temp_query.fasta" if os.path.exists(query_fp): os.rename(query_fp, "temp_prev_query.fasta") query_hits_fp = "temp_query_hits.txt" if os.path.exists(query_hits_fp): os.rename(query_hits_fp, "temp_prev_query_hits.txt") with open(query_fp, "w") as f: write_fasta(f, [(query_seqid, query_seq)]) aligner = PctidAligner(self.fasta_fp) aligner.search(query_fp, query_hits_fp, min_pctid=pctid, threads=threads, max_hits=10000) with open(query_hits_fp) as f: hits = aligner.parse(f) for hit in hits: if hit["pident"] == pctid_str: query = self.assemblies[hit["qseqid"]] subject = self.assemblies[hit["sseqid"]] pctid = hit["pident"] yield AssemblyPair(query, subject, pctid, hit["qseqid"], hit["sseqid"])
def search_reference_seqs(self, query_seqs): query_file = tempfile.NamedTemporaryFile(suffix=".fasta", mode="wt") write_fasta(query_file, query_seqs) query_file.seek(0) reference_hits_file = tempfile.NamedTemporaryFile(suffix=".txt", mode="wt") # 97.0 --> 0.97 vsearch_min_id = "{:.2f}".format(self.min_pct_id / 100) vsearch_args = [ "vsearch", "--usearch_global", query_file.name, "--db", self.reference_udb_fp, "--userout", reference_hits_file.name, "--iddef", "2", "--id", vsearch_min_id, "--maxaccepts", self.max_hits, "--userfields", "query+target+id2+alnlen+mism+gaps+qilo+qihi+tilo+tihi+qs+ts+qrow+trow", ] if self.num_threads: vsearch_args.extend(["--threads", str(self.num_threads)]) subprocess.check_call(vsearch_args) reference_hits_file.seek(0) return reference_hits_file
def find_in_seqs(self, seqs): if seqs.all_matched(): return # Create the file paths subject_fp = self._make_fp("subject_{0}.fa".format(self.suffix)) query_fp = self._make_fp("query_{0}.fa".format(self.suffix)) result_fp = self._make_fp("query_{0}.txt".format(self.suffix)) # Search with open(subject_fp, "w") as f: write_fasta(f, seqs.get_matched_offset0()) ba = VsearchAligner(subject_fp) search_args = { "min_id": round(self.min_pct_id / 100, 2), "top_hits_only": None} if self.cores > 0: search_args["threads"] = self.cores hits = ba.search( seqs.get_unmatched_recs(), input_fp=query_fp, output_fp=result_fp, **search_args) # Refine bext = HitExtender(seqs.get_unmatched_recs(), seqs.get_matched_offset0()) for hit in hits: alignment = bext.extend_hit(hit) subject_match = seqs.matches[alignment.subject_id] aligned_region = AlignedRegion.from_subject( alignment, subject_match.start, subject_match.end) query_start_idx, query_end_idx = aligned_region.in_query() query_offset = aligned_region.query_offset() matchobj = PrimerMatch( query_start_idx, query_end_idx, query_offset, "Alignment") yield alignment.query_id, matchobj
def search(self, seqs, input_fp=None, output_fp=None, **kwargs): if input_fp is None: infile = tempfile.NamedTemporaryFile(mode="w+t", encoding="utf-8") write_fasta(infile, seqs) infile.seek(0) input_fp = infile.name else: with open(input_fp, "w") as f: write_fasta(f, seqs) if output_fp is None: outfile = tempfile.NamedTemporaryFile() output_fp = outfile.name self._call(input_fp, self.ref_seqs_fp, output_fp, **kwargs) with open(output_fp) as f: for hit in self._parse(f): yield hit
def search_seq(self, query_seqid, query_seq, min_pctid=90.0, threads=None): query_fp = "temp_query.fasta" if os.path.exists(query_fp): os.rename(query_fp, "temp_prev_query.fasta") query_hits_fp = "temp_query_hits.txt" if os.path.exists(query_hits_fp): os.rename(query_hits_fp, "temp_prev_query_hits.txt") with open(query_fp, "w") as f: write_fasta(f, [(query_seqid, query_seq)]) aligner = PctidAligner(self.fasta_fp) aligner.search(query_fp, query_hits_fp, min_pctid=min_pctid, threads=threads, max_hits=10000) with open(query_hits_fp) as f: hits = aligner.parse(f) for hit in hits: query = self.assemblies[hit["qseqid"]] subject = self.assemblies[hit["sseqid"]] pctid = hit["pident"] if query.accession != subject.accession: yield AssemblyPair(query, subject, pctid, hit["qseqid"], hit["sseqid"])
def save(self): with open(self.fasta_fp, "w") as f: write_fasta(f, self.seqs.items()) with open(self.accession_fp, "w") as f: for seqid, assembly in self.assemblies.items(): f.write("{0}\t{1}\n".format(seqid, assembly.accession))
def test_write_fasta(self): f = tempfile.NamedTemporaryFile(mode="w+t", encoding="utf-8") seqs = [("a", "CCGGT"), ("b", "TTTTTTTTT")] write_fasta(f, seqs) f.seek(0) self.assertEqual(f.read(), ">a\nCCGGT\n>b\nTTTTTTTTT\n")