def dict_to_bioalignment(d, alphabet='generic_alphabet', sorted=True): """ Create a BioPython MultipleSequenceAlignment from a dict with pairs consisting of id and sequence. """ alignment = MultipleSeqAlignment([]) bio_alphabet = getattr(Bio.Alphabet, alphabet) for id, seq in d.items(): seq_record = SeqRecord(Seq(seq, alphabet=bio_alphabet), id=id) alignment.append(seq_record) if sorted: alignment.sort() return alignment
def retrieve_alignment(tre, alnpath, taxonset=range(0, 101), delimiter='_'): """ Parameters ---------------- tre : single-copy treeswift tree generated from James's code alnpath : path to the phylip formatted alignment of the genes. The row labels should be a superset of the leafset of 'tre' seqlen : sequence length parameter, only the first seqlen columns are taken from the MSA taxonset: set, the taxon set of the entire dataset Returns the MSA that corresponds to the input tree. """ aln = AlignIO.read(open(alnpath), "phylip") seqlen = len(aln[0].seq) blank = "-" * seqlen whitelist = set(tre.labels(True, False)) rest = set(taxonset) #print(rest) res = MultipleSeqAlignment([]) for r in aln[:, :seqlen]: if r.id in whitelist: rid = r.id.split(delimiter)[0] rid_i = rid res.append(SeqRecord(r.seq, id=rid)) rest.remove(rid_i) for rst in rest: res.append(SeqRecord(Seq(blank), id=str(rst))) res.sort() return res
def test_trimal2(alb_resources, hf): tester = Alb.trimal(alb_resources.get_one("o p n"), 'all') assert hf.buddy2hash(tester) == "8faaf09741ddb3137653cb77ee66974a" tester = alb_resources.get_one("o p n") tester.alignments[0]._records = tester.alignments[0]._records[:5] Alb.trimal(tester, 'clean') assert hf.buddy2hash(tester) == "93a2aa21e6baf5ca70eb2de52ae8dbea" tester = alb_resources.get_one("o p n") tester_dir = TEMPDIR.subdir() tester.write("%s%strimal" % (tester_dir, os.path.sep)) assert hf.buddy2hash(Alb.trimal( tester, 'gappyout')) == "2877ecfb201fc35211a4625f34c7afdd" """ Probably not a good idea to be calling binaries like this... real_trimal = Popen("trimal -in %s%strimal -gappyout" % (tester_dir, os.path.sep), stdout=PIPE, shell=True).communicate() real_trimal = real_trimal[0].decode() with open("%s%strimal" % (tester_dir, os.path.sep), "w") as ofile: ofile.write(real_trimal) tester = Alb.AlignBuddy("%s%strimal" % (tester_dir, os.path.sep)) assert hf.buddy2hash(tester) == "2877ecfb201fc35211a4625f34c7afdd" """ records = [ SeqRecord(Seq("A--G-")), SeqRecord(Seq("--T--")), SeqRecord(Seq("--TG-")), SeqRecord(Seq("A---C")) ] tester = Alb.AlignBuddy([MultipleSeqAlignment(records)]) Alb.trimal(tester, "gappyout") assert "".join([str(rec.seq) for rec in tester.records()]) == ""
def test_get_subalignment_sequence_order_maintained(self): """ Sequences given rearranged are still output in input order """ result = AlignedSeq.get_sub_alignment_by_list_id(["s3", "s1"], self.alignment) expected = MultipleSeqAlignment([self.alignment[0], self.alignment[2]]) self.assertTrue(msas_equal(expected, result))
def identify_polymorphisms(self): self.alignment = MultipleSeqAlignment(self.sequences) for col in range(self.alignment.get_alignment_length()): polmorphs = set(self.alignment[:,col]) if len(polmorphs) > 1: self.polymorphisms[col] = polmorphs
def test_get_subalignment_with_interval(self): result = AlignedSeq.get_sub_alignment_by_list_id( ["s2", "s3"], self.alignment, [0, 2] ) expected = MultipleSeqAlignment( [SeqRecord(Seq("C--"), id="s2"), SeqRecord(Seq("AAT"), id="s3"),] ) self.assertTrue(msas_equal(expected, result))
def get_sub_alignment_by_list_id(self, id_list: List[str], alignment: MultipleSeqAlignment, interval=None): list_records = [record for record in alignment if record.id in id_list] sub_alignment = MultipleSeqAlignment(list_records) if interval: sub_alignment = sub_alignment[:, interval[0]:interval[1] + 1] return sub_alignment
def setUpClass(cls): cls.alignment = MultipleSeqAlignment( [ SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("C--C"), id="s2"), SeqRecord(Seq("AATT"), id="s3"), SeqRecord(Seq("GNGG"), id="s4"), ] )
def test_two_identical_sequences_clustered_together(self): alignment = MultipleSeqAlignment( [ SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("AAAT"), id="s2"), SeqRecord(Seq("C-CC"), id="s3"), ] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1) self.assertEqual([["s1", "s2"], ["s3"]], result)
def test_sequences_in_short_interval_separate_clusters(self): alignment = MultipleSeqAlignment( [ SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("AATT"), id="s2"), SeqRecord(Seq("AAGT"), id="s3"), ] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 5) # Each sequence is below min_match_len (5), so goes into own cluster self.assertEqual([["s1"], ["s2"], ["s3"]], result)
def test_all_sequences_below_min_match_len(self): alignment = MultipleSeqAlignment( [ SeqRecord(Seq("AA---AT"), id="s1"), SeqRecord(Seq("AA---TT"), id="s2"), SeqRecord(Seq("CA--CAT"), id="s3"), ] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval( [0, len(alignment[0])], alignment, 6 ) self.assertEqual([["s1"], ["s2"], ["s3"]], result)
def modeller_automodel(self, query: SeqRecord, results: Path, num_align: int, atom_files_dir: Path): from modeller import environ from modeller.automodel import automodel for model_index, r in enumerate( np.load(results, allow_pickle=True)[:num_align]): try: aln = AlignIO.read(StringIO(r[-2][0]), 'clustal') except: logging.error( f'Failed to parse alignment: {r[0]} -> {r[2]} -> {r[4]} -> {r[6]}' ) continue assert query.id == aln[0].id and aln[-1].id == r[-3] q_rec, t_rec = self._remove_gaps(aln[0], aln[-1]) try: t_rec = self._remove_missing_res( t_rec, (atom_files_dir / aln[-1].id[2:4] / f'{aln[-1].id}.ent').resolve().as_posix()) except FileNotFoundError as e: logging.exception(e) continue q_rec.name, t_rec.name = '', '' q_rec.description = f'sequence:{q_rec.id}::::::::' t_rec.description = f'structureX:{t_rec.id}::{t_rec.id[-2].upper()}::{t_rec.id[-2].upper()}::::' aln = MultipleSeqAlignment([q_rec, t_rec]) out_d = results.resolve().parent if (out_d / f'{aln[0].id}_{model_index+1}.pdb').exists(): continue cwd = os.getcwd() with tempfile.TemporaryDirectory() as tmpdir: try: os.chdir(tmpdir) AlignIO.write(aln, 'aln.pir', 'pir') env = environ() env.io.atom_files_directory = [ (atom_files_dir / aln[1].id[2:4]).resolve().as_posix() ] mod = automodel(env, 'aln.pir', knowns=[aln[1].id], sequence=aln[0].id) mod.make() shutil.copy( list(Path().glob('*.B*.pdb'))[0], out_d / f'{aln[0].id}_{model_index+1}.pdb') except Exception as e: logging.error( f'knowns=[{aln[1].id}], sequence={aln[0].id}') logging.exception(e) finally: os.chdir(cwd)
def _parse(self): for i, l in enumerate(self.stdout): if re.match('^\(":" denotes', l): a = SeqRecord(Seq(self.stdout[i + 1], alphabet=generic_protein), id='Protein_A') b = SeqRecord(Seq(self.stdout[i + 3], alphabet=generic_protein), id='Protein_B') self.alignment = MultipleSeqAlignment([a, b]) break for l in self.stdout: if re.match('^TM-score', l): self.tmscore = float( l.split('=')[1].split('(')[0].replace(' ', '')) break for l in self.stdout: if re.match('^MaxSub-score', l): self.maxsub = float( l.split('=')[1].split('(')[0].replace(' ', '')) break for l in self.stdout: if re.match('^GDT-TS-score', l): self.gdtts = float( l.split('=')[1].split('%')[0].replace(' ', '')) break for l in self.stdout: if re.match('^GDT-HA-score', l): self.gdtha = float( l.split('=')[1].split('%')[0].replace(' ', '')) break for l in self.stdout: if re.match('^RMSD of', l): self.rmsd = float(l.split('=')[1].replace(' ', '')) break for l in self.stdout: if re.match('^Number of residues in common', l): self.num_res_in_common = int(l.split('=')[1].replace(' ', '')) break for l in self.stdout: if re.match('^Structure1: ', l): self.len_a = int( l.split('=')[1].split('(')[0].replace(' ', '')) break for l in self.stdout: if re.match('^Structure2: ', l): self.len_b = int( l.split('=')[1].split('(')[0].replace(' ', '')) break
def test_one_long_one_short_sequence_separate_and_ordered_clusters(self): alignment = MultipleSeqAlignment( [ SeqRecord(Seq("AATTAATTATATAATAAC"), id="s1"), SeqRecord(Seq("A--------------AAT"), id="s2"), ] ) order_1 = AlignedSeq.kmeans_cluster_seqs_in_interval( [0, len(alignment[0])], alignment, 5 ) self.assertEqual(order_1, [["s1"], ["s2"]]) order_2 = AlignedSeq.kmeans_cluster_seqs_in_interval( [0, len(alignment[0])], alignment[::-1], 5 ) self.assertEqual(order_2, [["s2"], ["s1"]])
def _parse(self): for i, l in enumerate(self.stdout): if re.match('^TM-score=', l): self.tmscore = (float(self.stdout[i].split(' ')[1]), float(self.stdout[i + 1].split(' ')[1])) break for i, l in enumerate(self.stdout): if re.match('^\(":" denotes', l): a = SeqRecord(Seq(self.stdout[i + 1], alphabet=generic_protein), id=Path(self.protein_A).stem + '&' + Path(self.protein_B).stem, description=f'TM-score={self.tmscore[0]}') b = SeqRecord(Seq(self.stdout[i + 3], alphabet=generic_protein), id=Path(self.protein_B).stem + '&' + Path(self.protein_A).stem, description=f'TM-score={self.tmscore[1]}') self.alignment = MultipleSeqAlignment([a, b]) break
def test_first_sequence_placed_in_first_cluster(self): """ Runs kmeans clustering on randomly generated multiple sequence alignments """ seq_len = 20 num_seqs = 20 bases = list(standard_bases) # Function has different behaviour at below and above seq_len for seq_len in [seq_len - 1, seq_len + 1]: with self.subTest(min_match_len=seq_len): for _ in range(20): # Run on a number of random alignments records = [] for i in range(num_seqs): rand_seq = "".join( [random.choice(bases) for _ in range(seq_len)] ) records.append(SeqRecord(Seq(rand_seq), id=f"s{i}")) alignment = MultipleSeqAlignment(records) result = AlignedSeq.kmeans_cluster_seqs_in_interval( [0, seq_len - 1], alignment, 1 ) self.assertTrue(result[0][0] == "s0")
def _local_align(self, record_a: SeqRecord, record_b: SeqRecord, open_gap_score: int): aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.substitution_matrix = substitution_matrices.load('BLOSUM62') aligner.open_gap_score = open_gap_score aligner.extend_gap_score = -1 aln = aligner.align( record_a.seq.ungap('-').upper(), record_b.seq.ungap('-').upper())[0] seq_a = Seq( str(aln).splitlines()[0].replace(' ', '-'), generic_protein) seq_b = Seq( str(aln).splitlines()[2].replace(' ', '-'), generic_protein) return MultipleSeqAlignment([ SeqRecord(seq_a, id=record_a.id), SeqRecord(seq_b, id=record_b.id) ], annotations={ 'score': aln.score, 'path': aln.path, 'aligned': aln.aligned })
counter - 1] + " but gene " + keyList[n] + " (has " + j + ")\n" n = n + 1 counter = counter + 1 print "I can correct these spelling mistakes...If you want me to..;) \n Do you?? \n 1. Yes \n 2. No \n" choice = input('\n') while choice == 1: print "Enter the correct taxon name \n" usrInp = raw_input('\n') for filename in fileList: handle = open(filename, 'rU') record = list(SeqIO.parse(handle, 'nexus')) msa = MultipleSeqAlignment(record) for i, val in enumerate(msa): if 1.0 > float([x == y for (x, y) in zip(usrInp, msa[i].id) ].count(True)) / len(msa[i].id) > 0.8: msa[i].id = usrInp else: print "No Spelling mistakes found in file %s \n" % filename fp = open(filename, 'w') SeqIO.write(msa, fp, typeList[file_format - 1]) handle.close() fp.close() print "Want me to do some more editing?? \n Well! I can do this whole day \n What about you?? \n 1. Yes \n 2. No \n" choice = input('\n')
#print(dir(multiple_alignment)) #sys.exit(0) seqs = [] for seqrec in multiple_alignment: #print(dir(seqrec)) try: name_id = seqrec.id.partition(" ")[0].partition(".")[0] #print(name_id) seqrec.id = organisms[name_id] if seqrec.id in organisms.values(): seqs.append(seqrec) except: continue new_alignment = MultipleSeqAlignment(records=seqs) #print(len(new_alignment)) AlignIO.write(new_alignment, "mm9_" + rg.name + ".fa", "fasta") process = subprocess.Popen([ "/home/joseph/Apps/PhyloCSF/PhyloCSF", "29mammals", "mm9_" + rg.name + ".fa", "--removeRefGaps", "--strategy=omega", "--orf=StopStop3", "--minCodons=25", "--frames=3" ], stdout=subprocess.PIPE) out, err = process.communicate() print(out) #print(out.split("\t")[2]) #print(out.split("\t")[3]) #print(out.split("\t")[4])
seqAA = safe_translate(seqCDS_ungapped) scoreAA, refalnAA, seqalnAA = align_pairwise(refAA, seqAA) if scoreAA < 0 or sum( seqAA.count(x) for x in ['*', 'X']) > 5 or refalnAA.count('-') > 5: print(seq.id, "didn't translate properly") continue seqCDS_aln = seq5pUTR pos = 0 for aa_ref, aa_seq in zip(refalnAA, seqalnAA): if aa_seq == '-': seqCDS_aln += '---' # if the nucleotide sequence is gapped # (i.e. because of missing data at the 5p and 3p end, advance pos) if seqCDS_ungapped[pos:pos + 3] == '---': pos += 3 else: if len(seqCDS_ungapped) >= pos + 3: seqCDS_aln += seqCDS_ungapped[pos:pos + 3] else: seqCDS_aln += '---' pos += 3 seq.seq = Seq.Seq(''.join(seqCDS_aln) + seq3pUTR) alignment.append(seq) # output AlignIO.write(MultipleSeqAlignment(alignment), args.output, 'fasta')
def test_ambiguous_sequences_in_short_interval_separate_clusters(self): alignment = MultipleSeqAlignment( [SeqRecord(Seq("ARAT"), id="s1"), SeqRecord(Seq("WAAT"), id="s2"),] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 5) self.assertEqual([["s1"], ["s2"]], result)
def test_get_subalignment_sequence_order_maintained2(self): result = AlignedSeq.get_sub_alignment_by_list_id(["s1", "s3"], self.alignment) expected = MultipleSeqAlignment([self.alignment[0], self.alignment[2]]) self.assertTrue(msas_equal(expected, result))
def test_one_seq_returns_single_id(self): alignment = MultipleSeqAlignment([SeqRecord(Seq("AAAT"), id="s1")]) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1) self.assertEqual(result, [["s1"]])
def impute_ancestors_dnapars(seqs, gl_seq, scratch_dir, gl_name='germline', verbose=True): """ Compute ancestral states via maximum parsimony @param seqs: list of sequences @param gl_seq: germline sequence @param scratch_dir: where to write intermediate dnapars files @param gl_name: name of germline (must be less than 10 characters long) @return genes_line: information needed to output imputed germline data @return seqs_line: information needed to output imputed sequence data """ from gctree.bin.phylip_parse import parse_outfile assert (len(gl_name) < 10) infile, config, outfile = [ os.path.join(scratch_dir, fname) for fname in [ 'infile', 'dnapars.cfg', 'outfile', ] ] aln = MultipleSeqAlignment([SeqRecord(Seq(gl_seq), id=gl_name)]) # sequence ID must be less than ten characters, but also dnapars sets internal node # names to 1, 2, 3, ..., so name them numbers descending from 100 million, hoping # we won't ever have a clone that big... for idx, seq in enumerate(seqs): aln.append(SeqRecord(Seq(seq), id=str(99999999 - idx))) # dnapars uses the name "infile" as default input phylip file with open(infile, 'w') as phylip_file: phylip_file.write(aln.format('phylip')) # and we need to tell it the line where the root sequence occurs with open(infile, 'r') as phylip_file: for lineno, line in enumerate(phylip_file): if line.startswith(gl_name): naive_idx = str(lineno) # arcane user options for dnapars # 'O', naive_idx: the location of the outgroup root # 'S', 'Y': less thorough search; runs much faster but output is less exhaustive # 'J', 13, 10: randomize input ("jumble") using seed 13 and jumbling 10 times # 4: print out steps in each site (to get all nucleotide info) # 5: print sequences in at all nodes (to get ancestors) # '.': use dot-differencing for display # 'Y': accept these options with open(config, 'w') as cfg_file: cfg_file.write('\n'.join( ['O', naive_idx, 'S', 'Y', 'J', '13', '10', '4', '5', '.', 'Y'])) # defer to command line to construct parsimony trees and ancestral states # dnapars has weird behavior if outfile and outtree already exist o_O cmd = [ 'cd', scratch_dir, '&& rm -f outfile outtree && dnapars <', os.path.basename(config), '> dnapars.log' ] if verbose: print "Calling:", " ".join(cmd) res = subprocess.call([" ".join(cmd)], shell=True) # phew, finally got some trees trees = parse_outfile(outfile, countfile=None, naive=gl_name) # take first parsimony tree genes_line = [] seq_line = [] for idx, descendant in enumerate(trees[0].traverse('preorder')): if descendant.is_root(): descendant.name = gl_name else: # use dummy name for internal node sequences descendant.name = '-'.join([descendant.up.name, descendant.name]) if [descendant.up.name, descendant.up.sequence.lower()] not in genes_line: genes_line.append( [descendant.up.name, descendant.up.sequence.lower()]) seq_line.append([ descendant.up.name, descendant.name, descendant.sequence.lower() ]) return genes_line, seq_line
def test_two_seqs_one_below_min_match_len_separate_clusters(self): alignment = MultipleSeqAlignment( [SeqRecord(Seq("AATTTAT"), id="s1"), SeqRecord(Seq("AA---AT"), id="s2")] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 5], alignment, 5) self.assertEqual(result, [["s1"], ["s2"]])
def get_sub_alignment_by_list_id(self, list_of_id, interval=None): list_records = [record for record in self.alignment if record.id in list_of_id] sub_alignment = MultipleSeqAlignment(list_records) if interval: sub_alignment = sub_alignment[:, interval[0] : interval[1] + 1] return sub_alignment
class PolymorphismSampler(object): """docstring for PolymorphismSampler""" def __init__(self): super(PolymorphismSampler, self).__init__() self.sequences = None self.alignment = None self.polymorphisms = dict() self.subsampled = set() self.num_positions = [] self.num_polymorphisms = [] def read_sequences(self, handle): sequences = [s for s in SeqIO.parse(handle, 'fasta')] counts = Counter([len(s) for s in sequences]) mc_length = counts.most_common(1)[0][0] filtered_sequences = list() for s in sequences: if len(s.seq) == mc_length: filtered_sequences.append(s) self.sequences = filtered_sequences def identify_polymorphisms(self): self.alignment = MultipleSeqAlignment(self.sequences) for col in range(self.alignment.get_alignment_length()): polmorphs = set(self.alignment[:,col]) if len(polmorphs) > 1: self.polymorphisms[col] = polmorphs def number_of_polymorphisms(self): total_polymorphisms = 0 for pos, polymorphs in self.polymorphisms.items(): total_polymorphisms += len(polymorphs) return total_polymorphisms def subsample(self): while len(self.polymorphisms.keys()) > 0: try: # Choose a seqrecord at random, based on LH sampling criteria. pos = choice(list(self.polymorphisms.keys())) letter = choice(list(self.polymorphisms[pos])) filtered = MultipleSeqAlignment([s for s in self.alignment if s[pos] == letter]) seqrecord = choice(filtered) self.subsampled.add(seqrecord) # Remove polymorphisms for pos in self.polymorphisms.keys(): if seqrecord.seq[pos] in self.polymorphisms[pos]: self.polymorphisms[pos].remove(seqrecord.seq[pos]) # Update data self.polymorphisms = {k:v for k,v in self.polymorphisms.items() if len(v) > 0} self.num_polymorphisms.append(self.number_of_polymorphisms()) self.num_positions.append(len(self.polymorphisms.keys())) except IndexError: break
def _aligned_number(self, aln: MultipleSeqAlignment): return len([ i for i in range(aln.get_alignment_length()) if '-' not in aln[:, i] ])
def search_layer2_evalue_sum_i_blast_merged(self, query: SeqRecord, graphml: Path, num_align: int): graph = networkx.read_graphml(graphml) path = [] for nei1 in graph.neighbors(query.id): if 'UniRef50' not in graph.nodes[nei1]['labels'].split(':'): continue for nei2 in graph.neighbors(nei1): if 'UniRef50' not in graph.nodes[nei2]['labels'].split(':'): continue for nei3 in graph.neighbors(nei2): if 'SCOP95' not in graph.nodes[nei3]['labels'].split(':'): continue score = graph.get_edge_data(query.id, nei1)['evalue'] \ + graph.get_edge_data(nei1, nei2)['evalue'] + graph.get_edge_data(nei2, nei3)['evalue'] path.append([ query.id, None, nei1, None, nei2, None, nei3, None, score ]) path = sorted(path, key=lambda _: _[-1]) # dedup results, seen = [], [] for p in path: if p[-3] in seen: continue seen.append(p[-3]) results.append(p) for r in results[:num_align]: n1_seq = query if r[2] == query.id else self._get_seq( 'uniref50', r[2]) n2_seq = query if r[4] == query.id else self._get_seq( 'uniref50', r[4]) n3_seq = query if r[6] == query.id else self._get_seq( 'scop95', r[6]) with tempfile.TemporaryDirectory() as t: tmpdir = Path(t) SeqIO.write(query, tmpdir / 'query.fasta', 'fasta') SeqIO.write(n1_seq, tmpdir / 'n1.fasta', 'fasta') r[1], _ = NcbideltablastCommandline( query=(tmpdir / 'query.fasta').as_posix(), subject=(tmpdir / 'n1.fasta').as_posix(), use_sw_tback=True, outfmt=5, rpsdb='cdd_delta')() hsp1 = SearchIO.read(StringIO(r[1]), 'blast-xml')[0][0] rec1, rec2 = hsp1.aln[0], hsp1.aln[1] seq1, seq2 = rec1.seq.tomutable(), rec2.seq.tomutable() aln1 = hsp1.aln SeqIO.write(aln1[1], tmpdir / 'n1.fasta', 'fasta') SeqIO.write(n2_seq, tmpdir / 'n2.fasta', 'fasta') r[3], _ = NcbideltablastCommandline( query=(tmpdir / 'n1.fasta').as_posix(), subject=(tmpdir / 'n2.fasta').as_posix(), use_sw_tback=True, outfmt=5, rpsdb='cdd_delta')() hsp2 = SearchIO.read(StringIO(r[3]), 'blast-xml')[0][0] rec3, rec4 = hsp2.aln[0], hsp2.aln[1] seq3, seq4 = rec3.seq.tomutable(), rec4.seq.tomutable() aln2 = hsp2.aln SeqIO.write(aln2[1], tmpdir / 'n2.fasta', 'fasta') SeqIO.write(n3_seq, tmpdir / 'n3.fasta', 'fasta') r[5], _ = NcbideltablastCommandline( query=(tmpdir / 'n2.fasta').as_posix(), subject=(tmpdir / 'n3.fasta').as_posix(), use_sw_tback=True, outfmt=5, rpsdb='cdd_delta')() try: hsp3 = SearchIO.read(StringIO(r[5]), 'blast-xml')[0][0] except IndexError: print(hsp1) r[7] = (None, None) continue rec5, rec6 = hsp3.aln[0], hsp3.aln[1] seq5, seq6 = rec5.seq.tomutable(), rec6.seq.tomutable() seq3 = '-' * hsp2.query_start + seq3 seq4 = '-' * hsp2.query_start + seq4 seq5 = '-' * (hsp2.query_start + hsp3.query_start) + seq5 seq6 = '-' * (hsp2.query_start + hsp3.query_start) + seq6 for i in range(hsp2.query_start, len(seq2)): if i >= len(seq3): seq3.append('-') seq4.append('-') elif seq2[i] == '-' and seq3[i] != '-': seq3.insert(i, '-') seq4.insert(i, '-') elif seq3[i] == '-' and seq2[i] != '-': seq2.insert(i, '-') seq1.insert(i, '-') append = len(seq2) - len(seq3) seq3 = seq3 + '-' * append seq4 = seq4 + '-' * append for i in range(hsp2.query_start + hsp3.query_start, len(seq4)): if i >= len(seq5): seq5.append('-') seq6.append('-') elif seq4[i] == '-' and seq5[i] != '-': seq5.insert(i, '-') seq6.insert(i, '-') elif seq5[i] == '-' and seq4[i] != '-': seq4.insert(i, '-') seq3.insert(i, '-') seq2.insert(i, '-') seq1.insert(i, '-') append = len(seq4) - len(seq5) seq5 = seq5 + '-' * append seq6 = seq6 + '-' * append rec1.seq = seq1 rec2.seq = seq2 rec3.seq = seq3 rec4.seq = seq4 rec5.seq = seq5 rec6.seq = seq6 try: r[7] = (MultipleSeqAlignment( [rec1, rec2, rec3, rec4, rec5, rec6]).format('clustal'), None) except ValueError: print(hsp1) r[7] = (None, None) continue return results
def test_two_identical_seqs_returns_two_ids_clustered(self): alignment = MultipleSeqAlignment( [SeqRecord(Seq("AAAT"), id="s1"), SeqRecord(Seq("AAAT"), id="s2"),] ) result = AlignedSeq.kmeans_cluster_seqs_in_interval([0, 3], alignment, 1) self.assertEqual(result, [["s1", "s2"]])