def setUp(self): aligner = PairwiseAligner() aligner.internal_open_gap_score = -1 aligner.internal_extend_gap_score = -0.0 aligner.match_score = +1 aligner.mismatch_score = -1 aligner.mode = "local" self.aligner = aligner
def bioPython_default_local_aligner(a, b): aligner = PairwiseAligner() aligner.mode = 'local' aligner.match_score = 2 aligner.mismatch_score = -3 aligner.open_gap_score = -7 aligner.extend_gap_score = -2 sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta') sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta') alignments = aligner.align(sequence1.seq, sequence2.seq)
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05): """Cluster a list of sequences by a distance identity threshold Parameters ---------- seqlist : list list of sequences as str dist_threshold : float Max distance value to retain, branches above this length in the hierarchical clustering tree will be cut. Returns ------- list list of lists - input sequences now grouped by cluster list list of int - cluster memberships of the originally input list """ if len(seqlist) == 1: # Skip alignment if there is only one sequence return([seqlist], [0]) else: aligner = PairwiseAligner() aligner.mode = "local" # Convert sequence list to distance matrix distmatrix = [] for seq1 in seqlist: row = [] for seq2 in seqlist: maxlen = max([len(seq1), len(seq2)]) # Take percentage identity of pairwise alignment score (match base # +1, all other operations +0) over the longer sequence in pair idval = aligner.align(seq1, seq2).score / maxlen distval = 1 - idval # convert to distance fraction row.append(distval) distmatrix.append(row) # Hierarchical clustering from the distance matrix htree = treecluster(data=None, distancematrix=array(distmatrix)) # Find number of branches with length longer than threshold, and add 1 # to get number of cuts cuts = 1 + len([htree[i].distance for i in range(len(htree)) if htree[i].distance > dist_threshold]) clust_ids = list(htree.cut(cuts)) clust_seqs_dict = defaultdict(list) for i in range(len(seqlist)): clust_seqs_dict[clust_ids[i]] += [seqlist[i]] # Convert dict of lists to list of lists clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict] return(clust_seqs, clust_ids)
def pairwise(self, potential_parent): """ Парное выравнивание последовательности листа на потенциальных родителей и сохранение скора Args: potential_parent (str): последовательность потенциального родителя """ aligner = PairwiseAligner() # используем локальное выравнивание aligner.mode = 'local' # заменим дефолтные символы пропуска на другие seq = self.seq.replace('-', '') potential_parent_undersores = potential_parent.replace('-', '') # выравнивание score = aligner.score(seq, potential_parent_undersores) # сохраняем скор self.parent_scores[potential_parent] = score
def perform_randomized_tests(n=1000): """Perform randomized tests and compare to pslMap. Run this function to perform 8 x n mappings for alignments of randomly generated sequences, get the alignment in PSL format, and compare the result to that of pslMap. """ aligner = PairwiseAligner() aligner.internal_open_gap_score = -1 aligner.internal_extend_gap_score = -0.0 aligner.match_score = +1 aligner.mismatch_score = -1 aligner.mode = "local" for i in range(n): nBlocks1 = random.randint(1, 10) nBlocks2 = random.randint(1, 10) test_random(aligner, nBlocks1, nBlocks2, "+", "+") test_random(aligner, nBlocks1, nBlocks2, "+", "-") test_random(aligner, nBlocks1, nBlocks2, "-", "+") test_random(aligner, nBlocks1, nBlocks2, "-", "-") test_random_sequences("+", "+") test_random_sequences("+", "-") test_random_sequences("-", "+") test_random_sequences("-", "-")
type=str, required=True) parser.add_argument('-r', '--reference', help='Reference to be aligned to', type=str, required=True) parser.add_argument('-n', '--seq_name', help='Name of the aligned sequence', type=str, required=True) args = parser.parse_args() aligner = PairwiseAligner() aligner.mode = 'global' aligner.match_score = 1 aligner.mismatch_score = 0 aligner.open_gap_score = -2 aligner.extend_gap_score = -1 ref = SeqIO.read(args.reference, "fasta") ref.seq = str(ref.seq.upper()).replace('-', 'N') cons = SeqIO.read(args.infile, "fasta") aln = aligner.align(ref.seq, cons.seq) with open(args.outfile, 'w') as out: print(">", args.seq_name, file=out) print(str(aln[0]).strip().split('\n')[2], file=out)
args.open_gap_score = -5 if not args.extend_gap_score: args.extend_gap_score = -2 else: sub_matrix = getattr(import_module('Bio.SubsMat.MatrixInfo'), args.sub_matrix) aligners['global'].substitution_matrix = sub_matrix if not args.open_gap_score: args.open_gap_score = -11 if not args.extend_gap_score: args.extend_gap_score = -1 aligners['global'].open_gap_score = args.open_gap_score aligners['global'].extend_gap_score = args.extend_gap_score if args.sim_algo == 'smith-waterman': aligners['local'] = PairwiseAligner() aligners['local'].mode = 'local' if args.seq_type in ('dna', 'rna'): aligners['local'].match = args.match_score aligners['local'].mismatch = args.mismatch_score else: aligners['local'].substitution_matrix = sub_matrix aligners['local'].open_gap_score = args.open_gap_score aligners['local'].extend_gap_score = args.extend_gap_score # Karlin-Altschul parameter values if args.seq_type in ('dna', 'rna'): if ((args.match_score, args.mismatch_score) in KA_PARAMS['na'] and (abs(args.open_gap_score), abs(args.extend_gap_score)) in KA_PARAMS['na'][(args.match_score, args.mismatch_score)]): args.ka_gapped_l = KA_PARAMS['na'][(args.match_score, args.mismatch_score)][(
# # DEMO: Izračun lastne matrike in njena uporaba # # V spodnjem primeru je prikazan izračun lastne matrike (enako, kot pri prejšnji vaji - [VAJA: Izračun matrike zamenjav (Python)](matrika_zamenjav.ipynb)) ter uporaba tako izračunane matrike za poravnavo dveh zaporedij. # # ## Izračun matrike # In[1]: from Bio import SeqIO sequence1 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_horse.fasta', 'fasta') sequence2 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_rat.fasta', 'fasta') # v mapi vhod sta tudi zaporedji mišjega in človeškega nebulina, ki sta bistveno daljši from Bio.Align import PairwiseAligner aligner = PairwiseAligner() aligner.mode = 'local' aligner.match_score = 2 aligner.mismatch_score = -3 aligner.open_gap_score = -7 aligner.extend_gap_score = -2 alignments = aligner.align(sequence1.seq, sequence2.seq) alignment = alignments[0] from Bio.Align.substitution_matrices import Array frequency = Array('ACGT', dims=2) for (start1, end1), (start2, end2) in zip(*alignment.aligned): seq1 = sequence1[start1:end1] seq2 = sequence2[start2:end2] for c1, c2 in zip(seq1, seq2): frequency[c1, c2] += 1 import numpy probabilities = frequency / numpy.sum(frequency)