def bioPython_default_local_aligner(a, b): aligner = PairwiseAligner() aligner.mode = 'local' aligner.match_score = 2 aligner.mismatch_score = -3 aligner.open_gap_score = -7 aligner.extend_gap_score = -2 sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta') sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta') alignments = aligner.align(sequence1.seq, sequence2.seq)
def nw_bio(seq1, seq2, cost_table): aligner = PairwiseAligner(alphabet=list(set(seq1 + seq2))) aligner.match_score = cost_table[0] aligner.mismatch_score = cost_table[1] aligner.gap_score = cost_table[2] alignments = aligner.align(seq1, seq2) formated_alignments = [] for i in range(len(alignments)): als = str(alignments[i]).split("\n") formated_alignments.append([als[0], als[2], int(alignments[i].score)]) return formated_alignments
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05): """Cluster a list of sequences by a distance identity threshold Parameters ---------- seqlist : list list of sequences as str dist_threshold : float Max distance value to retain, branches above this length in the hierarchical clustering tree will be cut. Returns ------- list list of lists - input sequences now grouped by cluster list list of int - cluster memberships of the originally input list """ if len(seqlist) == 1: # Skip alignment if there is only one sequence return([seqlist], [0]) else: aligner = PairwiseAligner() aligner.mode = "local" # Convert sequence list to distance matrix distmatrix = [] for seq1 in seqlist: row = [] for seq2 in seqlist: maxlen = max([len(seq1), len(seq2)]) # Take percentage identity of pairwise alignment score (match base # +1, all other operations +0) over the longer sequence in pair idval = aligner.align(seq1, seq2).score / maxlen distval = 1 - idval # convert to distance fraction row.append(distval) distmatrix.append(row) # Hierarchical clustering from the distance matrix htree = treecluster(data=None, distancematrix=array(distmatrix)) # Find number of branches with length longer than threshold, and add 1 # to get number of cuts cuts = 1 + len([htree[i].distance for i in range(len(htree)) if htree[i].distance > dist_threshold]) clust_ids = list(htree.cut(cuts)) clust_seqs_dict = defaultdict(list) for i in range(len(seqlist)): clust_seqs_dict[clust_ids[i]] += [seqlist[i]] # Convert dict of lists to list of lists clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict] return(clust_seqs, clust_ids)
def nw_bio_mat(seq1, seq2, cost_mat, key): aligner = PairwiseAligner(alphabet=key) matrix = {} for i in range(len(key)): for j in range(0, len(key)): matrix[(key[i], key[j])] = cost_mat[i * len(key) + j] aligner.substitution_matrix = substitution_matrices.Array(data=matrix) aligner.gap_score = cost_mat[len(key)**2] alignments = aligner.align(seq1, seq2) formated_alignments = [] for i in range(len(alignments)): als = str(alignments[i]).split("\n") formated_alignments.append([als[0], als[2], int(alignments[i].score)]) return formated_alignments
type=str, required=True) parser.add_argument('-r', '--reference', help='Reference to be aligned to', type=str, required=True) parser.add_argument('-n', '--seq_name', help='Name of the aligned sequence', type=str, required=True) args = parser.parse_args() aligner = PairwiseAligner() aligner.mode = 'global' aligner.match_score = 1 aligner.mismatch_score = 0 aligner.open_gap_score = -2 aligner.extend_gap_score = -1 ref = SeqIO.read(args.reference, "fasta") ref.seq = str(ref.seq.upper()).replace('-', 'N') cons = SeqIO.read(args.infile, "fasta") aln = aligner.align(ref.seq, cons.seq) with open(args.outfile, 'w') as out: print(">", args.seq_name, file=out) print(str(aln[0]).strip().split('\n')[2], file=out)
# Parse fasta file # seqs = list(SeqIO.parse(args.f,'fasta')) # Get substitution matrix substitution_matrix = getattr(MatrixInfo,args.s) #Pairwise alignment aligner = PairwiseAligner() aligner.open_gap_score, aligner.extend_gap_score = args.go, args.ge aligner.substitution_matrix = substitution_matrix # Align sequences and build matrix def similarity_matrix(seqs,n=len(seqs)): similarity_matrix = np.zeros([n,n]) for i in range(len(seqs)): for j in range(len(seqs)): alignment = aligner.align(seqs[i].seq,seqs[j].seq) similarity_matrix[i][j] = alignment.score return similarity_matrix m = similarity_matrix(seqs) def print_matrix(m): for i in m: row = "" for j in i: row+=str(j)+'\t' print(row) print_matrix(m)
# ## Izračun matrike # In[1]: from Bio import SeqIO sequence1 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_horse.fasta', 'fasta') sequence2 = SeqIO.read('vhod/matrika_zamenjav-myoglobin_rat.fasta', 'fasta') # v mapi vhod sta tudi zaporedji mišjega in človeškega nebulina, ki sta bistveno daljši from Bio.Align import PairwiseAligner aligner = PairwiseAligner() aligner.mode = 'local' aligner.match_score = 2 aligner.mismatch_score = -3 aligner.open_gap_score = -7 aligner.extend_gap_score = -2 alignments = aligner.align(sequence1.seq, sequence2.seq) alignment = alignments[0] from Bio.Align.substitution_matrices import Array frequency = Array('ACGT', dims=2) for (start1, end1), (start2, end2) in zip(*alignment.aligned): seq1 = sequence1[start1:end1] seq2 = sequence2[start2:end2] for c1, c2 in zip(seq1, seq2): frequency[c1, c2] += 1 import numpy probabilities = frequency / numpy.sum(frequency) probabilities = (probabilities + probabilities.transpose()) / 2.0 background = numpy.sum(probabilities, 0) expected = numpy.dot(background[:, None], background[None, :]) oddsratios = probabilities / expected scoring_matrix = numpy.log2(oddsratios)