def setUp(self): aligner = PairwiseAligner() aligner.internal_open_gap_score = -1 aligner.internal_extend_gap_score = -0.0 aligner.match_score = +1 aligner.mismatch_score = -1 aligner.mode = "local" self.aligner = aligner
def make_aligner() -> PairwiseAligner: aligner = PairwiseAligner(match_score=MATCH_SCORE, mismatch_score=MISMATCH_SCORE, end_open_gap_score=END_GAP_PENALTY, end_extend_gap_score=END_GAP_EXTEND_PENALTY, internal_open_gap_score=GAP_PENALTY, internal_extend_gap_score=GAP_EXTEND_PENALTY) return aligner
def bioPython_default_local_aligner(a, b): aligner = PairwiseAligner() aligner.mode = 'local' aligner.match_score = 2 aligner.mismatch_score = -3 aligner.open_gap_score = -7 aligner.extend_gap_score = -2 sequence1 = SeqIO.read('./resource/fasta' + str(a) + '.fasta', 'fasta') sequence2 = SeqIO.read('./resource/fasta' + str(b) + '.fasta', 'fasta') alignments = aligner.align(sequence1.seq, sequence2.seq)
def nw_bio(seq1, seq2, cost_table): aligner = PairwiseAligner(alphabet=list(set(seq1 + seq2))) aligner.match_score = cost_table[0] aligner.mismatch_score = cost_table[1] aligner.gap_score = cost_table[2] alignments = aligner.align(seq1, seq2) formated_alignments = [] for i in range(len(alignments)): als = str(alignments[i]).split("\n") formated_alignments.append([als[0], als[2], int(alignments[i].score)]) return formated_alignments
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05): """Cluster a list of sequences by a distance identity threshold Parameters ---------- seqlist : list list of sequences as str dist_threshold : float Max distance value to retain, branches above this length in the hierarchical clustering tree will be cut. Returns ------- list list of lists - input sequences now grouped by cluster list list of int - cluster memberships of the originally input list """ if len(seqlist) == 1: # Skip alignment if there is only one sequence return([seqlist], [0]) else: aligner = PairwiseAligner() aligner.mode = "local" # Convert sequence list to distance matrix distmatrix = [] for seq1 in seqlist: row = [] for seq2 in seqlist: maxlen = max([len(seq1), len(seq2)]) # Take percentage identity of pairwise alignment score (match base # +1, all other operations +0) over the longer sequence in pair idval = aligner.align(seq1, seq2).score / maxlen distval = 1 - idval # convert to distance fraction row.append(distval) distmatrix.append(row) # Hierarchical clustering from the distance matrix htree = treecluster(data=None, distancematrix=array(distmatrix)) # Find number of branches with length longer than threshold, and add 1 # to get number of cuts cuts = 1 + len([htree[i].distance for i in range(len(htree)) if htree[i].distance > dist_threshold]) clust_ids = list(htree.cut(cuts)) clust_seqs_dict = defaultdict(list) for i in range(len(seqlist)): clust_seqs_dict[clust_ids[i]] += [seqlist[i]] # Convert dict of lists to list of lists clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict] return(clust_seqs, clust_ids)
def nw_bio_mat(seq1, seq2, cost_mat, key): aligner = PairwiseAligner(alphabet=key) matrix = {} for i in range(len(key)): for j in range(0, len(key)): matrix[(key[i], key[j])] = cost_mat[i * len(key) + j] aligner.substitution_matrix = substitution_matrices.Array(data=matrix) aligner.gap_score = cost_mat[len(key)**2] alignments = aligner.align(seq1, seq2) formated_alignments = [] for i in range(len(alignments)): als = str(alignments[i]).split("\n") formated_alignments.append([als[0], als[2], int(alignments[i].score)]) return formated_alignments
def create_aligner() -> PairwiseAligner: """ Creates an aligner that can be used to search for proteins. """ aligner = PairwiseAligner(mode="local") # By default we want matches and penalize mismatches. aligner.mismatch_score = -1 aligner.match_score = 1 # left or right gaps shouldn't count negatively due to the local search. aligner.query_left_gap_score = 0 aligner.query_right_gap_score = 0 aligner.target_right_gap_score = 0 aligner.target_left_gap_score = 0 # Gaps in the middle should count negatively to narrow down the search space. aligner.query_internal_gap_score = -1 aligner.target_internal_gap_score = -1 return aligner
def pairwise(self, potential_parent): """ Парное выравнивание последовательности листа на потенциальных родителей и сохранение скора Args: potential_parent (str): последовательность потенциального родителя """ aligner = PairwiseAligner() # используем локальное выравнивание aligner.mode = 'local' # заменим дефолтные символы пропуска на другие seq = self.seq.replace('-', '') potential_parent_undersores = potential_parent.replace('-', '') # выравнивание score = aligner.score(seq, potential_parent_undersores) # сохраняем скор self.parent_scores[potential_parent] = score
def _remove_missing_res(self, record: SeqRecord, pdb: Path): structure = PDBParser().get_structure(record.id, pdb) sequence = ''.join([ str(_.get_sequence()) for _ in CaPPBuilder().build_peptides(structure, aa_only=False) ]) path = PairwiseAligner().align(record.seq.ungap('-'), sequence)[0].path gaps = [] for i, _ in enumerate(path[:-1]): if path[i][1] == path[i + 1][1]: gaps.append((path[i][0], path[i + 1][0])) gaps = list(reversed(gaps)) mut = record.seq.tomutable() for gap in gaps: i = 0 for k, res in enumerate(mut): if res == '-': continue if gap[0] <= i < gap[1]: mut[k] = '-' i += 1 record.seq = mut.toseq() return record
def perform_randomized_tests(n=1000): """Perform randomized tests and compare to pslMap. Run this function to perform 8 x n mappings for alignments of randomly generated sequences, get the alignment in PSL format, and compare the result to that of pslMap. """ aligner = PairwiseAligner() aligner.internal_open_gap_score = -1 aligner.internal_extend_gap_score = -0.0 aligner.match_score = +1 aligner.mismatch_score = -1 aligner.mode = "local" for i in range(n): nBlocks1 = random.randint(1, 10) nBlocks2 = random.randint(1, 10) test_random(aligner, nBlocks1, nBlocks2, "+", "+") test_random(aligner, nBlocks1, nBlocks2, "+", "-") test_random(aligner, nBlocks1, nBlocks2, "-", "+") test_random(aligner, nBlocks1, nBlocks2, "-", "-") test_random_sequences("+", "+") test_random_sequences("+", "-") test_random_sequences("-", "+") test_random_sequences("-", "-")
type=str, required=True) parser.add_argument('-r', '--reference', help='Reference to be aligned to', type=str, required=True) parser.add_argument('-n', '--seq_name', help='Name of the aligned sequence', type=str, required=True) args = parser.parse_args() aligner = PairwiseAligner() aligner.mode = 'global' aligner.match_score = 1 aligner.mismatch_score = 0 aligner.open_gap_score = -2 aligner.extend_gap_score = -1 ref = SeqIO.read(args.reference, "fasta") ref.seq = str(ref.seq.upper()).replace('-', 'N') cons = SeqIO.read(args.infile, "fasta") aln = aligner.align(ref.seq, cons.seq) with open(args.outfile, 'w') as out: print(">", args.seq_name, file=out) print(str(aln[0]).strip().split('\n')[2], file=out)
try: GAP_PENALTY = scores_dict['gap penalty'] GAP_EXTEND_PENALTY = scores_dict['gap extend penalty'] END_GAP_PENALTY = scores_dict['end gap penalty'] END_GAP_EXTEND_PENALTY = scores_dict['end gap extend penalty'] MATCH_SCORE = scores_dict['match score'] MISMATCH_SCORE = scores_dict['mismatch score'] except KeyError as ex: raise ValueError(f"'{ex.args[0]}' is missing in data/scores.tab") from ex score_matrix = np.empty((16, 16)) for i, j in itertools.product(range(0, 16), range(0, 16)): score_matrix[i, j] = MATCH_SCORE if i & j else MISMATCH_SCORE aligner = PairwiseAligner(substitution_matrix=score_matrix, end_open_gap_score=END_GAP_PENALTY, end_extend_gap_score=END_GAP_EXTEND_PENALTY, internal_open_gap_score=GAP_PENALTY, internal_extend_gap_score=GAP_EXTEND_PENALTY) seq_read_dict = { '-':0, 'A':1, 'C':2, 'G':4, 'T':8, 'R':5, 'Y':10, 'S':6, 'W':9, 'K':12, 'M':3, 'B':14, 'D':13,
args = parser.parse_args() for file in (args.query_seq, args.target_seq): if not path.isfile(file): parser.error("File %s doesn't exist" % file) # alphabet if args.seq_type == 'dna': args.seq_abc = IUPACAmbiguousDNA() elif args.seq_type == 'rna': args.seq_abc = IUPACAmbiguousRNA() else: args.seq_abc = ExtendedIUPACProtein() # Aligners setup aligners = {'global': PairwiseAligner(), 'local': None} aligners['global'].mode = 'global' if args.seq_type in ('dna', 'rna'): aligners['global'].match = args.match_score aligners['global'].mismatch = args.mismatch_score if not args.open_gap_score: args.open_gap_score = -5 if not args.extend_gap_score: args.extend_gap_score = -2 else: sub_matrix = getattr(import_module('Bio.SubsMat.MatrixInfo'), args.sub_matrix) aligners['global'].substitution_matrix = sub_matrix if not args.open_gap_score: args.open_gap_score = -11 if not args.extend_gap_score: