def init_aligner(allow_target_gaps=False, allow_target_mismatches=False): """Creates an aligner whose weights penalize excessive gaps, make gaps in the ProteinNet sequence impossible, and prefer gaps at the tail ends of sequences.""" a = Align.PairwiseAligner() a.mismatch = -np.inf a.mismatch_score = -np.inf # Don't allow for gaps or mismatches with the target sequence if not allow_target_gaps: a.target_gap_score = -np.inf # Do not let matching items overwhelm determining where gaps should go if not allow_target_gaps: a.match = 10 else: a.match = 200 if allow_target_mismatches: a.mismatch = 200 # Generally, prefer to extend gaps than to create them a.query_extend_gap_score = 99 a.query_open_gap_score = 49 # Set slight preference for open gaps on the edges, however, if present, strongly prefer single edge gaps a.query_end_open_gap_score = 50 a.query_end_extend_gap_score = 100 return a
def matrix(seq, file_name, idslist): ''' function takes sequences, result file name and genes id list.find similarity score of sequences and form similarity matrix ''' _file_name = file_name cache = {} row = col = 1 work_book = xlsxwriter.Workbook(_file_name + '.xlsx') work_sheet = work_book.add_worksheet() align = Align.PairwiseAligner() for s in range(len(seq)): work_sheet.write(0, 0, 'Gene Id') work_sheet.write(0, col, idslist[s]) work_sheet.write(row, 0, idslist[s]) row = row + 1 col = col + 1 for i in range(len(seq)): score = align.score(seq[s], seq[i]) score = score / len(seq[s]) if ((idslist[i], idslist[s]) and (idslist[s], idslist[i]) in cache.keys()): score = cache[(idslist[s], idslist[i])] work_sheet.write((s + 1), (i + 1), score) else: work_sheet.write((s + 1), (i + 1), score) cache.update({(idslist[i], idslist[s]): score}) work_book.close()
def pairwise_aligner(self): aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.extend_gap_score = -0.5 aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # aligner.match = 2 # aligner.mismatch = -1 alignments = aligner.align(self.sequence1, self.sequence2) differences_list = [] for alignment in sorted(alignments): split_alignment = str(alignment).split() assert len( split_alignment) % 3 == 0, "should be divisible by three" #symbolic_alignment_summary = split_alignment[int(len(split_alignment)/3):(int(len(split_alignment)/3)*2)] # turns out each part of pairwise alignment, that being # first aligned sequence, symbolic summary of alignment, and # other aligned seqeunce is on separate line already, & so can just # do: symbolic_alignment_summary = split_alignment[1] unaligned = symbolic_alignment_summary.count("X") #by limiting # to the symbolic summary of the alignment, I don't have to # worry about 'X's in sequence making count wrong. And I also # don't have to worry about dividing gap indicators out of # double counting on the next line either. #print(unaligned) gaps = symbolic_alignment_summary.count("-") differences = unaligned + gaps differences_list.append(differences) align_score = aligner.score(self.sequence1, self.sequence2) self.score = align_score self.differences = min(differences_list)
def __init__(self, align_cfg: Dict, min_score: float, min_read_length_without_primers: int, window_size: int): self._min_score = min_score self._window_size = window_size # Set logger logger = LoggerWrapper.get_logger() self._logger = logger self._min_primer_dimer_thresh = min_read_length_without_primers # Create Aligner # Init biopython aligner self._aligner = Align.PairwiseAligner() self._aligner.match_score = align_cfg["match_score"] self._aligner.mismatch_score = align_cfg["mismatch_score"] self._aligner.open_gap_score = align_cfg["open_gap_score"] self._aligner.extend_gap_score = align_cfg["extend_gap_score"] if align_cfg["substitution_matrix"] != "": if align_cfg["substitution_matrix"] in MatrixInfo.__dict__: self._aligner.substitution_matrix = MatrixInfo.__dict__[align_cfg["substitution_matrix"]] self._logger.warning("Shifting indels to cut-site isn't available for alignment with difference score" "substitution matrix. Contact package owner if this feature is required") else: raise AlignerSubstitutionDoesntExist(align_cfg["substitution_matrix"])
def _align_clusters(config, one, two, cutoff=0.3): """Constructs a cluster alignment using the given configuration.""" LOG.info("%s vs %s", one.name, two.name) aligner = Align.PairwiseAligner() matrix = config.pop("substitution_matrix", "BLOSUM62") if matrix not in substitution_matrices.load(): LOG.warning( "Invalid substitution matrix (%s), defaulting to BLOSUM62", matrix) matrix = "BLOSUM62" aligner.substitution_matrix = substitution_matrices.load(matrix) for k, v in config.items(): setattr(aligner, k, v) alignment = Alignment(query=one, target=two) for locusA, locusB in product(one.loci, two.loci): for geneA, geneB in product(locusA.genes, locusB.genes): if not geneA.translation or not geneB.translation: continue aln = aligner.align(geneA.translation, geneB.translation) identity, similarity = compute_identity(aln[0]) if identity < cutoff: continue alignment.add_link(geneA, geneB, identity, similarity) return alignment
def test_results(self): """ testing results """ for item in self.data: algo = SW(item['seq_a'], item['seq_b'], item['match'], item['mismatch'], item['gap']) algo.initialize() algo.calculate_score() algo.traceback() algo.calculate_identity() # alignments = algo.get_alignments()[0] score = algo.get_score() aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.match_score = item['match'] aligner.mismatch_score = item['mismatch'] aligner.gap_score = item['gap'] ref_result = aligner.align(item['seq_a'], item['seq_b']) ref_score = ref_result.score # ref_alignments = [] # for align in ref_result: # temp = str(align.format()).split('\n') # ref_alignments.append([temp[0], temp[2]]) # self.assertIn([alignments['algn_a'], alignments['algn_b']], ref_alignments) self.assertEqual(score, ref_score, item)
def find_orf_best_match(self, protein_seq_dict=None, min_score_thr=-1): """ Align the ORF translation against the sequences in the protein_seq_dict dict and return the id of the sequence with best score. Attributes: protein_seq_dict (dict): Dictionary of id:sequence of known proteins min_score_thr (int): Minimum alignment score to consider a hit """ if (type(protein_seq_dict) is not dict): raise Exception( "protein_seq_dict must be a dictionary of if:sequence") aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.extend_gap_score = -10 aligner.target_end_gap_score = -0.1 aligner.query_end_gap_score = -0.1 seq_to_match = self.translation aligner.match_score = 2 aligner.mismatch_score = -1 # Remove stop codon if present if (seq_to_match[-1] == "*"): seq_to_match = seq_to_match[:-1] best_hit = ["NA", -1] for id, seq in protein_seq_dict.items(): aln = aligner.align(seq_to_match, seq.seq) aln = sorted(aln)[0] score = aln.score identity = utils.percent_identity(aln) id = "{}({:.2f}%/{:.2f}%)".format(id, identity[0], identity[1]) if score > best_hit[1]: best_hit[0] = id best_hit[1] = score if best_hit[1] > min_score_thr: return best_hit[0] else: return "Unknown"
def getAlignScore(line): barcodeUmi = line[0] unmappedSeq = line[1] aligner = Align.PairwiseAligner() aligner.match_score = 1 aligner.mismatch_score = -1 aligner.gap_score = -2 aligner.query_gap_score = -1 aligner.target_end_gap_score = 0 barcodeUmi = barcodeUmi.split("_") barcode = barcodeUmi[0] umi = barcodeUmi[1] barcodeUmi = "".join(barcodeUmi) mappingResult = [aligner.align(barcodeUmi, x) for x in unmappedSeq] mappingScore = [x.score for x in mappingResult] barcodeUmiScore = max(mappingScore) bestScoreIndex = mappingScore.index(barcodeUmiScore) if bestScoreIndex % 2 == 0: mappingStrand = 0 else: mappingStrand = 1 bestAlign = mappingResult[bestScoreIndex][0] seqAlignedSeq = bestAlign.query[bestAlign.aligned[1][0][0]:bestAlign. aligned[1][-1][-1]] barcodeScore = aligner.align(barcode, seqAlignedSeq).score umiScore = barcodeUmiScore - barcodeScore return [ str(x) for x in [barcodeUmiScore, barcodeScore, umiScore, mappingStrand] ]
def matchAndScore(neo, db): """ Find peptides in unmutated DB that match at P4,5,8 using regex then finds those with score >=1.25 at P6+7 using scoring matrix. :param neo: neoepitope string input :param db: unmutated db as pandas DF :return: Matching peptides with neoepitope as pandas DF """ import re pattern = '^([A-Z]){3}' + neo[3] + neo[4] + '([A-Z]){2}' + neo[7] #get pos 4, 5, 8 matches pos_matches = db[db.str.contains(pattern, regex=True)].tolist() aligner = Align.PairwiseAligner() aligner.substitution_matrix = subMatrix() unmut_matches = [] for match in pos_matches: score6 = aligner.score(neo[5], match[5]) score7 = aligner.score(neo[6], match[6]) score67 = aligner.score(neo[5:7], match[5:7]) #compare P6 and P7 if (score67 >= 1.0 and score6 >= 0.25 and score7 >= 0.25): unmut_matches.append(match) scoredDF = pd.DataFrame() scoredDF['Peptide'] = unmut_matches scoredDF['Neoepitope'] = neo scoredDF['P4,5,8'] = neo[3] + neo[4] + neo[7] return scoredDF
def map_seqs(obj, ref, segid_obj=None, segid_ref=None, matrix='BLOSUM62'): """ given two sequences obj and ref return a mapping dict map_obj2ref_fullseq={(segid,0-based pos):(segid,0-based pos)} """ aligner = Align.PairwiseAligner() aligner.substitution_matrix = substitution_matrices.load(matrix) best_score = 0 best_aln = 'no' i = 0 for a in aligner.align(str(obj), str(ref)): if (a.score > best_score): best_score = a.score best_aln = a i = i + 1 if i > 100: # we analyze only first 100 alignments break t2q = {} for i, j in zip(best_aln.aligned[0], best_aln.aligned[1]): for x, y in zip(range(*i), range(*j)): t2q[x] = y if segid_obj is None: return t2q else: return {(segid_obj, k): (segid_ref, v) for k, v in t2q.items()}
def _align_clusters(config, one, two, cutoff=0.3): """Constructs a cluster alignment using the given configuration.""" LOG.info("%s vs %s", one.name, two.name) aligner = Align.PairwiseAligner() # Select the substitution matrix. # Defaults to BLOSUM62 when none or invalid matrix specified. matrix = config.pop("substitution_matrix", "BLOSUM62") if matrix not in substitution_matrices.load(): LOG.warning( "Invalid substitution matrix '(%s)', defaulting to BLOSUM62", matrix) matrix = "BLOSUM62" aligner.substitution_matrix = substitution_matrices.load(matrix) # ValueError is thrown during sequence alignment when a letter # in the sequence is not found in the substitution matrix. # Extended IUPAC codes (BXZJUO) are added to mitigate this. extend_matrix_alphabet(aligner.substitution_matrix, codes='BXZJUO') for k, v in config.items(): setattr(aligner, k, v) alignment = Alignment(query=one, target=two) for locusA, locusB in product(one.loci, two.loci): for geneA, geneB in product(locusA.genes, locusB.genes): if not geneA.translation or not geneB.translation: continue aln = aligner.align(geneA.translation, geneB.translation) identity, similarity = compute_identity(aln[0]) if identity < cutoff: continue alignment.add_link(geneA, geneB, identity, similarity) return alignment
def replace_missing_residues(template_alignment, template_id, chain, pdb): template_pdb = [ str(_.seq) for _ in SeqIO.parse(pdb, 'pdb-atom') if _.id == f'{template_id}:{chain}' ] try: template_pdb = template_pdb[0].replace('X', '') except IndexError: print(template_id) print(chain) aligner = Align.PairwiseAligner() aligner.mode = 'global' alignment = next( aligner.align(template_alignment.replace('-', ''), template_pdb)) a, _, b = str(alignment).splitlines() a = list(a) b = list(b) assert len(a) == len(b) for i, _ in enumerate(a): if b[i] == '-': a[i] = '@' a = ''.join(a).replace('-', '') a = list(a) for i, res in enumerate(template_alignment): if res == '-': a.insert(i, '-') a = ''.join(a).replace('@', '-') assert len(a) == len(template_alignment) return a
def biopython_align(qseq, tseq, param, table=False, strict=False): # Query and target sequences. q = str(qseq.seq) t = str(tseq.seq) aligner = Align.PairwiseAligner() # Select local mode. Global, semiglobal are about scoring. if param.mode == const.LOCAL_ALIGN: aligner.mode = 'local' # Attempts to detect DNA vs peptide sequences. param.is_dna = all(x in "ATGC" for x in q[:100]) # Default substituion matrix. if not param.matrix: param.matrix = 'NUC.4.4' if param.is_dna else 'BLOSUM62' # Apply substitution matrix. aligner.substitution_matrix = substitution_matrices.load(param.matrix) # Gap scoring. aligner.open_gap_score = -param.gap_open aligner.extend_gap_score = -param.gap_extend # End gap scoring. if strict: aligner.target_end_open_gap_score = -param.gap_open aligner.target_end_extend_gap_score = -param.gap_extend aligner.query_end_open_gap_score = -param.gap_open aligner.query_end_extend_gap_score = -param.gap_extend else: aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # Semiglobal will override strict mode. if param.mode == const.SEMIGLOBAL_ALIGN: aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # Biopython alignment target to query. alns = aligner.align(t, q) # Reformat alignments as a more detailed class. def builder(aln): rec = Alignment(qseq=qseq, tseq=tseq, aln=aln, param=param) return rec alns = map(builder, alns) # Format the aligners if table: print_func = print_tabular else: print_func = print_pairwise for index, aln in enumerate(alns): print_func(aln, param=param, index=index)
def __init__(self): self.seqa = '' self.seqb = '' self.aligner = Align.PairwiseAligner() self.aligner.mode = 'global' self.aligner.open_gap_score = -10 self.aligner.extend_gap_score = -0.5 self.aligner.substitution_matrix = matlist.blosum62 self.alignment_count = 0
def align(seq1, seq2): aligner = Align.PairwiseAligner() aligner.mode = "local" aligner.open_gap_score = -4 aligner.extend_gap_score = -2 aligner.match = 2 aligner.mismatch = -3 score = aligner.score(seq1, seq2) return score
def test_poly_a(): for poly_a_min_score in range(5, 11): for poly_a_win in range(5, 31, 5): pstv_counter = 0 tp_counter = 0 fp_counter = 0 for record in SeqIO.parse( "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl", "embl"): mark1 = 'SINE' mark2 = 'LINE' if mark1 in record.annotations[ 'comment'] or mark2 in record.annotations['comment']: pstv_counter += 1 aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.open_gap_score = -1.0 aligner.extend_gap_score = -1.0 aligner.mismatch = -1 score = aligner.score(record.seq[-poly_a_win:-1], poly_a_win * "a") if score < poly_a_min_score: # Search for poly-A tail in the negative strand score = aligner.score(str(record.seq[0:poly_a_win]), poly_a_win * "t") if score >= poly_a_min_score and ( mark1 in record.annotations['comment'] or mark2 in record.annotations['comment']): tp_counter += 1 if score >= poly_a_min_score and not ( mark1 in record.annotations['comment'] or mark2 in record.annotations['comment']): fp_counter += 1 sensitivity = 100 * tp_counter / pstv_counter if tp_counter > 0 or fp_counter > 0: precision = 100 * tp_counter / (tp_counter + fp_counter) else: precision = 0 if sensitivity > 0 or precision > 0: f_measure = 2 * (sensitivity * precision) / (sensitivity + precision) else: f_measure = 0 print('Window:', poly_a_win, 'Min score:', poly_a_min_score, 'Sensitivity:', sensitivity, 'Precision:', precision, 'F-measure:', f_measure)
def __init__(self, reference: str, sequence: str): self.aligner = Align.PairwiseAligner() self.aligner.mode = 'global' self.aligner.open_gap_score = -0.5 self.aligner.extend_gap_score = -0.1 self.aligner.target_end_gap_score = 0.0 self.aligner.query_end_gap_score = 0.0 self.reference = reference self.sequence = sequence
def BLOSUM45_score_dist(s1, s2): aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.substitution_matrix = substitution_matrices.load("BLOSUM45") aligner.mode = "global" score_s12 = aligner.score(s1, s2) score11 = aligner.score(s1, s1) score22 = aligner.score(s2, s2) distance = 1 - score_s12 / max(score11, score22) return distance
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2): #within length limit if not sv.is_third_fil: aligner = Align.PairwiseAligner() aligner.mode = 'global' #aligner.mode = 'local' aligner.match_score = 1 aligner.mismatch_score = -1 aligner.open_gap_score = -1 aligner.extend_gap_score = -0.5 #aligner.score_only = True alignment_beforeSV = aligner.score(query_seq, ref_seq_1) alignment_afterSV = aligner.score(query_seq, ref_seq_2) else: h = open(output_dir + "tmp_query.fasta", "w") h.write('>' + str(sv.idx) + "\n") h.write(query_seq + "\n") h.close() # aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1]) aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta") #if not alignment: raise Exception("ERROR: failed to load/build index") aligner_beforeSV = aligner.map(ref_seq_1, seq2=None, cs=False, MD=False) aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False) #test # for agt in aligner_beforeSV: # alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen) # break # for agt in aligner_afterSV: # alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen) # break try: agt_before = next(aligner_beforeSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None try: agt_after = next(aligner_afterSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt_before.mlen) alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen) os.remove(output_dir + "tmp_query.fasta") return alignment_beforeSV, alignment_afterSV
def get_align_score(seq1, seq2): aligner = Align.PairwiseAligner() aligner.mode = 'global' #aligner.mode = 'local' aligner.match_score = 1 aligner.mismatch_score = -1 aligner.open_gap_score = -1 aligner.extend_gap_score = -0.5 #aligner.score_only = True alignment_score = aligner.score(seq1, seq2) return alignment_score
def init_basic_aligner(allow_mismatches=False): """Returns an aligner with minimal assumptions about gaps.""" a = Align.PairwiseAligner() if allow_mismatches: a.mismatch_score = -1 a.gap_score = -3 a.target_gap_score = -np.inf if not allow_mismatches: a.mismatch = -np.inf a.mismatch_score = -np.inf return a
def setupAligner( match, mismatch, open, extend ): #create an aligner to attempt to simulate a semi-global aligment, as it is some what more accurate than pairwise2.globalms a = Align.PairwiseAligner() #the aligner itself a.match_score = match #set the aligner score based on the given condition a.mismatch_score = mismatch a.internal_open_gap_score = open #internal gap open a.internal_extend_gap_score = extend #internal extending gap a.target_left_open_gap_score = open #left gap open a.target_left_extend_gap_score = extend #left extend return a
def __init__(self, reference: str, query: str, config: dict): self.aligner = Align.PairwiseAligner() self.reference = reference self.query = query self.aligner.mode = config.get('mode', 'global') self.aligner.open_gap_score = config.get('open_gap_score', -0.5) self.aligner.extend_gap_score = config.get('extend_gap_score', -0.1) self.aligner.target_end_gap_score = config.get('target_end_gap_score', 0.0) self.aligner.query_end_gap_score = config.get('query_end_gap_score', 0.0)
def alignSeqs(keySequences, editDistance, count): aligner = Align.PairwiseAligner() aligner.open_gap_score = -0.5 aligner.extend_gap_score = -0.5 minScore = 15 - editDistance i = keySequences[int(count)] countj = int(int(count) + 1) for j in keySequences[countj:]: alignments = aligner.align(i, j) if alignments.score >= minScore and alignments.score < 15: alignment = alignments[0] return i, j
def __init__(self, aligner_config=None): self.alignments = [] self.aligner = Align.PairwiseAligner() self.clusters = OrderedDict() self._alignment_indices = defaultdict(dict) self._cluster_names = defaultdict(dict) if aligner_config: self.configure_aligner(**aligner_config) else: self.configure_aligner(**self.aligner_default)
def sequence_aligner(sequence_id, reference, sequence, chr_name, snpeff_database_name, annotation_file): aligner = Align.PairwiseAligner() aligner.match_score = 3.0 # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work aligner.mismatch_score = -2.0 aligner.open_gap_score = -2.5 aligner.extend_gap_score = -1 alignments = sorted(list(aligner.align(reference, sequence)), key = lambda x: len(str(x).strip().split('\n')[2].strip("-"))) alignment = str(alignments[0]).strip().split('\n') ref_aligned = alignment[0] seq_aligned = alignment[2] print(f'#\n#\n#Pipeline: {"Alignment done"} \n#\n#') ref_positions = np.zeros(len(seq_aligned), dtype=int) pos = 0 for i in range(len(ref_aligned)): if ref_aligned[i] != '-': pos += 1 ref_positions[i] = pos seq_positions = np.zeros(len(seq_aligned), dtype=int) pos = 0 for i in range(len(seq_aligned)): if seq_aligned[i] != '-': pos += 1 seq_positions[i] = pos annotated_variants = call_nucleotide_variants(sequence_id, reference, sequence, ref_aligned, seq_aligned, ref_positions, seq_positions, chr_name, snpeff_database_name ) print(f'#\n#\n#Pipeline: {"Nuc variant called"} \n#\n#') annotations = filter_ann_and_variants( call_annotation_variant(annotation_file, ref_aligned, seq_aligned, ref_positions, seq_positions ) ) print(f'#\n#\n#Pipeline: {"AA variant called"} \n#\n#') return annotated_variants, annotations
def populate_from_pair(g_1, g_2, trans_m, emiss_m, N = 10): ''' Popunjava trans i emis matricu iz najvise N optimalnih poravnanja koja se dobivaju od genoma g1 i g2. Tablicu popunjavaju pojavama(count) te se ne radi pretovrba u vjerojatnosu matricu ''' aligner = Align.PairwiseAligner() alignments = aligner.align(g_1, g_2) for i,alignment in enumerate(alignments): #print(alignment) if N is not None and i>=N: break populate_from_aligment(alignment, trans_m, emiss_m)
def get_nuc_aligner() -> Align.PairwiseAligner: from Bio.Align.substitution_matrices import Array aligner = Align.PairwiseAligner() aligner.match_score = 3.0 # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work aligner.mismatch_score = -2.1 aligner.open_gap_score = -2.5 aligner.extend_gap_score = -1 aligner.right_extend_gap_score = 0 aligner.left_extend_gap_score = 0 aligner.right_open_gap_score = 0 aligner.left_open_gap_score = 0 match_scores = {1: aligner.match_score, 3: 2, 10: 1.5, 16: 1} dd = { "a": "a", "g": "g", "c": "c", "t": "t", # len 3 "y": "cty", "r": "agr", "w": "atw", "s": "gcs", "k": "tgk", "m": "cam", # len 10 "d": "agtd" + "yrwskm", "v": "acgv" + "yrwskm", "h": "acth" + "yrwskm", "b": "cgtb" + "yrwskm", # len 16 "n": "agctyrwskmdvhbnx", "x": "agctyrwskmdvhbnx", } extra_characters = "" all_characters = "".join(dd) + extra_characters matrix = Array(alphabet=all_characters, dims=2, data=np.ones((len(all_characters), len(all_characters))) * aligner.mismatch_score) for x, chrs in dd.items(): score = match_scores[len(chrs)] for y in chrs: matrix[x, y] = matrix[y, x] = score aligner.substitution_matrix = matrix return aligner
def test_mite(): aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.open_gap_score = -1.0 aligner.extend_gap_score = -1.0 aligner.mismatch = -1.0 for ir_min_score in range(5, 11): for lt_win in range(10, 50, 5): dna_counter = 0 tp_counter = 0 fp_counter = 0 rt_win = lt_win for record in SeqIO.parse( "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl", "embl"): mark = 'Type: DNA' if mark in record.annotations['comment']: dna_counter += 1 rc = record.seq[-lt_win:-1].reverse_complement() score = aligner.score(str(record.seq[0:rt_win]), str(rc)) # print(score) if score >= ir_min_score and mark in record.annotations[ 'comment']: tp_counter += 1 if score >= ir_min_score and not mark in record.annotations[ 'comment']: fp_counter += 1 sensitivity = 100 * tp_counter / dna_counter if tp_counter > 0 or fp_counter > 0: precision = 100 * tp_counter / (tp_counter + fp_counter) else: precision = 0 if sensitivity > 0 or precision > 0: f_measure = 2 * (sensitivity * precision) / (sensitivity + precision) else: f_measure = 0 print('Left window:', lt_win, 'Right window:', rt_win, 'Min score:', ir_min_score, 'Sensitivity:', sensitivity, 'Precision:', precision, 'F-measure:', f_measure)
def Needleman_Wunsch_alignment(seq1, seq2): ''' Function for doing global alignment between seq1 and seq2 using Needleman-Wunsch algorithm implemented in Biopython ''' missing = None if "-" in seq1: # Need to handle "-" beforehand, otherwise the alignment may fail missing = [s == "-" for s in seq1] seq1 = seq1.replace("-", "") aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.extend_gap_score = -0.5 aligner.substitution_matrix = blosum62 alignment = aligner.align(seq1, seq2)[0] alignment_info = alignment.__str__().split("\n") aligned1, aligned2 = alignment_info[0], alignment_info[2] if missing is None: final1 = aligned1 final2 = aligned2 else: # Assign alignment with "-" final1_temp = "" final2_temp = "" j = 0 for s in missing: if s: final1_temp += "-" final2_temp += "-" else: while aligned1[j] == "-" and j < len(aligned1): final1_temp += aligned1[j] final2_temp += aligned2[j] j += 1 if j < len(aligned1): final1_temp += aligned1[j] final2_temp += aligned2[j] j += 1 if j < len(aligned1): final1_temp += aligned1[j:] final2_temp += aligned2[j:] # Cleaning up final1 = "" final2 = "" for i in range(len(final1_temp)): if not (final1_temp[i] == "-" and final2_temp[i] == "-"): final1 += final1_temp[i] final2 += final2_temp[i] return final1, final2