def _save_alignment(self): aln = Align.MultipleSeqAlignment([ SeqRecord.SeqRecord( Seq.Seq("".join(n.sequence)), id=n.name, name=n.name, description="", ) for n in self.tree.find_clades() ]) with myopen(os.path.join(self._root_dir, out_aln_fasta), "w") as ofile: AlignIO.write(aln, ofile, "fasta")
def setupAligner( match, mismatch, open, extend ): #create an aligner to attempt to simulate a semi-global aligment, as it is some what more accurate than pairwise2.globalms a = Align.PairwiseAligner() #the aligner itself a.match_score = match #set the aligner score based on the given condition a.mismatch_score = mismatch a.internal_open_gap_score = open #internal gap open a.internal_extend_gap_score = extend #internal extending gap a.target_left_open_gap_score = open #left gap open a.target_left_extend_gap_score = extend #left extend return a
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2): #within length limit if not sv.is_third_fil: aligner = Align.PairwiseAligner() aligner.mode = 'global' #aligner.mode = 'local' aligner.match_score = 1 aligner.mismatch_score = -1 aligner.open_gap_score = -1 aligner.extend_gap_score = -0.5 #aligner.score_only = True alignment_beforeSV = aligner.score(query_seq, ref_seq_1) alignment_afterSV = aligner.score(query_seq, ref_seq_2) else: h = open(output_dir + "tmp_query.fasta", "w") h.write('>' + str(sv.idx) + "\n") h.write(query_seq + "\n") h.close() # aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1]) aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta") #if not alignment: raise Exception("ERROR: failed to load/build index") aligner_beforeSV = aligner.map(ref_seq_1, seq2=None, cs=False, MD=False) aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False) #test # for agt in aligner_beforeSV: # alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen) # break # for agt in aligner_afterSV: # alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen) # break try: agt_before = next(aligner_beforeSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None try: agt_after = next(aligner_afterSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt_before.mlen) alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen) os.remove(output_dir + "tmp_query.fasta") return alignment_beforeSV, alignment_afterSV
def load_aln(infile): aln = Align.MultipleSeqAlignment([]) aln_dict = {} with open(infile, 'r') as f: for seq_record in SeqIO.parse(f, 'fasta'): aln.append(seq_record) aln_dict[seq_record.id] = str(seq_record.seq) return aln, aln_dict
def get_align_score(seq1, seq2): aligner = Align.PairwiseAligner() aligner.mode = 'global' #aligner.mode = 'local' aligner.match_score = 1 aligner.mismatch_score = -1 aligner.open_gap_score = -1 aligner.extend_gap_score = -0.5 #aligner.score_only = True alignment_score = aligner.score(seq1, seq2) return alignment_score
def alignment(seqs_in, profile, run_id): '''Compute an alignment of multiple sequences to a given covariance model profile such as constructed by cmbuild via infernal.profiles. input: seqs: a list of biopython SeqRecord objects profile: the filename of a covariance model profile run_id: a run id to use for naming temporary files to avoid collisions output: ali: an rfam multiple sequence alignment ref: the profile reference sequence aligned to ali struct: the profile reference structure aligned to ali ''' if type(seqs_in[0]) == str: raise Exception( 'Sorry but string lists are not supported. We need ids!') #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, # Bio.Seq.Alphabet.RNAAlphabet), # 'S{0:03}'.format(idx)) # for idx, s in enumerate(seqs)] else: seqs = [ Bio.SeqRecord.SeqRecord( Bio.Seq.Seq( ''.join([let for let in str(ali.seq) if let in 'AUTGC']), Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx)) for idx, ali in enumerate(seqs_in) ] name_maps = dict([('S{0:03}'.format(idx), s.id) for idx, s in enumerate(seqs_in)]) infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format( run_id, idx)) outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format( run_id, idx)) Bio.SeqIO.write(seqs, infile, 'fasta') cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile) ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE) out = ispc.communicate()[0] fopen = open(outfile) seqs, ref, struct = rutils.stk_parse(fopen) fopen.close() ali = ba.MultipleSeqAlignment(seqs) for a in ali: a.seq = a.seq.upper() a.id = name_maps[a.id] return ali, ref, struct
def __init__(self, reference: str, query: str, config: dict): self.aligner = Align.PairwiseAligner() self.reference = reference self.query = query self.aligner.mode = config.get('mode', 'global') self.aligner.open_gap_score = config.get('open_gap_score', -0.5) self.aligner.extend_gap_score = config.get('extend_gap_score', -0.1) self.aligner.target_end_gap_score = config.get('target_end_gap_score', 0.0) self.aligner.query_end_gap_score = config.get('query_end_gap_score', 0.0)
def init_basic_aligner(allow_mismatches=False): """Returns an aligner with minimal assumptions about gaps.""" a = Align.PairwiseAligner() if allow_mismatches: a.mismatch_score = -1 a.gap_score = -3 a.target_gap_score = -np.inf if not allow_mismatches: a.mismatch = -np.inf a.mismatch_score = -np.inf return a
def __init__(self, aligner_config=None): self.alignments = [] self.aligner = Align.PairwiseAligner() self.clusters = OrderedDict() self._alignment_indices = defaultdict(dict) self._cluster_names = defaultdict(dict) if aligner_config: self.configure_aligner(**aligner_config) else: self.configure_aligner(**self.aligner_default)
def alignSeqs(keySequences, editDistance, count): aligner = Align.PairwiseAligner() aligner.open_gap_score = -0.5 aligner.extend_gap_score = -0.5 minScore = 15 - editDistance i = keySequences[int(count)] countj = int(int(count) + 1) for j in keySequences[countj:]: alignments = aligner.align(i, j) if alignments.score >= minScore and alignments.score < 15: alignment = alignments[0] return i, j
def sequence_aligner(sequence_id, reference, sequence, chr_name, snpeff_database_name, annotation_file): aligner = Align.PairwiseAligner() aligner.match_score = 3.0 # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work aligner.mismatch_score = -2.0 aligner.open_gap_score = -2.5 aligner.extend_gap_score = -1 alignments = sorted(list(aligner.align(reference, sequence)), key = lambda x: len(str(x).strip().split('\n')[2].strip("-"))) alignment = str(alignments[0]).strip().split('\n') ref_aligned = alignment[0] seq_aligned = alignment[2] print(f'#\n#\n#Pipeline: {"Alignment done"} \n#\n#') ref_positions = np.zeros(len(seq_aligned), dtype=int) pos = 0 for i in range(len(ref_aligned)): if ref_aligned[i] != '-': pos += 1 ref_positions[i] = pos seq_positions = np.zeros(len(seq_aligned), dtype=int) pos = 0 for i in range(len(seq_aligned)): if seq_aligned[i] != '-': pos += 1 seq_positions[i] = pos annotated_variants = call_nucleotide_variants(sequence_id, reference, sequence, ref_aligned, seq_aligned, ref_positions, seq_positions, chr_name, snpeff_database_name ) print(f'#\n#\n#Pipeline: {"Nuc variant called"} \n#\n#') annotations = filter_ann_and_variants( call_annotation_variant(annotation_file, ref_aligned, seq_aligned, ref_positions, seq_positions ) ) print(f'#\n#\n#Pipeline: {"AA variant called"} \n#\n#') return annotated_variants, annotations
def remove_positions_with_gaps_in_first_sequence(input_fasta, output_fasta): # removes all positions with gaps in the first sequence aln = AlignIO.read(str(input_fasta), 'fasta') first_sequence = str(aln[0].seq) good_positions = [ k for k in range(len(first_sequence)) if first_sequence[k] != '-' ] first_pos = good_positions[0] clean_aln = Align.MultipleSeqAlignment(aln[:, first_pos:first_pos + 1]) for pos in good_positions[1:]: clean_aln += aln[:, pos:pos + 1] AlignIO.write(clean_aln, str(output_fasta), 'fasta') return output_fasta
def load_aln_to_repair(infile, omit): aln = Align.MultipleSeqAlignment([]) aln_dict = {} with open(infile, 'r') as f: for seq_record in SeqIO.parse(f, 'fasta'): aln_dict[seq_record.id] = str(seq_record.seq) if seq_record.name not in omit: aln.append(seq_record) return aln, aln_dict
def save_alignment(tt: TreeTime, config: TreetimeConfig): records = [ SeqRecord.SeqRecord( Seq.Seq("".join(n.sequence)), id=n.name, name=n.name, description="", ) for n in tt.tree.find_clades() ] aln = Align.MultipleSeqAlignment(records) with open(config.output_filenames.FASTA, "w") as ofile: AlignIO.write(aln, ofile, "fasta")
def populate_from_pair(g_1, g_2, trans_m, emiss_m, N = 10): ''' Popunjava trans i emis matricu iz najvise N optimalnih poravnanja koja se dobivaju od genoma g1 i g2. Tablicu popunjavaju pojavama(count) te se ne radi pretovrba u vjerojatnosu matricu ''' aligner = Align.PairwiseAligner() alignments = aligner.align(g_1, g_2) for i,alignment in enumerate(alignments): #print(alignment) if N is not None and i>=N: break populate_from_aligment(alignment, trans_m, emiss_m)
def get_nuc_aligner() -> Align.PairwiseAligner: from Bio.Align.substitution_matrices import Array aligner = Align.PairwiseAligner() aligner.match_score = 3.0 # the documentation states we can pass the scores in the constructor of PairwiseAligner but it doesn't work aligner.mismatch_score = -2.1 aligner.open_gap_score = -2.5 aligner.extend_gap_score = -1 aligner.right_extend_gap_score = 0 aligner.left_extend_gap_score = 0 aligner.right_open_gap_score = 0 aligner.left_open_gap_score = 0 match_scores = {1: aligner.match_score, 3: 2, 10: 1.5, 16: 1} dd = { "a": "a", "g": "g", "c": "c", "t": "t", # len 3 "y": "cty", "r": "agr", "w": "atw", "s": "gcs", "k": "tgk", "m": "cam", # len 10 "d": "agtd" + "yrwskm", "v": "acgv" + "yrwskm", "h": "acth" + "yrwskm", "b": "cgtb" + "yrwskm", # len 16 "n": "agctyrwskmdvhbnx", "x": "agctyrwskmdvhbnx", } extra_characters = "" all_characters = "".join(dd) + extra_characters matrix = Array(alphabet=all_characters, dims=2, data=np.ones((len(all_characters), len(all_characters))) * aligner.mismatch_score) for x, chrs in dd.items(): score = match_scores[len(chrs)] for y in chrs: matrix[x, y] = matrix[y, x] = score aligner.substitution_matrix = matrix return aligner
def test_mite(): aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.open_gap_score = -1.0 aligner.extend_gap_score = -1.0 aligner.mismatch = -1.0 for ir_min_score in range(5, 11): for lt_win in range(10, 50, 5): dna_counter = 0 tp_counter = 0 fp_counter = 0 rt_win = lt_win for record in SeqIO.parse( "/Users/zakarota/Tools/repeatMasker/Libraries/RepeatMaskerLib.embl", "embl"): mark = 'Type: DNA' if mark in record.annotations['comment']: dna_counter += 1 rc = record.seq[-lt_win:-1].reverse_complement() score = aligner.score(str(record.seq[0:rt_win]), str(rc)) # print(score) if score >= ir_min_score and mark in record.annotations[ 'comment']: tp_counter += 1 if score >= ir_min_score and not mark in record.annotations[ 'comment']: fp_counter += 1 sensitivity = 100 * tp_counter / dna_counter if tp_counter > 0 or fp_counter > 0: precision = 100 * tp_counter / (tp_counter + fp_counter) else: precision = 0 if sensitivity > 0 or precision > 0: f_measure = 2 * (sensitivity * precision) / (sensitivity + precision) else: f_measure = 0 print('Left window:', lt_win, 'Right window:', rt_win, 'Min score:', ir_min_score, 'Sensitivity:', sensitivity, 'Precision:', precision, 'F-measure:', f_measure)
def separate_alignments(msa_data, sus_ids, out_dir, filename, patient_zero='NC_045512.2'): good_seqs = [] poor_seqs = [] for rec in msa_data: if rec.id in sus_ids: poor_seqs.append(rec) elif rec.id == patient_zero: good_seqs.append(rec) poor_seqs.append(rec) else: good_seqs.append(rec) good_msa = Align.MultipleSeqAlignment(good_seqs) good_msa_fn = filename + '_aligned_white.fa' good_msa_fp = out_dir / good_msa_fn AlignIO.write(good_msa, good_msa_fp, 'fasta') poor_msa = Align.MultipleSeqAlignment(poor_seqs) poor_msa_fn = filename + '_aligned_inspect.fa' poor_msa_fp = out_dir / poor_msa_fn AlignIO.write(poor_msa, poor_msa_fp, 'fasta') return 0
def Needleman_Wunsch_alignment(seq1, seq2): ''' Function for doing global alignment between seq1 and seq2 using Needleman-Wunsch algorithm implemented in Biopython ''' missing = None if "-" in seq1: # Need to handle "-" beforehand, otherwise the alignment may fail missing = [s == "-" for s in seq1] seq1 = seq1.replace("-", "") aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.extend_gap_score = -0.5 aligner.substitution_matrix = blosum62 alignment = aligner.align(seq1, seq2)[0] alignment_info = alignment.__str__().split("\n") aligned1, aligned2 = alignment_info[0], alignment_info[2] if missing is None: final1 = aligned1 final2 = aligned2 else: # Assign alignment with "-" final1_temp = "" final2_temp = "" j = 0 for s in missing: if s: final1_temp += "-" final2_temp += "-" else: while aligned1[j] == "-" and j < len(aligned1): final1_temp += aligned1[j] final2_temp += aligned2[j] j += 1 if j < len(aligned1): final1_temp += aligned1[j] final2_temp += aligned2[j] j += 1 if j < len(aligned1): final1_temp += aligned1[j:] final2_temp += aligned2[j:] # Cleaning up final1 = "" final2 = "" for i in range(len(final1_temp)): if not (final1_temp[i] == "-" and final2_temp[i] == "-"): final1 += final1_temp[i] final2 += final2_temp[i] return final1, final2
def __init__(self, aligner_config=None): # Lookup dictionaries self._genes = {} self._loci = {} self._links = {} self._alignment_indices = defaultdict(dict) self._cluster_names = defaultdict(dict) self.alignments = {} self.aligner = Align.PairwiseAligner() self.clusters = OrderedDict() if aligner_config: self.configure_aligner(**aligner_config) else: self.configure_aligner(**self.aligner_default)
def test_align(): """ Determining how a string matches is a key challenge... """ aligner = Align.PairwiseAligner() searchstr = 'ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG' searchfor = 'GAATGG' alignments = aligner.align(searchstr, searchfor) for alignment in alignments: print(alignment) score = aligner.score print(score) print(aligner)
def get_aligned_subsequence(ref='jejuni'): alignments={} for key in sequences: print(key) if key==ref: continue #print(sequences[ref]) seqref = ''.join(sequences[ref].split()) seq = ''.join(sequences[key].split()) aligner = Align.PairwiseAligner() alignments[key]=aligner.align(seqref,seq) #target,query subsequences_aligned={} for key in alignments: subsequences_aligned[key] = alignments[key][0].aligned return subsequences_aligned
def pairwise_aligner(self): aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.extend_gap_score = -0.5 aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # aligner.match = 2 # aligner.mismatch = -1 # only need to run aligner.score. This improves memory usage and speed. # alignments = aligner.align(self.sequence1, self.sequence2) # for alignment in sorted(alignments): # logging.debug(alignment) align_score = aligner.score(self.sequence1, self.sequence2) #logging.info(align_score) self.score = align_score
def main(): q = Queue(connection=conn) mat_name = "BLOSUM62" matrix = substitution_matrices.load(mat_name) aligner = Align.PairwiseAligner() # aligner.substitution_matrix = matrix job = q.enqueue(global_align, args=(aligner, x, y, matrix)) # alignments = global_align() count = 0 while True: if job.result != None or count > 100000: break time.sleep(2) count += 1 print(f'job.get_id(): {job.get_id()}, ' f'job.result:{job.result}') alignments = job.result print(f'alignments[0]:{alignments[0]}\n score: {alignments[0].score}')
def compute_similarity_kernel_matrices(dataset): """ Computes the drug-drug and protein-protein kernel matrices for kernel-based methods (e.g. Kron-RLS) :param dataset: :return: tuple """ start = time.time() print("Computing kernel matrices (KD_dict, KT_dict)") all_comps = set() all_prots = set() for idx, pair in enumerate(dataset.X): mol, prot = pair all_comps.add(mol) all_prots.add(prot) # compounds / drugs comps_mat = {} for c1 in all_comps: fp1 = c1.fingerprint for c2 in all_comps: fp2 = c2.fingerprint # Tanimoto coefficient score = DataStructs.TanimotoSimilarity(fp1, fp2) comps_mat[Pair(c1, c2)] = score # proteins / targets aligner = Align.PairwiseAligner() aligner.mode = 'local' # SW algorithm prots_mat = {} for p1 in all_prots: seq1 = p1.sequence[1] p1_score = aligner.score(seq1, seq1) for p2 in all_prots: seq2 = p2.sequence[1] p2_score = aligner.score(seq2, seq2) score = aligner.score(seq1, seq2) # Normalized SW score prots_mat[Pair(p1, p2)] = score / (sqrt(p1_score) * sqrt(p2_score)) print("Kernel entities: Drugs={}, Prots={}".format(len(all_comps), len(all_prots))) duration = time.time() - start print("Kernel matrices computation finished in: {:.0f}m {:.0f}s".format( duration // 60, duration % 60)) return comps_mat, prots_mat
def fetch_seqs(seqs_filepath, out_fp, sample_idxs: list, is_aligned=False, is_gzip=False): if is_aligned: if is_gzip: with gzip.open(seqs_filepath, "rt") as handle: cns = AlignIO.read(handle, 'fasta') else: cns = AlignIO.read(seqs_filepath, 'fasta') my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs]) return AlignIO.write(my_cns, out_fp, 'fasta') else: if is_gzip: with gzip.open(seqs_filepath, "rt") as handle: cns = SeqIO.parse(handle, 'fasta') else: cns = SeqIO.parse(seqs_filepath, 'fasta') my_cns = [rec for rec in cns if rec.id in sample_idxs] return SeqIO.write(my_cns, out_fp, 'fasta')
def consensus_from_alignment (align): ## IUPAC ambiguity codes if ambiguous_dna: ## biopython < 1.78 xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N"), ambiguous_dna), id=rec.id, description=rec.description) for rec in align] else: xaln = [SeqRecord(Seq.Seq(str(rec.seq).replace("-","N")), id=rec.id, description=rec.description) for rec in align] summary_align = AlignInfo.SummaryInfo(Align.MultipleSeqAlignment(xaln)) # must be an MSA, not a list pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=["-"]) consensus = []; # pssm example: {'-':3, 'A':0, 'T':4.0, 'G':0, 'C':2.0, 'N':1} per column, means 3 seqs have "-", 4 have "T"... for score in pssm: # we don't care about frequency, only presence # base can be "R", then iupac.dna_values[R] = [A,G] acgt_list = [x for base, count in score.items() for x in IUPACData.ambiguous_dna_values[base] if count > 0] consensus.append(iupac_dna[ ''.join(sorted(set(acgt_list))) ]) if ambiguous_dna: return Seq.Seq(''.join(consensus),ambiguous_dna) else: return Seq.Seq(''.join(consensus))
def pop_row(aln, seqid): ''' Pop a row from an alignment by sequence id aln: a Bio.Align.MultipleSeqAlignment object seqid: id of Bio.SeqRecord.SeqRecord to pop from aln Returns a tuple containing the popped SeqRecord and a copy of aln without seqid's SeqRecord. ''' aln_d = SeqIO.to_dict(aln) seq = aln_d[seqid] del aln_d[seqid] aln = Align.MultipleSeqAlignment(aln_d.itervalues()) return seq, aln
def main(): global f_ab, f_extra, bt_positions seqs = {} records = [] fname_list = [basename(fpath) for fpath in options.input_files] with open(options.output.replace(".abbababa", ".flist"), "w") as fout: fout.write("\n".join(options.input_files)) for fpath in options.input_files: fname = basename(fpath) seqs[fname] = SeqIO.index(fpath, "fasta") records_per_fasta = seqs.get(fname).keys() records.extend([record for record in records_per_fasta]) print fname anc = SeqIO.index(options.anc, "fasta") print "\n" records = set([str(r) for r in records]) f_ab = open(options.output, "w") f_extra = open(options.extra, "w") bt_positions = BedToolPositions() for record in sorted(records): sequences = [] # min_alignment_length = min([len(seqs.get(seq_key).get(record)) for seq_key in fname_list] + # [len(anc.get(record))]) for seq_key in fname_list: # print seq_key sequences.append(seqs.get(seq_key).get(record)) min_alignment_length = min([len(sequence) for sequence in sequences] + [len(anc.get(record))]) per_chr_alignment = Align.MultipleSeqAlignment( [sequence[:min_alignment_length] for sequence in sequences]) do_abbababa(per_chr_alignment, anc.get(record)[:min_alignment_length]) bt_positions.write_to_BED(options.bed_out) f_ab.close() f_extra.close() return 1
def clean_alignment(alignment): """ Remove ambiguities from alignment. Iterate over sites in the alignment and build a new alignment containing either only pure ATGC sites (-c) or sites with up to a specified proportion of N's (-c -n FLOAT). """ site_length = len(alignment[:, 0]) cleaned_alignment = Align.MultipleSeqAlignment( [seq[:0] for seq in alignment]) if args.n_ratio: logging.info(f"Removing sites with > {int(args.n_ratio * 100)}% of " + f"N's from '{alignment[0].name}'") for pos in range(0, len(alignment[0])): site = alignment[:, pos:pos + 1] site_nucleotides = alignment[:, pos] n_count = site_nucleotides.upper().count('N') n_ratio = n_count / site_length if n_ratio <= args.n_ratio: cleaned_alignment += site else: logging.info("Removing sites with ambiguities from " + f"'{alignment[0].name}'") iupac = ['N', 'Y', 'R', 'K', 'M', 'W', 'S', 'B', 'D', 'H', 'V', '-'] iupac_length = len(iupac) for pos in range(0, len(alignment[0])): site = alignment[:, pos:pos + 1] site_nucleotides = alignment[:, pos] bad_char = False if site_length > iupac_length: if any([char in site_nucleotides.upper() for char in iupac]): bad_char = True break else: for char in site: if str(char.seq).upper() in 'NYRKMWSBDHV-': bad_char = True break if not bad_char: cleaned_alignment += site return cleaned_alignment