def maskResiduesNOMAP(refMSA_file, numseq, alnlen, scores, x, formatout, final_file, seqType): ''' Masks poorly aligned residues whose score is <x. Will NOT mask gaps.''' new='?' parsed = AlignIO.read(refMSA_file, 'fasta') newseqs=[] numres=0 totalmasked=0 maskedMSA=MultipleSeqAlignment([]) for row in range(numseq): newseq='' for position in range(alnlen): thispos=str(parsed[row].seq[position]) if thispos=='-': newseq=newseq+parsed[row].seq[position] else: numres+=1 thescore=scores[row][position] if float(thescore)<float(x): #mask if below threshold. newseq=newseq+new totalmasked+=1 else: #or, keep that position newseq=newseq+parsed[row].seq[position] newseqs.append(newseq) for i in range(numseq): if str(seqType)=='protein': aln_record=SeqRecord(Seq(newseqs[i],generic_protein), id=str(i+1), description='') elif str(seqType)=='dna': aln_record=SeqRecord(Seq(newseqs[i],generic_dna), id=str(i+1), description='') maskedMSA.append(aln_record) outhandle=open(final_file, 'w') outhandle.write(maskedMSA.format(str(formatout))) outhandle.close()
def show_alignments(ali_path, out_format): for aln in np.load(ali_path, allow_pickle=True): if out_format == 'pir': msa = MultipleSeqAlignment([ SeqRecord(Seq(aln[0], generic_protein), id='Query', name='', description='sequence:::::::::'), SeqRecord(Seq(aln[1], generic_protein), id='Template', name='', description='structureX:::::::::') ]) else: msa = MultipleSeqAlignment([ SeqRecord(Seq(aln[0], generic_protein), id='Query', name='', description=''), SeqRecord(Seq(aln[1], generic_protein), id='Template', name='', description='') ]) print(msa.format(out_format))
def to_string(self, begin=None, end=None, **kwargs): # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str format = get_value(kwargs, "format", None) if format == "pretty": return self._to_string_pretty(begin, end, **kwargs) # add markers as sequence records seq_records = [ SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name)) for m in self.list_msa_markers ] if begin is not None or end is not None: begin = begin if begin is not None else 0 end = end if end is not None else self.alignment_length() # add actual sequences for a in self.list_alignment_sequences: if begin is not None or end is not None: seq_records.append(a[begin:end]) else: seq_records.append(a) # create alignment with markers alignment = MultipleSeqAlignment(seq_records) return alignment.format("clustal")
def alignment_slicer(input, informat, outformat, SNPs, slide): alignment = AlignIO.read(input, informat, alphabet = generic_dna) alignment_seq_count = len(alignment) first_seq = (alignment[0].seq) length_alignment = len(first_seq) chars_to_ignore = ['N'] start = 0 end = start + args.SNPs_in_window while end <= length_alignment: with open(input+'_site'+str(start)+'to'+str(end)+'.'+outformat, 'w') as output_handle: # print 'start:', start # print 'end:', end alignment_iteration = MultipleSeqAlignment(alignment[:, start:end], alphabet=generic_dna) if outformat.lower() == 'nexus': n_alignments = [] alignment_iteration = alignment_iteration.format('nexus') n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration))) combined = Nexus.combine(n_alignments) combined.write_nexus_data(output_handle) else: AlignIO.write(alignment_iteration, output_handle, outformat) # print alignment_iteration start += args.slide end += args.slide else: with open(input+'_site'+str(start)+'to'+str(length_alignment)+'.'+outformat, 'w') as output_handle: n_alignments = [] # print 'now in else loop\n' # print 'start:', start # print 'end:', length_alignment alignment_iteration = MultipleSeqAlignment(alignment[:, start:length_alignment], alphabet=generic_dna) if outformat.lower() == 'nexus': n_alignments = [] alignment_iteration = alignment_iteration.format('nexus') n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration))) combined = Nexus.combine(n_alignments) combined.write_nexus_data(output_handle) else: AlignIO.write(alignment_iteration, output_handle, outformat) # print alignment_iteration print "\ndone\n"
def test_alnRemoveGapOnlyCols(self): s1 = SeqRecord(Seq('A-TT---TTAA---'),id='s1',name='s1') s2 = SeqRecord(Seq('AATT---TTAA---'),id='s2',name='s2') aln = MultipleSeqAlignment([s1, s2]) s1_nogap = SeqRecord(Seq('A-TTTTAA'),id='s1',name='s1') s2_nogap = SeqRecord(Seq('AATTTTAA'),id='s2',name='s2') alnnogap = MultipleSeqAlignment([s1_nogap, s2_nogap]) aln = MultipleSeqAlignment([s1, s2]) # Use format() to report, because the Align objects will be compared # by hash values which will not be equal self.assertEqual( Milraa.alnRemoveGapOnlyCols(aln).format('fasta'), alnnogap.format('fasta'))
def convert_a2m(ali): fh = cStringIO.StringIO(ali) msa = AlignIO.read(fh, 'fasta') fh.close() new_msa = [] for rec in msa: new_seq = Seq(re.sub(r'[a-z.]', '', str(rec.seq)), SingleLetterAlphabet()) new_rec = rec new_rec.seq = new_seq new_msa.append(new_rec) new_msa = MultipleSeqAlignment(new_msa) return new_msa.format('fasta')
del letters print("testing reading and writing clustal format...") test_dir = os.path.join(os.getcwd(), 'Clustalw') test_names = ['opuntia.aln', 'cw02.aln'] test_files = [] for name in test_names: test_files.append(os.path.join(test_dir, name)) for test_file in test_files: # parse the alignment file and get an aligment object alignment = AlignIO.read(test_file, "clustal") # print the alignment back out print(alignment.format("clustal")) alignment = AlignIO.read(os.path.join(test_dir, test_names[0]), "clustal", alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) # test the base alignment stuff print('all_seqs...') for seq_record in alignment: print('description: %s' % seq_record.description) print('seq: %r' % seq_record.seq) print('length: %i' % alignment.get_alignment_length()) print('Calculating summary information...') align_info = AlignInfo.SummaryInfo(alignment) dumb_consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6) simple_consensus = align_info.simple_consensus(ambiguous="N", threshold=0.6)
del letters print("testing reading and writing clustal format...") test_dir = os.path.join(os.getcwd(), 'Clustalw') test_names = ['opuntia.aln', 'cw02.aln'] test_files = [] for name in test_names: test_files.append(os.path.join(test_dir, name)) for test_file in test_files: # parse the alignment file and get an aligment object alignment = AlignIO.read(test_file, "clustal") # print the alignment back out print(alignment.format("clustal")) alignment = AlignIO.read(os.path.join(test_dir, test_names[0]), "clustal", alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna)) # test the base alignment stuff print('all_seqs...') for seq_record in alignment: print('description: %s' % seq_record.description) print('seq: %r' % seq_record.seq) print('length: %i' % alignment.get_alignment_length()) print('Calculating summary information...') align_info = AlignInfo.SummaryInfo(alignment) consensus = align_info.dumb_consensus()
taxon_sequence_threshold = 0.40 in_alignment=AlignIO.read(open(infile), "nexus", alphabet=Gapped(IUPAC.protein)) alignment_length = in_alignment.get_alignment_length(); out_alignment=MultipleSeqAlignment([], alphabet=Gapped(IUPAC.protein)) for seq_record in in_alignment: missing_data = float(seq_record.seq.count("-"))/float(alignment_length) print seq_record.id + "\t" + str(missing_data) if missing_data < taxon_sequence_threshold: out_alignment.append(seq_record) outname=re.search("(/.+)\.nexus$",infile).group(1) + "." + str(taxon_sequence_threshold) nexfile=open((outname + ".nexus"), "w") phyfile=open((outname + ".phylip"), "w") try: nexfile.write(out_alignment.format("nexus")) phyfile.write(out_alignment.format("phylip")) except: print "Could not write alignment " + infile + " ", sys.exc_info()[0] nexfile.close() phyfile.close()
if args.extract_ref: inddict['reference']='reference' seqdict['reference']=[refseq] #for ind in seqdict.keys(): # print ind,seqdict[ind] #generate alignment for ind in inddict.keys(): for i in range(1): # print ">"+ind+"_"+str(i) # print seqdict[ind][i] seqrec=SeqRecord(Seq(seqdict[ind][i], generic_dna), id=ind+"_"+str(i), description=ind+"_"+str(i)) # print seqrec seqrecords.append(seqrec) align = MultipleSeqAlignment(seqrecords) #print alignemnt in desired formats for f in formats: print "writing to "+f+"\n" OUT = open(args.out_prefix+'.'+f[:3],'w') OUT.write(align.format(f)) if f is 'nexus' and args.popmap: add_traits_block(p_list=pops_list, i_dict=inddict, nexhandle=OUT) OUT.close()
def debug_matching(gen, primer_pair, mf, mr, output_file, hanging_primers=False): """ This function computes and displays a single alignment. Used for debugging purposes """ try: assert (len(gen) == 1), "Multiple gen sequences detected" assert (len(primer_pair) == 1), "Multiple primer pairs detected" except (Exception) as e: logging.error(e) return try: g = copy.deepcopy(gen) except: g = gen pass template, discarded, raw_stats, cooked_stats = compute_gen_matching( mf, mr, primer_pair, g, output_file, hanging_primers=hanging_primers) if (template.empty): logging.warning("No result") return match_result = template.loc[0] pp = primer_pair[next(iter(primer_pair))] gen = gen[next(iter(gen))] fpos = match_result.at['F_pos'] - 1 if (fpos < 0): gen = '-' * (-fpos) + gen fpos = 0 len_primer = fpos + pp.flen + match_result.at['ampliconLen'] + pp.rlen rem_len = len(gen) - (fpos + pp.flen + match_result.at['ampliconLen'] + pp.rlen) pp.f.seq = Seq(''.join(pp.f.seq)) pp.r.seq = Seq(''.join(pp.r.seq)) pp_aligned = '-' * fpos + pp.f.seq + '-' * match_result.at[ 'ampliconLen'] + pp.r.seq + '-' * rem_len pp_aligned = SeqRecord(pp_aligned) pp_aligned.id = pp.id align = MultipleSeqAlignment([gen, pp_aligned]) print(align.format("clustal")) try: with open(output_file + ".txt", 'w') as outfile: outfile.write(align.format("clustal")) print("Debug saved") except (Exception) as e: logging.error(e) return
def _to_string_pretty(self, begin=None, end=None, **kwargs): # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str tag = get_value(kwargs, "tag", "", default_if_none=True) self.change_marker("q3prime", new_symbol="*") # add markers as sequence records seq_records = [ SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name)) for m in self.list_msa_markers ] if begin is not None or end is not None: begin = begin if begin is not None else 0 end = end if end is not None else self.alignment_length() headers_old = [a.id for a in self.list_alignment_sequences] headers_new = MSAType._format_headers_pretty(headers_old) # add actual sequences for i, a in enumerate(self.list_alignment_sequences): a.id = headers_new[i] if begin is not None or end is not None: seq_records.append(a[begin:end]) else: seq_records.append(a) # create alignment with markers alignment = MultipleSeqAlignment(seq_records) output_string = alignment.format("clustal") # Remove header output_string = output_string.replace("_", " ") output_string_array = output_string.split("\n") def get_summary_statistics_line_for_alignment(): # type: () -> str ref_position = self.get_mark_position("ref") is_lorf = len(set(self[0][0:ref_position])) <= 1 def count_lorf_targets_near_position(position): # type: (int) -> int count = 0 for idx in range(1, self.number_of_sequences()): j = 0 while True: if position - j >= 0 and self[idx][position - j].isupper(): if len(set(self[idx][0:position - j])) <= 1: count += 1 break if position + j < self.alignment_length( ) and self[idx][position + j].isupper(): if len(set(self[idx][0:position + j])) <= 1: count += 1 break j += 1 return count num_targets_that_are_lorf = count_lorf_targets_near_position( ref_position) return "{}: LORF={}\tTargetLORF={}".format( tag, str(is_lorf)[0], num_targets_that_are_lorf) output_string_array[0] = get_summary_statistics_line_for_alignment() output_string = "\n".join(output_string_array) return output_string