__author__="pmoreno" __date__ ="$May 29, 2011 5:16:28 PM$" if __name__ == "__main__": #dirOfHMMModels = sys.argv[1] fastaFileCladeNoGeneralSignal = sys.argv[1] fastaFileClade = sys.argv[2] entryToTest = int(sys.argv[3]) resultFolder = sys.argv[4] from Bio import AlignIO, SeqIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import MultipleSeqAlignment alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); noGenSignalAlignment = alignmentNoGenSignalIterator.next() queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,) ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,) #print testAlignment[entryToTest].id #print testAlignment[entryToTest].seq alignmentWithSignal = alignmentIterator.next() desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) #print seqNoGaps seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id)
def quantitative_analyzes(self, region): """ Function to analyze all paths in created tree from 'tree_building.py' for each patient you chosed. Here days for each patient are stored in patient's 'X' and probability for each path to be human's protein is stored in Y. Return: self.patients_evolution: dict, dict of dicts -> {patient:{'X': X, 'Y': [Ys for all paths]}} """ if region not in self.broken_regions: # preparing dict to return self.patients_evolution = {} # making k-mers aa_k_mer_list = data_prep_k_mer.making_aa_k_mers(2) # preparing references for all patients ref = patients_data.Reference('data/hivevo') # for-loop for patients for patient in self.patients_list: # We will not use patient#3 and patient#10 because their HIV wasn't cool at all # joke, additional info can be found here (https://elifesciences.org/articles/11282) if patient != 'p3' and patient != 'p10': # creating dataset for patient pat_class = patients_data.Patient(patient) pat_data = pat_class.regions[region] # extracting reference ref_data = ref.get_patient(patient, region=region) # adding reference to dataset -> now we are ready to construct tree pat_data = pd.concat([ref_data, pat_data], ignore_index=True).sort_values(by=['days']) # print(pat_data) # Constructing tree tree = tree_building.Tree(pat_data) tree.build() # Seqs data converting seq_data = tree.mapping prot_dict = {} # making protein dictionary for day_seq in list(seq_data.keys()): id_ = seq_data[day_seq] prot_dict[id_] = Seq(day_seq[1], Gapped(IUPAC.unambiguous_dna)).ungap().translate() # Dealing with graph vertices = [i for i in range(len(tree.mapping))] edges = tree.graph g = graph.Graph() g.add_vertices(vertices) # setting correct weights g.set_edge_weights(edges) # getting all paths phylo_paths = g.all_paths() # Creating unique days days = set() for day, _ in list(tree.mapping.keys()): days.add(day) days = sorted(list(days)) # adding patient self.patients_evolution[patient] = {} self.patients_evolution[patient]['X'] = None self.patients_evolution[patient]['Y'] = [] # Making X self.patients_evolution[patient]['X'] = days # Using classificator to find out probability to be human's gene for path in phylo_paths: met = self.clf_metric_2_mer_path(path, prot_dict, aa_k_mer_list) Y = self.classificator.predict_proba(met)[:, 1] self.patients_evolution[patient]['Y'].append(Y) else: print('There is no data for this region. Please choose other one or consider haplotype calling for this region')
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): raise ValueError(f"""\ Darn... amino acids vs nucleotide coordinates? tool: {tool} query_seq: {query_seq} query_tags: {query_tags} {q} length: {len(q)} match_seq: {match_seq} match_tags: {match_tags} {m} length: {len(m)} handle.name: {handle.name} """) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def align_progressive_nj(self, match_score = 1, mismatch_penalty = -1, gap_penalty = -1, extension_penalty = -1, filename = "output.txt"): nodes_list = [Node([str(seq[1])]) for seq in self.sequences] calculator = DistanceCalculator('blosum62') distance_matrix = np.zeros((len(self.sequences), len(self.sequences))) for c in combinations(range(len(nodes_list)), 2): alignment = pairwise2.align.globalms(nodes_list[c[0]].consensus, nodes_list[c[1]].consensus, match_score, mismatch_penalty, gap_penalty, extension_penalty, one_alignment_only=True)[0] aln = MultipleSeqAlignment([SeqIO.SeqRecord(Seq(alignment[0], Gapped(IUPAC.extended_protein, "-")), id="0"), SeqIO.SeqRecord(Seq(alignment[1], Gapped(IUPAC.extended_protein, "-")), id="1")], Gapped(IUPAC.extended_protein, "-")) dm = calculator.get_distance(aln) distance_matrix[c[0]][c[1]] = distance_matrix[c[1]][c[0]] = dm[0][1] argmin = (0, 1) minvalue = distance_matrix[argmin[0], argmin[1]] for c in combinations(range(len(nodes_list)), 2): if distance_matrix[c[0]][c[1]] < minvalue: minvalue = distance_matrix[c[0]][c[1]] argmin = c print("ARGMIN, MIN", argmin, distance_matrix[argmin[0]][argmin[1]]) print(distance_matrix) while len(nodes_list) > 1: argmin = (0, 1) minvalue = distance_matrix[argmin[0], argmin[1]] for c in combinations(range(len(nodes_list)), 2): if distance_matrix[c[0]][c[1]] < minvalue: minvalue = distance_matrix[c[0]][c[1]] argmin = c first = argmin[0] second = argmin[1] newnode = merge_nodes(nodes_list[first], nodes_list[second]) nodes_list = nodes_list[0:first] + nodes_list[first + 1:second] + nodes_list[second + 1:] nodes_list.append(newnode) distance_matrix = np.zeros((len(nodes_list), len(nodes_list))) for c in combinations(range(len(nodes_list)), 2): alignment = pairwise2.align.globalms(nodes_list[c[0]].consensus, nodes_list[c[1]].consensus, match_score, mismatch_penalty, gap_penalty, extension_penalty, one_alignment_only=True)[0] aln = MultipleSeqAlignment( [SeqIO.SeqRecord(Seq(alignment[0], Gapped(IUPAC.extended_protein, "-")), id="0"), SeqIO.SeqRecord(Seq(alignment[1], Gapped(IUPAC.extended_protein, "-")), id="1")], Gapped(IUPAC.extended_protein, "-")) dm = calculator.get_distance(aln) distance_matrix[c[0]][c[1]] = distance_matrix[c[1]][c[0]] = dm[0][1] print("ALIGNMENT:") for x in nodes_list[0].msa: print(str(x)) score = save_msa_to_file(nodes_list[0].msa, filename) return score
#!/usr/bin/env python """Example of generating a substitution matrix from an alignment.""" # standard library from __future__ import print_function # Biopython from Bio import SubsMat from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped from Bio.Align import AlignInfo # get an alignment object from a Clustalw alignment output c_align = AlignIO.read('protein.aln', 'clustal', alphabet=Gapped(IUPAC.protein)) summary_align = AlignInfo.SummaryInfo(c_align) # get a replacement dictionary and accepted replacement matrix # exclude all amino acids that aren't charged polar replace_info = summary_align.replacement_dictionary([ "G", "A", "V", "L", "I", "M", "P", "F", "W", "S", "T", "N", "Q", "Y", "C" ]) my_arm = SubsMat.SeqMat(replace_info) print(replace_info) my_lom = SubsMat.make_log_odds_matrix(my_arm) print('log_odds_mat: %s' % my_lom)
#http://biopython.org/wiki/AlignIO #!/usr/bin/env python from Bio import SeqIO import os import sys from collections import defaultdict from pprint import pprint import argparse import multiprocessing from Bio.Alphabet import generic_dna, Gapped from Bio import AlignIO alignment = AlignIO.read(open(sys.argv[1]), 'fasta', alphabet=Gapped(generic_dna)) output = open(sys.argv[2], 'w') AlignIO.write(alignment, output, "nexus")
__author__ = 'amirbar' import os from Bio.Seq import Seq from Bio import motifs from Bio.Alphabet import Gapped, IUPAC import matplotlib.pyplot as plt import optparse import sys import os GAP = "-" ALPHABET = Gapped(IUPAC.unambiguous_dna) def process_command_line(argv): """ Return a 2-tuple: (settings object, args list). `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = optparse.OptionParser( formatter=optparse.TitledHelpFormatter(width=100), add_help_option=None) parser.add_option( "-r", "--reads_fusion",
for line in f: MSAfilename = line.replace("\n", '') MSAfilenames.append( MSAfilename) # get names of MSA files without format # transform fasta to nex format for MSAfilename in MSAfilenames: file_list.append("speciesID_" + MSAfilename + ".nex") # get names of MSA files in "nex" format with open(MSAfilename + ".fa", "rU") as input_handle, open( "speciesID_" + MSAfilename + ".nex", "w") as output_handle_nex, open("speciesID_" + MSAfilename + ".fa", "w") as output_handle_fasta: alignments = AlignIO.read( input_handle, "fasta", alphabet=Gapped(IUPAC.protein)) # read fasta file to "alignments" for seq in alignments: seq.id = seq.description.split( "[")[-1][:-1] # use species name as sequence ID seq.description = "" # use species name as sequence ID AlignIO.write( alignments, output_handle_nex, "nexus") # write "alignments" with new ID to nexus format AlignIO.write(alignments, output_handle_fasta, "fasta") # to fasta format # change a one-gene MSA nex file to a nex obeject, and put them together nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list] # combine one-gene MSA nex file of different genes combined = Nexus.combine(nexi)
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" #Just for printing len(q) in debug below m = "?" #Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordindates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
from Bio.Align import AlignInfo from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import IUPAC, Gapped from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord os.chdir(sys.argv[1]) listing = os.listdir(".") consensus = {} genConsensus = '' pssmGen = '' consensusThres = 0.7 #generalAlignment = AlignIO.parse(sys.argv[2],"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) generalAlignment = AlignIO.parse(sys.argv[2], "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-")) lengthGenAl = 0 for genAlignment in generalAlignment: sumGen = AlignInfo.SummaryInfo(genAlignment) genConsensus = sumGen.gap_consensus(consensusThres) #pssmGen = sumGen.pos_specific_score_matrix(genConsensus,chars_to_ignore = ['-']) pssmGen = sumGen.pos_specific_score_matrix(genConsensus) lengthGenAl = len(genAlignment) for item in listing: if item.endswith(".fas"): #alignments = AlignIO.parse(item,"fasta",alphabet=IUPAC.ExtendedIUPACProtein()) alignments = AlignIO.parse(item, "fasta", alphabet=Gapped( IUPAC.ExtendedIUPACProtein(), "-"))
from Bio import SeqIO from Bio.Align import MultipleSeqAlignment from Bio.Alphabet import IUPAC, Gapped import time # Prettify labels def get_label(leaf): if leaf.name.startswith("Inner"): return "" return leaf.name.replace("_", " ") # Read the sequences and align aln = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse("data/coding.fa", "fasta"): # for seq_record in SeqIO.parse("data/cons_noncode.fa", "fasta"): print(seq_record.id) print(repr(seq_record.seq)) print(len(seq_record)) aln.extend([seq_record]) # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix
def rename_alignment_taxa(aln, name_map): new_align = Alignment([], alphabet=Gapped(IUPAC.unambiguous_dna, "-")) for seq in aln: seq.id, seq.name = name_map[seq.id], name_map[seq.id] new_align.append(seq) return new_align
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
def main(): # Configuration #Select the desired NCBI translation table translationTable = 11 # Open the DNA sequence file and read the fasta sequences into a dictionary if (len(argv) > 1): dnaFileName = argv[1] else: dnaFileName = None dnaSeqFile = fileinput.input(dnaFileName) dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta")) # Translate the sequences aaSeqRecords = [] for key in dnaSeqDict: aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key) aaSeqRecords.append(aaSeq) dnaSeqFile.close() # Replace stop codons with X (unknown aa) so muscle doesn't drop them for aaSeq in aaSeqRecords: noStopCodonSeq = str(aaSeq.seq).replace('*', 'X') aaSeq.seq = Seq(noStopCodonSeq) # Align the aa sequences commandLine = str(MuscleCommandline(seqtype='protein')) childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta") childProcess.stdin.close() aaAlignment = AlignIO.read(childProcess.stdout, "fasta") # Convert the aa alignment into a dna alignment dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for taxon in aaAlignment: aaCount = 0 dnaSeq = '' for aaResidue in taxon.seq: if (aaResidue == '-'): dnaSeq = dnaSeq + '---' else: dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3] aaCount+=1 # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq)) if (dnaFileName): outFileName = dnaFileName.split('.')[0] + '_aln.phy' else: outFileName = 'out_aln.phy' outFile = open(outFileName, 'w+') AlignIO.write([dnaAlignment], outFile, "phylip") #I think this section should be removed. If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc). I can use pamlize.py to add the I right before using paml. # Biopython doesn't tag Interleaved phylip files and PAML requires it so... # outFile.seek(0,0) # modifiedAlignmentText = outFile.readlines() # modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n' # outFile.seek(0,0) # outFile.writelines(modifiedAlignmentText) outFile.close()
#join all snps into one dictionary final_snp_alignment = {} if snp_mode == 'one': for key, value in final_dict.items(): final_snp_alignment[key] = "".join(value) elif snp_mode == 'all': for key, values in final_dict.items(): value = sum(values, []) final_snp_alignment[key] = "".join(value) # Create the output file in output directory output_file_fasta = os.path.join(out_dir,'snp.fasta') #print the snp dictionary into a fasta-file with open(output_file_fasta, "wb") as f: for k, v in final_snp_alignment.items(): f.write(">" + k+ "\n") f.write(v+ "\n") # Create output file for SNAPP output_file_nexus = os.path.join(out_dir,'snp.nexus') aln = AlignIO.read(open(output_file_fasta), "fasta", alphabet=Gapped(IUPAC.ambiguous_dna)) with open(output_file_nexus, "wb") as n: n.write(aln.format("nexus")) if not args.phased: for line in fileinput.input(output_file_nexus, inplace = 1): print line.replace("format datatype=dna missing=? gap=-;", "format datatype=binary symbols=01 missing=?;").rstrip() else: for line in fileinput.input(output_file_nexus, inplace = 1): print line.replace("format datatype=dna missing=? gap=-;", "format datatype=integerdata symbols=\"012\" missing=?;").rstrip()
def getKmers(k, interval, outdir, msaFile, tp_prot_file, modelName, start, end, gene_pos_file, gene_pos_file_aa): pprot_TP_dict = {} for record in SeqIO.parse(tp_prot_file, "fasta"): pprot_TP_dict[record.id] = str(record.seq) alignment = AlignIO.read(msaFile, "fasta") if (start != None) & (end != None): alignment = alignment[:, start - 1:end - 1] print("Number of domains: %i" % len(alignment)) print("Alignment length: %i" % alignment.get_alignment_length()) hmmDict = {} counter = int(((alignment.get_alignment_length() - k) / interval) + 1) j = 0 for i in range(alignment.get_alignment_length()): alnCol = alignment[:, i] if '-' in alnCol: j = j + 1 else: break seqCtr = alignment.get_alignment_length() for i in range(alignment.get_alignment_length() - 1, -1, -1): alnCol = alignment[:, i] if '-' in alnCol: seqCtr = seqCtr - 1 else: break gene_pos_out_aa = open(gene_pos_file_aa, 'w') gene_pos_out_aa.write("gene_name\tstart\tend\tinterval\tprot_type\n") gene_pos_out = open(gene_pos_file, 'w') gene_pos_out.write("gene_name\tstart\tend\tinterval\tprot_type\n") for i in range(counter): startPos = j endPos = j + k if endPos <= seqCtr: kmer = alignment[:, startPos: endPos] #[ rows (different domains),columns (Amino Acids)] spHMMAlign = MultipleSeqAlignment([], Gapped(IUPAC.extended_protein, "-")) if str(kmer[0].seq).count("-") <= 15: # Remove the TP genes for align in kmer: if align.id not in pprot_TP_dict: spHMMAlign.append(align) else: prot_seq = str(align.seq) prot_seq = prot_seq.replace("-", "") start_tp_coord = pprot_TP_dict[align.id].find(prot_seq) end_tp_coord = start_tp_coord + len(prot_seq) gene_pos_out_aa.write(align.id + "\t" + str(start_tp_coord) + "\t" + str(end_tp_coord) + "\t" + str(i * interval) + "_" + str(i * interval + k) + "\t" + modelName + "\n") gene_pos_out.write(align.id + "\t" + str(start_tp_coord * 3) + "\t" + str(end_tp_coord * 3) + "\t" + str(i * interval) + "_" + str(i * interval + k) + "\t" + modelName + "\n") outputFile = outdir + os.sep + modelName + "__" + str( k) + "_" + str(interval) + "__" + str( i * interval) + "_" + str(i * interval + k) + ".fas" AlignIO.write(spHMMAlign, outputFile, "fasta") hmmFile = runHMMBuild(outputFile, modelName) hmmSegment = str(startPos) + "_" + str(endPos) hmmDict[hmmSegment] = HMMFile(i * interval, i * interval + k, hmmFile) j = j + interval else: break gene_pos_out_aa.close() gene_pos_out.close() return hmmDict
import os import Bio from Bio.Alphabet import generic_dna from Bio import motifs from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment from Bio import AlignIO from Bio.Align import AlignInfo from Bio.Alphabet import IUPAC, Gapped from Bio.Align.Applications import ClustalwCommandline from Bio.Align.Applications import ClustalwCommandline from Bio.SubsMat import FreqTable alph = Gapped(IUPAC.ambiguous_dna) def printAlignmentInfo(alignment, alphabet): seqlist = [] for record in alignment: seqlist.append(record.seq) m = motifs.create(seqlist, alphabet) pwm = m.counts.normalize() consensus = pwm.consensus summary_align = AlignInfo.SummaryInfo(alignment) consensus2 = summary_align.dumb_consensus() my_pssm = summary_align.pos_specific_score_matrix(consensus,
def convert_file(in_file, out_file): alignment = AlignIO.read(open(in_file), "fasta", alphabet=Gapped(IUPAC.protein)) g = open(out_file, "w") g.write(alignment.format("nexus"))
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file raise StopIteration if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... raise StopIteration #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None, 1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError( "Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None, 1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region( query_seq, query_annotation) match_align_seq = self._extract_alignment_region( match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError( "Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(query_align_seq, alphabet), id=self._query_descr.split(None, 1)[0].strip(","), name="query", description=self._query_descr, annotations={"original_length": int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(match_align_seq, alphabet), id=match_descr.split(None, 1)[0].strip(","), name="match", description=match_descr, annotations={"original_length": int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
True >>> _match_ambiguous_dna('A', 'T') False >>> _match_ambiguous_dna('A', 'A') True """ x = x.upper() y = y.upper() xset = set(ambiguous_dna_values.get(x, x)) yset = set(ambiguous_dna_values.get(y, y)) if not xset.intersection(yset): return False return True DNA_ALPHABET = alphabet = Gapped(ambiguous_dna, '-') DNA_ALPHABET.match = lambda x, y: _match_ambiguous_dna(x, y) FLAGS = MavisNamespace(LQ='LOWQUAL') READ_PAIR_TYPE = MavisNamespace(RR='RR', LL='LL', RL='RL', LR='LR') CALL_METHOD = MavisNamespace(CONTIG='contig', SPLIT='split reads', FLANK='flanking reads', SPAN='spanning reads', INPUT='input') """:class:`MavisNamespace`: holds controlled vocabulary for allowed call methods - ``CONTIG``: a contig was assembled and aligned across the breakpoints - ``SPLIT``: the event was called by :term:`split read`
def compute_consensus(self): align = MultipleSeqAlignment(Gapped(IUPAC.extended_protein, "-")) for i, seq in enumerate(self.msa): align.add_sequence(str(i), str(seq)) summary_align = AlignInfo.SummaryInfo(align) self.consensus = summary_align.gap_consensus(threshold=0, ambiguous="-")
print '\t\t\t<alignment idref="alignment"/>' print '\t\t\t<counts>' print '\t\t\t\t<parameter value="', constants['A'], constants[ 'C'], constants['G'], constants['T'], '"/>' print '\t\t\t</counts>' print '\t\t</constantPatterns>' print '\t</mergePatterns>' print '\nOr use replace_BEAST_blocks.py and provide the file', options.outfile + ".patterns", "with the -p flag" output = open(options.outfile + ".patterns", "w") print >> output, ' '.join( map(str, [constants['A'], constants['C'], constants['G'], constants['T']])) output.close() alignment = Generic.Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for name in snpsequence: # if len(''.join(snpsequence[name]).replace("-","").replace("N",""))>float(len(snpsequence[name]))*(float(options.exclude)/100): # alignment.add_sequence(name, ''.join(snpsequence[name])) # else: # print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped" if name in dates: alignment.add_sequence(name + "_" + str(dates[name]), ''.join(snpsequence[name])) else: alignment.add_sequence(name, ''.join(snpsequence[name])) AlignIO.write([alignment], open(options.outfile, 'w'), "fasta")
def AceIterator(handle): """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> with open("Ace/consed_sample.ace", "rU") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) ... print(max(record.letter_annotations["phred_quality"])) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> with open("Ace/contig1.ace", "rU") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s ...%s..." % (record.id, record.seq[85:95])) ... print(record.letter_annotations["phred_quality"][85:95]) ... print(max(record.letter_annotations["phred_quality"])) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(handle): # Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence # Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: # Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: # For consistency with most other file formats, map # any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) # TODO? - Base segments (BS lines) which indicates which read # phrap has chosen to be the consensus at a particular position. # Perhaps as SeqFeature objects? # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) # Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) # Consensus base quality (BQ lines). Note that any gaps (originally # as * characters) in the consensus do not get a quality entry, so # we assign a quality of None (zero would be missleading as there may # be excelent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record
def stage_two_trimming(self, s1_trimmed, window_size, max_divergence, min_len): """ Alignment row-by-row trimming. After stage one trimming, iterate over rows of alignment to find differences between the alignment consensus and the row (taxon) of data. Trim those ends that differ from the consensus with > `divergence` across a `window_size` window. Goes to third round of filtering to remove edges that end up with only '----' characters to start or end alignment block. """ # create new alignment object to hold trimmed alignment s2_trimmed = MultipleSeqAlignment([], Gapped(IUPAC.ambiguous_dna, "-?")) # get consensus of alignment in array form consensus_array = numpy.array( list(self._alignment_consensus(s1_trimmed))) # iterate over each alignment sequence for sequence in s1_trimmed: # ensure sequence is uppercase - consensus will be, too sequence = sequence.upper() # get the true ends of the sequence by walking in until we hit some bases start, end = self._get_ends(sequence) # convert sequence to array orig_seq_array = numpy.array(list(sequence)) # trim down gaps at edges so they do not exert undue influence # on trimming the sequence row seq_array = orig_seq_array[start:end] # set default values for trim to `start` and `end`, just for safety # this ensure we don't carry anything over from previous iteration # (we shouldn't) bad_start = 0 bad_end = len(sequence) # ============================================================= # get first 5' => 3' positions that start a `window_size` block # of sequence having a divergence of less than `max_divergence` # from the consensus sequence of all alignments # ============================================================= # compare the sequence to the consensus, returns an array of # boolean values representing equality relative to the consensus compare = (seq_array != consensus_array[start:end]) # begin working from 5' => 3' across `compare` array for bad_start in xrange(compare.size): # get successive window-sized slices window = compare[bad_start:bad_start + window_size] divergence = float(sum(window)) / window.size # stop if we hit a point where divergence < max_divergence if divergence < max_divergence: break # reverse the `compare` array and begin working 3' => 5' reversed_compare = compare[::-1] for bad_end in xrange(reversed_compare.size): window = reversed_compare[bad_end:bad_end + window_size] divergence = float(sum(window)) / window.size # get 5 value slices if divergence < max_divergence: bad_end = reversed_compare.size - bad_end break # given original edge trimming and `bad_start`/`bad_end` values, # set the starting values of the sequece array to '-' orig_seq_array[:start + bad_start] = '-' orig_seq_array[start + bad_end:] = '-' trim = ''.join(orig_seq_array) # ensure alignment consists of something other than '-' or '?' # and that alignments are >= min_len if set(trim) != set( ['-']) and set(trim) != (['?']) and len(trim) >= min_len: s2_trimmed.append(self._record_formatter(trim, sequence.id)) # if they're not, return None else: s2_trimmed = None break return s2_trimmed
#!/usr/bin/env python from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped import sys #This script takes a FASTA alignment and converts is to a #nexus alignment # check for correct arguments if len(sys.argv) != 3: print("Usage: FastaToNexus.py <inputfile> <outputfile>") sys.exit(0) input_name = sys.argv[1] output_name = sys.argv[2] input_file = open(input_name, 'r') output_file = open(output_name, 'w') alignment = AlignIO.read(input_file, 'fasta', alphabet=Gapped(IUPAC.ambiguous_dna, '-')) AlignIO.write(alignment, output_file, 'nexus') input_file.close() output_file.close()
def __init__(self): Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
#! /usr/bin/env python ''' ''' import sys from Bio import AlignIO from Bio.Alphabet import IUPAC, Gapped input_handle = open(sys.argv[1], "rU") output_handle = open(sys.argv[2], "w") alignments = AlignIO.parse(input_handle, "fasta", alphabet=Gapped(IUPAC.unambiguous_dna)) AlignIO.write(alignments, output_handle, "nexus") output_handle.close() input_handle.close()
def get_aa(seq): seq = Seq(seq, Gapped(IUPAC.unambiguous_dna)) seq = seq.translate(table=1, to_stop=False) return str(seq)
def _record_formatter(self, trim, name): """return a string formatted as a biopython sequence record""" return SeqRecord(Seq(trim, Gapped(IUPAC.ambiguous_dna, "-?")), id=name, name=name, description=name)
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input directory from s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/" output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/") if os.path.isdir(output_directory_2) == False: os.makedirs(output_directory_2) ### iterate each gene for file in os.listdir(output_directory_1): if file != ".DS_Store": output_directory_file = output_directory_2 + file fasta_name = output_directory_1 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " + sequence) alignment = AlignIO.read(sequence, 'fasta') ### calculate the polymorphism in outgroup ### change alignment to an array. total_wrong_poly_sites_outgroup = [] align_array_outgroup = np.array([list(rec) for rec in alignment]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = alignment.get_alignment_length() # alignment = AlignIO.read(sequence, 'fasta') for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position_outgroup = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array_outgroup[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position_outgroup) > float(Max_p_sites_o): print(column_position_outgroup) total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup)) print(unique_wrong_sites_ougroup) print("outgroup") align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for record in alignment: new_seq = "" if record.id in outgroups: print(record.seq) for i in range(total_length): if i in unique_wrong_sites_ougroup: new_seq = new_seq + "-" else: new_seq = new_seq + str(record.seq[i]) align_2.add_sequence(str(record.id), str(new_seq)) else: align_2.add_sequence(str(record.id), str(record.seq)) print(align_2) AlignIO.write(align_2, output_directory_file, "fasta")