def _ID(a): r"""The function saves ids in a list. Arguments: -a- alignment file Example: >>> with open("alignment.fasta", "w") as alignment_file: ... alignment_file.write(">ENSG0997"+"\n"+"TGA"+"\n"+">ENSG1233"+"\n"+"AAA") >>> import os >>> from Bio import AlignIO >>> ID = _ID("alignment.fasta") >>> ID ['ENSG0997','ENSG1233']""" fileName, fileExtension = os.path.splitext(a) if fileExtension == ".phylip": try: l = list(AlignIO.read(a,"phylip")) except (ValueError): l = list(AlignIO.read(a,"phylip-relaxed")) except: pass elif fileExtension == ".fasta": l = list(AlignIO.read(a,"fasta")) else: raise Exception("Wrong format. Choose accepted format.") ID = [str(l[j].id) for j in range(0,len(l))] return(ID)
def fas_to_nex(infile,outfile,protein=True): '''Convert fasta infile to nexus and write to outfile. Uses BioPython''' if protein: aln = AlignIO.read(infile,'fasta',alphabet=Gapped(IUPAC.extended_protein)) else: aln = AlignIO.read(infile,'fasta',alphabet=Gapped(IUPAC.unambiguous_dna)) AlignIO.write(aln,outfile,'nexus')
def main_origin(args, stdout, stderr) : if not os.path.isdir(args.geneTable) : stderr.write("Loading gene table\n") geneTable = pygenes.GeneTable() geneTable.loadTable(args.geneTable) for inputFile in args.alnFiles : aln = AlignIO.read(inputFile, "fasta") origins = collections.defaultdict(lambda : []) for seq in aln : origins[geneTable.geneId(seq.description).recordId].append(seq.description) multipleOrigins = [(x,y) for (x,y) in origins.items() if len(y) > 1] for (x,y) in multipleOrigins : stdout.write(inputFile + "\t" + str(x) + "\t" + str(len(y)) + "\t" + ";".join(y) + "\n") else : n = str(len(args.alnFiles)) for (i, inputFile) in enumerate(args.alnFiles) : stderr.write("Processing file " + str(i+1) + "/" + n + "\n") geneTableFile = os.path.join(args.geneTable, os.path.basename(inputFile) + ".geneTable") geneTable = pygenes.GeneTable() geneTable.loadTable(geneTableFile) aln = AlignIO.read(inputFile, "fasta") origins = collections.defaultdict(lambda : []) for seq in aln : origins[geneTable.geneId(seq.description).recordId].append(seq.description) multipleOrigins = [(x,y) for (x,y) in origins.items() if len(y) > 1] for (x,y) in multipleOrigins : stdout.write(inputFile + "\t" + str(x) + "\t" + str(len(y)) + "\t" + ";".join(y) + "\n")
def get_second_seq(self): start_buf, end_buf = self.second_seq_text_buffer.get_bounds() seq_direct = self.second_seq_text_buffer.get_text(start_buf, end_buf, False).upper() seq_file = self.second_seq_file_entry.get_text() seq_online = self.second_seq_online_entry.get_text() if (seq_direct): self.second_seq = seq_direct if (seq_file): align = AlignIO.read(seq_file, self.file_type) self.second_seq = str(align[0].seq) if (seq_online): if self.file_type == 'fasta': f_type = 'fasta' elif self.file_type == 'genbank': f_type = 'gb' url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id='+seq_online+'&rettype='+f_type+'' f = urllib.request.urlopen(url) result = f.read().decode('utf-8') if len(result) < 10: self.show_message("Error", "Failed to retrieve the second sequence (please check your ID Number)") return else: file_name = "cache/"+seq_online+"."+self.file_type+"" file = open(file_name, "w") file.write(result) file.close() align = AlignIO.read(file_name, self.file_type) self.second_seq = str(align[0].seq)
def concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10): sequences1 = AlignIO.read(file1, 'fasta') sequences2 = AlignIO.read(file2, 'fasta') sequences3 = AlignIO.read(file3, 'fasta') sequences4 = AlignIO.read(file4, 'fasta') sequences5 = AlignIO.read(file5, 'fasta') sequences6 = AlignIO.read(file6, 'fasta') sequences7 = AlignIO.read(file7, 'fasta') sequences8 = AlignIO.read(file8, 'fasta') sequences9 = AlignIO.read(file9, 'fasta') sequences10 = AlignIO.read(file10, 'fasta') complete_sequences = [] for sequence1 in sequences1: strain_name = util.get_strain_name(sequence1) sequence2 = util.get_matching_sequence(sequences2, strain_name=strain_name) sequence3 = util.get_matching_sequence(sequences3, strain_name=strain_name) sequence4 = util.get_matching_sequence(sequences4, strain_name=strain_name) sequence5 = util.get_matching_sequence(sequences5, strain_name=strain_name) sequence6 = util.get_matching_sequence(sequences6, strain_name=strain_name) sequence7 = util.get_matching_sequence(sequences7, strain_name=strain_name) sequence8 = util.get_matching_sequence(sequences8, strain_name=strain_name) sequence9 = util.get_matching_sequence(sequences9, strain_name=strain_name) sequence10 = util.get_matching_sequence(sequences10, strain_name=strain_name) if (sequence2 and sequence3 and sequence4 and sequence5 and sequence6 and sequence7 and sequence8 and sequence9 and sequence10): complete_sequence=[] complete_sequence.append(util.get_strain_name(sequence1)) complete_sequence.append(sequence1.seq+sequence2.seq+sequence3.seq+sequence4.seq+sequence5.seq+sequence6.seq+sequence7.seq+sequence8.seq+sequence9.seq+sequence10.seq) complete_sequences.append(complete_sequence) return complete_sequences
def main(): """ The main function """ parser = cmdline_parser() args = parser.parse_args() #read in the alignment aln_viral = AlignIO.read(open(args.viral, 'r'), "fasta") aln_gta = AlignIO.read(open(args.gta, 'r'), "fasta") aln_viral_stereo = stereo_score(aln_viral) aln_gta_stereo = stereo_score(aln_gta) #convert to tuples viral_stereo_posi = [(i, j) for i, j in enumerate(aln_viral_stereo)] gta_stereo_posi = [(i, j) for i, j in enumerate(aln_viral_stereo)] #convert to pandas dataframe viral_stereo_posi_df = pd.DataFrame.from_records(viral_stereo_posi) gta_stereo_posi_df = pd.DataFrame.from_records(gta_stereo_posi) #add column headers viral_stereo_posi_df.columns = ['position', 'score'] gta_stereo_posi_df.columns = ['position', 'score'] #plot the figures fig = plt.figure(figsize=(30, 10)) plt.bar(viral_stereo_posi_df['position'], viral_stereo_posi_df['score']) plt.bar(ta_stereo_posi_df['position'], -gta_stereo_posi_df['score'], color='r') plt.axis([0, 700, -1.2, 1.2]) savefig('args.output')
def load_tree(seqfname): """Load an alignment, build & prep a tree, return the tree object.""" if seqfname.endswith('.aln'): aln = AlignIO.read(seqfname, 'clustal') elif seqfname.endswith('.fasta'): # Run MAFFT quickly alndata = subprocess.check_output(['mafft', '--quiet', '--auto', seqfname]) aln = AlignIO.read(StringIO(alndata), 'fasta') else: raise ValueError("Input sequences must be a Clustal alignment (.aln) " "or unaligned FASTA (.fasta)") # Use conserved (less-gappy) blocks to build the tree aln = alnutils.blocks(aln, 0.4) with tempfile.NamedTemporaryFile(mode='w') as tmp: AlignIO.write(aln, tmp, 'fasta') tmp.flush() treedata = subprocess.check_output(['fasttree', '-pseudo', '-gamma', '-wag', tmp.name]) tree = Phylo.read(StringIO(treedata), 'newick') # Collapse weakly supported splits confs = [c.confidence for c in tree.find_clades() if c.confidence is not None] # ENH: accept min_confidence as an option min_confidence = math.fsum(confs) / len(confs) tree.collapse_all(lambda c: c.confidence < min_confidence) tree.ladderize(reverse=True) tree.root.branch_length = 0.0 return tree
def design_primers(source_dir, target_dir, settings, logfile): print("\nDesigning primers using PriFi...\n", file=logfile) # get rid of previous files utils.purge_dir(target_dir) aln_files = glob(os.path.join(source_dir, '*.fasta')) print("\tChecking for empty alignments...", file=logfile) for f in aln_files: try: align = AlignIO.read(f, 'fasta') filename = os.path.basename(f) shutil.copyfile(f, os.path.join(target_dir, filename)) except Exception: print("[WARNING] Empty alignment file?! (%s)" % f, file=logfile) continue # call PriFi for actual primer design for f in glob(os.path.join(target_dir, '*.fasta')): aln = AlignIO.read(f, 'fasta') summary = AlignInfo.SummaryInfo(aln) l = aln.get_alignment_length() primerpairs = prifipy.findprimers(0, list(aln), summary, l, settings, logfile) if not primerpairs: print("%s: No valid primer pair found" % f, file=logfile) else: print('%s: Found %d primer pair suggestions. Writing primer files:' % (f, len(primerpairs)), file=logfile) prifipy.writePrimersToFiles(f, primerpairs, 1, logfile)
def conversion(self, prank_number, prank_ext, format): """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename): os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual( str(cmdline), prank_exe + " -d=%s" % self.input + ' -o="%s"' % self.output + " -f=%i" % prank_number + " -convert", ) self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) child = subprocess.Popen( str(cmdline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32") ) return_code = child.wait() self.assertEqual(return_code, 0) message = child.stdout.read().strip() self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) in message, message) self.assertEqual(child.stderr.read(), "") self.assert_(os.path.isfile(filename)) old = AlignIO.read(open(self.input), "fasta") # Hack... if format == "phylip": for record in old: record.id = record.id[:10] new = AlignIO.read(open(filename), format) assert len(old) == len(new) for old_r, new_r in zip(old, new): self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename) del child
def conversion(self, prank_number, prank_ext, format) : """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename) : os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual(str(cmdline), prank_exe \ + ' -d=%s' % self.input \ + ' -o="%s"' % self.output \ + ' -f=%i' % prank_number \ + ' -convert') self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, stdout, stderr = Application.generic_run(cmdline) self.assertEqual(result.return_code, 0) message = stdout.read().strip() self.assert_(("PRANK: converting '%s' to '%s'" % (self.input, filename)) \ in message, message) self.assertEqual(stderr.read(), "") self.assertEqual(str(result._cl), str(cmdline)) self.assert_(os.path.isfile(filename)) old = AlignIO.read(open(self.input), "fasta") #Hack... if format=="phylip" : for record in old : record.id = record.id[:10] new = AlignIO.read(open(filename), format) assert len(old) == len(new) for old_r, new_r in zip(old, new) : self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename)
def multiple_alignment(fasta_dict, alignment_type=SeqTypeData().TYPE_DEFAULT): in_handle = StringIO() fasta_tools.write_fasta_handle(in_handle, fasta_dict) muscle_cmd = SeqTypeData().type2cmd[alignment_type] child = subprocess.Popen(str(muscle_cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform != "win32")) if not child: print("Process was not created!") return if sys.version_info[0] == 3: child.stdin.write(bytes(in_handle.getvalue(), 'utf-8')) child.stdin.close() align = AlignIO.read(StringIO("".join(line.decode() for line in child.stdout)), "clustal") else: child.stdin.write(in_handle.getvalue()) child.stdin.close() align = AlignIO.read(child.stdout, "clustal") fd = copy.deepcopy(fasta_dict) for a in align: fd.set(a.id, str(a.seq)) return fd
def conversion(self, prank_number, prank_ext, format): """Get PRANK to do a conversion, and check it with SeqIO.""" filename = "%s.%s" % (self.output, prank_ext) if os.path.isfile(filename): os.remove(filename) cmdline = PrankCommandline(prank_exe, d=self.input, convert=True, f=prank_number, o='"%s"' % self.output) self.assertEqual(str(cmdline), _escape_filename(prank_exe) + ' -d=%s' % self.input + ' -o="%s"' % self.output + ' -f=%i' % prank_number + ' -convert') self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) message, error = cmdline() self.assertTrue("PRANK" in message, message) self.assertTrue(("converting '%s' to '%s'" % (self.input, filename)) in message, message) self.assertEqual(error, "") self.assertTrue(os.path.isfile(filename)) old = AlignIO.read(self.input, "fasta") # Hack... if format == "phylip": for record in old: record.id = record.id[:10] new = AlignIO.read(filename, format) self.assertEqual(len(old), len(new)) for old_r, new_r in zip(old, new): self.assertEqual(old_r.id, new_r.id) self.assertEqual(str(old_r.seq), str(new_r.seq)) os.remove(filename)
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def muscle_align_protein(recs, work_dir, outfmt="fasta", inputorder=True): """ Align given proteins with muscle. recs are iterable of Biopython SeqIO objects """ fasta_file = op.join(work_dir, "prot-start.fasta") align_file = op.join(work_dir, "prot.aln") SeqIO.write(recs, file(fasta_file, "w"), "fasta") muscle_cl = MuscleCommandline(cmd=MUSCLE_BIN("muscle"), input=fasta_file, out=align_file, seqtype="protein", clwstrict=True) stdout, stderr = muscle_cl() alignment = AlignIO.read(muscle_cl.out, "clustal") if inputorder: try: muscle_inputorder(muscle_cl.input, muscle_cl.out) except ValueError: return "" alignment = AlignIO.read(muscle_cl.out, "fasta") print >>sys.stderr, "\tDoing muscle alignment: %s" % muscle_cl if outfmt == "fasta": return alignment.format("fasta") if outfmt == "clustal": return alignment.format("clustal")
def _input(a): """The function converts alignments to matrix for further use. Arguments: -a- alignment file Example: >>>import os >>>import numpy as np >>>from Bio import AlignIO >>>_input("example.fasta")""" fileName, fileExtension = os.path.splitext(a) if fileExtension == ".phylip": try: l = list(AlignIO.read(a,"phylip")) except (ValueError): l = list(AlignIO.read(a,"phylip-relaxed")) except: pass elif fileExtension == ".fasta": l = list(AlignIO.read(a,"fasta")) else: raise Exception("Wrong format. Choose accepted format.") p = [[i for i in str(l[j].seq)] for j in range(0,len(l))] y = np.array(p) return(y)
def main(argv): usage = 'ConvertAln -i <infile> -x <informat> -o <outfile> -f <outformat>' infile = '' informat = '' outfile = '' outformat = '' try: opts, args = getopt.getopt(argv,"hi:x:o:f:",["infile=", "informat=", "outfile=", "outformat="]) except getopt.GetoptError: sys.exit(usage) for opt, arg in opts: if opt == '-h': print usage sys.exit() elif opt in ("-i", "--infile"): infile = arg elif opt in ("-x", "--informat"): informat = arg elif opt in ("-o", "--outfile"): outfile = arg elif opt in ("-f", "--outformat"): outformat = arg if not infile: sys.exit("must specify infile! %s" % usage) if not outformat: sys.exit("must specify format to convert to! %s" % usage) if not informat: informat = guess_format(infile) if not outfile: if '.' in infile: outfile = '.'.join((infile.split('.')[:-1] + [get_extension(outformat)])) else: outfile = '.'.join((infile, get_extension(outformat))) if infile == 'pipe' or infile == 'stdin' or infile == 'STDIN' or infile == '|': infile = sys.stdin if outformat == 'phylip': alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna) alignment = remove_blank(alignment) if len(alignment) == 0 or len(alignment[0]) == 0: sys.exit() if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>': write_phylip(alignment, sys.stdout) else: out_fh = open(outfile, 'w') write_phylip(alignment, out_fh) out_fh.close() else: if outfile == 'pipe' or outfile == 'stdout' or outfile == 'STDOUT' or outfile == '|' or outfile == '>': outfile = sys.stdout if outformat == 'nexus': alignment=AlignIO.read(infile, informat, alphabet=IUPAC.ambiguous_dna) write_nexus(alignment, outfile) else: AlignIO.convert(infile, informat, outfile, outformat, alphabet=IUPAC.ambiguous_dna)
def codon_align(self, alignment_tool="mafft", prune=True, verbose=0): ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps note that this suppresses any compensated frameshift mutations Parameters: - alignment_tool: ['mafft', 'muscle'] the commandline tool to use ''' from Bio import AlignIO,SeqIO from Bio.SeqRecord import SeqRecord make_dir(self.run_dir) os.chdir(self.run_dir) # translage aa_seqs = {} bad_seq = 0 for seq in self.seqs.values(): tempseq = seq.seq.translate() # use only sequences that translate with out trouble if '*' not in str(tempseq)[:-1] or prune==False: aa_seqs[seq.id]=SeqRecord(tempseq,id=seq.id) aa_seqs[seq.id].attributes = seq.attributes else: if verbose: print(seq.id,"has premature stops, discarding") bad_seq+='*' in str(tempseq)[:-1] print('Number of sequences with stops:',bad_seq,'out of total',len(self.seqs)) tmpfname = 'temp_in.fasta' SeqIO.write(aa_seqs.values(), tmpfname,'fasta') if alignment_tool=='muscle': from Bio.Align.Applications import MuscleCommandline cline = MuscleCommandline(input=tmpfname, out=tmpfname[:-5]+'aligned.fasta') cline() aln_aa = AlignIO.read(tmpfname[:-5]+'aligned.fasta', "fasta") elif alignment_tool=='mafft': from Bio.Align.Applications import MafftCommandline from StringIO import StringIO mafft_cline = MafftCommandline(input=tmpfname) stdout, stderr = mafft_cline() aln_aa = AlignIO.read(StringIO(stdout), "fasta") else: print('Alignment tool not supported:',alignment_tool) return #generate nucleotide alignment self.aln = pad_nucleotide_sequences(aln_aa, self.seqs) self.sequence_lookup = {seq.id:seq for seq in self.aln} self.reference_aligned = self.sequence_lookup[self.reference.id] # add attributes to alignment for seq in self.seqs.values(): if seq.id in self.sequence_lookup: self.sequence_lookup[seq.id].attributes = seq.attributes os.chdir('..') remove_dir(self.run_dir)
def read_alignment(self, *args, **kwargs): filename = args[0] args = args[1:] with fileIO.freader(filename) as fl: if ISPY3: handle = io.TextIOWrapper(fl) msa = AlignIO.read(handle, *args, **kwargs) else: msa = AlignIO.read(fl, *args, **kwargs) self.infile = filename # guess alphabet self._msa = self._guess_alphabet(msa)
def main(): args = get_parser() outfile1 = open(args.outputfile1, "w") file1 = glob.glob(os.path.join(args.indirectory, "*.nexus")) with open(file1[0], "r") as infile1: aln = AlignIO.read(infile1, 'nexus') AlignIO.write(aln, outfile1, 'phylip-relaxed') outputfile2 = open(args.outputfile2, "w") with open(file1[1], "r") as infile2: aln = AlignIO.read(infile2, 'nexus') AlignIO.write(aln, outputfile2, 'phylip-relaxed') outputfile3 = open(args.outputfile3, "w") with open(file1[2], "r") as infile3: aln = AlignIO.read(infile3, 'nexus') AlignIO.write(aln, outputfile3, 'phylip-relaxed')
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) align_lengths = [AlignIO.read(f, 'nexus').get_alignment_length() \ for f in files] max_align_length = max(align_lengths) # find the middle middle = int(round(max_align_length/2, 0)) # create a dict to hold the results by position in longest array differences = dict((d,np.array([])) for d in range(-middle, middle + 1)) # iterate through all the files again for f in files: align = AlignIO.read(f, 'nexus') align_length = align.get_alignment_length() align_diff = int((round((max_align_length - align_length)/2.,0) - middle)) # determine relative start of this alignment to longest for col in xrange(align_length): bases = align.get_column(col) b_counts = len(set(bases)) #pdb.set_trace() differences[align_diff + col] = np.append(differences[align_diff + col],b_counts) #pdb.set_trace() position = differences.keys() # create bins in groups of 100 #bins = np.array(range(0,500,100)) #print bins #pdb.set_trace() if options.output: outp = open(options.output, 'w') else: outp = sys.stdout outp.write('bp, mean, ci, onediff, greaterthanonediff, fourdiff, count\n') ignore = [] for p in sorted(position): #pdb.set_trace() # how many only have 0-1 difference try: one_diff = sum(differences[p] <= 1)/float(len(differences[p])) four_diff = sum(differences[p] >= 4)/float(len(differences[p])) greater_than_one_diff = sum(differences[p] > 1)/float(len(differences[p])) total = len(differences[p]) except ZeroDivisionError: ignore.append(p) if p not in ignore: outp.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(p, np.mean(differences[p]), 1.96 * np.std(differences[p]), one_diff, four_diff, greater_than_one_diff, total)) if options.output: outp.close()
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" # Overwrite existing files. cline.force = True # Mark output files for later cleanup. self.add_file_to_clean(cline.outfile) if cline.guidetree_out: self.add_file_to_clean(cline.guidetree_out) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) self.assertEqual(str(eval(repr(cline))), str(cline)) output, error = cline() self.assertTrue(not output or output.strip().startswith("CLUSTAL")) # Test if ClustalOmega executed successfully. self.assertTrue(error.strip() == "" or error.startswith("WARNING: Sequence type is DNA.") or error.startswith("WARNING: DNA alignment is still experimental.")) # Check the output... align = AlignIO.read(cline.outfile, "clustal") output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal")) self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys()))) for record in align: self.assertEqual(str(record.seq), str(output_records[record.id].seq)) # TODO - Try and parse this with Bio.Nexus? if cline.guidetree_out: self.assertTrue(os.path.isfile(cline.guidetree_out))
def __init__(self, file_name=None, data = None, format='fasta'): if file_name: super(Alignment, self).__init__(AlignIO.read(file_name, format)) elif data: super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format)) else: super(Alignment, self).__init__([])
def test_needle_file(self): """needle with the asis trick, output to a file.""" # Setup, cline = NeedleCommandline(cmd=exes["needle"]) cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT") cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT") cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") # EMBOSS would guess this, but let's be explicit: cline.set_parameter("-snucleotide", "True") cline.set_parameter("-outfile", "Emboss/temp with space.needle") self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, stdout, stderr = cline() # Check it worked, self.assertTrue(stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr) self.assertEqual(stdout.strip(), "") filename = cline.outfile self.assertTrue(os.path.isfile(filename), "Missing output file %r from:\n%s" % (filename, cline)) # Check we can parse the output... align = AlignIO.read(filename, "emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") # Clean up, os.remove(filename)
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet) self.aln = aln
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" self.assertTrue(str(eval(repr(cline))) == str(cline)) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"), lambda rec : rec.id.replace(":", "_")) #Determine name of tree file if cline.newtree: tree_file = cline.newtree else: #Clustalw will name it based on the input file tree_file = os.path.splitext(cline.infile)[0] + ".dnd" # Mark generated files for later removal self.add_file_to_clean(cline.outfile) self.add_file_to_clean(tree_file) output, error = cline() self.assertTrue(output.strip().startswith("CLUSTAL")) self.assertTrue(error.strip() == "") #Check the output... align = AlignIO.read(cline.outfile, "clustal") #The length of the alignment will depend on the version of clustalw #(clustalw 2.1 and clustalw 1.83 are certainly different). output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile,"clustal")) self.assertTrue(set(input_records.keys()) == set(output_records.keys())) for record in align: self.assertTrue(str(record.seq) == str(output_records[record.id].seq)) self.assertTrue(str(record.seq).replace("-", "") == str(input_records[record.id].seq)) #Check the DND file was created. #TODO - Try and parse this with Bio.Nexus? self.assertTrue(os.path.isfile(tree_file))
def align(cls, seq_records, outfile=None): '''Align given sequences @param seq_records: a list of SeqRecords objects @param outfile: a filename for the output alignment or None @return: if the outfile is none, return an AlignmentExt object; otherwise return True on success. In both cases return None on error.''' if not outfile: outfile = mktmp_name('.aln.fasta') remove_out = True else: remove_out = False msafile = mktmp_fasta(seq_records) args = dict(thread=cpu_count, input=msafile) if len(seq_records) < 10000: args['auto'] = True else: args['parttree'] = True args['partsize'] = 1000 ali = None if run_cline(MafftCommandline(**args), stdout=outfile): if remove_out: ali = AlignmentExt.from_msa(AlignIO.read(outfile, 'fasta')) else: ali = True if remove_out: safe_unlink(outfile) safe_unlink(msafile) return ali
def __init__(self, fileName = None, msaBean = None, alphabet = "ACDEFGHIKLMNPQRSTVWY-", backtrack = None, jsBeanFile = None, id=None): self.alphabet = alphabet self.backtrack = backtrack self.id = id if id else "id_default_msa" if fileName: fType = None if fileName.endswith(".aln"): fType = "clustal" else : fType = "fasta" self.alignment = AlignIO.read(open(fileName), fType) # fasta self.asMatrix = [[ aaCoherce(aa) for aa in list(record.seq) ] for record in self.alignment] self.headers = [record.id for record in self.alignment] # print dir(self.alignment) # print dir (self.asMatrix) # print self.alignment.annotations elif msaBean: self.asMatrix = msaBean.matrix self.headers = msaBean.header self.backtrack = msaBean.backtrack #self.alignment = [seq for seq in msaBean.matrix] # same for headers elif jsBeanFile: with open(jsBeanFile) as json_file: json_data = json.load(json_file) self.asMatrix = json_data['matrix'] self.headers = json_data['headers'] self.backtrack = json_data['backtrack'] else: raise initError("You must specify a bean or a mfasta file") self.nSeq = len(self.asMatrix) self.length = len(self.asMatrix[0]) self._frequency = None
def test_long(self): """Simple muscle call using long file.""" #Create a large input file by converting some of another example file temp_large_fasta_file = "temp_cw_prot.fasta" handle = open(temp_large_fasta_file, "w") records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40] SeqIO.write(records, handle, "fasta") handle.close() #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", temp_large_fasta_file) #Preserve input record order cmdline.set_parameter("stable", True) #Default None treated as False! #Use fast options cmdline.set_parameter("maxiters", 1) cmdline.set_parameter("diags", True) #Default None treated as False! #Use clustal output cmdline.set_parameter("clwstrict", True) #Default None treated as False! #Shoudn't need this, but just to make sure it is accepted cmdline.set_parameter("maxhours", 0.1) #No progress reports to stderr cmdline.set_parameter("quiet", True) #Default None treated as False! self.assertEqual(str(cmdline).rstrip(), muscle_exe + \ " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \ " -maxiters 1 -clwstrict -stable -quiet") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, out_handle, err_handle = generic_run(cmdline) align = AlignIO.read(out_handle, "clustal") self.assertEqual(len(records), len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) os.remove(temp_large_fasta_file) #See if quiet worked: self.assertEqual("", err_handle.read().strip())
def generate_mask(refpkg, stockholm_alignment): """ Generate an AlignmentMask from a reference package and stockholm alignment """ with refpkg.open_resource('mask') as fp: unmasked_positions = set(int(i.strip()) for i in fp.read().split(',')) # Get length of alignment with open(stockholm_alignment) as fp: align_length = len(AlignIO.read(stockholm_alignment, 'stockholm')[0]) # Load consensus columns with open(stockholm_alignment) as fp: consensus_columns = _parse_stockholm_consensus(fp) if not align_length == len(consensus_columns.mask): raise ValueError("Consensus Columns and Alignment have " "differing lengths") counter = itertools.count().next consensus_column_indexes = (counter() if i else None for i in consensus_columns.mask) consensus_mask = AlignmentMask([i in unmasked_positions for i in consensus_column_indexes]) return consensus_mask
def test_needle_piped(self): """needle with asis trick, output piped to stdout.""" cline = NeedleCommandline(cmd=exes["needle"], asequence="asis:ACCCGGGCGCGGT", bsequence="asis:ACCCGAGCGCGGT", gapopen=10, gapextend=0.5, auto=True, filter=True) self.assertEqual(str(cline), exes["needle"] + " -auto -filter" + " -asequence=asis:ACCCGGGCGCGGT" + " -bsequence=asis:ACCCGAGCGCGGT" + " -gapopen=10 -gapextend=0.5") # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) child.stdin.close() # Check we could read it's output align = AlignIO.read(child.stdout, "emboss") self.assertEqual(len(align), 2) self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT") self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT") # Check no error output: self.assertEqual(child.stderr.read(), "") self.assertEqual(0, child.wait()) child.stdout.close() child.stderr.close()
def test_aln_to_leaves(self): anc_t = treeanc.TreeAnc.from_file(resources_dir+'PR.B.100.nwk', 'newick') aln = AlignIO.read(resources_dir+'PR.B.100.fasta', 'fasta') err = anc_t.load_aln(aln) assert (err==0) # all sequencs were set up successfully
with open("%s.fa" % fa_head, "w") as handle: handle.write(">%s\n%s\n>%s\n%s\n" % (germline, germs[germline], entry.id, entry.seq)) clustal_cline = ClustalwCommandline(cmd=clustalw, infile="%s.fa" % fa_head) try: stdout, stderr = clustal_cline() except: print("Error in alignment of %s (will skip): %s" % (entry.id, stderr)) for f in glob.glob("%s.*" % fa_head): os.remove(f) continue alignment = AlignIO.read("%s.aln" % fa_head, "clustal") shift = False for record in alignment: codons = re.sub( "---", "", str(record.seq.strip("-")) ) #don't care about leading/trailing and full-codon indels are fine if "-" in codons: shift = True #likely frameshift --discard! if not shift: #made it; save the sequence good += 1 sequences.append(entry) for f in glob.glob("%s.*" % fa_head): os.remove(f)
aln = drbaln alnindex = dict([(a.id, a) for a in aln]) compare_tepitope_alleles(alnindex) #d1 = compare(ref1, ref2, alnindex) #x = d1.merge(d2,right_index=1,left_index=1) #print len(x) #compare_ref(hla,bola,ref,alnindex) plt.show() return pocket_residues = get_pocket_positions() librarypssms = get_pssms() #drb MHC alignment using IPD sequences, includes BoLA-DRB3 sequences drbaln = AlignIO.read(drb_aln_file, "fasta") def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("-t", "--test", dest="test", action='store_true', help="test") opts, remainder = parser.parse_args() if opts.test == True: test()
usage = "%prog [options] < fasta_file > phylip file" description = "Convert a fasta file to a phylip file" parser = optparse.OptionParser(description=description, usage=usage) parser.add_option( "-c", "--convfile", help="File to convert new IDs back to original IDs (D = don't save file)", action="store", type="str", dest="convfile", default=None) (options, args) = parser.parse_args() # Read the FASTA file from stdin and convert it into a phylip file # Use list so we actually edit in-place rather than # just editing a copy that gets destroyed later! aln = list(AlignIO.read(sys.stdin, "fasta")) if not options.convfile == None: fid = open(options.convfile, "w") # We will use this to convert back to the IDs in the fasta file for i in range(len(aln)): newid = "S%09d" % (i) if not options.convfile == None: fid.write("%s\t%s\n" % (newid, aln[i].id)) aln[i].id = newid SeqIO.write(aln, sys.stdout, "phylip")
from Bio import AlignIO import sys al = AlignIO.read( open(sys.argv[1]), "stockholm" ) print al.format("fasta")
def Calculate_BaseEdit_freq(self, lQuery_seq=[]): dRef = {} dResult = {} dRef[sBarcode] = ( sRef) # total matched reads, insertion, deletion, complex dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [], []] # lRef : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)] # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ], info] iCount = 0 for sQuery_seq_raw in lQuery_seq: iBarcode_matched = 0 iNeedle_matched = 0 iInsert_count = 0 iDelete_count = 0 iComplex_count = 0 try: # Check the barcode pos and remove it. sQuery_seq_raw = sQuery_seq_raw.replace('\r', '') iBarcode_start_pos = sQuery_seq_raw.index(sBarcode) iBarcode_matched += 1 sQuery_seq_with_barcode = sQuery_seq_raw[iBarcode_start_pos:] # _check = 0 # if sQuery_seq_raw == 'TCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA': # _check = 1 sRef_seq = r'<(echo -e ">{name}\n{seq}")'.format( name=sBarcode + '_ref', seq=sRef) sQuery_seq = r'<(echo -e ">{name}\n{seq}")'.format( name=sBarcode + '_query', seq=sQuery_seq_with_barcode) sNeedle_cmd = r"/bin/bash -c 'needle -filter {0} {1} -outfile stdout -gapopen {2} -gapextend {3} -endweight Y -endopen {4} -endextend {5}'".format( sRef_seq, sQuery_seq, sOG, sOE, sEG, sEE) Needle_result = sp.Popen(sNeedle_cmd, stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True, shell=True) lResult = [ Instance.seq._data for Instance in AlignIO.read( Needle_result.stdout, "emboss") ] sRef_needle_ori = lResult[0] sQuery_needle_ori = lResult[1] # if _check == 1: # print(sRef_needle_ori) # print(sQuery_needle_ori) # set_trace() Needle_result.stdout.close() # detach forward ---, backward --- # e.g. ref ------AAAGGCTACGATCTGCG------ # query AAAAAAAAATCGCTCTCGCTCTCCGATCT # trimmed ref AAAGGCTACGATCTGCG # trimmed qeury AAATCGCTCTCGCTCTC iReal_ref_needle_start = 0 iReal_ref_needle_end = len(sRef_needle_ori) iRef_needle_len = len(sRef_needle_ori) for i, sRef_nucle in enumerate(sRef_needle_ori): if sRef_nucle in ['A', 'C', 'G', 'T']: iReal_ref_needle_start = i break for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]): if sRef_nucle in ['A', 'C', 'G', 'T']: iReal_ref_needle_end = iRef_needle_len - (i + 1) # forward 0 1 2 len : 3 # reverse 2 1 0, len - (2 + 1) = 0 break sRef_needle = sRef_needle_ori[ iReal_ref_needle_start:iReal_ref_needle_end + 1] if iReal_ref_needle_start: sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end] sQuery_needle = sQuery_needle_ori[:len(sRef_needle)] # detaching completion # indel info making. iNeedle_match_pos_ref = 0 iNeedle_match_pos_query = 0 iNeedle_insertion = 0 iNeedle_deletion = 0 lInsertion_in_read = [ ] # insertion result [[100, 1], [119, 13]] lDeletion_in_read = [] # deletion result [[97, 1], [102, 3]] # print 'sRef_needle', sRef_needle # print 'sQuery_needle', sQuery_needle for i, (sRef_nucle, sQuery_nucle) in enumerate( zip(sRef_needle, sQuery_needle)): if sRef_nucle == '-': iNeedle_insertion += 1 if sQuery_nucle == '-': iNeedle_deletion += 1 if sRef_nucle in ['A', 'C', 'G', 'T']: if iNeedle_insertion: lInsertion_in_read.append( [iNeedle_match_pos_ref, iNeedle_insertion]) iNeedle_insertion = 0 iNeedle_match_pos_ref += 1 if sQuery_nucle in ['A', 'C', 'G', 'T']: if iNeedle_deletion: lDeletion_in_read.append( [iNeedle_match_pos_query, iNeedle_deletion]) iNeedle_match_pos_query += iNeedle_deletion iNeedle_deletion = 0 iNeedle_match_pos_query += 1 # print 'sRef_needle', sRef_needle # print 'sQuery_needle', sQuery_needle # print 'lInsertion_in_read: onebase', lInsertion_in_read # print 'lDeletion_in_read: onebase', lDeletion_in_read # print 'i5bp_front_Indel_end', i5bp_front_Indel_end # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos lTarget_indel_result = [] # ['20M2I', '23M3D' ...] """ ins case ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT """ iCleavage_window_start = int(lIndel_check_pos[0]) iCleavage_window_end = int(lIndel_check_pos[1]) - 1 for iMatch_pos, iInsertion_pos in lInsertion_in_read: if iCleavage_window_start <= iMatch_pos <= iCleavage_window_end: # iMatch_pos is one base iInsert_count = 1 lTarget_indel_result.append( str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I') """ del case 1 ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT del case 2 ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT """ for iMatch_pos, iDeletion_pos in lDeletion_in_read: """ Insertion: 30M3I ^ ACGT---ACGT ACGTTTTACGT -> check this seq Insertion just check two position Deletion: 30M3D ^ ACGTTTTACGT ACGT---ACGT -> check this seq But deletion has to includes overlap deletion. """ if iMatch_pos <= iCleavage_window_end and iCleavage_window_start <= ( iMatch_pos + iDeletion_pos): iDelete_count = 1 lTarget_indel_result.append( str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D') if iInsert_count == 1 and iDelete_count == 1: iComplex_count = 1 iInsert_count = 0 iDelete_count = 0 # """ test set # print 'sBarcode', sBarcode # print 'sTarget_region', sTarget_region # print 'sRef_seq_after_barcode', sRef_seq_after_barcode # print 'sSeq_after_barcode', sQuery_seq # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos # """ """ 23M3I 23M is included junk_seq after barcode, barcorde junk targetseq others *********ACCCT-------------ACACACACC so should select target region. If junk seq is removed by target region seq index pos. """ ## 8: indel info dResult[sBarcode][8].append([ sRef, sQuery_seq_raw, lTarget_indel_result, "", sRef_needle_ori, sQuery_needle_ori ]) ## "" -> target seq, but this is not used this project. # end: try except ValueError as e: # print(e) continue # total matched reads, insertion, deletion, complex dResult[sBarcode][0] += iBarcode_matched dResult[sBarcode][1] += iInsert_count dResult[sBarcode][2] += iDelete_count dResult[sBarcode][3] += iComplex_count ## base editing frequency """ BaseEditPos : 0 1 2 [OrderedDict([('A',0),('C',0),('G',0),('T',0)]), OrderedDict([('A',0),('C',0),('G',0),('T',0)]), ... and sum the counts each position """ if iInsert_count == 0 and iDelete_count == 0 and iComplex_count == 0: lBaseEdit = [] iTarget_len = int(lTarget_window[1]) - int( lTarget_window[0]) + 1 for i in range(iTarget_len): lBaseEdit.append( OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])) iTarget_start = int(lTarget_window[0]) - 1 iTarget_end = int(lTarget_window[1]) """ cleavage window start ^ [barcode]ACGACGTACGACGT[cleavage] [barcode]ACGACGTACGACGT[cleavage] """ iBase_edit_event = 0 for i, tRef_Query_base in enumerate( zip(sRef_needle[iTarget_start:iTarget_end], sQuery_needle[iTarget_start:iTarget_end])): sRef_base = tRef_Query_base[0] sQuery_base = tRef_Query_base[1] if sRef_base == '-' or sQuery_base == '-': continue if sRef_base != sQuery_base and sQuery_base != 'N': iBase_edit_event = 1 lBaseEdit[i][sQuery_base] += 1 # print(sQuery_needle) dResult[sBarcode][9].append(lBaseEdit) if iBase_edit_event == 1: dResult[sBarcode][10].append([ sRef, sQuery_seq_raw, lTarget_indel_result, [ list(orderedDict.values()) for orderedDict in lBaseEdit ], sRef_needle_ori, sQuery_needle_ori ]) # dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [BaseEdit_freq_data]] iBarcode_matched = 0 iInsert_count = 0 iDelete_count = 0 iComplex_count = 0 # end: for sBarcode, lCol_ref # end: for lCol_FASTQ return dResult
def __init__(self, root_dir, *args, **kwargs): # set the default configurations self._build_tree = False self._infer_gtr = True self._root = None self._mutation_rate = None self._relaxed_clock = None self._Tc = None # set the directory, containing files for the session self._root_dir = root_dir self._nwk = os.path.join(self._root_dir, in_tree) self._aln = os.path.join(self._root_dir, in_aln) self._meta = os.path.join(self._root_dir, in_meta) self._cfg = os.path.join(self._root_dir, in_cfg) # some parameters for the logger function should be set explicitly, # because we might need them before super.__init__(...) is called self.verbose = 5 self.t_start = time.time() self._log_file = os.path.join(self._root_dir, log_filename) # read the JSON configuration file with open(self._cfg) as ff: config_dic = json.load(ff) # Compose the list of steps to perform, # # save this as dictionary to session_state.json file (used to update # web browser state) self._init_session_state(config_dic) # build tree if necessary # build tree if necessary if self._build_tree: try: self.logger("Building phylogenetic tree...", 1) self._advance_session_progress() res = build_tree(self._root_dir) if (res != 0): raise RuntimeError( "Error in tree builder FastTree. The method returned: " + str(res)) except Exception as e: s = str(e) self._session_error( "Error occurred when building phylogenetic tree." " Exception description: " + s) raise try: tree = Phylo.read(self._nwk, 'newick') aln = AlignIO.read(self._aln, 'fasta') # read the metadata dates, metadata = self._read_metadata_from_file(self._meta) super(TreeTime, self).__init__(dates=dates, tree=tree, aln=aln, gtr=self._gtr, *args, **kwargs) except Exception as e: s = str(e) self._session_error("Error in TreeTime object initialization. " " Exception description: " + s) raise self._metadata = metadata
def simulate_missing(path_to_align, spp_info, prop_n): # Set path to alignment alignment_path = str(path_to_align) full_path = os.path.abspath(alignment_path) base_name = os.path.basename(full_path) dir_name = os.path.dirname(full_path) # Get species list spp_list = str(spp_info).split(",") # Get N proportions n_percents = [float(x) for x in str(prop_n).split(",")] # Read in alignment and prune to desired species if requested try: formats = { 'nex': 'nexus', 'nexus': 'nexus', 'phy': 'phylip', 'phylip-relaxed': 'phylip-relaxed', 'phylip': 'phylip', 'fa': 'fasta', 'fasta': 'fasta' } fformat = formats[alignment_path.split('.')[-1]] raw_alignment = AlignIO.read(alignment_path, fformat) # If alignment cannot be read in, raise exception except: sys.exit("ERROR: Cannot process " + os.path.basename(alignment_path)) # Get species from raw alignment raw_spp = [] for seq_record in raw_alignment: raw_spp.append(str(seq_record.id)) if all(elem in spp_list for elem in raw_spp): # Create dummy alignment global pruned_alignment pruned_alignment = raw_alignment[0:0] # Populate alignment by adding taxa sorted by taxon ID for i in range(0, len(spp_list)): spp_to_add = spp_list[i] spp_n_percent = n_percents[i] raw_index = raw_spp.index(spp_to_add) raw_id = raw_alignment[raw_index].id raw_seq = raw_alignment[raw_index].seq new_seq = add_n(raw_seq, spp_n_percent) pruned_alignment.add_sequence(str(raw_id), new_seq) # If resulting alignment is empty, raise exception if int(pruned_alignment.get_alignment_length()) == 0: sys.exit( "ERROR: Alignment processed, but appears to have no bases...") else: with open( dir_name + '\\' + base_name.replace( "." + alignment_path.split('.')[-1], "_SimN." + alignment_path.split('.')[-1]), "w") as handle: SeqIO.write(pruned_alignment, handle, "phylip") else: sys.exit("ERROR: Requested species not found in " + os.path.basename(alignment_path) + "...")
def ali_parser(alignment_file): # optional parameter use score or hmmemit to choose seed sequence #args.ali if args.mode == 'con': subprocess.check_output([hmmbuild_exe, '--symfrac', '0', args.output_path+args.file_name+'.hmm', args.ali]) subprocess.check_output([hmmemit_exe, '-c', '-o', args.output_path+'temp_seed_seq.fasta', args.output_path+args.file_name+'.hmm']) seed_seq_record = AlignIO.read(args.output_path+'temp_seed_seq.fasta', 'fasta') seed_seq_record = str(seed_seq_record[0].seq) seed_seq_record = SeqRecord(Seq(seed_seq_record), id=args.file_name+'-consensus') seed_to_ali_mapping = [x+1 for x in range(len(seed_seq_record))] elif args.mode == 'rep': full_ali_obj = AlignIO.read(alignment_file, "fasta") for record in full_ali_obj: record.seq = str(record.seq).replace('X','-') # in case of 'X', replacing it with '-' raw_array = np.array([list(rec) for rec in full_ali_obj], order="F") # Convert alignment object to np_array object col_array = np.transpose(raw_array) # align_columns.shape == (#col, #raw), i.e. (#site, #seq) # compute the columne scores (as the proxy for seq coverage percentage) col_scores = [] for col in col_array: unique, counts = np.unique(col, return_counts=True) if '-' in unique: col_scores.append(1.0 - float(dict(zip(unique, counts))['-'])/float(len(col))) # obtain the seq with the highest coverage in the alignment array_score = np.copy(raw_array).astype(object) j = 0 # seq count while j < len(array_score): i = 0 # col count while i < len(col_scores): if array_score[j][i] == '-': array_score[j][i] = 0.0 else: array_score[j][i] = float(col_scores[i]) i += 1 j += 1 seq_score=np.zeros( j ) #np_sum j = 0 # seq count while j < len(array_score): i = 0 # col count while i < len(col_scores): seq_score[j]=seq_score[j]+array_score[j][i] i += 1 j += 1 #obtaining seed_seq related info seed_seq_array = raw_array[np.argmax(seq_score)] seed_seq = "".join(seed_seq_array) for record in full_ali_obj: if str(record.seq) == seed_seq: seed_id = str(record.id) seed_to_ali_mapping = [0 if x == '-' else 1 for x in seed_seq_array] # constructing a mapping list of '-'== 0 & amino acids == 1,2,3,... seed_seq indicating position in the original alignment) i = 0 # track the alignment position j = 1 # track the seed_seq position #print len(seed_seq) while i < len(seed_seq): if seed_to_ali_mapping[i] == 1: seed_to_ali_mapping[i] = j i += 1 j += 1 else: i += 1 #print '%d \n%s %s\n%s' % (np.amax(seq_score), seed_id, seed_seq, seed_to_ali_mapping) # check seed_seq_record = SeqRecord(Seq(seed_seq.replace("-","")), id=seed_id) # remove the 'gap' so it won't affect hmm_aa_freq mapping in the Main Processes SeqIO.write(seed_seq_record, args.output_path+"temp_seed_seq.fasta", "fasta") # run hmmscan domain_threshold = args.profile_threshold profile_threshold = args.domain_threshold subprocess.check_output([hmmscan_exe, '-o', args.output_path+'temp_hmmscan.out', '-E', profile_threshold ,'--domE', domain_threshold, args.database, args.output_path+'temp_seed_seq.fasta']) return (seed_seq_record, seed_to_ali_mapping)
import Bio from Bio.Phylo import PhyloXML, NewickIO import argparse from Bio import Phylo from Bio import AlignIO from Bio.Phylo.Consensus import * from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio.Phylo.TreeConstruction import DistanceTreeConstructor parser = argparse.ArgumentParser(description="") parser.add_argument("multSeqAln", help="", type=str) #parser.add_argument("out",help="") args = parser.parse_args() msa = AlignIO.read(args.multSeqAln, "fasta") #tree = Phylo.read(args.multSeqAln, "newick") #msas = bootstrap(msa, 100) calculator = DistanceCalculator('blosum62') constructor = DistanceTreeConstructor(calculator) #trees = bootstrap_trees(msa, 100, constructor) print "start bootstrap" consensus_tree = bootstrap_consensus(msa, 2, constructor, majority_consensus) print "bootstrap done" print consensus_tree consensus_tree.root_with_outgroup("Tamandua") Phylo.draw(consensus_tree)
lists[start]=1 start+=1 return lists ################################################################################### ############################### MAIN PROCESSES ################################ ################################################################################### seed_seq, seed_to_ali_mapping = ali_parser(args.ali)[0], ali_parser(args.ali)[1] domains = hmmscan_parser(args.output_path+'temp_hmmscan.out', seed_seq.id) hmm_database = {} for key, value in domains.items(): #hmm_database = HMM(args.database, seed_seq.id).hmm_database # Storing target hmm profile into a HMM Class which is structured as a dictionary hmm_database = HMM(args.database, key,hmm_database ) alimentObj = AlignIO.read(args.ali, "fasta") aligmentObjLen=len(alimentObj[0]) local_seed_seq_master = [[]]*(aligmentObjLen+int(0.1*aligmentObjLen)) max_seed_seq_master = [[]]*aligmentObjLen seed_seq_master = [[]]*(aligmentObjLen+int(0.1*aligmentObjLen)) #The master list storing aa_freqs for the entire seed_seq domain_position_check=[0 for x in range(aligmentObjLen)] ##### LET HMM_database order as score ''' domains_order = {} dom_key=(domains.keys()) for i in dom_key[::-1]: domains_order[i]=domains[i]
aln_gen, aln_nat, score, _, _ = alignments[0] match_count = 0 for i in range(len(aln_gen)): if aln_gen[i] != '-': if aln_gen[i] == aln_nat[i]: match_count += 1 print(aln_gen) print(aln_nat) percent_identity = match_count / actual_length print(match_count, actual_length, f"{percent_identity*100:0.2f} % identity") # Export alignments # aln = AlignIO.read('alns/sample_aln.fasta','fasta') aln = AlignIO.read(aln_filename, 'fasta') p = view_alignment(aln, plot_width=800) export_svgs(p, filename="figs/" + sample_sp + ".svg") # export_png(p, filename="figs/" + sample_sp + ".png") # pn.pane.Bokeh(p) # export_svgs(p, filename="figs/" + sample_sp + ".svg") closest_matches.append(aln_nat.replace('-', '')) closest_identities.append(percent_identity) # write final file df_func['closest_match'] = closest_matches df_func['percent_identity'] = closest_identities df_func.to_csv('alns/func_sps_matches.csv')
def remove_low_cov_and_consensus_columns(alignment_file_in, minimal_cov, min_consensus, alignment_file_out): def remove_single_columns_from_msa(alignment_in, column_to_remove): alignment_column_l = alignment_in[:, :column_to_remove - 1] alignment_column_r = alignment_in[:, column_to_remove:] alignment_new = alignment_column_l + alignment_column_r return alignment_new def remove_multi_columns_from_msa(alignment_in, column_list): alignment_new = alignment_in removed_col_num = 0 for column in sorted(column_list): alignment_new = remove_single_columns_from_msa( alignment_new, column - removed_col_num) removed_col_num += 1 return alignment_new def remove_low_cov_columns(alignment, min_cov): # get columns with low coverage sequence_number = len(alignment) total_col_num = alignment.get_alignment_length() low_cov_columns = [] n = 0 while n < total_col_num: current_column = alignment[:, n] dash_number = current_column.count('-') gap_percent = (dash_number / sequence_number) * 100 if gap_percent > min_cov: low_cov_columns.append(n + 1) n += 1 # remove identified columns alignment_new = remove_multi_columns_from_msa(alignment, low_cov_columns) #alignment_new = slice_string(alignment, low_cov_columns) return alignment_new def remove_low_consensus_columns(alignment, min_consensus): # get columns with low coverage sequence_number = len(alignment) total_col_num = alignment.get_alignment_length() low_css_columns = [] n = 0 while n < total_col_num: current_column = alignment[:, n] # get all aa in current column aa_list = set() for aa in current_column: aa_list.add(aa) # get maximum aa percent most_abundant_aa_percent = 0 for each_aa in aa_list: each_aa_percent = (current_column.count(each_aa) / sequence_number) * 100 if each_aa_percent > most_abundant_aa_percent: most_abundant_aa_percent = each_aa_percent # if maximum percent lower than provided cutoff, add current column to low consensus column list if most_abundant_aa_percent < min_consensus: low_css_columns.append(n + 1) n += 1 # remove identified columns alignment_new = remove_multi_columns_from_msa(alignment, low_css_columns) #alignment_new = slice_string(alignment, low_css_columns) return alignment_new # read in alignment alignment = AlignIO.read(alignment_file_in, "fasta") # remove_low_cov_columns alignment_cov = remove_low_cov_columns(alignment, minimal_cov) # remove_low_consensus_columns alignment_cov_css = remove_low_consensus_columns(alignment_cov, min_consensus) # write filtered alignment alignment_file_out_handle = open(alignment_file_out, 'w') for each_seq in alignment_cov_css: alignment_file_out_handle.write('>%s\n' % str(each_seq.id)) alignment_file_out_handle.write('%s\n' % str(each_seq.seq)) alignment_file_out_handle.close()