def validateInputs(msa, tree=None): # Check for existence and proper FASTA formatting of input MSA try: msaHandle = open(msa, "rU") except: print '** HYPNO input error: Given MSA file location does not exist or is not accessible: '+msa sys.exit(1) try: AlignIO.parse(msaHandle, "fasta").next() except: print '** HYPNO input error: improper MSA file format, must be aligned FASTA or a2m format: '+msa sys.exit(1) if tree: try: treeHandle = open(tree, "rU") except: print '** HYPNO input error: Given tree file location does not exist or is not accessible: '+tree sys.exit(1) try: Phylo.read(treeHandle, "newick") except: print '** HYPNO input error: improper tree file format, must be Newick format: '+msa sys.exit(1) if not internet_connected(): print '** HYPNO connection error: Please connect to the internet to enable HYPNO remote database queries.' sys.exit(1) return 0
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.nexus) old_names = set() for f in files: for align in AlignIO.parse(f, 'nexus'): for seq in list(align): old_names.update([seq.name]) #pdb.set_trace() name_map = abbreviator(old_names) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): new_seq_name = name_map[seq.name] new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def GetExec(): Recs = os.listdir(os.getcwd()) newList=[] j = 0 listdata=dict() k = 0 while k < len(Recs): try: (name, ext) = os.path.splitext(Recs[k]) typo = '' if ext in [".txt",".fas",".fasta"]: IORec = AlignIO.parse(Recs[k],'fasta') typo = 'fasta' elif ext in [".aln"]: IORec = AlignIO.parse(Recs[k],'clustal') typo = 'clustal' aNum = 1 for align in IORec: newList.append([align,name]) NumSeqs = 0 for rec in align: NumSeqs += 1 listdata[j] = str(Recs[k]),aNum, NumSeqs,align.get_alignment_length(),str(typo) j += 1 aNum += 1 except IOError, e: print e k += 1
def check_AlignIO_to_EMBOSS(self, in_filename, in_format, skip_formats=(), alphabet=None): """Can Bio.AlignIO write files seqret can read back?""" if alphabet: old_aligns = list(AlignIO.parse(in_filename, in_format, alphabet)) else: old_aligns = list(AlignIO.parse(in_filename, in_format)) formats = ["clustal", "phylip"] if len(old_aligns) == 1: formats.extend(["fasta", "nexus"]) for temp_format in formats: if temp_format in skip_formats: continue # PHYLIP is a simple format which explicitly supports # multiple alignments (unlike FASTA). try: new_aligns = list(emboss_piped_AlignIO_convert(old_aligns, temp_format, "phylip")) except ValueError as e: # e.g. ValueError: Need a DNA, RNA or Protein alphabet # from writing Nexus files... continue try: self.assertTrue(compare_alignments(old_aligns, new_aligns)) except ValueError as err: raise ValueError("Disagree on file %s %s in %s format: %s" % (in_format, in_filename, temp_format, err))
def check_simple_write_read(alignments, indent=" "): # print indent+"Checking we can write and then read back these alignments" for format in test_write_read_align_with_seq_count: records_per_alignment = len(alignments[0]) for a in alignments: if records_per_alignment != len(a): records_per_alignment = None # Can we expect this format to work? if not records_per_alignment and format not in test_write_read_alignment_formats: continue print(indent + "Checking can write/read as '%s' format" % format) # Going to write to a handle... handle = StringIO() try: c = AlignIO.write(alignments, handle=handle, format=format) assert c == len(alignments) except ValueError as e: # This is often expected to happen, for example when we try and # write sequences of different lengths to an alignment file. print(indent + "Failed: %s" % str(e)) # Carry on to the next format: continue # First, try with the seq_count if records_per_alignment: handle.flush() handle.seek(0) try: alignments2 = list(AlignIO.parse(handle=handle, format=format, seq_count=records_per_alignment)) except ValueError as e: # This is BAD. We can't read our own output. # I want to see the output when called from the test harness, # run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if format in test_write_read_alignment_formats: # Don't need the seq_count handle.flush() handle.seek(0) try: alignments2 = list(AlignIO.parse(handle=handle, format=format)) except ValueError as e: # This is BAD. We can't read our own output. # I want to see the output when called from the test harness, # run_tests.py (which can be funny about new lines on Windows) handle.seek(0) raise ValueError("%s\n\n%s\n\n%s" % (str(e), repr(handle.read()), repr(alignments2))) simple_alignment_comparison(alignments, alignments2, format) if len(alignments) > 1: # Try writing just one Alignment (not a list) handle = StringIO() SeqIO.write(alignments[0], handle, format) assert handle.getvalue() == alignments[0].format(format)
def initialize_data(self): self.motif1 = Motif(self.m1_file) motif = open(self.m1_file, 'r') for each in AlignIO.parse(motif, "fasta"): self.motif1.add_promoter(each) motif.close() self.motif2 = Motif(self.m2_file) motif = open(self.m2_file, 'r') for each in AlignIO.parse(motif, "fasta"): self.motif2.add_promoter(each) motif.close()
def getStuff(workAlignment, geneName): speciesList = [] global speciesList def rmDot(string): #Removes dots return string.split(".")[0] def makeMatrix(species): #Makes the matrix that things can be stored in. stuffDict = {} for i in species: stuffDict[i] = {} for j in species: stuffDict[i][j] = 0 return stuffDict for i in AlignIO.parse(workAlignment, "maf"): #Creates a list of species to be used in a matrix. for q in xrange(len(str(i.get_column(0)))): if rmDot(list(i)[q].id) not in speciesList: speciesList.append(rmDot(list(i)[q].id)) #runs the makeMatrix functionto create dictOfCOunts, which will hold all of the counts. dictOfCounts = makeMatrix(speciesList) dictOfSames = makeMatrix(speciesList) for i in AlignIO.parse(workAlignment, "maf"): #Loops through each block in the alignment.i is an Object that can be turned into a list or string and has several deprecated functions (which I use). columnDict = {} for j in xrange(i.get_alignment_length()): #creates columnDict, a dictionary that contains a apecies name and a letter global columnDict column = i.get_column(j) if '-' or "n" or "N" not in column: y = 0 for k in column: columnDict[rmDot(list(i)[y].id)] = k y = y + 1 for g in columnDict: #Populates dictOfCounts for h in columnDict: if columnDict[h] != columnDict[g]: dictOfCounts[g][h]= dictOfCounts[g][h] + 1 else: dictOfSames[g][h] = dictOfSames[g][h] + 1 def fileWriter(path, data): pickle.dump(data, open(path, "wb")) fileWriter("data/" + geneName + "Diffs", dictOfCounts) fileWriter("data/" + geneName + "Sames", dictOfSames) fileWriter("data/emptyMatrix", makeMatrix(speciesList))
def convert_alignments(in_file, out_file, new_type): """converts a nexus alignment into a phylip alignment and writes to new file""" alignments = AlignIO.parse(open(in_file, 'r'), "nexus") for alignment in alignments: handle = open(out_file, "a") AlignIO.write(alignment, handle, new_type) handle.close()
def test_seqtmatchall_piped(self): """seqmatchall with pair output piped to stdout.""" cline = SeqmatchallCommandline(cmd=exes["seqmatchall"], sequence="Fasta/f002", aformat="pair", wordsize=9, auto=True, stdout=True) self.assertEqual(str(cline), exes["seqmatchall"] + " -auto -stdout" + " -sequence=Fasta/f002" + " -wordsize=9 -aformat=pair") # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) child.stdin.close() # Check we could read it's output for align in AlignIO.parse(child.stdout, "emboss"): self.assertEqual(len(align), 2) self.assertEqual(align.get_alignment_length(), 9) # Check no error output: self.assertEqual(child.stderr.read(), "") self.assertEqual(0, child.wait()) child.stdout.close() child.stderr.close()
def convert(infile, type, outtype, outfile): """Make AlignIO call to convert using the specified parameters""" from Bio import AlignIO ifh = AlignIO.parse(infile, type) AlignIO.write(ifh, outfile, outtype)
def test_water_file3(self): """water with the asis trick and GenBank file, output to a file.""" # Setup, query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA" out_file = "Emboss/temp_test3.water" in_file = "GenBank/cor6_6.gb" self.assertTrue(os.path.isfile(in_file)) if os.path.isfile(out_file): os.remove(out_file) cline = WaterCommandline(cmd=exes["water"]) cline.set_parameter("asequence", "asis:%s" % query) cline.set_parameter("bsequence", in_file) # TODO - Tell water this is a GenBank file! cline.set_parameter("gapopen", "1") cline.set_parameter("gapextend", "0.5") cline.set_parameter("outfile", out_file) self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, self.run_water(cline) # Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse(in_file, "genbank"), AlignIO.parse(out_file, "emboss"), local=True) # Clean up, os.remove(out_file)
def parsexmfa(xmfa, r1, r2, ref): """ """ print("parsing...") r1 = r1 r2 = r2 consensusdict = {} gapfill = 0 totalgaps = 0 alignment = AlignIO.parse(open(xmfa), "mauve") for aln in alignment: # each alignment block header = [] if len(aln) > 1: for record in aln: header.append(record.id) for rec in header: if r1 in rec: pos = rec.split("/")[1] pos = pos.replace("-", ":") alignarr = np.array([list(r) for r in aln], np.character) sense, gap, fill = makesense(alignarr, header, r1, r2, ref) gapfill += fill totalgaps += gap consensusdict[pos] = sense print("total gaps: {}\n total gaps filled: {}".format(totalgaps, gapfill)) return(consensusdict)
def align(hom): '''Takes in a homologue from getHomologues() and aligns all of the sequences that it contains''' with tempfile.NamedTemporaryFile() as temp_file: uid_map = {} for species in hom['species']: temp = hom['species'][species][0] temp_file.write('>' + str(temp['uid']) + '\n') temp_file.write(temp['seq'] + '\n') temp_file.write('\n') uid_map[temp['uid']] = species align_io_temp_file = StringIO.StringIO() cline = ClustalwCommandline("clustalw", infile=temp_file.name, align='true',output='PHYLIP') align_io_temp_file.write(cline) alignments = AlignIO.parse(align_io_temp_file, 'phylip') for alignment in alignments: #TODO: Don't throw out data here #get the first protien for the species temp = hom[uid_map[alignment.id]]['species'][0] #clear out all others since we only currently want one hom[uid_map[alignment.id]]['species'] = [] #stick in the aligned sequence temp['seq'] = str(alignment.seq) #re append the protien hom[uid_map[alignment.id]]['species'].append(temp) seq_len = False for species in hom['species']: if not seq_len: seq_len = len(hom['species'][species][0]['seq']) if len(hom['species'][species][0]['seq']) != seq_len: raise Exception("ALIGNMENT ERROR") return hom
def main(): global args print "doop" d = "/home/nabil/IGORwork/safe.ALL.MAF" for multiple_alignment in AlignIO.parse(d, "maf"): print "new m"
def test_water_file2(self): """water with the asis trick and nucleotide FASTA file, output to a file.""" # Setup, query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG" out_file = "Emboss/temp_test2.water" in_file = "Fasta/f002" self.assertTrue(os.path.isfile(in_file)) if os.path.isfile(out_file): os.remove(out_file) cline = WaterCommandline(cmd=exes["water"]) cline.set_parameter("-asequence", "asis:%s" % query) cline.set_parameter("-bsequence", in_file) cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") cline.set_parameter("-outfile", out_file) self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, self.run_water(cline) # Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse(in_file, "fasta"), AlignIO.parse(out_file, "emboss"), local=True) # Clean up, os.remove(out_file)
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input) all_taxa = set([]) for count, f in enumerate(files): #new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) new_align = MultipleSeqAlignment([], generic_dna) for align in AlignIO.parse(f, 'nexus'): for seq in list(align): #pdb.set_trace() fname = os.path.splitext(os.path.basename(f))[0] new_seq_name = re.sub("^{}_*".format(fname), "", seq.name) all_taxa.add(new_seq_name) seq.id = new_seq_name seq.name = new_seq_name new_align.append(seq) assert len(all_taxa) == args.taxa, "Taxon names are not identical" outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count print "Taxon names in alignments: {0}".format(','.join(list(all_taxa)))
def remove_duplicate_sequences_and_sequences_missing_too_much_data(self, output_filename,remove_identical_sequences = 0): taxa_to_remove = [] if remove_identical_sequences < 1: taxa_to_remove = self.taxa_missing_too_much_data() else: taxa_to_remove = self.taxa_of_duplicate_sequences() + self.taxa_missing_too_much_data() with open(self.input_filename) as input_handle: with open(output_filename, "w+") as output_handle: alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] number_of_included_alignments = 0 for alignment in alignments: for record in alignment: if record.id not in taxa_to_remove: output_alignments.append(record) number_of_included_alignments += 1 if number_of_included_alignments <= 1: sys.exit("Not enough sequences are left after removing duplicates.Please check you input data.") AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") output_handle.close() input_handle.close() return taxa_to_remove
def test_needle_piped2(self): """needle with asis trick, and nucleotide FASTA file, output piped to stdout.""" # TODO - Support needle in Bio.Emboss.Applications # (ideally with the -auto and -filter arguments) # Setup, query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAA" cline = exes["needle"] cline += " -asequence asis:" + query cline += " -bsequence Fasta/f002" cline += " -auto" # no prompting cline += " -filter" # use stdout # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) child.stdin.close() # Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse("Fasta/f002", "fasta"), AlignIO.parse(child.stdout, "emboss"), local=False) # Check no error output: self.assertEqual(child.stderr.read(), "") self.assertEqual(0, child.wait()) child.stdout.close() child.stderr.close()
def check_bootstrap(self, filename, format, align_type="d"): """ check we can use fseqboot to pseudosample an alignment The align_type type argument is passed to the commandline object to set the output format to use (from [D]na,[p]rotein and [r]na ) """ self.assert_(os.path.isfile(filename), "Missing %s" % filename) cline = FSeqBootCommandline(exes["fseqboot"], sequence = filename, outfile = "test_file", seqtype = align_type, reps = 2, auto = True, filter = True) return_code = run_command(cline) if return_code != 0: raise ValueError("Return code %s from:\n%s" \ % (return_code, str(cline))) # the resultant file should have 2 alignments... bs = list(AlignIO.parse(open("test_file", "r" ), format)) self.assertEqual(len(bs), 2) # ..and each name in the original alignment... a_names = [s.name.replace(" ", "_") for s in AlignIO.read(open(filename, "r"), format)] # ...should be in each alignment in the bootstrapped file for a in bs: self.assertEqual(a_names, [s.name.replace(" ", "_") for s in a])
def filter_out_alignments_with_too_much_missing_data(input_filename, output_filename, filter_percentage,verbose): input_handle = open(input_filename, "rU") output_handle = open(output_filename, "w+") alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] taxa_removed = [] number_of_included_alignments = 0 for alignment in alignments: for record in alignment: number_of_gaps = 0 number_of_gaps += record.seq.count('n') number_of_gaps += record.seq.count('N') number_of_gaps += record.seq.count('-') sequence_length = len(record.seq) if sequence_length == 0: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because there werent enough bases in it" elif((number_of_gaps*100/sequence_length) <= filter_percentage): output_alignments.append(record) number_of_included_alignments += 1 else: taxa_removed.append(record.id) print "Excluded sequence " + record.id + " because it had " + str(number_of_gaps*100/sequence_length) +" percentage gaps while a maximum of "+ str(filter_percentage) +" is allowed" if number_of_included_alignments <= 1: sys.exit("Too many sequences have been excluded so theres no data left to work with. Please increase the -f parameter") AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") output_handle.close() input_handle.close() return taxa_removed
def load(cls, filename, schema=None): try: return [AlignmentExt.from_msa(msa) for msa in AlignIO.parse(filename, cls.schema(filename, schema))] except Exception, e: print 'Unable to load alignments from: %s\n%s' % (filename, str(e)) return None
def check_convert(in_filename, in_format, out_format, alphabet=None): # Write it out using parse/write handle = StringIO() aligns = list(AlignIO.parse(in_filename, in_format, None, alphabet)) try: count = AlignIO.write(aligns, handle, out_format) except ValueError: count = 0 # Write it out using convert passing filename and handle handle2 = StringIO() try: count2 = AlignIO.convert(in_filename, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue() # Write it out using convert passing handle and handle handle2 = StringIO() try: with open(in_filename) as handle1: count2 = AlignIO.convert(handle1, in_format, handle2, out_format, alphabet) except ValueError: count2 = 0 assert count == count2 assert handle.getvalue() == handle2.getvalue()
def split_family_seqs(): alis_dir = cfg.dataPath('rfam/family_alis/') meta_dir = cfg.dataPath('rfam/family_metas/') fopen = open(cfg.dataPath('rfam/Rfam.seed')) alis = aio.parse(fopen,'stockholm') while 1: infos = {} start = fopen.tell() while 1: l = fopen.readline() if l == '': break if l[0] == '#': ukey = str(l[5:7]) infos.update( [(ukey, infos.get(ukey,'') + l[8:])]) else: if l.strip() != '': break fopen.seek(start) ali = alis.next() if not ali: break rfname = infos['AC'].strip() alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w') metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w') aio.write(ali, alifile, 'fasta') pickle.dump(infos, metafile) alifile.close() metafile.close()
def _iterate_via_AlignIO(handle, format, alphabet): """Iterate over all records in several alignments (PRIVATE).""" from Bio import AlignIO for align in AlignIO.parse(handle, format, alphabet=alphabet): for record in align: yield record
def test_water_file3(self): """water with the asis trick and GenBank file, output to a file.""" #Setup, query = "TGTTGTAATGTTTTAATGTTTCTTCTCCCTTTAGATGTACTACGTTTGGA" out_file = "Emboss/temp_test3.water" in_file = "GenBank/cor6_6.gb" self.assert_(os.path.isfile(in_file)) if os.path.isfile(out_file) : os.remove(out_file) cline = WaterCommandline(cmd=exes["water"]) cline.set_parameter("asequence", "asis:%s" % query) cline.set_parameter("bsequence", in_file) #TODO - Tell water this is a GenBank file! cline.set_parameter("gapopen", "1") cline.set_parameter("gapextend", "0.5") cline.set_parameter("outfile", out_file) self.assertEqual(str(eval(repr(cline))), str(cline)) #Run the tool, result, out, err = generic_run(cline) #Check it worked, errors = err.read().strip() self.assert_(errors.startswith("Smith-Waterman local alignment"), errors) self.assertEqual(out.read().strip(), "") if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline self.assertEqual(result.return_code, 0) self.assertEqual(result.get_result("outfile"), out_file) assert os.path.isfile(out_file) #Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse(open(in_file),"genbank"), AlignIO.parse(open(out_file),"emboss"), local=True) #Clean up, os.remove(out_file)
def test_water_file2(self): """water with the asis trick and nucleotide FASTA file, output to a file.""" #Setup, query = "ACACACTCACACACACTTGGTCAGAGATGCTGTGCTTCTTGGAAGCAAGGNCTCAAAGGCAAGGTGCACGCAGAGGGACGTTTGAGTCTGGGATGAAGCATGTNCGTATTATTTATATGATGGAATTTCACGTTTTTATG" out_file = "Emboss/temp_test2.water" in_file = "Fasta/f002" self.assert_(os.path.isfile(in_file)) if os.path.isfile(out_file) : os.remove(out_file) cline = WaterCommandline(cmd=exes["water"]) cline.set_parameter("-asequence", "asis:%s" % query) cline.set_parameter("-bsequence", in_file) cline.set_parameter("-gapopen", "10") cline.set_parameter("-gapextend", "0.5") cline.set_parameter("-outfile", out_file) self.assertEqual(str(eval(repr(cline))), str(cline)) #Run the tool, result, out, err = generic_run(cline) #Check it worked, errors = err.read().strip() self.assert_(errors.startswith("Smith-Waterman local alignment"), errors) self.assertEqual(out.read().strip(), "") if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline self.assertEqual(result.return_code, 0) self.assertEqual(result.get_result("outfile"), out_file) assert os.path.isfile(out_file) #Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse(open(in_file),"fasta"), AlignIO.parse(open(out_file),"emboss"), local=True) #Clean up, os.remove(out_file)
def __init__(self, file_name=None, data = None, format='fasta'): if file_name: super(Alignment, self).__init__(AlignIO.read(file_name, format)) elif data: super(Alignment, self).__init__(AlignIO.parse(StringIO(data), format)) else: super(Alignment, self).__init__([])
def test_water_file4(self): """water with the asis trick and SwissProt file, output to a file.""" # Setup, query = "DVCTGKALCDPVTQNIKTYPVKIENLRVMI" out_file = "Emboss/temp_test4.water" in_file = "SwissProt/sp004" self.assertTrue(os.path.isfile(in_file)) if os.path.isfile(out_file): os.remove(out_file) cline = WaterCommandline(cmd=exes["water"]) cline.set_parameter("-asequence", "asis:%s" % query) cline.set_parameter("-bsequence", in_file) # EMBOSS should work this out, but let's be explicit: cline.set_parameter("-sprotein", True) # TODO - Tell water this is a SwissProt file! cline.set_parameter("-gapopen", "20") cline.set_parameter("-gapextend", "5") cline.set_parameter("-outfile", out_file) self.assertEqual(str(eval(repr(cline))), str(cline)) # Run the tool, self.run_water(cline) # Check we can parse the output and it is sensible... self.pairwise_alignment_check(query, SeqIO.parse(in_file, "swiss"), AlignIO.parse(out_file, "emboss"), local=True) # Clean up, os.remove(out_file)
def taxit_create(taxit_executable_loc, aln_fasta, hmm_file, tree_file, tree_stats, pfam_acc, output_location, aln_stockholm): ''' Calls taxit ''' #taxit create --clobber --aln-fasta ./PF14424.dedup.fasta --profile ./PF14424.wholefam.hmm --tree-file ./PF14424.dedup.nh --locus PF14424 --package-name PF14424.pplacer cmd = taxit_executable_loc \ + " create --clobber" \ + " --aln-fasta " + aln_fasta \ + " --profile " + hmm_file \ + " --tree-file " + tree_file \ + " --tree-stats " + tree_stats \ + " --locus " + pfam_acc \ + " --package-name " + output_location raw_data = subprocess.check_call(cmd, shell=True) input_handle = open(aln_fasta, "rU") output_handle = open(aln_stockholm, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "stockholm") output_handle.close() input_handle.close()
def emboss_piped_AlignIO_convert(alignments, old_format, new_format): """Run seqret, returns alignments (as a generator).""" # Setup, this assumes for all the format names used # Biopython and EMBOSS names are consistent! cline = SeqretCommandline(exes["seqret"], sformat=old_format, osformat=new_format, auto=True, # no prompting filter=True) # Run the tool, child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform != "win32")) try: AlignIO.write(alignments, child.stdin, old_format) except Exception as err: child.stdin.close() child.stderr.close() child.stdout.close() raise child.stdin.close() child.stderr.close() # TODO - Is there a nice way to return an iterator AND # automatically close the handle? try: aligns = list(AlignIO.parse(child.stdout, new_format)) except Exception as err: child.stdout.close() raise child.stdout.close() return aligns
def combine_fastas(self, leaf_node_filename, internl_node_filename, output_file): with open(output_file, 'w') as output_handle: # print out leafnodes as is with open(leaf_node_filename, 'r') as input_handle: alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "fasta") input_handle.closed with open(internl_node_filename, 'r') as input_handle: alignments = AlignIO.parse(input_handle, "fasta") output_alignments = [] for alignment in alignments: for record in alignment: record.id = self.internal_node_prefix + str(record.id) record.description = '' output_alignments.append(record) AlignIO.write(MultipleSeqAlignment(output_alignments), output_handle, "fasta") input_handle.closed output_handle.closed
def trim_sequences(filepath: str) -> List: record = list(AlignIO.parse(filepath, "msf")) result_record = [] for rec, num in zip(record[1:], range(len(record[1:]))): canonical_ = rec[0] canonical_len = len(canonical_.seq) result = percent_id_calc(canonical_, rec[1], f"{rec[0].id[:7]}_{num}", canonical_len) if result: result_record.append(result) return result_record
def get_alignment_sequences_amount(input_path): input_format = get_format(input_path) if input_format == "fasta": alignment = SeqIO.parse(open(input_path), input_format) else: alignment = AlignIO.parse(open(input_path), input_format) seq_num = 0 while True: try: record = next(alignment) seq_num += 1 except: return seq_num
def seq_trimmer(self): needle_record = list(AlignIO.parse(self.out_dir / "water.fasta", "msf")) self.result_record = [] for rec in needle_record[1:]: reference_seq = rec[0] seq_parser = IdenticalSequencesParser(reference_seq, rec[1], self.id_score) result = seq_parser.highly_identical_seqs() if result: self.result_record.append(result)
def are_sequence_names_unique(self): with open(self.input_filename) as input_handle: alignments = AlignIO.parse(input_handle, "fasta") sequence_names = [] for alignment in alignments: for record in alignment: sequence_names.append(record.name) if [k for k, v in list(Counter(sequence_names).items()) if v > 1 ] != []: return False input_handle.close() return True
def main(): args = get_args() nexus_files = get_files(args.input) taxa = get_all_taxon_names(nexus_files) taxa_to_keep = get_samples_to_run(args, taxa) for count, align_file in enumerate(nexus_files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for align in AlignIO.parse(align_file, "nexus"): for taxon in list(align): if taxon.name in taxa_to_keep: new_align.add_sequence(taxon.name, str(taxon.seq)) outf = os.path.join(args.output, os.path.basename(align_file)) AlignIO.write(new_align, open(outf, 'w'), 'nexus') print count
def maf(in_maf, out_paf): maf_f = None try: with open(out_paf, "w") as paf: maf_f = AlignIO.parse(in_maf, "maf") for grp in maf_f: seqs = [] for seq in grp: seqs.append(seq) matches = 0 for i in range(0, len(seqs[0])): if seqs[0][i] == seqs[1][i]: matches += 1 tannots = seqs[0].annotations qannots = seqs[1].annotations tlen = tannots["srcSize"] tstart = tannots["start"] tend = tstart + tannots["size"] if tannots["strand"] == -1: tstart = tlen - tstart tend = tlen - tend qlen = qannots["srcSize"] qstart = qannots["start"] qend = qannots["start"] + qannots["size"] if qannots["strand"] == -1: qstart = qlen - qstart qend = qlen - qend strand = "+" if tannots["strand"] == qannots["strand"] else "-" paf.write( "{qname}\t{qlen}\t{qstart}\t{qend}\t{strand}\t{tname}\t{tlen}\t{tstart}\t{tend}\t{matches}\t" "{block_len}\t255\n".format( tname=seqs[0].id, tlen=tlen, tstart=tstart, tend=tend, qname=seqs[1].id, qlen=qlen, qstart=qstart if strand == "+" else qend, qend=qend if strand == "+" else qstart, strand=strand, matches=matches, block_len=tannots["size"])) except: traceback.print_exc() if maf_f is not None: maf_f.close() return False else: maf_f.close() return True
def convert_nexus_to_format(dataset_as_nexus, dataset_format): """ Converts nexus format to Phylip and Fasta using Biopython tools. :param dataset_as_nexus: :param dataset_format: :return: """ fake_handle = StringIO(dataset_as_nexus) nexus_al = AlignIO.parse(fake_handle, 'nexus') tmp_file = make_random_filename() AlignIO.write(nexus_al, tmp_file, dataset_format) dataset_as_fasta = read_and_delete_tmp_file(tmp_file) return dataset_as_fasta
def parse_alignment(self, f): alignments = AlignIO.parse(f, 'clustal') for msa in alignments: self.length = msa.get_alignment_length() self.hydropathies = get_average_hydropathies(msa, window=self.window, kernel=self.kernel) self.amphipathicities = get_average_amphipathicities( msa, window=self.window) self.similarities = get_similarities(msa) self.tmcenters = get_tmcenters(msa) #what's the worst that can happen??? return msa
def load_alignments(alignmentfiles, format): alignments = [] for file in alignmentfiles: try: for alignment in AlignIO.parse(file, format=format): logger.debug("loaded alignment of length {} from {}".format( len(alignment), file)) alignments.append(alignment) except ValueError as e: logger.error("Cannot parse input file {}: {}".format(file, e)) raise logger.info("Successfully loaded {} alignments from {} input files".format( len(alignments), len(alignmentfiles))) return alignments
def usePhyMLForBranchLengths(alignmentFasta, newicktree): #converts the alignment file in FASTA to PHYLIP format alnFastaInFH = getInputTempFile(alignmentFasta) alnPhylipOutFH = getOutputTempFile() input_handle = open(alnFastaInFH.name, "rU") output_handle = open(alnPhylipOutFH.name, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "phylip") input_handle.close() output_handle.close() treeInFH = getInputTempFile(newicktree) #removes the output files to be created if they already exist if os.path.exists(alnPhylipOutFH.name + "_phyml_stats.txt"): os.system("rm %s_phyml_stats.txt" % (alnPhylipOutFH.name)) if os.path.exists(alnPhylipOutFH.name + "_phyml_tree.txt"): os.system("rm %s" % (alnPhylipOutFH.name + "_phyml_tree.txt")) #spawns a process and executes all the correct input keys when prompted child = pexpect.spawn("phyml") child.expect(". Enter the sequence file name >") child.sendline(alnPhylipOutFH.name) child.sendline("D") child.sendline("+") child.sendline("+") child.sendline("O") child.sendline("U") child.sendline("Y") child.expect(". Enter the name of the input tree file >") child.sendline(treeInFH.name) #checks for how long the calculation has been going for, if it has stalled, then the process will be halted argc = 0 startTime = time.time() B = True while child.isalive() and B: argc += 1 if argc % 1000 == 0: newTime = time.time() - startTime if newTime > 60.0: B = False Ret = None if B: Ret = open(alnPhylipOutFH.name + "_phyml_tree.txt", "r").read() #returns the branch lengthed tree if the process was successful return [B, Ret]
def convert_fasta_to_phylip(input_path, output_path, blank=False): if blank: intermediata_path = input_path.replace(".fas", ".translated_fasta") names_translator_path = input_path.replace(".fas", ".names_map") res = convert_sequences_names(input_path, intermediata_path, names_translator_path, src="FastML") input_handle = open(intermediata_path, "rU") else: input_handle = open(input_path, "rU") output_handle = open(output_path, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "phylip-relaxed") input_handle.close() output_handle.close() return 0
def runPhyML(aln, phymlOpt, geneDir): """ Function converting fasta file to phylip and running PhyML. @param1 aln: Path @param2 geneDir: Gene directory @return outPhy: Path to PhyML results file """ # convert to Phylip format and replace eventual "!" symbols (relic from using MACSE) origin = os.getcwd() os.chdir(geneDir) outPhy = aln.split("/")[-1].split(".")[0]+".phylip" #aln = aln.split("/")[-1] tmp = aln.split("/")[-1].split(".")[0]+".tmp" logger = logging.getLogger("main.tree") with open(aln, "rU") as aln2: laln = aln2.read().replace("!", "N") aln2.close() with open(tmp, "w") as temp: temp.write(laln) temp.close() input_handle = open(tmp, "rU") output_handle = open(outPhy, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "phylip-relaxed") output_handle.close() input_handle.close() os.remove(tmp) # PhyML if phymlOpt != "": try: opt=phymlOpt.split("ALN ")[1] logger.debug("phyml -i {:s} {}".format(outPhy, opt)) cmd("phyml -i {:s} {}".format(outPhy, opt), False) except: logger.info("PhyML couldn't run with the provided info {}, running with default options.".format(phymlOpt)) cmd("phyml -i {:s} -v e -b -2".format(outPhy), False) else: logger.debug("phyml -i {:s} -v e -b -2".format(outPhy)) cmd("phyml -i {:s} -v e -b -2".format(outPhy), False) os.chdir(origin) return(geneDir+outPhy)
def hssp3_file_to_phylip(hssp3_file_name, phylip_file_name, chain_id, master_sequence): """Reads a HSSP file in stockholm format and writes a new msa file in phylip-sequential format only containing the given chain""" alignments = list(AlignIO.parse(hssp3_file_name, format='stockholm')) for align in alignments: if align[0].name[4] == '/': chain = align[0].name[5].upper() if chain == chain_id: align[0].id = align[0].name = align[0].description = 'MASTER' #align[0].seq = align[0].seq.ungap('-') AlignIO.write(align, phylip_file_name, format='phylip-sequential')
def check_EMBOSS_to_AlignIO(self, filename, old_format, skip_formats=()): """Check AlignIO can read seqret's conversion of the file.""" self.assertTrue(os.path.isfile(filename), filename) old_aligns = list(AlignIO.parse(filename, old_format)) formats = ["clustal", "phylip", "ig", "msf"] if len(old_aligns) == 1: formats.extend(["fasta", "nexus"]) for new_format in formats: if new_format in skip_formats: continue handle = emboss_convert(filename, old_format, new_format) try: new_aligns = list(AlignIO.parse(handle, new_format)) except Exception: # TODO - Which exceptions? handle.close() raise ValueError("Can't parse %s file %s in %s format." % (old_format, filename, new_format)) handle.close() try: self.assertTrue(compare_alignments(old_aligns, new_aligns)) except ValueError as err: raise ValueError("Disagree on %s file %s in %s format: %s" % (old_format, filename, new_format, err))
def parse_output(file, output): """Parses seq-gen output (phylip) into fasta file(s) """ assert file assert output strio = StringIO.StringIO(output) phylip = list(AlignIO.parse(strio, 'phylip-sequential')) if len(phylip) > 1: for i, msa in enumerate(phylip): AlignIO.write(msa, '%i.%s' % (i, file), 'fasta') else: AlignIO.write(phylip[0], file, 'fasta')
def pairwisealign(seq1, seq2, **kwargs): """ Globally align two sequences. :param seq1: Sequence 1 :type seq1: str :param seq2: Sequence 2 :type seq2: str :parm AA: True if protein sequences, false otherwise. :param gapopen: The cost for opening a gap. :param gapextend: The cost for extending a gap. :returns: Sequence 1 aligned, sequence 2 aligned :rtype: tuple """ if kwargs['AA']: flag1 = '-sprotein1' flag2 = '-sprotein2' matrix = 'EBLOSUM62' else: flag1 = '-snucleotide1' flag2 = '-snucleotide2' matrix = 'EDNAFULL' outfile = tempfile.NamedTemporaryFile() callstr = 'stretcher -outfile=%s -asequence=asis:%s \ -bsequence=asis:%s -gapopen=%s -gapextend=%s -aformat fasta \ -datafile %s %s %s' % (outfile.name, seq1, seq2, kwargs['gapopen'], kwargs['gapextend'], matrix, flag1, flag2) status, output = getstatusoutput(callstr) if status == 0: result = AlignIO.parse(outfile, 'fasta') alignmentobj = result.next() outfile.close() getoutput('rm %s' % outfile.name) else: print output print "There was an error in pairwisealign.\ Is stretcher installed? See above output and check out" , \ outfile.name print callstr outfile.close() raise SystemError return alignmentobj[0].seq._data, alignmentobj[1].seq._data
def remove_similars(ident): """Remove sequences with desired % identity""" # Input multi-seq FASTA format validated_sequence, validated_id = [], [] for record in SeqIO.parse("../input.fasta", "fasta"): validated_sequence.append(record.seq) validated_id.append(record.id.split(" ")[0]) remove_id = [] for i in range(0, len(validated_sequence) - 1): if len(validated_sequence[i]) > 10: if validated_id[i] not in remove_id: for j in range(i + 1, len(validated_sequence)): if len(validated_sequence[j]) > 10: needle_cline = NeedleCommandline( asequence="asis:" + validated_sequence[i], bsequence="asis:" + validated_sequence[j], gapopen=10, gapextend=0.5, outfile='stdout') stdout, stderr = needle_cline() alignment = AlignIO.parse(StringIO(stdout), "emboss") for needle_records in alignment: query = list(needle_records[0].seq) subject = list(needle_records[1].seq) matches = [ h for h, k in zip(query, subject) if h == k ] while '-' in matches: matches.remove('-') similarity = (float(len(matches)) / len(query)) * 100 if similarity >= ident: remove_id.append(validated_id[j]) remove_id = list(set(remove_id)) else: remove_id.append(validated_id[i]) remove_id = list(set(remove_id)) records = (r for r in SeqIO.parse("../input.fasta", "fasta") if r.id.split(" ")[0] not in remove_id) SeqIO.write(records, "input2.fasta", "fasta") os.rename("input2.fasta", "../output.fasta")
def main(): import sys from Bio import AlignIO if len(sys.argv) != 5: sys.exit('python3 %s <in.mfa> <out.mfa> <inFormat> <outFormat>' % (sys.argv[0])) inFile = sys.argv[1] outFile = sys.argv[2] inFormat = sys.argv[3] outFormat = sys.argv[4] alignments = AlignIO.parse(inFile, inFormat) AlignIO.write(alignments, outFile, outFormat)
def align(genes): path = config.get_binary_path('mafft') if not path: raise exceptions.BinaryNotFound('MAFFT binary was not found') with tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta') as tmp: utils.write_genome(genes, tmp) tmp.flush() mafft_cmd = MafftCommandline(path, input=tmp.name, quiet=True) stdout, _ = mafft_cmd() records = AlignIO.parse(StringIO(stdout), "fasta") try: alignments = next(records) except StopIteration: return {} return utils.records_to_dict(alignments)
def parse_many_alignments(infiles, fmt='fasta'): """ Iterate either over multiple files, or over one single file split by '//' """ current = '' if not infiles or (len(infiles) == 1 and infiles[0] in ('-', '/dev/stdin')): for line in stdin: if line.startswith('//'): yield AlignIO.read(StringIO(current), fmt) current = '' else: current += line else: for infile in infiles: yield from AlignIO.parse(infile, fmt)
def main(args=None): if args is None: args = argv[1:] parser = ArgumentParser(description='convert stockholm file to FASTA') parser.add_argument('STOCKHOLMFILE', type=FileType('r')) ns = parser.parse_args(args) alignments = AlignIO.parse(ns.STOCKHOLMFILE, 'stockholm') AlignIO.write(alignments, stdout, 'fasta') if ns.STOCKHOLMFILE != stdin: ns.STOCKHOLMFILE.close() return 0
def convertFasta2Phylip(instring, outstring): """ :param instring: in fasta :param outstring: out phylip handle :return: basic phylip """ input_handle = open(instring, "rU") output_handle = open(outstring, "w") alignments = AlignIO.parse(input_handle, "fasta") AlignIO.write(alignments, output_handle, "phylip") output_handle.close() input_handle.close()
def _createObjectBasedOnFile(self, filePath): if not _BioAvailable: return filePath conversionResult = [] file = open(filePath, 'rU') fileFormatName = u2py_internals.detectFormat(filePath) if fileFormatName in Serializer._seqRecordsFileFormats: conversionResult = list(SeqIO.parse(file, fileFormatName)) self.createdFiles[filePath] = conversionResult elif fileFormatName in Serializer._msaFileFormats: conversionResult = list(AlignIO.parse(file, fileFormatName)) self.createdFiles[filePath] = conversionResult else: conversionResult = filePath file.close() return conversionResult
def hash_sequences(self): sequence_hash_to_taxa = defaultdict(list) with open(self.input_filename) as input_handle: alignments = AlignIO.parse(input_handle, "fasta") for alignment in alignments: for record in alignment: sequence_hash = hashlib.md5() sequence_hash.update(str(record.seq).encode('utf-8')) hash_of_sequence = sequence_hash.digest() sequence_hash_to_taxa[hash_of_sequence].append(record.id) if self.verbose: print("Sample " + str(record.id) + " has a hash of " + str(hash_of_sequence)) input_handle.close() return sequence_hash_to_taxa
def main(): # parse command line options usage = "%prog [options] bpg_accession" opt_parser = OptionParser(usage=usage) (options, args) = opt_parser.parse_args() if len(args) != 1: opt_parser.error('Incorrect number of arguments') if len(args[0]) < 10 or args[0][0:3] != 'bpg': opt_parser.error('Argument must be a bpg accession like bpg0123456') bpg_accession = args[0] try: family_id = int(bpg_accession[3:]) except ValueError: opt_parser.error('Argument must be a bpg accession like bpg0123456') family_dir = '/clusterfs/ohana/bpg/pfacts/%s/%s/%s' % ( bpg_accession[0:4], bpg_accession[0:7], bpg_accession) if not os.path.exists(family_dir): opt_parser.error('Family %s not found on the filesystem.' % bpg_accession) family = Family.objects.get(id=family_id) if family.status == 'bad': opt_parser.error('Family %s is marked as bad in the database.' \ % bpg_accession) alignment_file = os.path.join(family_dir, '%s.a2m' % bpg_accession) f = open(alignment_file) for alignment in AlignIO.parse(f, "fasta"): break alignment_seqs, aligned_column_indices \ = get_alignment_seqs_and_aligned_column_indices(alignment) column_conserved_residue, column_score \ = get_conservation_info(alignment_seqs, aligned_column_indices) outfname = os.path.join(family_dir, bpg_accession + '.alignmentconservation.csv') outf = open(outfname, 'w') outf.write('ColumnIndex,ConservedResidue,Blosum62ConservationScore\n') for j in aligned_column_indices: outf.write('%d,%s,%f\n' % (j, column_conserved_residue[j], column_score[j])) outf.close() root = family.canonical_root_node() update_tree_node_alignment_conservation(root, aligned_column_indices, column_conserved_residue, column_score)
def pseudofam_convert_stockholm_to_fasta(input_file_name, output_file_name=None): from Bio import AlignIO if output_file_name == None: output_file_name = input_file_name.rstrip(".sto") + ".fasta" input_handle = open( input_file_name, "rU" ) #HJ: Apparently U has no affect, so this indicates it will only read (for "r"). output_handle = open(output_file_name, "w") alignments = AlignIO.parse(input_handle, "stockholm") AlignIO.write(alignments, output_handle, "fasta") output_handle.close() input_handle.close()
def get_fraction(position, msa_path, chars=True): # print("get_fraction called with chars = " + str(chars)) # debug sys.path.insert(0, '/groups/itay_mayrose/halabikeren/python_scripts/') from utils.alterAlignments import get_format, get_alignment_sequences_amount msa_format = get_format(msa_path) seq_num = get_alignment_sequences_amount(msa_path) chars_counter = 0 alignments = AlignIO.parse(open(msa_path), msa_format) alignment = next(alignments) for record in alignment: sequence = str(record.seq) if sequence[position - 1] != "-": chars_counter += 1 chars_fraction = float(chars_counter / seq_num) if not chars: return float(1 - chars_fraction) return chars_fraction
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.input, args.input_format) for count, f in enumerate(files): new_records = [] for align in AlignIO.parse(f, args.input_format): for oldseq in list(align): seqstr = str(oldseq.seq) #pdb.set_trace() new_seq = re.sub("[acgtn]", "", seqstr) new_seq = re.sub("-", "", new_seq) new_seq_record = SeqRecord(Seq(new_seq, generic_dna), id=oldseq.id, name=oldseq.name, description=oldseq.description) new_records.append(new_seq_record) outf = os.path.join(args.output, os.path.split(f)[1]) SeqIO.write(new_records, open(outf, 'w'), 'fasta') print count
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) pos1, pos2 = [eval(i) for i in options.positions.strip().split(',')] for count, f in enumerate(files): align = AlignIO.parse(f, "nexus") new_name = os.path.splitext( os.path.split(f)[1])[0] + '.' + options.format outf = os.path.join(options.output, new_name) if options.shorten_name: for item in rename(align, pos1, pos2, options.splitchar): AlignIO.write(item, open(outf, 'w'), options.format) else: AlignIO.write(align, open(outf, 'w'), options.format) print count