def concat(mypath, same_taxa): ''' Combine multiple nexus data matrices in one partitioned file. By default this will only work if the same taxa are present in each file use same_taxa=False if you are not concerned by this From: http://biopython.org/wiki/Concatenate_nexus small change: added onlyfiles block to remove hidden files ''' onlyfiles = [] for item in os.listdir(mypath): if not item.startswith('.') and os.path.isfile( os.path.join(mypath, item)): onlyfiles.append(item) nexi = [] for nex in onlyfiles: nex_open = open(nex, 'r') nex_save = Nexus.Nexus(nex_open) nexi.append((nex, nex_save)) if same_taxa: if not check_taxa(nexi): return Nexus.combine(nexi) else: return Nexus.combine(nexi)
def concatNexusAlignments(processes): # take the list of fasta alingments and convert each to a nexus file and # concat all the nexus files into 1 alingment pool = ThreadPool(processes) already_done = [ x.split('.')[0] for x in os.listdir('{}/nexus'.format(base_dir)) ] fastas = [ '{}/fasta/{}'.format(base_dir, file) for file in os.listdir('{}/fasta'.format(base_dir)) if file.split('.')[0] not in already_done ] list( tqdm(pool.imap(convertFastaToNexus, fastas), total=len(fastas), desc='Fastas to Nexus...')) combined_nexus = '{}/WGS.nex'.format(base_dir) if os.path.isfile(combined_nexus): return combined_nexus nexus = [ '{}/nexus/{}'.format(base_dir, file) for file in os.listdir('{}/nexus'.format(base_dir)) ] nexus = [(filename, Nexus.Nexus(filename)) for filename in nexus] combined = Nexus.combine(nexus) combined.write_nexus_data(filename=open(combined_nexus, 'w')) return combined_nexus
def concat_alignment(files, output): nexi = [(fname.replace(alignments_dir, '').replace(".", "").replace("-", ""), Nexus.Nexus(fname)) for fname in files] combined = Nexus.combine(nexi) combined.write_nexus_data(filename=open(output, 'w'))
def export_nexus(aln, charset_name): nexus_list_names = split_nexus_by_charsets(aln, charset_name) nexus_tuples = [] for name in nexus_list_names: nexus_tuples.append((name, Nexus.Nexus(name))) concat = Nexus.combine(nexus_tuples) concat.write_nexus_data('%s_concat.nex' % (aln.rstrip(".nex")))
def main(): args = get_args() print "Reading files..." nexus_files = glob.glob(os.path.join(args.input, '*.nex*')) data = [(fname, Nexus.Nexus(fname)) for fname in nexus_files] print "Concatenating files..." concatenated = Nexus.combine(data) print "Writing to phylip..." concatenated.export_phylip(args.output)
def concat_nexus_alignment(path: str, output_name: str): wildcards = path + "/*.nex" outname = path + "/" + output_name fnames = glob(wildcards) nex_list = [(nex, Nexus.Nexus(nex)) for nex in fnames] concat = Nexus.combine(nex_list) concat.write_nexus_data(filename=open(outname, "w")) print(f"DONE! File is written as {outname}")
def main(): args = get_args() print "Reading files..." nexus_files = glob.glob(os.path.join(args.input, '*.nex*')) data = [(fname, Nexus.Nexus(fname)) for fname in nexus_files] print "Concatenating files..." concatenated = Nexus.combine(data) #print "Writing temp nexus..." #fd, temp = tempfile.mkstemp(suffix='.nexus') #concatenated.write_nexus_data(filename=os.fdopen(fd, 'w')) print "Writing to phylip..." concatenated.export_phylip(args.output)
def concatNexAlns(): """Combine multiple nexus data matrices in one partitioned file. By default this will only work if the same taxa are present in each file use same_taxa=False if you are not concerned by this """ nexdir = '{}/nexus/'.format(base_dir) filelist = [x for x in os.listdir(nexdir) if x.endswith('.nex')] nexi = [(os.path.join(nexdir, fname), Nexus.Nexus(os.path.join(nexdir, fname))) for fname in filelist] coutname = '{}/concat_aln_species_tree.nex'.format(base_dir) combined = Nexus.combine(nexi) combined.write_nexus_data(filename=open(coutname, 'w')) return coutname
def concattophylip(directory, outdir): print("Making concat and charset files.") os.makedirs(outdir + '/phylip') os.chdir(directory) file_list = glob.glob('*.nex*') nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list] combined = Nexus.combine(nexi) sets = combined.append_sets() concat_file = outdir + '/phylip/concatdata.phylip' combined.export_phylip(concat_file) charset_file = outdir + '/phylip/charsets.charsets' with open(charset_file, 'w') as outf: outf.write(sets) outf.close()
def fully_partition(metadata, aligns): to_combine = [] start = 1 for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus( open(os.path.join(aligns, "{0}.nex".format(locus)))) end = start + nex.nchar - 1 metadata[model][locus] = (start, end) to_combine.append((locus, nex)) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, metadata
def concatNexAlns( nexDir, outname, same_taxa=True): #from https://biopython.org/wiki/Concatenate_nexus """Combine multiple nexus data matrices in one partitioned file. By default this will only work if the same taxa are present in each file use same_taxa=False if you are not concerned by this """ filelist = [x for x in os.listdir(nexDir) if x.endswith('.nex')] nexi = [(os.path.join(nexDir, fname), Nexus.Nexus(os.path.join(nexDir, fname))) for fname in filelist] coutname = 'concat_stree_aln_{}.nex'.format(outname) combined = Nexus.combine(nexi) combined.write_nexus_data(filename=open(coutname, 'w')) return coutname
def NexusIterator(handle, seq_count=None): """Returns SeqRecord objects from a Nexus file. Thus uses the Bio.Nexus module to do the hard work. You are expected to call this function via Bio.SeqIO or Bio.AlignIO (and not use it directly). NOTE - We only expect ONE alignment matrix per Nexus file, meaning this iterator will only yield one Alignment.""" n = Nexus.Nexus(handle) if not n.matrix: #No alignment found raise StopIteration alignment = Alignment(n.alphabet) #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. #The original names and the modified names are kept in these two lists: assert len(n.unaltered_taxlabels) == len(n.taxlabels) if seq_count and seq_count != len(n.unaltered_taxlabels): raise ValueError("Found %i sequences, but seq_count=%i" \ % (len(n.unaltered_taxlabels), seq_count)) for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels): assert new_name.startswith(old_name) seq = n.matrix[new_name] #already a Seq object with the alphabet set #ToDo - Can we extract any annotation too? #ToDo - Avoid abusing the private _records list alignment._records.append( SeqRecord(seq, id=new_name, name=old_name, description="")) #All done yield alignment
def main(): args = get_args() #pdb.set_trace() # get filenames in directory and convert to array files = numpy.array(glob.glob(os.path.join(args.nexus, '*.nex*'))) # make sure we have enough assert len(files) >= args.sample_size, "Sample size must be < number(files)" print "Running" for i in xrange(args.samples): sys.stdout.write('.') sys.stdout.flush() # get list of random numbers sample = numpy.random.random_integers(0, len(files) - 1, args.sample_size) # reindex filenames by random selections random_files = sorted(files[sample].tolist()) # concatenate and output files_to_combine = [(f, Nexus.Nexus(f)) for f in random_files] combined = Nexus.combine(files_to_combine) align_name = "random-sample-{}-{}-loci.nex".format(i, args.sample_size) # open metadata file meta_name = 'META-random-sample-{}-{}-loci.txt'.format(i, args.sample_size) meta = open( os.path.join(args.output, meta_name), 'w' ) meta.write('{}'.format('\n'.join(random_files))) meta.close() combined.write_nexus_data(filename=open( os.path.join(args.output, align_name), 'w') ) sys.stdout.write("Done")
def NexusIterator(handle, seq_count=None): """Returns SeqRecord objects from a Nexus file. Thus uses the Bio.Nexus module to do the hard work. You are expected to call this function via Bio.SeqIO or Bio.AlignIO (and not use it directly). NOTE - We only expect ONE alignment matrix per Nexus file, meaning this iterator will only yield one MultipleSeqAlignment. """ n = Nexus.Nexus(handle) if not n.matrix: # No alignment found raise StopIteration # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. # The original names and the modified names are kept in these two lists: assert len(n.unaltered_taxlabels) == len(n.taxlabels) if seq_count and seq_count != len(n.unaltered_taxlabels): raise ValueError("Found %i sequences, but seq_count=%i" % (len(n.unaltered_taxlabels), seq_count)) # TODO - Can we extract any annotation too? records = ( SeqRecord(n.matrix[new_name], id=new_name, name=old_name, description="") for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels)) # All done yield MultipleSeqAlignment(records, n.alphabet)
def parse(handle): """Parse the trees in a Nexus file. Uses the old Nexus.Trees parser to extract the trees, converts them back to plain Newick trees, and feeds those strings through the new Newick parser. This way we don't have to modify the Nexus module yet. (Perhaps we'll eventually change Nexus to use the new NewickIO parser directly.) """ nex = Nexus.Nexus(handle) # NB: Once Nexus.Trees is modified to use Tree.Newick objects, do this: # return iter(nex.trees) # Until then, convert the Nexus.Trees.Tree object hierarchy: def node2clade(nxtree, node): subclades = [node2clade(nxtree, nxtree.node(n)) for n in node.succ] return Newick.Clade(branch_length=node.data.branchlength, name=node.data.taxon, clades=subclades, confidence=node.data.support, comment=node.data.comment) for nxtree in nex.trees: newroot = node2clade(nxtree, nxtree.node(nxtree.root)) yield Newick.Tree(root=newroot, rooted=nxtree.rooted, name=nxtree.name, weight=nxtree.weight)
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # read alignments log.info("Reading input alignments in NEXUS format") nexus_files = glob.glob(os.path.join(args.alignments, '*.nex*')) data = [(os.path.basename(fname), Nexus.Nexus(fname)) for fname in nexus_files] log.info("Concatenating files") concatenated = Nexus.combine(data) if not args.nexus: concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".phylip") if args.charsets: sets = concatenated.append_sets() charset_file = os.path.join(args.output, os.path.basename(args.alignments) + ".charsets") log.info("Writing charsets to {}".format( charset_file )) with open(charset_file, 'w') as outf: outf.write(sets) log.info("Writing concatenated PHYLIP alignment to {}".format(concat_file)) concatenated.export_phylip(concat_file) else: concat_file = os.path.join(args.output, os.path.basename(args.alignments) + ".nexus") if args.charsets: log.info("Writing concatenated alignment to NEXUS format (with charsets)") concatenated.write_nexus_data(concat_file) else: log.info("Writing concatenated alignment to NEXUS format (without charsets)") concatenated.write_nexus_data(concat_file, append_sets=False) # end text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def model_partition(metadata, aligns): to_combine = [] start = 1 end = 0 new_metadata = OrderedDict() for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus( open(os.path.join(aligns, "{0}.nex".format(locus)))) end += nex.nchar to_combine.append((locus, nex)) new_metadata[model] = (start, end) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, new_metadata
def test_taxa_and_characters_with_many_codings_two_without_state(self): """Taxa and chr blocks, over 9 codings, 2 character without states.""" nexus6 = Nexus.Nexus() # TODO: Implement continuous datatype: # Bio.Nexus.Nexus.NexusError: Unsupported datatype: continuous self.assertRaises(Nexus.NexusError, nexus6.read, "Nexus/vSysLab_Oreiscelio_discrete+continuous.nex")
def main(nexusfile, reftree, burnin=10): # Using the Nexus module data = Nexus.Nexus(nexusfile) taxlabels = data.structured[1].commandlines[1].options.split() nb2taxlabels = data.translate trees = data.trees # Using the Phylo module trees = list(Phylo.parse(nexusfile, 'nexus')) N0 = len(trees) trees = trees[N0 * burnin / 100 + 1:] N = N0 * (100 - burnin) / 100 topologies = Counter() topo_groups = defaultdict(list) for tree in trees: # Ensure all equivalent topologies will be represented the same way biophylo_leaf_sort(tree, tree.root) topo = biophylo_topology(tree, tree.root) topologies[topo] += 1 topo_groups.append(tree) MAP_topology, MAP_count = topologies.most_common(1)[0] MAP_proba = float(MAP_count) / sum(topologies.values()) clades = represent_clades(reftree, BioPhylo.get_children, BioPhylo.get_label)
def write_alignment(self, alignment, interleave=None): """Write an alignment to file. Creates an empty Nexus object, adds the sequences and then gets Nexus to prepare the output. Default interleave behaviour: Interleave if columns > 1000 --> Override with interleave=[True/False] """ if len(alignment) == 0: raise ValueError("Must have at least one sequence") columns = alignment.get_alignment_length() if columns == 0: raise ValueError("Non-empty sequences are required") minimal_record = ( "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=%s; end;" % self._classify_alphabet_for_nexus(alignment._alphabet)) n = Nexus.Nexus(minimal_record) n.alphabet = alignment._alphabet for record in alignment: n.add_sequence(record.id, str(record.seq)) # Note: MrBayes may choke on large alignments if not interleaved if interleave is None: interleave = columns > 1000 n.write_nexus_data(self.handle, interleave=interleave)
def combine(arg): file_format = get_format(arg) if file_format == 'fasta': arg = convert(arg) name_data = [(clean_name(name), Nexus.Nexus(name)) for name in arg.input] combined = Nexus.combine(name_data) combined.write_nexus_data(filename=arg.output)
def write_alignment(self, alignment, interleave=None): """Write an alignment to file. Creates an empty Nexus object, adds the sequences and then gets Nexus to prepare the output. Default interleave behaviour: Interleave if columns > 1000 --> Override with interleave=[True/False] """ if len(alignment) == 0: raise ValueError("Must have at least one sequence") columns = alignment.get_alignment_length() if columns == 0: raise ValueError("Non-empty sequences are required") datatype = self._classify_mol_type_for_nexus(alignment) minimal_record = ( "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=%s; end;" % datatype) n = Nexus.Nexus(minimal_record) for record in alignment: # Sanity test sequences (should this be even stricter?) if datatype == "dna" and "U" in record.seq: raise ValueError(f"{record.id} contains U, but DNA alignment") elif datatype == "rna" and "T" in record.seq: raise ValueError(f"{record.id} contains T, but RNA alignment") n.add_sequence(record.id, str(record.seq)) # Note: MrBayes may choke on large alignments if not interleaved if interleave is None: interleave = columns > 1000 n.write_nexus_data(self.handle, interleave=interleave)
def process_dataset_metrics(dataset_path, metrics, minimum_window_size, outfilename): ''' Input: dataset_path: path to a nexus alignment with UCE charsets metrics: a list of 'gc', 'entropy' or 'multi' outfilename: name for the csv file Output: csv files written to disk ''' print("Sitewise metrics analysis") dataset_name = os.path.basename(dataset_path).rstrip(".nex") outfile = open(outfilename, 'w') outfile.write( "name,uce_site,aln_site,window_start,window_stop,type,value,plot_mtx\n" ) outfile.close() # write the start blocks of the partitionfinder files for m in metrics: pfinder_config_file = open( '%s_%s_partition_finder.cfg' % (dataset_name, m), 'w') pfinder_config_file.write(p_finder_start_block(dataset_name)) pfinder_config_file.close() dat = Nexus.Nexus() dat.read(dataset_path) aln = AlignIO.read(open(dataset_path), "nexus") for name in tqdm(dat.charsets): sites = dat.charsets[name] start = min(sites) stop = max(sites) + 1 # slice the alignment to get the UCE uce_aln = aln[:, start:stop] best_windows, metric_array = process_uce(uce_aln, metrics, minimum_window_size) for i, best_window in enumerate(best_windows): pfinder_config_file = open( '%s_%s_partition_finder.cfg' % (dataset_name, metrics[i]), 'a') pfinder_config_file.write( blocks_pfinder_config(best_window, name, start, stop, uce_aln)) break write_csvs(best_windows, metric_array, sites, name, outfilename) # write the end blocks of the partitionfinder files for m in metrics: pfinder_config_file = open( '%s_%s_partition_finder.cfg' % (dataset_name, m), 'a') pfinder_config_file.write(p_finder_end_block(dataset_name)) pfinder_config_file.close()
def test_WriteToFileName(self): """Test writing to a given filename.""" filename = "Nexus/test_temp.nex" if os.path.isfile(filename): os.remove(filename) n = Nexus.Nexus(self.handle) n.write_nexus_data(filename) self.assertTrue(os.path.isfile(filename)) os.remove(filename)
def alignment_slicer(input, informat, outformat, SNPs, slide): alignment = AlignIO.read(input, informat, alphabet = generic_dna) alignment_seq_count = len(alignment) first_seq = (alignment[0].seq) length_alignment = len(first_seq) chars_to_ignore = ['N'] start = 0 end = start + args.SNPs_in_window while end <= length_alignment: with open(input+'_site'+str(start)+'to'+str(end)+'.'+outformat, 'w') as output_handle: # print 'start:', start # print 'end:', end alignment_iteration = MultipleSeqAlignment(alignment[:, start:end], alphabet=generic_dna) if outformat.lower() == 'nexus': n_alignments = [] alignment_iteration = alignment_iteration.format('nexus') n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration))) combined = Nexus.combine(n_alignments) combined.write_nexus_data(output_handle) else: AlignIO.write(alignment_iteration, output_handle, outformat) # print alignment_iteration start += args.slide end += args.slide else: with open(input+'_site'+str(start)+'to'+str(length_alignment)+'.'+outformat, 'w') as output_handle: n_alignments = [] # print 'now in else loop\n' # print 'start:', start # print 'end:', length_alignment alignment_iteration = MultipleSeqAlignment(alignment[:, start:length_alignment], alphabet=generic_dna) if outformat.lower() == 'nexus': n_alignments = [] alignment_iteration = alignment_iteration.format('nexus') n_alignments.append(('site'+str(start)+'to'+str(end),Nexus.Nexus(alignment_iteration))) combined = Nexus.combine(n_alignments) combined.write_nexus_data(output_handle) else: AlignIO.write(alignment_iteration, output_handle, outformat) # print alignment_iteration print "\ndone\n"
def write_nexus_non_interleaved(alignment, fh_out): ntax = len(alignment) nchar = alignment.get_alignment_length() minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ + "format datatype=dna missing=N; end;" n = Nexus.Nexus(minimal_record) n.alphabet = alignment._alphabet for record in alignment: n.add_sequence(record.id, str(record.seq)) n.write_nexus_data(fh_out, interleave=False)
def model_partition(metadata, aligns): to_combine = [] start = 1 end = 0 new_metadata = OrderedDict() for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus( open(os.path.join(aligns, "{0}.nex".format(locus)))) #s = sum([1 if 'copy' in n else 0 for n in nex.get_original_taxon_order()]) #if s > 0: # pdb.set_trace() end += nex.nchar to_combine.append((locus, nex)) new_metadata[model] = (start, end) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, new_metadata
def check_alignment(alignment_file): # do lots of checks on an alignment aln = Nexus.Nexus() try: aln.read(alignment_file) except Exception as e: logging.error("Couldn't read nexus file, please check and try again.") logging.error("Here's the error from the BioPython Nexus.Nexus module") logging.error(e) raise ValueError # Check that there are just two charpartitions: 'loci' and 'genomes' logging.info(" checking correct charpartitions exist") if aln.charpartitions.keys() != ['loci', 'genomes']: logging.error("There should be exactly two CHARPARTITIONS: 'loci' and 'genomes'. Check and try again.") raise ValueError # Check for an 'outgroup' taxset logging.info(" checking outgroup taxset exists") if aln.taxsets.keys() != ['outgroups']: logging.error("There should be exactly one TAXSET: 'outgroups'. Check and try again.") raise ValueError # Check that no sites are duplicated in either charpartition logging.info(" checking for duplicates sites in charpartitions") all_sites = set(range(aln.nchar)) loci_sites = [x[1] for x in aln.charpartitions['loci'].items()] loci_sites = list(itertools.chain.from_iterable(loci_sites)) if len(loci_sites) > len(all_sites): logging.error("The loci charpartition has %d more site(s) than the number of sites in the alignment" %(len(loci_sites) - len(all_sites))) raise ValueError geno_sites = [x[1] for x in aln.charpartitions['genomes'].items()] geno_sites = list(itertools.chain.from_iterable(geno_sites)) if len(geno_sites) > len(all_sites): logging.error("The genomes charpartition has %d more site(s) than the number of sites in the alignment" %(len(geno_sites) - len(all_sites))) raise ValueError # Check that all sites are covered by 'loci' charpartition logging.info(" checking that all sites are covered by charpartitions") if len(set(loci_sites)) < len(all_sites): logging.error("The loci charpartition does not cover the following sites, please fix: %s" %(all_sites.difference(set(loci_sites)))) raise ValueError # Check that all sites are covered by 'genomes' charpartition if len(set(geno_sites)) < len(all_sites): logging.error("The genomes charpartition does not cover the following sites, please fix: %s" %(all_sites.difference(set(geno_sites)))) raise ValueError return(aln)
def parse_nexus_file(self, path_to_nex): ''' This function parses a NEXUS file. ''' from Bio.Nexus import Nexus try: aln = Nexus.Nexus() aln.read(path_to_nex) charsets = aln.charsets matrix = aln.matrix except: raise ME.MyException('Parsing of .nex-file unsuccessful.') return (charsets, matrix)
def test_TreeTest1(self): """Test Tree module.""" n = Nexus.Nexus(self.handle) t3 = n.trees[2] t2 = n.trees[2] t3.root_with_outgroup(['t1', 't5']) self.assertEqual( str(t3), "tree tree1 = (((((('one should be punished, for (that)!','isn''that [a] strange name?'),'t2 the name'),t8,t9),t6),t7),(t5,t1));" ) self.assertEqual(t3.is_monophyletic(['t8', 't9', 't6', 't7']), -1) self.assertEqual(t3.is_monophyletic(['t1', 't5']), 13) t3.split(parent_id=t3.search_taxon('t9')) stdout = sys.stdout try: sys.stdout = cStringIO.StringIO() t3.display() if sys.version_info[0] == 3: output = sys.stdout.getvalue() else: sys.stdout.reset() output = sys.stdout.read() finally: sys.stdout = stdout expected = """\ # taxon prev succ brlen blen (sum) support comment 1 'isn''that [a] strange name?' 2 [] 100.00 119.84 10.00 - 2 - 4 [3, 1] 0.40 19.84 0.30 - 3 'one should be punished, for (that)!' 2 [] 0.50 20.34 - - 4 - 6 [2, 5] 4.00 19.44 3.00 - 5 't2 the name' 4 [] 0.30 19.74 - - 6 - 9 [4, 7, 8] 2.00 15.44 1.00 - 7 t8 6 [] 1.20 16.64 - - 8 t9 6 [17, 18] 3.40 18.84 - - 9 - 11 [6, 10] 0.44 13.44 33.00 - 10 t6 9 [] 1.00 14.44 - - 11 - 16 [9, 12] 13.00 13.00 12.00 - 12 t7 11 [] 99.90 112.90 - - 13 - 16 [14, 15] 0.00 0.00 0.00 - 14 t5 13 [] 99.00 99.00 - - 15 t1 13 [] 0.98 0.98 - - 16 - None [11, 13] 0.00 0.00 - - 17 t90 8 [] 1.00 19.84 - - 18 t91 8 [] 1.00 19.84 - - Root: 16 """ self.assertEqual(len(output.split("\n")), len(expected.split("\n"))) for l1, l2 in zip(output.split("\n"), expected.split("\n")): self.assertEqual(l1, l2) self.assertEqual(output, expected) self.assertEqual(t3.is_compatible(t2, threshold=0.3), [])
def fully_partition(metadata, aligns): to_combine = [] start = 1 for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus(open(os.path.join(aligns, "{0}.nex".format(locus)))) end = start + nex.nchar - 1 metadata[model][locus] = (start, end) to_combine.append((locus, nex)) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, metadata
def write_alignment(alignment_trans, outformat): """ Read in the translated alignment, write this out to file in any format. """ with open(os.path.splitext(ARGS.filename)[0]+"_nametrans."+outformat, "w" \ ) as output_handle: if outformat == "nexus": alignment_trans = Nexus.Nexus(alignment_trans.format("nexus")) alignment_trans.write_nexus_data(output_handle, interleave=False) else: AlignIO.write(alignment_trans, output_handle, outformat) print '\nAlignment with translated strain names written to "'+\ output_handle.name+'".'
def check_taxa(matrices): '''Checks that nexus instances in a list [(name, instance)...] have the same taxa, provides useful error if not and returns None if everything matches From: http://biopython.org/wiki/Concatenate_nexus ''' first_taxa = matrices[0][1].taxlabels for name, matrix in matrices[1:]: first_only = [t for t in first_taxa if t not in matrix.taxlabels] new_only = [t for t in matrix.taxlabels if t not in first_taxa] if first_only: missing = ', '.join(first_only) msg = '%s taxa %s not in martix %s' % (matrices[0][0], missing, name) raise Nexus.NexusError(msg) elif new_only: missing = ', '.join(new_only) msg = '%s taxa %s not in all matrices' % (name, missing) raise Nexus.NexusError(msg) return None # will only get here if it hasn't thrown an exception
def verify_nexus_topology(self, treeseq): nexus = treeseq.nexus(precision=16) nexus_treeseq = Nexus.Nexus(nexus) self.assertEqual(treeseq.num_trees, len(nexus_treeseq.trees)) for tree, nexus_tree in itertools.zip_longest(treeseq.trees(), nexus_treeseq.trees): name = nexus_tree.name split_name = name.split("_") self.assertEqual(len(split_name), 2) start = float(split_name[0][4:]) end = float(split_name[1]) self.assertAlmostEqual(tree.interval[0], start) self.assertAlmostEqual(tree.interval[1], end) self.verify_tree(nexus_tree, tree)
def model_partition(metadata, aligns): to_combine = [] start = 1 end = 0 new_metadata = OrderedDict() for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus(open(os.path.join(aligns, "{0}.nex".format(locus)))) end += nex.nchar to_combine.append((locus, nex)) new_metadata[model] = (start, end) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, new_metadata
def model_partition(metadata, aligns): to_combine = [] start = 1 end = 0 new_metadata = OrderedDict() for model in metadata: for locus in metadata[model]: nex = Nexus.Nexus(open(os.path.join(aligns, "{0}.nex".format(locus)))) #s = sum([1 if 'copy' in n else 0 for n in nex.get_original_taxon_order()]) #if s > 0: # pdb.set_trace() end += nex.nchar to_combine.append((locus, nex)) new_metadata[model] = (start, end) start = end + 1 combined = Nexus.combine(to_combine) #pdb.set_trace() return combined, new_metadata
def write_variant_sites(alignment, var_sites, outfile): nex_aligns = [] #Bio.Nexus.Nexus.Nexus objects blocks = list(ranges(var_sites)) #tuples of positions for i in blocks: alignment_iteration = MultipleSeqAlignment(alignment[:, i[0]:i[1]+1], alphabet=generic_dna).format('nexus') # if i[0] == i[1]: # nex_aligns.append(('site {}'.format(str(i[1] + 1)), # Nexus.Nexus(alignment_iteration))) # else: # nex_aligns.append(('site {} to {}'.format(str(i[0]), str(i[1] + 1)), # Nexus.Nexus(alignment_iteration))) nex_aligns.append(('site {} to {}'.format(str(i[0]), str(i[1]+1)), Nexus.Nexus(alignment_iteration))) combined = Nexus.combine(nex_aligns) with open(outfile, 'w') as out: combined.write_nexus_data(out) print 'Converted {} informative sites without gaps into nexus alignment'.format(str(len(blocks)))
def _write_BEST(dataset, filestem): """ write a MrBayes block for BEST species tree estimation Used by write_multispecies(), writes a concatenated nexus file and prints MrBayes block to screen. """ fname = filestem + ".nex" #write a nexus file with partitions for each gene nexi = [] for g in dataset.get_genes(): nexi.append( (g, _nexify( dataset.get_sequences(g)))) combined = Nexus.combine(nexi) combined.write_nexus_data(filename=fname) #then build a MrBayes block for BEST d = defaultdict(list) for sp, i in zip(dataset.get_species(), [str(i) for i in xrange(1,len(dataset)+1)]): d[sp].append(i) contents = ["begin MyBayes;"] for species, OTUs in d.items(): contents.append("taxset %s = % s" % (species, " ".join(OTUs))) print "Add the following to the MrBayes block in %s" % fname for line in contents: print line
def GettingInfoFromInput(NexusInput): shape_dict={"1":"JC","2":"HKY","6":"GTR"} size_dict={"4by4":"4X4","doublet":"16X16","codon":"64X64"} "gettinginfo from mrbayes block in nexus" #Function assume that datatype is uniform! all DNA, all protein no mix from Bio.Nexus import Nexus N=Nexus.Nexus() N.read(NexusInput) #merging togheter all possible mrbayes block present in the file as a long list of command #should I take only the first one? cmdblock=sum([sum(x.commandlines,[]) for x in N.unknown_blocks if x.title.lower()=="mrbayes"],[]) HyppartitionPlan={} partitionPlan=["dummy"] for cmdline in cmdblock: CMDline=Nexus.Commandline(cmdline,"mrbayes") if CMDline.command=="charset": N._charset(CMDline.options) elif CMDline.command=="partition": nameplan=cmdline.split("=")[0].split()[1] HyppartitionPlan[nameplan]=[x.strip() for x in cmdline.split(":")[-1].split(",")] elif CMDline.command.find("mcmc")>-1: try: nruns=CMDline.options["nruns"] except KeyError: pass elif CMDline.command=="set": if CMDline.options.has_key("partition"): partitionPlan=HyppartitionPlan[CMDline.options["partition"]] #print cmdblock #print partitionPlan Model={} counter=1 for partition in partitionPlan: Model[partition]={"ntaxa":N.ntax,"type":N.datatype.lower().title(),"matrix":{}} Model[partition]["partitionSize"]=len(N.charsets[partition]) Model[partition]["partitionRange"]=Nexus._compact4nexus(N.charsets[partition]) for cmdline in cmdblock: CMDline=Nexus.Commandline(cmdline,"mrbayes") if N.datatype.lower()=="dna": if CMDline.command=="lset": test1=test2=test3=False if CMDline.options.has_key("applyto"): APP=CMDline.options["applyto"][1:-1].lower() if (APP.strip()=="all"): test2=True elif counter in map(int,APP.split(",")): test3=True else: test1=True if test1 or test2 or test3: Model[partition]["matrix"]["shape"]=shape_dict[CMDline.options["nst"]] if CMDline.options.has_key("nucmodel"): Model[partition]["matrix"]["size"]=size_dict(CMDline.options["nucmodel"]) else: Model[partition]["matrix"]["size"]="4X4" if N.datatype.lower()=="protein": Model[partition]["matrix"]["size"]="20X20" if CMDline.command=="prset": mod=CMDline.options["aamodelpr"] if mod.find("fix")>-1: Model[partition]["matrix"]["shape"]=mod[(mod.find("(")+1):(mod.find(")")-1)] elif mod.find("(")==-1: Model[partition]["matrix"]["shape"]=mod counter+=1 return Model,partitionPlan, int(nruns)
def read_collapse(file, informat, gapchar): with open(file, 'r') as input_handle: alignment = AlignIO.read(input_handle, informat, alphabet=generic_dna) summary_align = AlignInfo.SummaryInfo(alignment) first_seq = (alignment[0].seq) length_first_seq = len(first_seq) # chars_to_ignore = ['N'] my_pssm = summary_align.pos_specific_score_matrix(first_seq) index = 0 count = 0 invariant_sites_counter = 0 invariant_position_index = [] for i in my_pssm.pssm: A = i[1]['A'] C = i[1]['C'] G = i[1]['G'] T = i[1]['T'] if gapchar != None: print gapchar gap = i[1][gapchar] x = [gap, A, C, G, T] if gapchar == None: x = [A, C, G, T] print x y = [] for j in x: if j > 0: y.append(1) else: y.append(0) if sum(y[1:len(y)]) > 1: pass else: invariant_sites_counter += 1 invariant_position_index.append(count) count += 1 alignment_indices_to_write = [] n_alignments = [] for i in range(0,length_first_seq): if i not in invariant_position_index: alignment_indices_to_write.append(i) def ranges(i): for a, b in itertools.groupby(enumerate(i), lambda (x, y): y - x): b = list(b) yield b[0][1], b[-1][1] blocks = list(ranges(alignment_indices_to_write)) print '\nExcluding', str(len(invariant_position_index)),'sites at positions:',invariant_position_index,'\n' print 'Including sites at positions:',blocks,'\n' for i in blocks: alignment_iteration = MultipleSeqAlignment(alignment[:,i[0]:i[1]+1], alphabet = generic_dna).format('nexus') n_alignments.append(('site'+str(i[0])+'to'+str(i[1]+1),Nexus.Nexus(alignment_iteration))) # #combine the alignments in n_alignments combined = Nexus.combine(n_alignments) with open(file+'_collapsed.nexus', 'w') as output_handle: print 'Writing collapsed alignment to:',file+'_collapsed.nexus\n' combined.write_nexus_data(output_handle)
mainDir = os.getcwd() for g in glob.glob('*_sims'): # pull out gene name gene = g.split("_")[0] # create path to gene folder geneDirPath = os.path.join(mainDir,g) # move into gene folder os.chdir(geneDirPath) for p in glob.glob('posterior_predictive_sim_*'): simNum = p.split("_")[3] # make name for concat nexus file concatNex = gene + "_" + simNum + ".nex" # make folder for sim seq mbRunDirPath = os.path.join(mainDir, gene + "_" + simNum) nexOutPath = os.path.join(mbRunDirPath,concatNex) if not os.path.exists(mbRunDirPath): os.mkdir(mbRunDirPath) #debug print simNum, concatNex, mbRunDirPath, nexOutPath # move into sim seq folder os.chdir(p) seqList =["phyloSeq[1].nex", "phyloSeq[2].nex", "phyloSeq[3].nex"] nexConvert = [(f, Nexus.Nexus(f)) for f in seqList] combine = Nexus.combine(nexConvert) combine.write_nexus_data(filename=open(nexOutPath, 'w')) os.chdir(geneDirPath) os.chdir(mainDir)
__author__ = 'anastasiiakorosteleva' from Bio.Nexus import Nexus # the combine function takes a list of tuples [(name, nexus instance)...], #if we provide the file names in a list we can use a list comprehension to # create these tuples file_list = ['apoa1.nex', 'apoe.nex', 'cyt450.nex', 'ace.nex', 'ABO.nex', "apoa5.nex", 'apod.nex', 'cdk6.nex', 'CETP.nex', 'ETV6.nex', 'Gckr.nex', 'gdf5.nex','LDLR.nex', 'lpl.nex', 'NAT2.nex', 'park2.nex', 'SLC22A5.nex', 'UGT1A9.nex', 'HMGA2.nex', 'apoc1.nex'] nexuses = [(fname, Nexus.Nexus(fname)) for fname in file_list] combined = Nexus.combine(nexuses) combined.write_nexus_data(filename=open('combo.nex', 'w'))
def Concatenate(prefix): file_list = glob('*.nex') nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list] combined = Nexus.combine(nexi) combined.write_nexus_data(filename=open('btCOMBINED.nex', 'w')) combined.export_phylip(prefix+'.phy')
# a little script to concatenate lots of nexus files in a folder # and write a new one. from Bio.Nexus import Nexus import os infile = "/Users/robertlanfear/Desktop/turtles-individual-nexus-files-for-loci" file_list = [x for x in os.walk(infile)][0][2] nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list] combined = Nexus.combine(nexi) outfile = os.path.join(infile, "alignment.nex") combined.write_nexus_data(filename=open(outfile, 'w'))
#!/usr/bin/env python # Author: Gregory S Mendez # This script will create a super matrix alignment file in nexus format from input alignments in nexus format # Named variables. Every run needs the following defined: # 1) --in_dir - The directory containing the nexus alignments that need to be merged. # 2) --out - The full filepath and name you want for the output file. from Bio.Nexus import Nexus import argparse, glob # Argument Parser parser = argparse.ArgumentParser(description = 'This script will create a super matrix alignment file from input alignments') parser.add_argument('--in_dir', required=True, help='The input directory containing alignment files.') parser.add_argument('--out', required=True, help='The filepath and filename of the output file.') args = parser.parse_args() IN_DIR = args.in_dir OUT = args.out FILE_LIST = glob.glob('%s/*.nex' % IN_DIR) NEXI = [(FNAME, Nexus.Nexus(FNAME)) for FNAME in FILE_LIST] COMBINED = Nexus.combine(NEXI) COMBINED.write_nexus_data(filename=open('%s' % OUT, 'w'))