def build_subproblems(self): (alignment, tree) = self.read_alignment_and_tree() if options().distance != 1: self.compute_distances(alignment) assert isinstance(tree, PhylogeneticTree) assert isinstance(alignment, MutableAlignment) tree.get_tree().resolve_polytomies() # Label edges with numbers so that we could assemble things back # at the end tree.lable_edges() ''' Make sure size values are set, and are meaningful. ''' self.check_and_set_sizes(alignment.get_num_taxa()) self._create_root_problem(tree, alignment) ''' Decompose the tree based on placement subsets''' placement_tree_map = PhylogeneticTree(Tree( tree.den_tree)).decompose_tree( self.options.placement_size, strategy=self.strategy, minSize=self.options.placement_size / int(self.options.exhaustive.placementminsubsetsizefacotr), tree_map={}, pdistance=1, decomp_strategy=self.decomp_strategy, distances=self.distances, maxDiam=None) assert len(placement_tree_map) > 0, ( "Tree could not be decomposed" " given the following settings; strategy:%s minsubsetsize:%s" " placement_size:%s" % (self.strategy, self.minsubsetsize, self.options.placement_size)) _LOG.info("Breaking into %d placement subsets." % len(placement_tree_map)) ''' For placement subsets create a placement subproblem, and decompose further''' for (p_key, p_tree) in placement_tree_map.items(): assert isinstance(p_tree, PhylogeneticTree) placement_problem = SeppProblem(p_tree.leaf_node_names(), self.root_problem) placement_problem.subtree = p_tree placement_problem.label = "P_%s" % str(p_key) _LOG.debug( "Placement subset %s has %d nodes" % (placement_problem.label, len(p_tree.leaf_node_names()))) ''' Further decompose to alignment subsets ''' alignment_tree_map = PhylogeneticTree(Tree( p_tree.den_tree)).decompose_tree( self.options.alignment_size, strategy=self.strategy, minSize=self.minsubsetsize, tree_map={}, decomp_strategy=self.options.decomp_strategy, pdistance=options().distance, distances=self.distances, maxDiam=self.options.maxDiam) assert len(alignment_tree_map) > 0, ( "Tree could not be decomposed" " given the following settings; strategy:%s" " minsubsetsize:%s alignmet_size:%s" % (self.strategy, self.minsubsetsize, self.options.alignment_size)) _LOG.debug("Placement subset %s has %d alignment subsets: %s" % (placement_problem.label, len(alignment_tree_map), str(sorted(alignment_tree_map.keys())))) _LOG.debug("Placement subset %s has %d taxa:" % (placement_problem.label, sum([ len(a_tree.leaf_node_names()) for a_tree in alignment_tree_map.values() ]))) for (a_key, a_tree) in alignment_tree_map.items(): assert isinstance(a_tree, PhylogeneticTree) self.modify_tree(a_tree) alignment_problem = SeppProblem(a_tree.leaf_node_names(), placement_problem) alignment_problem.subtree = a_tree alignment_problem.label = "A_%s_%s" % (str(p_key), str(a_key)) _LOG.info("Breaking into %d alignment subsets." % (len(list(self.root_problem.iter_leaves())))) ''' Divide fragments into chunks, to help achieve better parallelism''' fragment_chunk_files = self.create_fragment_files() self.root_problem.fragment_chunks = len(fragment_chunk_files) for alignment_problem in self.root_problem.iter_leaves(): for afc in range(0, self.root_problem.fragment_chunks): frag_chunk_problem = SeppProblem(alignment_problem.taxa, alignment_problem) frag_chunk_problem.subtree = alignment_problem.subtree frag_chunk_problem.label = alignment_problem.label.replace( "A_", "FC_") + "_" + str(afc) frag_chunk_problem.fragments = fragment_chunk_files[afc] _LOG.info("Breaking each alignment subset into %d fragment chunks." % self.root_problem.fragment_chunks) _LOG.debug("Subproblem structure: %s" % str(self.root_problem)) return self.root_problem
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = [ "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI", "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI", "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC", "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD", "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF", "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH", "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD", "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE", "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE", "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF", "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA", "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE", "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII" ] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp) cp1.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns( "data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns( "data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x < 0]) in2 = len([x for x in ext2._col_labels if x < 0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % ( extmerger.get_length(), in1, in2, tlen) assert (in1 + in2 + tlen - mixed) == extmerger.get_length( ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % ( extmerger.get_length(), in1, in2, tlen, mixed) assert (in1 + in2 - mixed) == len( list(extmerger.iter_insertion_columns()) ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % ( len(list(extmerger.iter_insertion_columns())), in1, in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k, s) in tmp.items() ]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
def _create_root_problem(self, tree, alignment): ''' Create the root problem''' self.root_problem = SeppProblem(tree.leaf_node_names()) self.root_problem.label = "root" self.root_problem.subalignment = alignment self.root_problem.subtree = tree
for dir in dirs: print "Working on %s\n" % dir aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir) sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir) base_alignment_file = glob.glob('%s/*.fasta' % dir) base_alignment = MutableAlignment() done = base_alignment.read_filepath(base_alignment_file[0]) subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names()) frags = MutableAlignment() sequence_names = [] for file in sequence_files: seq = MutableAlignment() done = seq.read_filepath(file) done = sequence_names.extend(seq.get_sequence_names()) for name, seq in seq.iteritems(): frags[name] = seq.upper() problem = SeppProblem(sequence_names) problem.set_subalignment(subbackbone) mut_subalg = problem.subalignment.get_mutable_alignment() remaining_cols = mut_subalg.delete_all_gap() problem.annotations["ref.alignment.columns"] = remaining_cols problem.fragments = frags ap_alg = problem.read_extendend_alignment_and_relabel_columns\ (base_alignment_file, aligned_files) extendedAlignment.merge_in(ap_alg,convert_to_string=False) extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.unmasked.fasta") extendedAlignment.remove_insertion_columns() extendedAlignment.write_to_path("/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/upp.masked.fasta")