def read_extendend_alignment_and_relabel_columns( self, orig_path, extension_path, convert_to_string=True): """ This method goes with write_subalignment_without_allgap_columns method. It enables reading back an alignment that was previously written to disk, and relabeling its columns with the original labels. extension_path is a path to an .sto file (or a list of paths). Alignments from these .sto files are also read, and merged with the original (base) alignment. """ remaining_cols = self.annotations["ref.alignment.columns"] assert remaining_cols is not None and \ len(remaining_cols) != 0, \ ("Subproblem needs to have a proper list of alignment columns " "associated with it") _LOG.debug( "Reading %s %s and relabeling it based on %d orig column labels." % (orig_path, extension_path, len(remaining_cols))) ap_alg = ExtendedAlignment(list(self.fragments.keys())) ap_alg.build_extended_alignment( orig_path, extension_path, convert_to_string) ap_alg.relabel_original_columns(remaining_cols) return ap_alg
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.info("Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns\ (ap.jobs["hmmbuild"].infile , aligned_files) _LOG.info("Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) del ap_alg extendedAlignment.from_bytearray_to_string() return extendedAlignment
def backtranslate(faa, fna): newfna = ExtendedAlignment(faa.fragments) for k, s in fna.items(): if k in faa.keys(): aa = faa[k].upper() cd = [] i = 0 for r in aa: cds = s[i:i + 3] if r == '-': cd.append('---') else: if is_compatible(cds, r): cd.append(cds) i += 3 else: if i == 0 and (cds == 'GTG' or cds == 'TTG'): cd.append(cds) i += 3 else: raise ValueError('%s at position %d of %s ' 'does not translate to %s' % (cds, i, k, r)) newfna[k] = ''.join(cd) else: continue col_lab = faa.col_labels for i in col_lab: newfna._col_labels = newfna._col_labels + [i, i, i] return newfna
def merge_results(self): assert isinstance(self.root_problem,SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment(self.root_problem.fragments.keys()) #self.root_problem.get_children()[0].jobs[get_placement_job_name(0)].get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0,self.root_problem.fragment_chunks): align_input = open(pp.jobs[get_placement_job_name(i)].full_extended_alignment_file,'rb') extended_alignment = pickle.load(align_input) align_input.close() fullExtendedAlignment.merge_in(extended_alignment,convert_to_string=True) self.results = fullExtendedAlignment mergeinput = [] '''Append main tree to merge input''' mergeinput.append("%s;" %(self.root_problem.subtree.compose_newick(labels = True))) jsons = [] for pp in self.root_problem.get_children(): assert isinstance(pp,SeppProblem) for i in range(0,self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append("%s;\n%s" %(pp.subtree.compose_newick(labels = True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) mergeJsonJob = self.get_merge_job(meregeinputstring) mergeJsonJob.run()
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First find fragments assigned to this placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then, gather a list of all alignments relevant to this placement subset''' fragfilesperap = dict() for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children ] fragfilesperap[ap] = aligned_files ''' Now, build an extended alignment *per each fragment chunk*. Simply merge all hmmalign results for fragment chunk numbered i''' extendedAlignments = [] for i in range(0, self.root_problem.fragment_chunks): extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: # _LOG.debug("Merging fragment chunks for subalignment : %s." # %(ap.label)) if fragfilesperap[ap][i]: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]]) else: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, []) _LOG.debug( ("Merging alignment subset into placement subset for " "chunk %d: %s.") % (i, ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) '''Extended alignmnts have all fragments. remove the ones that don't belong to thsi chunk''' extendedAlignment.remove_missing_fragments() extendedAlignment.from_bytearray_to_string() extendedAlignments.append(extendedAlignment) return extendedAlignments
def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info("Merging sub-alignments for placement problem : %s." % pp.label) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug("fragments are %d:\n %s" % (len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.debug("Merging fragment chunks for subalignment : %s." % ap.label) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug("Merging alignment subset into placement subset: %s." % ap.label) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = [ "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI", "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI", "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC", "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD", "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF", "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH", "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD", "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE", "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE", "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF", "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA", "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE", "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII" ] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp) cp1.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns( "data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns( "data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x < 0]) in2 = len([x for x in ext2._col_labels if x < 0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % ( extmerger.get_length(), in1, in2, tlen) assert (in1 + in2 + tlen - mixed) == extmerger.get_length( ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % ( extmerger.get_length(), in1, in2, tlen, mixed) assert (in1 + in2 - mixed) == len( list(extmerger.iter_insertion_columns()) ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % ( len(list(extmerger.iter_insertion_columns())), in1, in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k, s) in tmp.items() ]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
from sepp.scheduler import JobPool from multiprocessing import Pool, Manager from sepp.alignment import ExtendedAlignment import glob job_joiner = JoinAlignJobs original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta' original_backbone = MutableAlignment() done = original_backbone.read_filepath(original_backbone_file) original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100' original_frag = MutableAlignment() done = original_frag.read_filepath(original_frag_file) #First build extended alignment on entire fragment set extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names()) dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/') dirs.reverse() for dir in dirs: print "Working on %s\n" % dir aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir) sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir) base_alignment_file = glob.glob('%s/*.fasta' % dir) base_alignment = MutableAlignment() done = base_alignment.read_filepath(base_alignment_file[0]) subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names()) frags = MutableAlignment() sequence_names = [] for file in sequence_files:
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def merge_results(self): assert isinstance(self.root_problem, SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): extended_alignment = pp.jobs[get_placement_job_name( i)].get_attribute("full_extended_alignment_object") fullExtendedAlignment.merge_in(extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment # IF only one placement subset, no need to go to java if len(self.root_problem.get_children()) == 1: import json mergeinput = [] for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' with open( pp.get_job_result_by_name( get_placement_job_name(i))) as f: mergeinput.append(json.load(f)) _LOG.info( "There are %d fragment chunks on a single placement subset" % len(mergeinput)) result = mergeinput[0] for i in range(1, len(mergeinput)): result["placements"] = result["placements"] + mergeinput[i][ "placements"] with open(self.get_output_filename("placement.json"), 'w') as f: json.dump(result, f, sort_keys=True, indent=4) else: mergeinput = [] '''Append main tree to merge input''' mergeinput.append( "%s;" % (self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % (pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) _LOG.debug(mergeinput) mergeJsonJob = MergeJsonJob() mergeJsonJob.setup(meregeinputstring, self.get_output_filename("placement.json")) mergeJsonJob.run()