def backtranslate(faa, fna): newfna = ExtendedAlignment(faa.fragments) for k, s in fna.items(): if k in faa.keys(): aa = faa[k].upper() cd = [] i = 0 for r in aa: cds = s[i:i + 3] if r == '-': cd.append('---') else: if is_compatible(cds, r): cd.append(cds) i += 3 else: if i == 0 and (cds == 'GTG' or cds == 'TTG'): cd.append(cds) i += 3 else: raise ValueError('%s at position %d of %s ' 'does not translate to %s' % (cds, i, k, r)) newfna[k] = ''.join(cd) else: continue col_lab = faa.col_labels for i in col_lab: newfna._col_labels = newfna._col_labels + [i, i, i] return newfna
def merge_results(self): assert isinstance(self.root_problem,SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment(self.root_problem.fragments.keys()) #self.root_problem.get_children()[0].jobs[get_placement_job_name(0)].get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0,self.root_problem.fragment_chunks): align_input = open(pp.jobs[get_placement_job_name(i)].full_extended_alignment_file,'rb') extended_alignment = pickle.load(align_input) align_input.close() fullExtendedAlignment.merge_in(extended_alignment,convert_to_string=True) self.results = fullExtendedAlignment mergeinput = [] '''Append main tree to merge input''' mergeinput.append("%s;" %(self.root_problem.subtree.compose_newick(labels = True))) jsons = [] for pp in self.root_problem.get_children(): assert isinstance(pp,SeppProblem) for i in range(0,self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append("%s;\n%s" %(pp.subtree.compose_newick(labels = True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) mergeJsonJob = self.get_merge_job(meregeinputstring) mergeJsonJob.run()
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." %(pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None] _LOG.info("Merging fragment chunks for subalignment : %s." %(ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns\ (ap.jobs["hmmbuild"].infile , aligned_files) _LOG.info("Merging alignment subset into placement subset: %s." %(ap.label)) extendedAlignment.merge_in(ap_alg,convert_to_string=False) del ap_alg extendedAlignment.from_bytearray_to_string() return extendedAlignment
def read_extendend_alignment_and_relabel_columns( self, orig_path, extension_path, convert_to_string=True): ''' This method goes with write_subalignment_without_allgap_columns method. It enables reading back an alignment that was previously written to disk, and relabeling its columns with the original labels. extension_path is a path to an .sto file (or a list of paths). Alignments from these .sto files are also read, and merged with the original (base) alignment. ''' remaining_cols = self.annotations["ref.alignment.columns"] assert remaining_cols is not None and \ len(remaining_cols) != 0, \ ("Subproblem needs to have a proper list of alignment columns " "associated with it") _LOG.debug( "Reading %s %s and relabeling it based on %d orig column labels." % (orig_path, extension_path, len(remaining_cols))) ap_alg = ExtendedAlignment(list(self.fragments.keys())) ap_alg.build_extended_alignment( orig_path, extension_path, convert_to_string) ap_alg.relabel_original_columns(remaining_cols) return ap_alg
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.info("Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns\ (ap.jobs["hmmbuild"].infile , aligned_files) _LOG.info("Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) del ap_alg extendedAlignment.from_bytearray_to_string() return extendedAlignment
def read_extendend_alignment_and_relabel_columns( self, orig_path, extension_path, convert_to_string=True): """ This method goes with write_subalignment_without_allgap_columns method. It enables reading back an alignment that was previously written to disk, and relabeling its columns with the original labels. extension_path is a path to an .sto file (or a list of paths). Alignments from these .sto files are also read, and merged with the original (base) alignment. """ remaining_cols = self.annotations["ref.alignment.columns"] assert remaining_cols is not None and \ len(remaining_cols) != 0, \ ("Subproblem needs to have a proper list of alignment columns " "associated with it") _LOG.debug( "Reading %s %s and relabeling it based on %d orig column labels." % (orig_path, extension_path, len(remaining_cols))) ap_alg = ExtendedAlignment(list(self.fragments.keys())) ap_alg.build_extended_alignment( orig_path, extension_path, convert_to_string) ap_alg.relabel_original_columns(remaining_cols) return ap_alg
def merge_results(self): assert isinstance(self.root_problem, SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): align_input = open( pp.jobs[get_placement_job_name(i)] .full_extended_alignment_file, 'rb') extended_alignment = pickle.load(align_input) align_input.close() fullExtendedAlignment.merge_in( extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment mergeinput = [] '''Append main tree to merge input''' mergeinput.append("%s;" % ( self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name( get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % ( pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) mergeJsonJob = self.get_merge_job(meregeinputstring) mergeJsonJob.run()
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First find fragments assigned to this placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then, gather a list of all alignments relevant to this placement subset''' fragfilesperap = dict() for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children ] fragfilesperap[ap] = aligned_files ''' Now, build an extended alignment *per each fragment chunk*. Simply merge all hmmalign results for fragment chunk numbered i''' extendedAlignments = [] for i in range(0, self.root_problem.fragment_chunks): extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: # _LOG.debug("Merging fragment chunks for subalignment : %s." # %(ap.label)) if fragfilesperap[ap][i]: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]]) else: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, []) _LOG.debug( ("Merging alignment subset into placement subset for " "chunk %d: %s.") % (i, ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) '''Extended alignmnts have all fragments. remove the ones that don't belong to thsi chunk''' extendedAlignment.remove_missing_fragments() extendedAlignment.from_bytearray_to_string() extendedAlignments.append(extendedAlignment) return extendedAlignments
def merge_subalignments(self): ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.placement_problem _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First find fragments assigned to this placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then, gather a list of all alignments relevant to this placement subset''' fragfilesperap = dict() for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children] fragfilesperap[ap] = aligned_files ''' Now, build an extended alignment *per each fragment chunk*. Simply merge all hmmalign results for fragment chunk numbered i''' extendedAlignments = [] for i in range(0, self.root_problem.fragment_chunks): extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: # _LOG.debug("Merging fragment chunks for subalignment : %s." # %(ap.label)) if fragfilesperap[ap][i]: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, [fragfilesperap[ap][i]]) else: ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, []) _LOG.debug( ("Merging alignment subset into placement subset for " "chunk %d: %s.") % (i, ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) '''Extended alignmnts have all fragments. remove the ones that don't belong to thsi chunk''' extendedAlignment.remove_missing_fragments() extendedAlignment.from_bytearray_to_string() extendedAlignments.append(extendedAlignment) return extendedAlignments
def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info("Merging sub-alignments for placement problem : %s." % pp.label) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug("fragments are %d:\n %s" % (len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.debug("Merging fragment chunks for subalignment : %s." % ap.label) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug("Merging alignment subset into placement subset: %s." % ap.label) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment
class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm): ''' This implements the exhaustive algorithm where all alignments subsets are searched for every fragment. This is for UPP, meaning that no placement is performed, and that there is always only one placement subset (currently). ''' def __init__(self): ExhaustiveAlgorithm.__init__(self) self.pasta_only = False def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query) def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ((not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None)): options().fragment_file = options().sequence_file elif ((options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None)): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().backtranslation_sequence_file and \ options().molecule != "amino": _LOG.error(("Backtranslation can be performed only when " "input sequences are amino acid. ")) exit(-1) return ExhaustiveAlgorithm.check_options(self) def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug("fragments are %d:\n %s" % (len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.debug("Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug("Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment # Useful for multi-core merging if ever needed # def parallel_merge_results(self): # assert len(self.root_problem.get_children()) == 1, "Currently UPP # works with only one placement subset." # ''' # Merge alignment subset extended alignments to get one extended # alignment # for current placement subset. # ''' # pp = self.root_problem.get_children()[0] # _LOG.info("Merging sub-alignments for placement problem : %s." # %(pp.label)) # ''' Then Build an extended alignment by merging all hmmalign # results''' # manager = Manager() # extendedAlignments = manager.list() # for ap in pp.children: # assert isinstance(ap, SeppProblem) # ''' Get all fragment chunk alignments for this alignment subset''' # aligned_files = [fp.get_job_result_by_name('hmmalign') for # fp in ap.children if # fp.get_job_result_by_name('hmmalign') # is not None] # _LOG.info("Merging fragment chunks for subalignment : %s." # %(ap.label)) # ap_alg = ap.read_extendend_alignment_and_relabel_columns\ # (ap.jobs["hmmbuild"].infile , aligned_files) # _LOG.info("Merging alignment subset into placement subset: %s." # %(ap.label)) # extendedAlignments.append(ap_alg) # # while len(extendedAlignments)>1: # a=range(0,len(extendedAlignments)) # #print [len(x) for x in extendedAlignments] # x = zip(a[0::2],a[1::2]) # mapin = zip (x,[extendedAlignments]*len(x)) # _LOG.debug("One round of merging started. Currently have %d # alignments left. " %len(extendedAlignments)) # Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin) # #print [len(x) if x is not None else "None" for x in # extendedAlignments] # extendedAlignments = manager.list([x for x in # extendedAlignments if x is not None]) # extendedAlignments.reverse() # _LOG.debug("One round of merging finished. Still have %d # alignments left. " %len(extendedAlignments)) # extendedAlignment = extendedAlignments[0] # extendedAlignment.from_bytearray_to_string() # self.results = extendedAlignment def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) if self.options.backtranslation_sequence_file: outfilename = self.get_output_filename( "backtranslated_alignment.fasta") backtranslation_seqs = MutableAlignment() backtranslation_seqs.read_file_object( self.options.backtranslation_sequence_file) try: extended_backtranslated_alignment = backtranslate( self.results, backtranslation_seqs) except Exception as e: _LOG.warning("Backtranslation failed due " "to following error: " + str(e) + ".\n" "No translated DNA sequence will be " "written to a file.") pass else: extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated alignment written to %s" % outfilename) extended_backtranslated_alignment.remove_insertion_columns() outfilename = self.get_output_filename( "backtranslated_alignment_masked.fasta") extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated masked alignment written " "to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename) def check_and_set_sizes(self, total): assert (self.options.placement_size is None) or ( self.options.placement_size >= total), \ ("currently UPP works with only one placement subset." " Please leave placement subset size option blank.") ExhaustiveAlgorithm.check_and_set_sizes(self, total) self.options.placement_size = total def _get_new_Join_Align_Job(self): return UPPJoinAlignJobs() def modify_tree(self, a_tree): ''' Filter out taxa on long branches ''' self.filtered_taxa = [] if self.options.long_branch_filter is not None: tr = a_tree.get_tree() elen = {} for e in tr.leaf_edge_iter(): elen[e] = e.length elensort = sorted(elen.values()) mid = elensort[len(elensort) / 2] torem = [] for k, v in list(elen.items()): if v > mid * self.options.long_branch_filter: self.filtered_taxa.append(k.head_node.taxon.label) torem.append(k.head_node.taxon) tr.prune_taxa(torem) def create_fragment_files(self): alg_subset_count = len(list(self.root_problem.iter_leaves())) frag_chunk_count = lcm(alg_subset_count, self.options.cpu) // alg_subset_count _LOG.info("%d taxa pruned from backbone and added to fragments: %s" % (len(self.filtered_taxa), " , ".join(self.filtered_taxa))) return self.read_and_divide_fragments( frag_chunk_count, extra_frags=self.root_problem.subalignment.get_soft_sub_alignment( self.filtered_taxa))
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = ( seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] (min_length, max_length) = ( int(options().median_full_length * ( 1 - options().backbone_threshold)), int(options().median_full_length*( 1 + options().backbone_threshold))) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): _LOG.info( "Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted(random.sample( sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm): ''' This implements the exhaustive algorithm where all alignments subsets are searched for every fragment. This is for UPP, meaning that no placement is performed, and that there is always only one placement subset (currently). ''' def __init__(self): ExhaustiveAlgorithm.__init__(self) self.pasta_only = False def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = ( seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] (min_length, max_length) = ( int(options().median_full_length * ( 1 - options().backbone_threshold)), int(options().median_full_length*( 1 + options().backbone_threshold))) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): _LOG.info( "Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted(random.sample( sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query) def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ( (not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None) ): options().fragment_file = options().sequence_file elif ( (options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None) ): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self) def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info( "Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug( "fragments are %d:\n %s" % ( len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None] _LOG.debug( "Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug( "Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment # Useful for multi-core merging if ever needed # def parallel_merge_results(self): # assert len(self.root_problem.get_children()) == 1, "Currently UPP # works with only one placement subset." # ''' # Merge alignment subset extended alignments to get one extended # alignment # for current placement subset. # ''' # pp = self.root_problem.get_children()[0] # _LOG.info("Merging sub-alignments for placement problem : %s." # %(pp.label)) # ''' Then Build an extended alignment by merging all hmmalign # results''' # manager = Manager() # extendedAlignments = manager.list() # for ap in pp.children: # assert isinstance(ap, SeppProblem) # ''' Get all fragment chunk alignments for this alignment subset''' # aligned_files = [fp.get_job_result_by_name('hmmalign') for # fp in ap.children if # fp.get_job_result_by_name('hmmalign') # is not None] # _LOG.info("Merging fragment chunks for subalignment : %s." # %(ap.label)) # ap_alg = ap.read_extendend_alignment_and_relabel_columns\ # (ap.jobs["hmmbuild"].infile , aligned_files) # _LOG.info("Merging alignment subset into placement subset: %s." # %(ap.label)) # extendedAlignments.append(ap_alg) # # while len(extendedAlignments)>1: # a=range(0,len(extendedAlignments)) # #print [len(x) for x in extendedAlignments] # x = zip(a[0::2],a[1::2]) # mapin = zip (x,[extendedAlignments]*len(x)) # _LOG.debug("One round of merging started. Currently have %d # alignments left. " %len(extendedAlignments)) # Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin) # #print [len(x) if x is not None else "None" for x in # extendedAlignments] # extendedAlignments = manager.list([x for x in # extendedAlignments if x is not None]) # extendedAlignments.reverse() # _LOG.debug("One round of merging finished. Still have %d # alignments left. " %len(extendedAlignments)) # extendedAlignment = extendedAlignments[0] # extendedAlignment.from_bytearray_to_string() # self.results = extendedAlignment def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename) def check_and_set_sizes(self, total): assert (self.options.placement_size is None) or ( self.options.placement_size >= total), \ ("currently UPP works with only one placement subset." " Please leave placement subset size option blank.") ExhaustiveAlgorithm.check_and_set_sizes(self, total) self.options.placement_size = total def _get_new_Join_Align_Job(self): return UPPJoinAlignJobs() def modify_tree(self, a_tree): ''' Filter out taxa on long branches ''' self.filtered_taxa = [] if self.options.long_branch_filter is not None: tr = a_tree.get_tree() elen = {} for e in tr.leaf_edge_iter(): elen[e] = e.length elensort = sorted(elen.values()) mid = elensort[len(elensort) / 2] torem = [] for k, v in list(elen.items()): if v > mid * self.options.long_branch_filter: self.filtered_taxa.append(k.head_node.taxon.label) torem.append(k.head_node.taxon) tr.prune_taxa(torem) def create_fragment_files(self): alg_subset_count = len(list(self.root_problem.iter_leaves())) frag_chunk_count = lcm( alg_subset_count, self.options.cpu) // alg_subset_count _LOG.info( "%d taxa pruned from backbone and added to fragments: %s" % (len(self.filtered_taxa), " , ".join(self.filtered_taxa))) return self.read_and_divide_fragments( frag_chunk_count, extra_frags=self.root_problem.subalignment.get_soft_sub_alignment( self.filtered_taxa))
writer.write(line.upper()) else: writer.write(line) original_backbone = MutableAlignment() done = original_backbone.read_filepath(new_backbone_file) # all query sequences original_frag_file = ( '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/all_query.txt' ) original_frag = MutableAlignment() done = original_frag.read_filepath(original_frag_file) # First build extended alignment on entire fragment set extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names()) dir = '/Users/gillianchu/warnow/bin/gitrepos/smirarab-sepp-17a33aa/trial/' for a in [1, 2]: a = str(a) print("Working on HMM %s\n" % a) # query sequences aligned_files = glob.glob(str(dir) + 's' + str(a) + '_query.aln') if a == '1': sequence_files = glob.glob(str(dir) + 'query_x.txt') elif a == '2': sequence_files = glob.glob(str(dir) + 'query_y.txt') # sequences your hmm was trained on. Ensure you didn't just take the backbone alignment and # restrict the subset of sequences. This file must not have any gaps in it.
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = [ "SFIF", "SFII", "SCFC", "SGHD", "SDCC", "SBGE", "SFBB", "SDI", "SCGB", "SJGF", "SGBI", "SCJA", "SGAD", "SHEB", "SFHB", "SDJI", "SHED", "SJJJ", "SBBE", "SCCH", "SDJB", "SDAC", "SHEH", "SFDC", "SFEI", "SHHB", "SC", "SIAB", "SDDI", "SBCB", "SJB", "SEBD", "SFGD", "SHA", "SIDA", "SGHI", "SGIB", "SBFJ", "SFIE", "SCJF", "SJHJ", "SJBG", "SEJI", "SFFF", "SJ", "SIII", "SJHH", "SEIH", "SBDC", "SHDJ", "SJDD", "SGDB", "SIHA", "SIBB", "SECC", "SCAD", "SGBB", "SGIF", "SJHC", "SFCD", "SEAA", "SEFF", "SDFG", "SDJE", "SCFG", "SFH", "SCJ", "SDDD", "SEGD", "SCIH", "SDAG", "SCJE", "SFAJ", "SIDJ", "SE", "SHBC", "SJFF", "SCHD", "SBHA", "SEDF", "SFAF", "SEDD", "SDHD", "SGJD", "SIBH", "SGDF", "SIFA", "SJGA", "SIJB", "SFI", "SGA", "SBFC", "SBJA", "SFFC", "SFDH", "SFEE", "SBDF", "SGBJ", "SDHE", "SJIB", "SHHI", "SIDE", "SJII" ] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) - set(subset)), pp) cp1.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment( [k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns( "data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns( "data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all( [not tmp.is_all_gap(pos) for pos in xrange(0, tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x < 0]) in2 = len([x for x in ext2._col_labels if x < 0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" % ( extmerger.get_length(), in1, in2, tlen) assert (in1 + in2 + tlen - mixed) == extmerger.get_length( ), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" % ( extmerger.get_length(), in1, in2, tlen, mixed) assert (in1 + in2 - mixed) == len( list(extmerger.iter_insertion_columns()) ), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" % ( len(list(extmerger.iter_insertion_columns())), in1, in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k, s) in tmp.items() ]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
from sepp.scheduler import JobPool from multiprocessing import Pool, Manager from sepp.alignment import ExtendedAlignment import glob job_joiner = JoinAlignJobs original_backbone_file = '/projects/sate8/namphuon/ultra_large/1000000/sate.fasta' original_backbone = MutableAlignment() done = original_backbone.read_filepath(original_backbone_file) original_frag_file = '/projects/sate8/namphuon/ultra_large/1000000/initial.fas.100' original_frag = MutableAlignment() done = original_frag.read_filepath(original_frag_file) #First build extended alignment on entire fragment set extendedAlignment = ExtendedAlignment(original_frag.get_sequence_names()) dirs = glob.glob('/projects/sate8/namphuon/ultra_large/1000000/upp_100_10_new/temp/upp.1_HNlM/root/P_0/A_0_*/') dirs.reverse() for dir in dirs: print "Working on %s\n" % dir aligned_files = glob.glob('%sFC_*/hmmalign.results.*' % dir) sequence_files = glob.glob('%sFC_*/hmmalign.frag.*' % dir) base_alignment_file = glob.glob('%s/*.fasta' % dir) base_alignment = MutableAlignment() done = base_alignment.read_filepath(base_alignment_file[0]) subbackbone = original_backbone.get_soft_sub_alignment(base_alignment.get_sequence_names()) frags = MutableAlignment() sequence_names = [] for file in sequence_files:
def testExtendedAlignment(self): print "======= starting testExtendedAlignment =========" subset = ["SFIF","SFII","SCFC","SGHD","SDCC","SBGE","SFBB","SDI","SCGB","SJGF","SGBI","SCJA","SGAD","SHEB","SFHB","SDJI","SHED","SJJJ","SBBE","SCCH","SDJB","SDAC","SHEH","SFDC","SFEI","SHHB","SC","SIAB","SDDI","SBCB","SJB","SEBD","SFGD","SHA","SIDA","SGHI","SGIB","SBFJ","SFIE","SCJF","SJHJ","SJBG","SEJI","SFFF","SJ","SIII","SJHH","SEIH","SBDC","SHDJ","SJDD","SGDB","SIHA","SIBB","SECC","SCAD","SGBB","SGIF","SJHC","SFCD","SEAA","SEFF","SDFG","SDJE","SCFG","SFH","SCJ","SDDD","SEGD","SCIH","SDAG","SCJE","SFAJ","SIDJ","SE","SHBC","SJFF","SCHD","SBHA","SEDF","SFAF","SEDD","SDHD","SGJD","SIBH","SGDF","SIFA","SJGA","SIJB","SFI","SGA","SBFC","SBJA","SFFC","SFDH","SFEE","SBDF","SGBJ","SDHE","SJIB","SHHI","SIDE","SJII"] alg = MutableAlignment() alg.read_filepath("data/simulated/test.fasta") alg.delete_all_gap() tlen = alg.get_length() frg = MutableAlignment() frg.read_filepath("data/simulated/test.fas") #print frg.get_num_taxa() pp = SeppProblem(alg.keys()) pp.fragments = frg pp.subalignment = alg cp1 = SeppProblem(subset, pp) cp2 = SeppProblem(list(set(alg.keys()) -set(subset)), pp) cp1.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) >= 9], frg) cp2.fragments = ReadonlySubalignment([k for k in frg.keys() if int(k[-1]) <= 1], frg) cp1labels = cp1.write_subalignment_without_allgap_columns("data/tmp/cp1.fasta") cp2labels = cp2.write_subalignment_without_allgap_columns("data/tmp/cp2.fasta") tmp = MutableAlignment().read_filepath("data/tmp/cp1.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) tmp = MutableAlignment().read_filepath("data/tmp/cp2.fasta") assert all([not tmp.is_all_gap(pos) for pos in xrange(0,tmp.get_length())]) cp1.fragments.write_to_path("data/tmp/cp1.frags.fas") cp2.fragments.write_to_path("data/tmp/cp2.frags.fas") '''We have done the hmmalign before. don't worry about that right now''' ext1 = ExtendedAlignment(cp1.fragments) ext1.build_extended_alignment("data/tmp/cp1.fasta", "data/tmp/cp1.extended.sto") ext1.relabel_original_columns(cp1labels) ext2 = ExtendedAlignment(cp2.fragments) ext2.build_extended_alignment("data/tmp/cp2.fasta", "data/tmp/cp2.extended.sto") ext2.relabel_original_columns(cp2labels) extmerger = ExtendedAlignment([]) extmerger.merge_in(ext1) mixed = extmerger.merge_in(ext2) extmerger.write_to_path("data/tmp/extended.merged.fasta") assert extmerger.is_aligned(), "Merged alignment is not aligned" in1 = len([x for x in ext1._col_labels if x<0]) in2 = len([x for x in ext2._col_labels if x<0]) print "Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d" %(extmerger.get_length(),in1 , in2 , tlen) assert ( in1 + in2 + tlen - mixed) == extmerger.get_length(), "Lengths don't match up after merging. Merged:%d. Insertion1:%d Insertion2:%d BaseLen:%d Mixed-insertion: %d" %(extmerger.get_length(),in1, in2 , tlen, mixed) assert ( in1 + in2 - mixed) == len(list(extmerger.iter_insertion_columns())), "Columns are not correctly labeled after merging. Merged insertion count:%d. Insertion1:%d Insertion2:%d Mixed-insertion: %d" %(len(list(extmerger.iter_insertion_columns())),in1 , in1, mixed) tmp = extmerger.get_base_readonly_alignment().get_mutable_alignment() tmp.delete_all_gap() assert tmp.is_aligned(), "merged alignment should be aligned!" assert tmp.get_length() == tlen, "merged alignment has wrong length" assert all([alg[k] == s for (k,s) in tmp.items()]), "merged alignment should match original alignment" print "======= finished testExtendedAlignment ========="
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def merge_results(self): assert isinstance(self.root_problem, SeppProblem) '''Generate single extended alignment''' fullExtendedAlignment = ExtendedAlignment( self.root_problem.fragments.keys()) # self.root_problem.get_children()[0].jobs[get_placement_job_name(0)]\ # .get_attribute("full_extended_alignment_object") for pp in self.root_problem.get_children(): for i in range(0, self.root_problem.fragment_chunks): extended_alignment = pp.jobs[get_placement_job_name( i)].get_attribute("full_extended_alignment_object") fullExtendedAlignment.merge_in(extended_alignment, convert_to_string=True) self.results = fullExtendedAlignment # IF only one placement subset, no need to go to java if len(self.root_problem.get_children()) == 1: import json mergeinput = [] for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' with open( pp.get_job_result_by_name( get_placement_job_name(i))) as f: mergeinput.append(json.load(f)) _LOG.info( "There are %d fragment chunks on a single placement subset" % len(mergeinput)) result = mergeinput[0] for i in range(1, len(mergeinput)): result["placements"] = result["placements"] + mergeinput[i][ "placements"] with open(self.get_output_filename("placement.json"), 'w') as f: json.dump(result, f, sort_keys=True, indent=4) else: mergeinput = [] '''Append main tree to merge input''' mergeinput.append( "%s;" % (self.root_problem.subtree.compose_newick(labels=True))) for pp in self.root_problem.get_children(): assert isinstance(pp, SeppProblem) for i in range(0, self.root_problem.fragment_chunks): if (pp.get_job_result_by_name(get_placement_job_name(i)) is None): continue '''Append subset trees and json locations to merge input''' mergeinput.append( "%s;\n%s" % (pp.subtree.compose_newick(labels=True), pp.get_job_result_by_name(get_placement_job_name(i)))) mergeinput.append("") mergeinput.append("") meregeinputstring = "\n".join(mergeinput) _LOG.debug(mergeinput) mergeJsonJob = MergeJsonJob() mergeJsonJob.setup(meregeinputstring, self.get_output_filename("placement.json")) mergeJsonJob.run()