class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm): ''' This implements the exhaustive algorithm where all alignments subsets are searched for every fragment. This is for UPP, meaning that no placement is performed, and that there is always only one placement subset (currently). ''' def __init__(self): ExhaustiveAlgorithm.__init__(self) self.pasta_only = False def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = ( seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] (min_length, max_length) = ( int(options().median_full_length * ( 1 - options().backbone_threshold)), int(options().median_full_length*( 1 + options().backbone_threshold))) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): _LOG.info( "Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted(random.sample( sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query) def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ( (not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None) ): options().fragment_file = options().sequence_file elif ( (options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None) ): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size return ExhaustiveAlgorithm.check_options(self) def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info( "Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug( "fragments are %d:\n %s" % ( len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None] _LOG.debug( "Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug( "Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment # Useful for multi-core merging if ever needed # def parallel_merge_results(self): # assert len(self.root_problem.get_children()) == 1, "Currently UPP # works with only one placement subset." # ''' # Merge alignment subset extended alignments to get one extended # alignment # for current placement subset. # ''' # pp = self.root_problem.get_children()[0] # _LOG.info("Merging sub-alignments for placement problem : %s." # %(pp.label)) # ''' Then Build an extended alignment by merging all hmmalign # results''' # manager = Manager() # extendedAlignments = manager.list() # for ap in pp.children: # assert isinstance(ap, SeppProblem) # ''' Get all fragment chunk alignments for this alignment subset''' # aligned_files = [fp.get_job_result_by_name('hmmalign') for # fp in ap.children if # fp.get_job_result_by_name('hmmalign') # is not None] # _LOG.info("Merging fragment chunks for subalignment : %s." # %(ap.label)) # ap_alg = ap.read_extendend_alignment_and_relabel_columns\ # (ap.jobs["hmmbuild"].infile , aligned_files) # _LOG.info("Merging alignment subset into placement subset: %s." # %(ap.label)) # extendedAlignments.append(ap_alg) # # while len(extendedAlignments)>1: # a=range(0,len(extendedAlignments)) # #print [len(x) for x in extendedAlignments] # x = zip(a[0::2],a[1::2]) # mapin = zip (x,[extendedAlignments]*len(x)) # _LOG.debug("One round of merging started. Currently have %d # alignments left. " %len(extendedAlignments)) # Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin) # #print [len(x) if x is not None else "None" for x in # extendedAlignments] # extendedAlignments = manager.list([x for x in # extendedAlignments if x is not None]) # extendedAlignments.reverse() # _LOG.debug("One round of merging finished. Still have %d # alignments left. " %len(extendedAlignments)) # extendedAlignment = extendedAlignments[0] # extendedAlignment.from_bytearray_to_string() # self.results = extendedAlignment def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename) def check_and_set_sizes(self, total): assert (self.options.placement_size is None) or ( self.options.placement_size >= total), \ ("currently UPP works with only one placement subset." " Please leave placement subset size option blank.") ExhaustiveAlgorithm.check_and_set_sizes(self, total) self.options.placement_size = total def _get_new_Join_Align_Job(self): return UPPJoinAlignJobs() def modify_tree(self, a_tree): ''' Filter out taxa on long branches ''' self.filtered_taxa = [] if self.options.long_branch_filter is not None: tr = a_tree.get_tree() elen = {} for e in tr.leaf_edge_iter(): elen[e] = e.length elensort = sorted(elen.values()) mid = elensort[len(elensort) / 2] torem = [] for k, v in list(elen.items()): if v > mid * self.options.long_branch_filter: self.filtered_taxa.append(k.head_node.taxon.label) torem.append(k.head_node.taxon) tr.prune_taxa(torem) def create_fragment_files(self): alg_subset_count = len(list(self.root_problem.iter_leaves())) frag_chunk_count = lcm( alg_subset_count, self.options.cpu) // alg_subset_count _LOG.info( "%d taxa pruned from backbone and added to fragments: %s" % (len(self.filtered_taxa), " , ".join(self.filtered_taxa))) return self.read_and_divide_fragments( frag_chunk_count, extra_frags=self.root_problem.subalignment.get_soft_sub_alignment( self.filtered_taxa))
class UPPExhaustiveAlgorithm(ExhaustiveAlgorithm): ''' This implements the exhaustive algorithm where all alignments subsets are searched for every fragment. This is for UPP, meaning that no placement is performed, and that there is always only one placement subset (currently). ''' def __init__(self): ExhaustiveAlgorithm.__init__(self) self.pasta_only = False def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query) def check_options(self): self.check_outputprefix() options().info_file = "A_dummy_value" # Check to see if tree/alignment/fragment file provided, if not, # generate it from sequence file if ((not options().tree_file is None) and (not options().alignment_file is None) and (not options().sequence_file is None)): options().fragment_file = options().sequence_file elif ((options().tree_file is None) and (options().alignment_file is None) and (not options().sequence_file is None)): self.generate_backbone() else: _LOG.error( ("Either specify the backbone alignment and tree and query " "sequences or only the query sequences. Any other " "combination is invalid")) exit(-1) sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) backbone_size = sequences.get_num_taxa() if options().backbone_size is None: options().backbone_size = backbone_size assert options().backbone_size == backbone_size, ( ("Backbone parameter needs to match actual size of backbone; " "backbone parameter:%s backbone_size:%s") % (options().backbone_size, backbone_size)) if options().placement_size is None: options().placement_size = options().backbone_size if options().backtranslation_sequence_file and \ options().molecule != "amino": _LOG.error(("Backtranslation can be performed only when " "input sequences are amino acid. ")) exit(-1) return ExhaustiveAlgorithm.check_options(self) def merge_results(self): assert \ len(self.root_problem.get_children()) == 1, \ "Currently UPP works with only one placement subset." ''' Merge alignment subset extended alignments to get one extended alignment for current placement subset. ''' pp = self.root_problem.get_children()[0] _LOG.info("Merging sub-alignments for placement problem : %s." % (pp.label)) ''' First assign fragments to the placement problem''' pp.fragments = pp.parent.fragments.get_soft_sub_alignment([]) for ap in pp.get_children(): pp.fragments.seq_names |= set(ap.fragments) ''' Then Build an extended alignment by merging all hmmalign results''' _LOG.debug("fragments are %d:\n %s" % (len(pp.fragments.seq_names), pp.fragments.seq_names)) extendedAlignment = ExtendedAlignment(pp.fragments.seq_names) for ap in pp.children: assert isinstance(ap, SeppProblem) ''' Get all fragment chunk alignments for this alignment subset''' aligned_files = [ fp.get_job_result_by_name('hmmalign') for fp in ap.children if fp.get_job_result_by_name('hmmalign') is not None ] _LOG.debug("Merging fragment chunks for subalignment : %s." % (ap.label)) ap_alg = ap.read_extendend_alignment_and_relabel_columns( ap.jobs["hmmbuild"].infile, aligned_files) _LOG.debug("Merging alignment subset into placement subset: %s." % (ap.label)) extendedAlignment.merge_in(ap_alg, convert_to_string=False) extendedAlignment.from_bytearray_to_string() self.results = extendedAlignment # Useful for multi-core merging if ever needed # def parallel_merge_results(self): # assert len(self.root_problem.get_children()) == 1, "Currently UPP # works with only one placement subset." # ''' # Merge alignment subset extended alignments to get one extended # alignment # for current placement subset. # ''' # pp = self.root_problem.get_children()[0] # _LOG.info("Merging sub-alignments for placement problem : %s." # %(pp.label)) # ''' Then Build an extended alignment by merging all hmmalign # results''' # manager = Manager() # extendedAlignments = manager.list() # for ap in pp.children: # assert isinstance(ap, SeppProblem) # ''' Get all fragment chunk alignments for this alignment subset''' # aligned_files = [fp.get_job_result_by_name('hmmalign') for # fp in ap.children if # fp.get_job_result_by_name('hmmalign') # is not None] # _LOG.info("Merging fragment chunks for subalignment : %s." # %(ap.label)) # ap_alg = ap.read_extendend_alignment_and_relabel_columns\ # (ap.jobs["hmmbuild"].infile , aligned_files) # _LOG.info("Merging alignment subset into placement subset: %s." # %(ap.label)) # extendedAlignments.append(ap_alg) # # while len(extendedAlignments)>1: # a=range(0,len(extendedAlignments)) # #print [len(x) for x in extendedAlignments] # x = zip(a[0::2],a[1::2]) # mapin = zip (x,[extendedAlignments]*len(x)) # _LOG.debug("One round of merging started. Currently have %d # alignments left. " %len(extendedAlignments)) # Pool(max(12,len(extendedAlignments))).map(mergetwo,mapin) # #print [len(x) if x is not None else "None" for x in # extendedAlignments] # extendedAlignments = manager.list([x for x in # extendedAlignments if x is not None]) # extendedAlignments.reverse() # _LOG.debug("One round of merging finished. Still have %d # alignments left. " %len(extendedAlignments)) # extendedAlignment = extendedAlignments[0] # extendedAlignment.from_bytearray_to_string() # self.results = extendedAlignment def output_results(self): extended_alignment = self.results _LOG.info("Generating output. ") outfilename = self.get_output_filename("alignment.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Unmasked alignment written to %s" % outfilename) outfilename = self.get_output_filename("insertion_columns.txt") extended_alignment.write_insertion_column_indexes(outfilename) _LOG.info("The index of insertion columns written to %s" % outfilename) if self.options.backtranslation_sequence_file: outfilename = self.get_output_filename( "backtranslated_alignment.fasta") backtranslation_seqs = MutableAlignment() backtranslation_seqs.read_file_object( self.options.backtranslation_sequence_file) try: extended_backtranslated_alignment = backtranslate( self.results, backtranslation_seqs) except Exception as e: _LOG.warning("Backtranslation failed due " "to following error: " + str(e) + ".\n" "No translated DNA sequence will be " "written to a file.") pass else: extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated alignment written to %s" % outfilename) extended_backtranslated_alignment.remove_insertion_columns() outfilename = self.get_output_filename( "backtranslated_alignment_masked.fasta") extended_backtranslated_alignment.write_to_path(outfilename) _LOG.info("Backtranslated masked alignment written " "to %s" % outfilename) extended_alignment.remove_insertion_columns() outfilename = self.get_output_filename("alignment_masked.fasta") extended_alignment.write_to_path(outfilename) _LOG.info("Masked alignment written to %s" % outfilename) def check_and_set_sizes(self, total): assert (self.options.placement_size is None) or ( self.options.placement_size >= total), \ ("currently UPP works with only one placement subset." " Please leave placement subset size option blank.") ExhaustiveAlgorithm.check_and_set_sizes(self, total) self.options.placement_size = total def _get_new_Join_Align_Job(self): return UPPJoinAlignJobs() def modify_tree(self, a_tree): ''' Filter out taxa on long branches ''' self.filtered_taxa = [] if self.options.long_branch_filter is not None: tr = a_tree.get_tree() elen = {} for e in tr.leaf_edge_iter(): elen[e] = e.length elensort = sorted(elen.values()) mid = elensort[len(elensort) / 2] torem = [] for k, v in list(elen.items()): if v > mid * self.options.long_branch_filter: self.filtered_taxa.append(k.head_node.taxon.label) torem.append(k.head_node.taxon) tr.prune_taxa(torem) def create_fragment_files(self): alg_subset_count = len(list(self.root_problem.iter_leaves())) frag_chunk_count = lcm(alg_subset_count, self.options.cpu) // alg_subset_count _LOG.info("%d taxa pruned from backbone and added to fragments: %s" % (len(self.filtered_taxa), " , ".join(self.filtered_taxa))) return self.read_and_divide_fragments( frag_chunk_count, extra_frags=self.root_problem.subalignment.get_soft_sub_alignment( self.filtered_taxa))