def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100,int(.20*sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def generate_backbone(self): _LOG.info("Reading input sequences: %s" %(self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) fragments = MutableAlignment() if (options().median_full_length is not None): if (options().median_full_length == -1): seq_lengths = sorted([len(seq) for seq in sequences.values()]) lengths = len(seq_lengths) if lengths % 2: options().median_full_length = (seq_lengths[lengths / 2] + seq_lengths[lengths / 2 - 1]) / 2.0 else: options().median_full_length = seq_lengths[lengths / 2] (min_length,max_length) = (int(options().median_full_length*(1-options().backbone_threshold)),int(options().median_full_length*(1+options().backbone_threshold))) frag_names = [name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length] if (len(frag_names) > 0): fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in fragments.keys()] if (options().backbone_size is None): options().backbone_size = min(1000,int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" %(options().backbone_size)) if (options().backbone_size > len(sequences.keys())): options().backbone_size = len(sequences.keys()) backbone_sequences = sequences.get_hard_sub_alignment(random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone,options().backbone_size,self.options.outdir,moleculeType,options().cpu) pastaalignJob.run() pastaalignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/pasta.fasta") options().tree_file = open(self.options.outdir + "/pasta.fasttree") _LOG.info("Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): _LOG.info("No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) shutil.copyfile(self.options.outdir + "/pasta.fasta", self.get_output_filename("alignment.fasta")) sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)
def read_and_divide_fragments(self, chunks, extra_frags={}): max_chunk_size = self.options.max_chunk_size _LOG.debug( "start reading fragment files and breaking to at least %s chunks but at most %s sequences " % (str(chunks), str(max_chunk_size))) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) # test if input fragment names might collide with reference names. # code contribution by Stefan Janssen (June 13th, 2018) ids_reference = set(self.root_problem.subalignment.keys()) ids_inputfragments = set(self.root_problem.fragments.keys()) ids_overlap = ids_reference & ids_inputfragments if len(ids_overlap) > 0: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "overlap with names in your reference. Please rename your inp" "ut fragments and re-start. Duplicate names are:\n '%s'") % (len(ids_overlap), "'\n '".join(ids_overlap))) for (k, v) in extra_frags.items(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks( chunks, max_chunk_size) ret = [] for i in range(0, len(alg_chunks)): temp_file = None if alg_chunks[i]: temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def read_and_divide_fragments(self, chunks, extra_frags={}): max_chunk_size = self.options.max_chunk_size _LOG.debug( ("start reading fragment files and breaking to at least %s chunks" " but at most %s sequences ") % (str(chunks), str(max_chunk_size))) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) # test if input fragment names might collide with reference names. # code contribution by Stefan Janssen (June 13th, 2018) ids_reference = set(self.root_problem.subalignment.keys()) ids_inputfragments = set(self.root_problem.fragments.keys()) ids_overlap = ids_reference & ids_inputfragments if len(ids_overlap) > 0 and not self.options.ignore_overlap: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "overlap with names in your reference. Please rename your inp" "ut fragments and re-start. Duplicate names are:\n '%s'") % (len(ids_overlap), "'\n '".join(ids_overlap))) elif len(ids_overlap) > 0: _LOG.debug("Ignoring following %i query sequences present " "in the backbone: \n '%s'" % (len(ids_overlap), "' , '".join(ids_overlap))) self.root_problem.fragments = self.root_problem.fragments.\ get_soft_sub_alignment(ids_inputfragments - ids_reference) # test if input fragment names contain whitespaces / tabs which would # cause hmmsearch to fail. # code contribution by Stefan Janssen (June 22nd, 2018) ids_inputfragments_spaces = [ id_ for id_ in ids_inputfragments if (' ' in id_) or ('\t' in id_) ] if len(ids_inputfragments_spaces) > 0: raise ValueError( ("Your input fragment file contains %i sequences, whose names " "contain either whitespaces: ' ' or tabulator '\\t' symbols. " "Please rename your input fragments and re-start. Affected " "names are:\n '%s'") % (len(ids_inputfragments_spaces), "'\n '".join(ids_inputfragments_spaces))) for (k, v) in extra_frags.items(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks( chunks, max_chunk_size) ret = [] for i in range(0, len(alg_chunks)): temp_file = None if alg_chunks[i]: temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def read_and_divide_fragments(self, chunks, extra_frags={}): max_chunk_size = self.options.max_chunk_size _LOG.debug( ("start reading fragment files and breaking to at least %s chunks" " but at most %s sequences ") % ( str(chunks), str(max_chunk_size))) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) # test if input fragment names might collide with reference names. # code contribution by Stefan Janssen (June 13th, 2018) ids_reference = set(self.root_problem.subalignment.keys()) ids_inputfragments = set(self.root_problem.fragments.keys()) ids_overlap = ids_reference & ids_inputfragments if len(ids_overlap) > 0: raise ValueError(( "Your input fragment file contains %i sequences, whose names " "overlap with names in your reference. Please rename your inp" "ut fragments and re-start. Duplicate names are:\n '%s'") % (len(ids_overlap), "'\n '".join(ids_overlap))) # test if input fragment names contain whitespaces / tabs which would # cause hmmsearch to fail. # code contribution by Stefan Janssen (June 22nd, 2018) ids_inputfragments_spaces = [ id_ for id_ in ids_inputfragments if (' ' in id_) or ('\t' in id_)] if len(ids_inputfragments_spaces) > 0: raise ValueError(( "Your input fragment file contains %i sequences, whose names " "contain either whitespaces: ' ' or tabulator '\\t' symbols. " "Please rename your input fragments and re-start. Affected " "names are:\n '%s'") % (len(ids_inputfragments_spaces), "'\n '".join(ids_inputfragments_spaces))) for (k, v) in extra_frags.items(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks( chunks, max_chunk_size) ret = [] for i in range(0, len(alg_chunks)): temp_file = None if alg_chunks[i]: temp_file = get_temp_file( "fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) if (options().backbone_size is None): options().backbone_size = min(100, int(.20 * sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment( random.sample(sequences.keys(), options().backbone_size)) [sequences.pop(i) for i in backbone_sequences.keys()] _LOG.info("Writing query and backbone set. ") query = get_temp_file("query", "backbone", ".fas") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(sequences, query) _write_fasta(backbone_sequences, backbone) _LOG.info("Generating sate backbone alignment and tree. ") satealignJob = SateAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' satealignJob.setup(backbone, options().backbone_size, self.options.outdir, moleculeType, options().cpu) satealignJob.run() satealignJob.read_results() options().placement_size = self.options.backbone_size options().alignment_file = open(self.options.outdir + "/sate.fasta") options().tree_file = open(self.options.outdir + "/sate.fasttree") _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) options().fragment_file = query
def read_and_divide_fragments(self, chunks, extra_frags = {}): _LOG.debug("start reading fragment files and breaking to chunks: %d" %chunks) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object(self.options.fragment_file) for (k,v) in extra_frags.iteritems(): self.root_problem.fragments[k] = v.replace("-","") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks) ret = [] for i in xrange(0,chunks): temp_file = get_temp_file("fragment_chunk_%d" %i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def read_and_divide_fragments(self, chunks, extra_frags={}): _LOG.debug("start reading fragment files and breaking to chunks: %d" % chunks) self.root_problem.fragments = MutableAlignment() self.root_problem.fragments.read_file_object( self.options.fragment_file) for (k, v) in extra_frags.iteritems(): self.root_problem.fragments[k] = v.replace("-", "") alg_chunks = self.root_problem.fragments.divide_to_equal_chunks(chunks) ret = [] for i in xrange(0, chunks): temp_file = get_temp_file("fragment_chunk_%d" % i, "fragment_chunks", ".fasta") alg_chunks[i].write_to_path(temp_file) ret.append(temp_file) _LOG.debug("fragment files read and divided.") return ret
def save_checkpoint(checkpoint_manager): ''' This is the callback function that is called periodically to save the current state of the system. ''' # Note: this module is not bullet proof in terms of race conditions. # Most importantly, it is possible (though extremely unlikely) that # while the new temp path is being written (f.write...) if checkpoint_manager.is_checkpointing: # checkpoint_manager.lock.acquire() checkpoint_manager.saving = True newTmpDest = get_temp_file("dump", "checkpoints") _LOG.info("Checkpoint is being updated: %s" % newTmpDest) oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines() oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split( ",")[0] checkpoint_manager.update_time() currenlimit = sys.getrecursionlimit() sys.setrecursionlimit(100000) picklefile = gzip.GzipFile(newTmpDest, 'wb') pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2) picklefile.close() sys.setrecursionlimit(currenlimit) f = open(checkpoint_manager.checkpoint_path, "a") f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now())) f.close() if oldTmpFile is not None: os.remove(oldTmpFile) _LOG.info("Checkpoint Saved to: %s and linked in %s." % (newTmpDest, checkpoint_manager.checkpoint_path)) checkpoint_manager.saving = False # checkpoint_manager.lock.release() checkpoint_manager.timer = threading.Timer( options().checkpoint_interval, save_checkpoint, args=[checkpoint_manager]) checkpoint_manager.timer.setDaemon(True) checkpoint_manager.timer.start()
def save_checkpoint(checkpoint_manager): ''' This is the callback function that is called periodically to save the current state of the system. ''' # Note: this module is not bullet proof in terms of race conditions. # Most importantly, it is possible (though extremely unlikely) that # while the new temp path is being written (f.write...) if checkpoint_manager.is_checkpointing: # checkpoint_manager.lock.acquire() checkpoint_manager.saving = True newTmpDest = get_temp_file("dump", "checkpoints") _LOG.info("Checkpoint is being updated: %s" % newTmpDest) oldTmpFile = open(checkpoint_manager.checkpoint_path).readlines() oldTmpFile = None if len(oldTmpFile) == 0 else oldTmpFile[-1].split( ",")[0] checkpoint_manager.update_time() currenlimit = sys.getrecursionlimit() sys.setrecursionlimit(100000) picklefile = gzip.GzipFile(newTmpDest, 'wb') pickle.dump(checkpoint_manager.checkpoint_state, picklefile, 2) picklefile.close() sys.setrecursionlimit(currenlimit) f = open(checkpoint_manager.checkpoint_path, "a") f.write("%s, %s\n" % (newTmpDest, datetime.datetime.now())) f.close() if oldTmpFile is not None: os.remove(oldTmpFile) _LOG.info("Checkpoint Saved to: %s and linked in %s." % ( newTmpDest, checkpoint_manager.checkpoint_path)) checkpoint_manager.saving = False # checkpoint_manager.lock.release() checkpoint_manager.timer = threading.Timer( options().checkpoint_interval, save_checkpoint, args=[checkpoint_manager]) checkpoint_manager.timer.setDaemon(True) checkpoint_manager.timer.start()
def generate_backbone(self): _LOG.info("Reading input sequences: %s" % (self.options.sequence_file)) sequences = MutableAlignment() sequences.read_file_object(self.options.sequence_file) sequences.degap() fragments = MutableAlignment() if (options().median_full_length is not None or options().full_length_range is not None): if (options().median_full_length == -1): seq_lengths = sorted( [len(seq) for seq in list(sequences.values())]) lengths = len(seq_lengths) l2 = int(lengths / 2) if lengths % 2: options().median_full_length = (seq_lengths[l2] + seq_lengths[l2 + 1]) / 2.0 else: options().median_full_length = seq_lengths[l2] if options().full_length_range is not None: L = sorted(int(x) for x in options().full_length_range.split()) min_length = L[0] max_length = L[1] else: (min_length, max_length) = (int(options().median_full_length * (1 - options().backbone_threshold)), int(options().median_full_length * (1 + options().backbone_threshold))) _LOG.info( "Full length sequences are set to be from %d to %d character long" % (min_length, max_length)) frag_names = [ name for name in sequences if len(sequences[name]) > max_length or len(sequences[name]) < min_length ] if (len(frag_names) > 0): _LOG.info("Detected %d fragmentary sequences" % len(frag_names)) fragments = sequences.get_hard_sub_alignment(frag_names) [sequences.pop(i) for i in list(fragments.keys())] if (options().backbone_size is None): options().backbone_size = min(1000, int(sequences.get_num_taxa())) _LOG.info("Backbone size set to: %d" % (options().backbone_size)) if (options().backbone_size > len(list(sequences.keys()))): options().backbone_size = len(list(sequences.keys())) sample = sorted( random.sample(sorted(list(sequences.keys())), options().backbone_size)) backbone_sequences = sequences.get_hard_sub_alignment(sample) _LOG.debug("Backbone: %s" % (sorted(list(backbone_sequences.keys())))) [sequences.pop(i) for i in list(backbone_sequences.keys())] _LOG.info("Writing backbone set. ") backbone = get_temp_file("backbone", "backbone", ".fas") _write_fasta(backbone_sequences, backbone) _LOG.info("Generating pasta backbone alignment and tree. ") pastaalignJob = PastaAlignJob() moleculeType = options().molecule if (options().molecule == 'amino'): moleculeType = 'protein' pastaalignJob.setup(backbone, options().backbone_size, moleculeType, options().cpu, **vars(options().pasta)) pastaalignJob.run() (a_file, t_file) = pastaalignJob.read_results() shutil.copyfile(t_file, self.get_output_filename("pasta.fasttree")) shutil.copyfile(a_file, self.get_output_filename("pasta.fasta")) options().placement_size = self.options.backbone_size options().alignment_file = open( self.get_output_filename("pasta.fasta")) options().tree_file = open(self.get_output_filename("pasta.fasttree")) _LOG.info( "Backbone alignment written to %s.\nBackbone tree written to %s" % (options().alignment_file, options().tree_file)) sequences.set_alignment(fragments) if (len(sequences) == 0): sequences = MutableAlignment() sequences.read_file_object(open(self.options.alignment_file.name)) self.results = ExtendedAlignment(fragment_names=[]) self.results.set_alignment(sequences) _LOG.info( "No query sequences to align. Final alignment saved as %s" % self.get_output_filename("alignment.fasta")) self.output_results() sys.exit(0) else: query = get_temp_file("query", "backbone", ".fas") options().fragment_file = query _write_fasta(sequences, query)