def make_msa(seqbuddy, aligner, trimal=()): """ Create a multiple sequence alignment :param seqbuddy: SeqBuddy object :param aligner: path to alignment program :param trimal: List of TrimAl thresholds to try :return: AlignBuddy object """ trimal = trimal if trimal else ["clean"] if len(seqbuddy) == 1: alignment = Alb.AlignBuddy(str(seqbuddy)) else: alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True) ave_seq_length = Sb.ave_seq_length(seqbuddy) for threshold in trimal: align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold) cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy))) cleaned_seqs = Sb.delete_small(cleaned_seqs, 1) # Structured this way for unit test purposes if len(alignment.records()) != len(cleaned_seqs): continue elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5: continue else: alignment = align_copy break return alignment
ref_name = in_args.reference.split(os.sep)[-1] ref_name = os.path.splitext(ref_name)[0] if not os.path.isfile("%s%s.gb" % (ref_dir, ref_name)): shutil.copy(in_args.reference, "%s%s.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_pep.gb" % (ref_dir, ref_name)): sb_pep = Sb.translate_cds(Sb.make_copy(seqbuddy)) sb_pep.write("%s%s_pep.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_rna.gb" % (ref_dir, ref_name)): sb_rna = Sb.dna2rna(Sb.make_copy(seqbuddy)) sb_rna.write("%s%s_rna.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_aln.gb" % (ref_dir, ref_name)): alignbuddy = Alb.generate_msa(Sb.make_copy(seqbuddy), "mafft") alignbuddy.write("%s%s_aln.gb" % (ref_dir, ref_name)) else: alignbuddy = Alb.AlignBuddy("%s%s_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_pep_aln.gb" % (ref_dir, ref_name)): alb_pep = Alb.translate_cds(Alb.make_copy(alignbuddy)) alb_pep.write("%s%s_pep_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_rna_aln.gb" % (ref_dir, ref_name)): alb_rna = Alb.dna2rna(Alb.make_copy(alignbuddy)) alb_rna.write("%s%s_rna_aln.gb" % (ref_dir, ref_name)) if not os.path.isfile("%s%s_tree.nwk" % (ref_dir, ref_name)): phylobuddy = Pb.generate_tree(Alb.make_copy(alignbuddy), "fasttree") phylobuddy.write("%s%s_tree.nwk" % (ref_dir, ref_name))
def start(self): self.split_time = time.time() self.start_time = time.time() self.heartbeat.start() self.worker_file = os.path.join(self.working_dir, "Worker_%s" % self.heartbeat.id) with open(self.worker_file, "w") as ofile: ofile.write("To terminate this Worker, simply delete this file.") self.data_file = os.path.join(self.working_dir, ".Worker_%s.dat" % self.heartbeat.id) open(self.data_file, "w").close() helpers.dummy_func() self.last_heartbeat_from_master = time.time() self.printer.write("Starting Worker_%s" % self.heartbeat.id) self.printer.new_line(1) idle_countdown = 1 while os.path.isfile(self.worker_file): idle = round(100 * self.idle / (self.idle + self.running), 2) if not idle_countdown: self.printer.write("Idle %s%%" % idle) idle_countdown = 5 # Make sure there are some masters still kicking around self.check_masters(idle) # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through rand_check = random() if rand_check > 0.95: self.clean_dead_threads() # Fetch a job from the queue data = self.fetch_queue_job() if data: full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \ else full_name.split("_") subjob_num = int(subjob_num) num_subjobs = int(num_subjobs) self.printer.write("Running %s" % full_name) else: time.sleep( random() * self.idle_workers() ) # Pause for some time relative to num idle workers idle_countdown -= 1 self.idle += time.time() - self.split_time self.split_time = time.time() continue try: idle_countdown = 1 seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash), in_format="fasta") # Prepare alignment if len(seqbuddy) == 1: raise ValueError("Queued job of size 1 encountered: %s" % id_hash) else: if num_subjobs == 1: self.printer.write("Creating MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), align_m, params=align_p, quiet=True) else: self.printer.write("Reading MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.AlignBuddy( os.path.join(self.output, "%s.aln" % id_hash)) # Prepare psipred dataframes psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir) if num_subjobs == 1: # This is starting a full job from scratch, not a sub-job # Need to specify what columns the PsiPred files map to now that there are gaps. psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "msa") # TrimAl self.printer.write("Trimal (%s seqs)" % len(seqbuddy)) alignment = rdmcl.trimal(seqbuddy, trimal, alignment) with helpers.ExclusiveConnect(os.path.join( self.output, "write.lock"), max_lock=0): # Place these write commands in ExclusiveConnect to ensure a writing lock if not os.path.isfile( os.path.join(self.output, "%s.aln" % id_hash)): alignment.write(os.path.join( self.output, "%s.aln" % id_hash), out_format="fasta") # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed self.printer.write("Updating %s psipred dataframes" % len(seqbuddy)) psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "trimal") # Prepare all-by-all list self.printer.write("Preparing all-by-all data") data_len, data = rdmcl.prepare_all_by_all( seqbuddy, psipred_dfs, self.cpus) if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff: data_len, data, subjob_num, num_subjobs = self.spawn_subjobs( id_hash, data, psipred_dfs, gap_open, gap_extend) elif subjob_num > 1: data_len, data = self.load_subjob(id_hash, subjob_num, num_subjobs, psipred_dfs) # Launch multicore self.printer.write("Running all-by-all data (%s comparisons)" % data_len) with open(self.data_file, "w") as ofile: ofile.write("seq1,seq2,subsmat,psi") br.run_multicore_function(data, rdmcl.mc_score_sequences, quiet=True, max_processes=self.cpus, func_args=[ alignment, gap_open, gap_extend, self.data_file ]) self.printer.write("Processing final results") self.process_final_results(id_hash, subjob_num, num_subjobs) self.running += time.time() - self.split_time self.split_time = time.time() except (OSError, FileNotFoundError, br.GuessError, ValueError) as err: if num_subjobs == 1: self.terminate( "something wrong with primary cluster %s\n%s" % (full_name, err)) else: with helpers.ExclusiveConnect(self.wrkdb_path) as cursor: cursor.execute("DELETE FROM processing WHERE hash=?", (full_name, )) continue # Broken out of while loop, clean up and terminate worker if os.path.isfile(self.data_file): os.remove(self.data_file) self.terminate("deleted check file")
def start(self): self.split_time = time.time() self.start_time = time.time() self.heartbeat.start() self.worker_file = os.path.join(self.working_dir, "Worker_%s" % self.heartbeat.id) with open(self.worker_file, "w") as ofile: ofile.write("To terminate this Worker, simply delete this file.") self.data_file = os.path.join(self.working_dir, ".Worker_%s.dat" % self.heartbeat.id) open(self.data_file, "w").close() helpers.dummy_func() self.last_heartbeat_from_master = time.time() self.printer.write("Starting Worker_%s" % self.heartbeat.id) self.printer.new_line(1) idle_countdown = 1 while os.path.isfile(self.worker_file): idle = round(100 * self.idle / (self.idle + self.running), 2) if not idle_countdown: self.printer.write("Idle %s%%" % idle) idle_countdown = 5 # Make sure there are some masters still kicking around self.check_masters(idle) # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through rand_check = random() if rand_check > 0.95: self.clean_dead_threads() # Fetch a job from the queue data = self.fetch_queue_job() if data: full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \ else full_name.split("_") subjob_num = int(subjob_num) num_subjobs = int(num_subjobs) self.printer.write("Running %s" % full_name) else: time.sleep(random() * self.idle_workers()) # Pause for some time relative to num idle workers idle_countdown -= 1 self.idle += time.time() - self.split_time self.split_time = time.time() continue try: idle_countdown = 1 seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash), in_format="fasta") # Prepare alignment if len(seqbuddy) == 1: raise ValueError("Queued job of size 1 encountered: %s" % id_hash) else: if num_subjobs == 1: self.printer.write("Creating MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), align_m, params=align_p, quiet=True) else: self.printer.write("Reading MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.AlignBuddy(os.path.join(self.output, "%s.aln" % id_hash)) # Prepare psipred dataframes psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir) if num_subjobs == 1: # This is starting a full job from scratch, not a sub-job # Need to specify what columns the PsiPred files map to now that there are gaps. psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "msa") # TrimAl self.printer.write("Trimal (%s seqs)" % len(seqbuddy)) alignment = rdmcl.trimal(seqbuddy, trimal, alignment) with helpers.ExclusiveConnect(os.path.join(self.output, "write.lock"), max_lock=0): # Place these write commands in ExclusiveConnect to ensure a writing lock if not os.path.isfile(os.path.join(self.output, "%s.aln" % id_hash)): alignment.write(os.path.join(self.output, "%s.aln" % id_hash), out_format="fasta") # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed self.printer.write("Updating %s psipred dataframes" % len(seqbuddy)) psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "trimal") # Prepare all-by-all list self.printer.write("Preparing all-by-all data") data_len, data = rdmcl.prepare_all_by_all(seqbuddy, psipred_dfs, self.cpus) if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff: data_len, data, subjob_num, num_subjobs = self.spawn_subjobs(id_hash, data, psipred_dfs, gap_open, gap_extend) elif subjob_num > 1: data_len, data = self.load_subjob(id_hash, subjob_num, num_subjobs, psipred_dfs) # Launch multicore self.printer.write("Running all-by-all data (%s comparisons)" % data_len) with open(self.data_file, "w") as ofile: ofile.write("seq1,seq2,subsmat,psi") br.run_multicore_function(data, rdmcl.mc_score_sequences, quiet=True, max_processes=self.cpus, func_args=[alignment, gap_open, gap_extend, self.data_file]) self.printer.write("Processing final results") self.process_final_results(id_hash, subjob_num, num_subjobs) self.running += time.time() - self.split_time self.split_time = time.time() except (OSError, FileNotFoundError, br.GuessError, ValueError) as err: if num_subjobs == 1: self.terminate("something wrong with primary cluster %s\n%s" % (full_name, err)) else: with helpers.ExclusiveConnect(self.wrkdb_path) as cursor: cursor.execute("DELETE FROM processing WHERE hash=?", (full_name,)) continue # Broken out of while loop, clean up and terminate worker if os.path.isfile(self.data_file): os.remove(self.data_file) self.terminate("deleted check file")