def get_data(self, data): if data == "cteno_panxs": return Sb.make_copy(self._cteno_panxs) elif data == "cteno_panxs_aln": return Alb.make_copy(self._cteno_panxs_aln) elif data == "cteno_ids": return deepcopy(self._cteno_ids) elif data == "cteno_sim_scores": return deepcopy(self._cteno_sim_scores) elif data == "ss2_dfs": psi_pred_ss2_dfs = Sb.OrderedDict() for rec in cteno_panxs.records: path = os.path.join(self.resource_path, "psi_pred", "%s.ss2" % rec.id) psi_pred_ss2_dfs[rec.id] = pd.read_csv(path, comment="#", header=None, delim_whitespace=True) psi_pred_ss2_dfs[rec.id].columns = [ "indx", "aa", "ss", "coil_prob", "helix_prob", "sheet_prob" ] return psi_pred_ss2_dfs elif data == "ss2_paths": psi_pred_ss2 = Sb.OrderedDict() for rec in cteno_panxs.records: psi_pred_ss2[rec.id] = os.path.join(self.resource_path, "psi_pred", "%s.ss2" % rec.id) return psi_pred_ss2 else: raise AttributeError("Unknown data type: %s" % data)
def main(): def fmt(prog): return br.CustomHelpFormatter(prog) parser = argparse.ArgumentParser(prog="largest_isoform", formatter_class=fmt, add_help=False, usage=argparse.SUPPRESS, description='''\ \033[1mLargest Isoform\033[m Select only the largest isoform from Augustus protein models Pass in a file containing protein sequences with the .t# suffix at the end of each sequence ID. \033[1mUsage\033[m: largest_isoform.py "/path/to/sequences" ''') # Positional parser.add_argument("sequences", help="Specify a sequence file") parser.add_argument( "-i", "--in_place", action="store_true", help="Overwrite original file. Be sure you want to do this!!") in_args = parser.parse_args() final_records = [] seqs = Sb.SeqBuddy(in_args.sequences) seqs = Sb.order_ids(seqs) iso_id = ".".join(seqs.records[0].id.split(".")[:-1]) max_seq = seqs.records[0] for rec in seqs.records: cur_id = ".".join(rec.id.split(".")[:-1]) if cur_id != iso_id: iso_id = cur_id final_records.append(max_seq) max_seq = rec else: if len(max_seq.seq) < len(rec.seq): max_seq = rec seqs.records = final_records if in_args.in_place: seqs.write(in_args.sequences) else: print(seqs)
def test_start_worker_1seq_error(hf, capsys, monkeypatch): temp_dir = br.TempDir() temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path) temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path) worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=20) work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite")) work_cursor = work_con.cursor() work_cursor.execute("INSERT INTO " "waiting (hash, master_id) " "VALUES ('foo', 2)") work_cursor.execute( "INSERT INTO " "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) " "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)", (os.path.join(hf.resource_path, "psi_pred"), )) work_con.commit() # Only a single sequence present seqbuddy = hf.get_data("cteno_panxs") seqbuddy = Sb.pull_recs(seqbuddy, "Oma-PanxαC") seqbuddy.write(os.path.join(worker.output, "foo.seqs")) monkeypatch.setattr(launch_worker.Worker, "check_masters", lambda *_, **__: True) with pytest.raises(SystemExit): worker.start() out, err = capsys.readouterr() assert "Queued job of size 1 encountered: foo" in out work_con.close()
def test_start_worker_1seq_error(hf, capsys, monkeypatch): temp_dir = br.TempDir() temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path) temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path) worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=20) work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite")) work_cursor = work_con.cursor() work_cursor.execute("INSERT INTO " "waiting (hash, master_id) " "VALUES ('foo', 2)") work_cursor.execute("INSERT INTO " "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) " "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)", (os.path.join(hf.resource_path, "psi_pred"),)) work_con.commit() # Only a single sequence present seqbuddy = hf.get_data("cteno_panxs") seqbuddy = Sb.pull_recs(seqbuddy, "Oma-PanxαC") seqbuddy.write(os.path.join(worker.output, "foo.seqs")) monkeypatch.setattr(launch_worker.Worker, "check_masters", lambda *_, **__: True) with pytest.raises(SystemExit): worker.start() out, err = capsys.readouterr() assert "Queued job of size 1 encountered: foo" in out work_con.close()
def test_main_strip_taxa(monkeypatch, hf, capsys): tmp_file = br.TempFile() seqbuddy = Sb.SeqBuddy(os.path.join(hf.resource_path, "Cteno_pannexins.fa")) seqbuddy = Sb.rename(seqbuddy, "^.*?\-") tmp_file.write(str(seqbuddy)) argv = [ 'rdmcl.py', os.path.join(hf.resource_path, "final_clusters.txt"), tmp_file.path, "-s" ] monkeypatch.setattr(sys, "argv", argv) group_by_cluster.main() out, err = capsys.readouterr() assert hf.string2hash(out) == "3020ea067affd21c77b7446f35689a6a", print( out)
def test_main_strip_taxa(monkeypatch, hf, capsys): tmp_file = br.TempFile() seqbuddy = Sb.SeqBuddy(os.path.join(hf.resource_path, "Cteno_pannexins.fa")) seqbuddy = Sb.rename(seqbuddy, "^.*?\-") tmp_file.write(str(seqbuddy)) argv = ['rdmcl.py', os.path.join(hf.resource_path, "final_clusters.txt"), tmp_file.path, "-s"] monkeypatch.setattr(sys, "argv", argv) group_by_cluster.main() out, err = capsys.readouterr() assert hf.string2hash(out) == "3020ea067affd21c77b7446f35689a6a", print(out)
def make_msa(seqbuddy, aligner, trimal=()): """ Create a multiple sequence alignment :param seqbuddy: SeqBuddy object :param aligner: path to alignment program :param trimal: List of TrimAl thresholds to try :return: AlignBuddy object """ trimal = trimal if trimal else ["clean"] if len(seqbuddy) == 1: alignment = Alb.AlignBuddy(str(seqbuddy)) else: alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), aligner, quiet=True) ave_seq_length = Sb.ave_seq_length(seqbuddy) for threshold in trimal: align_copy = Alb.trimal(Alb.make_copy(alignment), threshold=threshold) cleaned_seqs = Sb.clean_seq(Sb.SeqBuddy(str(align_copy))) cleaned_seqs = Sb.delete_small(cleaned_seqs, 1) # Structured this way for unit test purposes if len(alignment.records()) != len(cleaned_seqs): continue elif Sb.ave_seq_length(cleaned_seqs) / ave_seq_length < 0.5: continue else: alignment = align_copy break return alignment
def test_start_worker_fetch_queue(hf, capsys, monkeypatch): temp_dir = br.TempDir() temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path) temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path) def kill(*args): self = args[0] os.remove(self.worker_file) return monkeypatch.setattr(launch_worker.Worker, "process_final_results", kill) worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=1) work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite")) work_cursor = work_con.cursor() work_cursor.execute("INSERT INTO " "waiting (hash, master_id) " "VALUES ('foo', 2)") work_cursor.execute( "INSERT INTO " "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) " "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)", (os.path.join(hf.resource_path, "psi_pred"), )) work_con.commit() seqbuddy = hf.get_data("cteno_panxs") seqbuddy = Sb.pull_recs(seqbuddy, "Oma") # Only 4 records, which means 6 comparisons seqbuddy.write(os.path.join(worker.output, "foo.seqs")) with pytest.raises(SystemExit): worker.start() out, err = capsys.readouterr() assert "Running foo" in out assert "Creating MSA (4 seqs)" in out assert "Trimal (4 seqs)" in out assert os.path.isfile(os.path.join(worker.output, "foo.aln")) assert "Updating 4 psipred dataframes" in out assert "Preparing all-by-all data" in out assert "Running all-by-all data (6 comparisons)" in out assert "Processing final results" in out work_con.close()
def test_start_worker_fetch_queue(hf, capsys, monkeypatch): temp_dir = br.TempDir() temp_dir.copy_to("%swork_db.sqlite" % hf.resource_path) temp_dir.copy_to("%sheartbeat_db.sqlite" % hf.resource_path) def kill(*args): self = args[0] os.remove(self.worker_file) return monkeypatch.setattr(launch_worker.Worker, "process_final_results", kill) worker = launch_worker.Worker(temp_dir.path, heartrate=1, max_wait=1) work_con = sqlite3.connect(os.path.join(temp_dir.path, "work_db.sqlite")) work_cursor = work_con.cursor() work_cursor.execute("INSERT INTO " "waiting (hash, master_id) " "VALUES ('foo', 2)") work_cursor.execute("INSERT INTO " "queue (hash, psi_pred_dir, align_m, align_p, trimal, gap_open, gap_extend) " "VALUES ('foo', ?, 'clustalo', '', 'gappyout 50 90 clean', 0, 0)", (os.path.join(hf.resource_path, "psi_pred"),)) work_con.commit() seqbuddy = hf.get_data("cteno_panxs") seqbuddy = Sb.pull_recs(seqbuddy, "Oma") # Only 4 records, which means 6 comparisons seqbuddy.write(os.path.join(worker.output, "foo.seqs")) with pytest.raises(SystemExit): worker.start() out, err = capsys.readouterr() assert "Running foo" in out assert "Creating MSA (4 seqs)" in out assert "Trimal (4 seqs)" in out assert os.path.isfile(os.path.join(worker.output, "foo.aln")) assert "Updating 4 psipred dataframes" in out assert "Preparing all-by-all data" in out assert "Running all-by-all data (6 comparisons)" in out assert "Processing final results" in out work_con.close()
def mc_blast(records_list, args): # separate the args into its respective variable database, outfile = args # set temp_file as a buddy resource variable temp_file = br.TempFile() # set this variable to each record in records_list -- fasta format sub_input_seqs = sb.SeqBuddy(records_list, out_format='fasta') # write each sequence/record name to the temp_file of a certain path sub_input_seqs.write(temp_file.path) # generic blastp command for each file, blastdb used blast_cmd = "blastp -query %s -db %s -num_threads 3 -max_target_seqs 1 -outfmt 6" % ( temp_file.path, database) # utilize Popen to write the full blastp command to execute output = Popen(blast_cmd, stdout=PIPE, shell=True).communicate() # output = [stdout, stderr] - get stdout and decode output = output[0].decode() # write to file while locked so no other processes can write at the same time with lock: with open(outfile, 'a') as ofile: ofile.write(output) return
num_duplications_range = num_duplications_range if len(num_duplications_range) == 1 \ else list(range(num_duplications_range[0], num_duplications_range[1] + 1)) models = in_args.models alphas = make_range_from_inargs(in_args.alpha) category_range = sorted(in_args.categories) category_range = category_range if len(category_range) == 1 \ else list(range(num_drops_range[0], num_drops_range[1] + 1)) seed_file = in_args.seed_file assert os.path.exists(seed_file) with open(seed_file, 'r') as seed_io: seed_seq = seed_io.read() seed_seq = str(Sb.clean_seq(Sb.SeqBuddy(seed_seq, out_format='raw'))) seed_seq = seed_seq.upper().strip() # ugly-ass loop arguments = [] for grp in group_range: for tax in taxa_range: for mdl in models: for gbr in gene_branch_len: for gstdv in gene_branch_stdev: for sbr in species_branch_len: for sstdv in species_branch_stdev: for alp in alphas: for cat in category_range: for drp in drop_chances: for ndr in num_drops_range:
help="Print out the result of each tool") parser.add_argument( "-p", "--pause", action="store_true", help= "Stop execution until 'return' key pressed (only workes in combination with -v)" ) in_args = parser.parse_args() # Validate input reference file if not os.path.isfile(in_args.reference): sys.stderr("Error: Reference file does not exist\n") sys.exit() seqbuddy = Sb.SeqBuddy(in_args.reference) if seqbuddy.alpha != IUPAC.ambiguous_dna: sys.stderr("Error: Reference file must be DNA\n") sys.exit() if seqbuddy.in_format not in ["genbank", "gb"]: sys.stderr("Error: Reference file must be GenBank format\n") sys.exit() # Create or load all necessary reference files ref_dir = "{0}{1}reference{1}".format( os.path.dirname(os.path.realpath(__file__)), os.path.sep) ref_name = in_args.reference.split(os.sep)[-1] ref_name = os.path.splitext(ref_name)[0] if not os.path.isfile("%s%s.gb" % (ref_dir, ref_name)):
'--speed up the process') parser.add_argument('input_file', help='transdecoder file') parser.add_argument('database', help='blastp database path') parser.add_argument('num_cores', type=int, help='number of cores') parser.add_argument('-gs', '--group_size', type=int, help='group size') parser.add_argument('-o', '--out_file', default='blastp.outfmt6', help='output file') parser.add_argument('-q', '--quiet', help='suppress run time output counter', action='store_true') in_args = parser.parse_args() # sb.Seqbuddy(input_file) creates a variable of input sequences input_seqs = sb.SeqBuddy(in_args.input_file) # number of 'groups' of cores -- we divide the total number requested by 3 (number of jobs to perform at once) # we have to floor it b/c remainders/leftover cores are not allowed -- need at least 3 cores per job num_cores = floor(in_args.num_cores / 3) # if we specify group sizes, we're good to go -- otherwise, group size is the ceil(len(input_seqs)/num_cores) # You should usually try to specify group sizes group_size = ceil( len(input_seqs) / num_cores) if not in_args.group_size else in_args.group_size # specifies which records/seqs are in each group based on group size -- list comprehension records_list = [ input_seqs.records[i:i + group_size] for i in range(0, len(input_seqs.records), group_size) ] ##########
def start(self): self.split_time = time.time() self.start_time = time.time() self.heartbeat.start() self.worker_file = os.path.join(self.working_dir, "Worker_%s" % self.heartbeat.id) with open(self.worker_file, "w") as ofile: ofile.write("To terminate this Worker, simply delete this file.") self.data_file = os.path.join(self.working_dir, ".Worker_%s.dat" % self.heartbeat.id) open(self.data_file, "w").close() helpers.dummy_func() self.last_heartbeat_from_master = time.time() self.printer.write("Starting Worker_%s" % self.heartbeat.id) self.printer.new_line(1) idle_countdown = 1 while os.path.isfile(self.worker_file): idle = round(100 * self.idle / (self.idle + self.running), 2) if not idle_countdown: self.printer.write("Idle %s%%" % idle) idle_countdown = 5 # Make sure there are some masters still kicking around self.check_masters(idle) # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through rand_check = random() if rand_check > 0.95: self.clean_dead_threads() # Fetch a job from the queue data = self.fetch_queue_job() if data: full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \ else full_name.split("_") subjob_num = int(subjob_num) num_subjobs = int(num_subjobs) self.printer.write("Running %s" % full_name) else: time.sleep( random() * self.idle_workers() ) # Pause for some time relative to num idle workers idle_countdown -= 1 self.idle += time.time() - self.split_time self.split_time = time.time() continue try: idle_countdown = 1 seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash), in_format="fasta") # Prepare alignment if len(seqbuddy) == 1: raise ValueError("Queued job of size 1 encountered: %s" % id_hash) else: if num_subjobs == 1: self.printer.write("Creating MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), align_m, params=align_p, quiet=True) else: self.printer.write("Reading MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.AlignBuddy( os.path.join(self.output, "%s.aln" % id_hash)) # Prepare psipred dataframes psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir) if num_subjobs == 1: # This is starting a full job from scratch, not a sub-job # Need to specify what columns the PsiPred files map to now that there are gaps. psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "msa") # TrimAl self.printer.write("Trimal (%s seqs)" % len(seqbuddy)) alignment = rdmcl.trimal(seqbuddy, trimal, alignment) with helpers.ExclusiveConnect(os.path.join( self.output, "write.lock"), max_lock=0): # Place these write commands in ExclusiveConnect to ensure a writing lock if not os.path.isfile( os.path.join(self.output, "%s.aln" % id_hash)): alignment.write(os.path.join( self.output, "%s.aln" % id_hash), out_format="fasta") # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed self.printer.write("Updating %s psipred dataframes" % len(seqbuddy)) psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "trimal") # Prepare all-by-all list self.printer.write("Preparing all-by-all data") data_len, data = rdmcl.prepare_all_by_all( seqbuddy, psipred_dfs, self.cpus) if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff: data_len, data, subjob_num, num_subjobs = self.spawn_subjobs( id_hash, data, psipred_dfs, gap_open, gap_extend) elif subjob_num > 1: data_len, data = self.load_subjob(id_hash, subjob_num, num_subjobs, psipred_dfs) # Launch multicore self.printer.write("Running all-by-all data (%s comparisons)" % data_len) with open(self.data_file, "w") as ofile: ofile.write("seq1,seq2,subsmat,psi") br.run_multicore_function(data, rdmcl.mc_score_sequences, quiet=True, max_processes=self.cpus, func_args=[ alignment, gap_open, gap_extend, self.data_file ]) self.printer.write("Processing final results") self.process_final_results(id_hash, subjob_num, num_subjobs) self.running += time.time() - self.split_time self.split_time = time.time() except (OSError, FileNotFoundError, br.GuessError, ValueError) as err: if num_subjobs == 1: self.terminate( "something wrong with primary cluster %s\n%s" % (full_name, err)) else: with helpers.ExclusiveConnect(self.wrkdb_path) as cursor: cursor.execute("DELETE FROM processing WHERE hash=?", (full_name, )) continue # Broken out of while loop, clean up and terminate worker if os.path.isfile(self.data_file): os.remove(self.data_file) self.terminate("deleted check file")
def start(self): self.split_time = time.time() self.start_time = time.time() self.heartbeat.start() self.worker_file = os.path.join(self.working_dir, "Worker_%s" % self.heartbeat.id) with open(self.worker_file, "w") as ofile: ofile.write("To terminate this Worker, simply delete this file.") self.data_file = os.path.join(self.working_dir, ".Worker_%s.dat" % self.heartbeat.id) open(self.data_file, "w").close() helpers.dummy_func() self.last_heartbeat_from_master = time.time() self.printer.write("Starting Worker_%s" % self.heartbeat.id) self.printer.new_line(1) idle_countdown = 1 while os.path.isfile(self.worker_file): idle = round(100 * self.idle / (self.idle + self.running), 2) if not idle_countdown: self.printer.write("Idle %s%%" % idle) idle_countdown = 5 # Make sure there are some masters still kicking around self.check_masters(idle) # Check for and clean up dead threads and orphaned jobs every twentieth(ish) time through rand_check = random() if rand_check > 0.95: self.clean_dead_threads() # Fetch a job from the queue data = self.fetch_queue_job() if data: full_name, psipred_dir, align_m, align_p, trimal, gap_open, gap_extend = data subjob_num, num_subjobs, id_hash = [1, 1, full_name] if len(full_name.split("_")) == 1 \ else full_name.split("_") subjob_num = int(subjob_num) num_subjobs = int(num_subjobs) self.printer.write("Running %s" % full_name) else: time.sleep(random() * self.idle_workers()) # Pause for some time relative to num idle workers idle_countdown -= 1 self.idle += time.time() - self.split_time self.split_time = time.time() continue try: idle_countdown = 1 seqbuddy = Sb.SeqBuddy("%s/%s.seqs" % (self.output, id_hash), in_format="fasta") # Prepare alignment if len(seqbuddy) == 1: raise ValueError("Queued job of size 1 encountered: %s" % id_hash) else: if num_subjobs == 1: self.printer.write("Creating MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.generate_msa(Sb.make_copy(seqbuddy), align_m, params=align_p, quiet=True) else: self.printer.write("Reading MSA (%s seqs)" % len(seqbuddy)) alignment = Alb.AlignBuddy(os.path.join(self.output, "%s.aln" % id_hash)) # Prepare psipred dataframes psipred_dfs = self.prepare_psipred_dfs(seqbuddy, psipred_dir) if num_subjobs == 1: # This is starting a full job from scratch, not a sub-job # Need to specify what columns the PsiPred files map to now that there are gaps. psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "msa") # TrimAl self.printer.write("Trimal (%s seqs)" % len(seqbuddy)) alignment = rdmcl.trimal(seqbuddy, trimal, alignment) with helpers.ExclusiveConnect(os.path.join(self.output, "write.lock"), max_lock=0): # Place these write commands in ExclusiveConnect to ensure a writing lock if not os.path.isfile(os.path.join(self.output, "%s.aln" % id_hash)): alignment.write(os.path.join(self.output, "%s.aln" % id_hash), out_format="fasta") # Re-update PsiPred files now that some columns, possibly including non-gap characters, are removed self.printer.write("Updating %s psipred dataframes" % len(seqbuddy)) psipred_dfs = rdmcl.update_psipred(alignment, psipred_dfs, "trimal") # Prepare all-by-all list self.printer.write("Preparing all-by-all data") data_len, data = rdmcl.prepare_all_by_all(seqbuddy, psipred_dfs, self.cpus) if num_subjobs == 1 and data_len > self.cpus * self.job_size_coff: data_len, data, subjob_num, num_subjobs = self.spawn_subjobs(id_hash, data, psipred_dfs, gap_open, gap_extend) elif subjob_num > 1: data_len, data = self.load_subjob(id_hash, subjob_num, num_subjobs, psipred_dfs) # Launch multicore self.printer.write("Running all-by-all data (%s comparisons)" % data_len) with open(self.data_file, "w") as ofile: ofile.write("seq1,seq2,subsmat,psi") br.run_multicore_function(data, rdmcl.mc_score_sequences, quiet=True, max_processes=self.cpus, func_args=[alignment, gap_open, gap_extend, self.data_file]) self.printer.write("Processing final results") self.process_final_results(id_hash, subjob_num, num_subjobs) self.running += time.time() - self.split_time self.split_time = time.time() except (OSError, FileNotFoundError, br.GuessError, ValueError) as err: if num_subjobs == 1: self.terminate("something wrong with primary cluster %s\n%s" % (full_name, err)) else: with helpers.ExclusiveConnect(self.wrkdb_path) as cursor: cursor.execute("DELETE FROM processing WHERE hash=?", (full_name,)) continue # Broken out of while loop, clean up and terminate worker if os.path.isfile(self.data_file): os.remove(self.data_file) self.terminate("deleted check file")
def main(): def fmt(prog): return br.CustomHelpFormatter(prog) parser = argparse.ArgumentParser(prog="homolog_tree_builder", formatter_class=fmt, add_help=False, usage=argparse.SUPPRESS, description='''\ \033[1mRun PSI-PRED\033[m For Sofia, to do awesome stuff with Pass in a file of sequences, get secondary structure in return. \033[1mUsage\033[m: run_psipred.py "/path/to/seqs" [-options] ''') # Positional positional = parser.add_argument_group( title="\033[1mPositional argument\033[m") positional.add_argument( "seqs", help="Specify sequence file (most formats accepted)") positional.add_argument("save_ss2", action="store", help="Specify directory to save/read ss2 files.") # Optional commands parser_flags = parser.add_argument_group( title="\033[1mAvailable commands\033[m") parser_flags.add_argument( "-cpu", "--max_cpus", type=int, action="store", default=CPUS, metavar="", help="Specify the maximum number of cores RD-MCL can use (default=%s)" % CPUS) # Misc misc = parser.add_argument_group(title="\033[1mMisc options\033[m") misc.add_argument('-v', '--version', action='version', version="1.0") misc.add_argument('-h', '--help', action="help", help="Show this help message and exit") in_args = parser.parse_args() sequences = Sb.SeqBuddy(in_args.seqs) if not in_args.save_ss2: ss2_files = br.TempDir().path else: ss2_files = os.path.abspath(in_args.save_ss2) os.makedirs(ss2_files, exist_ok=True) br.run_multicore_function(sequences.records, mc_psi_pred, [ss2_files], max_processes=in_args.max_cpus)
def test_make_msa(hf, monkeypatch): seqbuddy = hf.get_data("cteno_panxs") seqbuddy.records = seqbuddy.records[:2] alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo") assert type(alb_obj) == Alb.AlignBuddy assert str(alb_obj) == """\ >Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2. MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA >Bab-PanxαB Be_abyssicola|m.19|ML47742|1063 2. --MLDILSKFKGVTPFKGITIDDGWDQLNRSFMFVLLVVMGTTVTVRQYTGSVISCDGFK KFGSTFAEDYCWTQGLYTVLEGYDQPSYNIPYPGLLPDELPACTPVKLKDGTRLKCPDAD QLMSPTRISHLWYQWVPFYFWLAAAAFFMPYLLYKNFGMGDIKPLVRLLHNPVESDQ--E LKKMTDKAATWLFYKFDLYMSEQSLVASLTRKHGLGLSMVFVKILYAAVSFCCFILTAEM FSIGDFKTYGSKWIKKMRYEDTLATEEKDKLFPKMVACEVKRWGASGIEEEQGMCVLAPN VINQYLFLILWFCLVFVMICNIVSIFVSLIKLLFTYGSYRRLLST-AFLRDDSAIKHMYF NVGSSGRLILHVLANNTAPRVFEDILLTLAPKLIQRKLRGNGKAV------ """ seqbuddy.records = [seqbuddy.records[0]] alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo") assert type(alb_obj) == Alb.AlignBuddy assert str(alb_obj) == """\ >Bab-PanxαA Be_abyssicola|m.8 and m.21|ML036514|937+ 2. MLLLGSLGTIKNLSIFKDLSLDDWLDQMNRTFMFLLLCFMGTIVAVSQYTGKNISCNGFE KFSDDFSQDYCWTQGLYTIKEAYDLPESQIPYPGIIPENVPACREHSLKNGGKIICPPPE EIKPLTRARHLWYQWIPFYFWVIAPVFYLPYMFVKRMGLDRMKPLLKIMSDYYHCTTETP SEEIIVKCADWVYNSIVDRLSEGSSWTSWRNRHGLGLAVLFSKLMYLGGSILVMMVTTLM FQVGDFKTYGIEWLKQFPSDENYTTSVKHKLFPKMVACEIKRWGPSGLEEENGMCVLAPN VIYQYIFLIMWFALAITICTNFFNIFFWVFKLTATRYTYSKLVATGHFSHKHPGWKFMYY RIGTSGRVLLNIVAQNTNPIIFGAIMEKLTPSVIKHLRIGHVPGEYLTDPA """ # Don't modify if any sequence is reduced to nothing align = Alb.AlignBuddy("""\ >A MSTGTC------- >B M---TC------- >C M---TC---AILP >D -STP---YWAILP """, in_format="fasta") seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta") seqbuddy = Sb.clean_seq(seqbuddy) monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align) alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3]) assert str(alb_obj) == str(align) # Don't modify if average sequence length is reduced by more than half align = Alb.AlignBuddy("""\ >A MSTGTC------- >B M---TC------- >C M---TC---AILP >D -STPTC-YWAILP """, in_format="fasta") seqbuddy = Sb.SeqBuddy(Alb.make_copy(align).records(), in_format="fasta") seqbuddy = Sb.clean_seq(seqbuddy) monkeypatch.setattr(Alb, "generate_msa", lambda *_, **__: align) alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3]) assert str(alb_obj) == str(align) # Remove some gaps alb_obj = group_by_cluster.make_msa(seqbuddy, "clustalo", trimal=[0.3, 0.55]) assert str(alb_obj) == """\
"Stop execution until 'return' key pressed (only works in combination with -v)" ) parser.add_argument("-to", "--timeout", action='store', default=31536000, type=int, help="Set max execution time") in_args = parser.parse_args() # Validate input reference file if not os.path.isfile(in_args.reference): sys.stderr("Error: Reference file does not exist\n") sys.exit() seqbuddy = Sb.SeqBuddy(in_args.reference) if seqbuddy.alpha != IUPAC.ambiguous_dna: sys.stderr("Error: Reference file must be DNA\n") sys.exit() if seqbuddy.in_format not in ["genbank", "gb"]: sys.stderr("Error: Reference file must be GenBank format\n") sys.exit() # Create or load all necessary reference files ref_dir = "{0}{1}reference{1}".format( os.path.dirname(os.path.realpath(__file__)), os.path.sep) ref_name = in_args.reference.split(os.sep)[-1] ref_name = os.path.splitext(ref_name)[0] res_dir = "{0}{1}results{1}{2}{1}".format(
def main(): in_args = argparse_init() mode = in_args.mode.lower() mode = "seqs" if "sequences".startswith(mode) else mode mode = "aln" if "alignment".startswith(mode) else mode mode = "con" if "consensus".startswith(mode) else mode mode = "list" if "list".startswith(mode) else mode if mode not in ["seqs", "aln", "con", "list"]: Sb.br._stderr('Unrecognized mode, please select from ["seqs", "aln", "con", "list"].\n') sys.exit() if in_args.groups: in_args.groups = [x.lower() for x in in_args.groups[0]] in_args.groups = "^%s$" % "$|^".join(in_args.groups) cluster_file = prepare_clusters(in_args.clusters, hierarchy=True) seqbuddy = Sb.SeqBuddy(in_args.sequence_file) output = OrderedDict() for rank, node in cluster_file.items(): rank = rank.split()[0] if in_args.groups: if not re.search(in_args.groups, rank): continue if in_args.min_size: if len(node) < in_args.min_size: continue if in_args.max_size: if len(node) > in_args.max_size: continue if in_args.strip_taxa: node = [re.sub("^.*?\-", "", x) for x in node] ids = "^%s$" % "$|^".join(node) subset = Sb.pull_recs(Sb.make_copy(seqbuddy), ids) subset = Sb.order_ids(subset) rank_output = "" if mode == "list": rank_output += rank for rec in subset.records: rec.description = re.sub("^%s" % rec.id, "", rec.description) rank_output += "\n%s %s" % (rec.id, rec.description) rank_output += "\n" elif mode == "seqs": for rec in subset.records: rec.description = "%s %s" % (rank, rec.description) rank_output += str(subset) elif mode in ["aln", "con"]: try: rank_output = make_msa(subset, in_args.aligner, in_args.trimal) except (SystemError, AttributeError) as err: print(err) sys.exit() rank_output.out_format = "phylip-relaxed" if mode == "con": rec = Alb.consensus_sequence(rank_output).records()[0] rec.id = rank rec.name = rank rec.description = "" rank_output.out_format = "fasta" output[rank] = str(rank_output) if not in_args.write: print("\n".join(data for rank, data in output.items()).strip()) else: outdir = os.path.abspath(in_args.write) os.makedirs(outdir, exist_ok=True) extension = ".%s" % seqbuddy.out_format[:3] if mode == "seq" \ else ".txt" if mode == "list" \ else ".phy" if mode == "aln" \ else ".fa" for rank, data in output.items(): with open(os.path.join(outdir, rank + extension), "w") as ofile: ofile.write(data)
Bfr 4 Cfu 6 Dgl 9 Edu 9 Hca 8 Hru 5 Hvu 14 Lcr 12 Lla 3 Mle 12 Oma 4 Pba 7 Tin 6 Vpa 7 ''' cteno_panxs = Sb.SeqBuddy("%s%sCteno_pannexins.fa" % (RESOURCE_PATH, SEP)) cteno_panxs_aln = Alb.AlignBuddy("%s%sCteno_pannexins_aln.fa" % (RESOURCE_PATH, SEP)) ids = sorted([rec.id for rec in cteno_panxs.records]) sim_scores = pd.read_csv("%sCteno_pannexins_sim.scores" % RESOURCE_PATH, index_col=False, header=None) sim_scores.columns = ["seq1", "seq2", "subsmat", "psi", "raw_score", "score"] # ################################# - Helper class - ################################## # class HelperMethods(object): def __init__(self): self.sep = SEP self.resource_path = RESOURCE_PATH self._cteno_panxs = cteno_panxs
seq_files = [] for _file in files: extension = _file.split(".")[-1] name = re.sub(extension, "", _file.split("/")[-1]) if extension in in_args.extensions and name not in prev_blast_dbs: seq_files.append("%s/%s" % (in_args.indir, _file)) print("***Hashing proteomes***") chars = string.ascii_uppercase + string.digits for i in range(len(seq_files)): _file = seq_files[i] name = _file.split("/")[-1] name = "_".join(name.split(".")[:-1]) seqbuddy = Sb.SeqBuddy(_file) seqbuddy = Sb.clean_seq(seqbuddy) if "%s/blastdbs/%s" % (in_args.outdir, name) in prev_blast_dbs: for record in seqbuddy.records: if not in_args.original_names: record.id = reverse_hash_map["%s@%s" % (name, record.id)] else: record.id = reverse_hash_map[record.id] prev_records_list += seqbuddy.records continue print(name) for indx, rec in enumerate(seqbuddy.records): while True: new_hash = "".join([random.choice(chars) for _ in range(10)])