def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ temp_fasta_fp = get_tmp_filename(\ prefix='QiimeScriptUtilTests',suffix='.fasta') temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25 open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta)) actual_25 = compute_seqs_per_file(temp_fasta_fp, 25) actual_2 = compute_seqs_per_file(temp_fasta_fp, 2) actual_10 = compute_seqs_per_file(temp_fasta_fp, 10) actual_5 = compute_seqs_per_file(temp_fasta_fp, 5) actual_40 = compute_seqs_per_file(temp_fasta_fp, 40) remove(temp_fasta_fp) self.assertEqual(actual_25, 1) self.assertEqual(actual_2, 13) self.assertEqual(actual_10, 3) self.assertEqual(actual_5, 5) self.assertEqual(actual_40, 1)
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ temp_fasta_fp = get_tmp_filename(\ prefix='QiimeScriptUtilTests',suffix='.fasta') temp_fasta = ['>seq','AAACCCCAAATTGG'] * 25 open(temp_fasta_fp,'w').write('\n'.join(temp_fasta)) actual_25 = compute_seqs_per_file(temp_fasta_fp,25) actual_2 = compute_seqs_per_file(temp_fasta_fp,2) actual_10 = compute_seqs_per_file(temp_fasta_fp,10) actual_5 = compute_seqs_per_file(temp_fasta_fp,5) actual_40 = compute_seqs_per_file(temp_fasta_fp,40) remove(temp_fasta_fp) self.assertEqual(actual_25,1) self.assertEqual(actual_2,13) self.assertEqual(actual_10,3) self.assertEqual(actual_5,5) self.assertEqual(actual_40,1)
def _split_fasta(self, input_fp, jobs_to_start, job_prefix, output_dir): # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) return tmp_fasta_fps, True
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.blast_db == None and opts.refseqs_fp == None: option_parser.error('Either blast_db or refseqs_fp must be provided.') # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep max_e_value = opts.max_e_value similarity = opts.similarity poll_directly = opts.poll_directly min_aligned_percent = opts.min_aligned_percent created_temp_paths = [] if not opts.blast_db: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) created_temp_paths += db_files_to_remove else: blast_db = opts.blast_db # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity, min_aligned_percent) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp assign_taxonomy_fp = opts.assign_taxonomy_fp confidence = opts.confidence rdp_classifier_fp = opts.rdp_classifier_fp id_to_taxonomy_fp = opts.id_to_taxonomy_fp reference_seqs_fp = opts.reference_seqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly if not isfile(input_fasta_fp): raise ValueError('This file does not exists: %s' % input_fasta_fp) if id_to_taxonomy_fp or reference_seqs_fp: if not id_to_taxonomy_fp or not isfile(id_to_taxonomy_fp): raise ValueError('This file does not exits: %s' % id_to_taxonomy_fp) if not reference_seqs_fp or not isfile(reference_seqs_fp): raise ValueError('This file does not exits: %s' % reference_seqs_fp) try: makedirs(output_dir) except OSError: # output dir already exists pass created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('RDP') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: mkdir(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = \ get_commands(python_exe_fp,assign_taxonomy_fp,confidence,job_prefix,\ tmp_fasta_fps,rdp_classifier_fp,output_dir,working_dir,\ id_to_taxonomy_fp=id_to_taxonomy_fp,reference_seqs_fp=reference_seqs_fp) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir write_merge_map_file_assign_taxonomy(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,\ expected_files_filepath,merge_map_filepath,\ deletion_list_filepath,seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep similarity = opts.similarity poll_directly = opts.poll_directly uclust_stable_sort = not opts.suppress_uclust_stable_sort save_uc_files = opts.save_uc_files enable_rev_strand_match = opts.enable_rev_strand_match optimal_uclust = opts.optimal_uclust exact_uclust = opts.exact_uclust max_accepts = opts.max_accepts max_rejects = opts.max_rejects stepwords = opts.stepwords word_length = opts.word_length created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,refseqs_fp,job_prefix,working_dir,similarity, enable_rev_strand_match,optimal_uclust,exact_uclust,max_accepts,max_rejects, stepwords, word_length, uclust_stable_sort, save_uc_files) if save_uc_files: # keep any .uc files that get created created_temp_paths +=\ [fp for fp in job_result_filepaths if not fp.endswith('.uc')] else: created_temp_paths += [job_result_filepaths] # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_uclust_ref.parallel_uclust_ref_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename,failures=True) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)