def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.blast_db == None and opts.refseqs_fp == None: option_parser.error('Either blast_db or refseqs_fp must be provided.') # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep max_e_value = opts.max_e_value similarity = opts.similarity poll_directly = opts.poll_directly min_aligned_percent = opts.min_aligned_percent created_temp_paths = [] if not opts.blast_db: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) created_temp_paths += db_files_to_remove else: blast_db = opts.blast_db # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity, min_aligned_percent) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options input_path = opts.input_path output_dir = opts.output_path min_seqs = opts.min max_seqs = opts.max step = opts.step num_reps = opts.num_reps lineages_included = opts.lineages_included single_rarefaction_fp = opts.single_rarefaction_fp python_exe_fp = opts.python_exe_fp path_to_cluster_jobs = opts.cluster_jobs_fp poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly jobs_to_start = opts.jobs_to_start created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fn = split(input_path) input_file_basename, input_file_ext = splitext(input_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with ALDIV (ALphaDIVersity) job_prefix = opts.job_prefix or get_random_job_prefix('RARIF') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working_dir already exists pass # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = \ get_job_commands(python_exe_fp,single_rarefaction_fp,job_prefix,\ input_path,output_dir,working_dir,min_seqs,max_seqs,step,num_reps, lineages_included,command_prefix=' ',command_suffix=' ') # Merge commands into jobs_to_start number of jobs commands = merge_to_n_commands(commands,jobs_to_start) # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file even though no merging is necessary # (get_poller_command requires this, but a future version won't) merge_map_filepath = '%s/merge_map.txt' % working_dir open(merge_map_filepath,'w').close() created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) if not poll_directly: # Generate the command to run the poller, and the list of temp files # created by the poller poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options input_path = opts.input_path output_dir = opts.output_path metrics = opts.metrics tree_fp = opts.tree_path beta_diversity_fp = opts.beta_diversity_fp python_exe_fp = opts.python_exe_fp path_to_cluster_jobs = opts.cluster_jobs_fp poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly jobs_to_start = opts.jobs_to_start if isfile(input_path): single_otu_table_mode = True else: single_otu_table_mode = False input_fps = glob('%s/*' % input_path) created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension # input_path, input_fn = split(input_path) # input_file_basename, input_file_ext = splitext(input_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with BDIV job_prefix = opts.job_prefix or get_random_job_prefix('BDIV') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # Get the list of commands to be run and the expected result files if single_otu_table_mode: # these will be the row dissim matrices # temp for making, then move to output/i so the poller knows we're done for i in range(jobs_to_start): makedirs(working_dir + '/' + str(i)) created_temp_paths.append(working_dir + '/' + str(i)) makedirs(output_dir + '/' + str(i)) created_temp_paths.append(output_dir + '/' + str(i)) # to speed up this process, if not opts.full_tree: call setup here once # and then use full_tree=True # not implemented yet commands, job_result_filepaths = \ get_job_commands_single_otu_table(python_exe_fp,beta_diversity_fp, tree_fp,job_prefix,metrics,input_path,output_dir,working_dir, jobs_to_start,command_prefix=' ',command_suffix=' ', full_tree=opts.full_tree) created_temp_paths += job_result_filepaths else: commands, job_result_filepaths = \ get_job_commands_multiple_otu_tables(python_exe_fp,beta_diversity_fp, tree_fp,job_prefix,metrics,input_fps,output_dir,working_dir, command_prefix=' ',command_suffix=' ', full_tree=opts.full_tree) # Merge commands into jobs_to_start number of jobs commands = merge_to_n_commands(commands,jobs_to_start) # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir if single_otu_table_mode: create_merge_map_file_single_otu_table( input_path,output_dir,metrics,merge_map_filepath, expected_files_filepath) process_run_results_f =\ 'qiime.parallel.beta_diversity.parallel_beta_diversity_process_run_results_f' else: open(merge_map_filepath,'w').close() process_run_results_f = None created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) if not poll_directly: # Generate the command to run the poller, and the list of temp files # created by the poller poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath, merge_map_filepath,deletion_list_filepath, process_run_results_f=process_run_results_f, seconds_to_sleep=seconds_to_sleep) # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath, merge_map_filepath,deletion_list_filepath, seconds_to_sleep=seconds_to_sleep, process_run_results_f=process_run_results_f, command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp assign_taxonomy_fp = opts.assign_taxonomy_fp confidence = opts.confidence rdp_classifier_fp = opts.rdp_classifier_fp id_to_taxonomy_fp = opts.id_to_taxonomy_fp reference_seqs_fp = opts.reference_seqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly if not isfile(input_fasta_fp): raise ValueError('This file does not exists: %s' % input_fasta_fp) if id_to_taxonomy_fp or reference_seqs_fp: if not id_to_taxonomy_fp or not isfile(id_to_taxonomy_fp): raise ValueError('This file does not exits: %s' % id_to_taxonomy_fp) if not reference_seqs_fp or not isfile(reference_seqs_fp): raise ValueError('This file does not exits: %s' % reference_seqs_fp) try: makedirs(output_dir) except OSError: # output dir already exists pass created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('RDP') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: mkdir(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = \ get_commands(python_exe_fp,assign_taxonomy_fp,confidence,job_prefix,\ tmp_fasta_fps,rdp_classifier_fp,output_dir,working_dir,\ id_to_taxonomy_fp=id_to_taxonomy_fp,reference_seqs_fp=reference_seqs_fp) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir write_merge_map_file_assign_taxonomy(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,\ expected_files_filepath,merge_map_filepath,\ deletion_list_filepath,seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep similarity = opts.similarity poll_directly = opts.poll_directly uclust_stable_sort = not opts.suppress_uclust_stable_sort save_uc_files = opts.save_uc_files enable_rev_strand_match = opts.enable_rev_strand_match optimal_uclust = opts.optimal_uclust exact_uclust = opts.exact_uclust max_accepts = opts.max_accepts max_rejects = opts.max_rejects stepwords = opts.stepwords word_length = opts.word_length created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,refseqs_fp,job_prefix,working_dir,similarity, enable_rev_strand_match,optimal_uclust,exact_uclust,max_accepts,max_rejects, stepwords, word_length, uclust_stable_sort, save_uc_files) if save_uc_files: # keep any .uc files that get created created_temp_paths +=\ [fp for fp in job_result_filepaths if not fp.endswith('.uc')] else: created_temp_paths += [job_result_filepaths] # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_uclust_ref.parallel_uclust_ref_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename,failures=True) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)