def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k,'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1,1000): filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_get_random_job_prefix(self): """ get_random_job_prefix functions as expected """ s1 = get_random_job_prefix() s2 = get_random_job_prefix() self.assertNotEqual(s1, s2) self.assertEqual(len(s1), 10) self.assertEqual(len(s2), 10) # different max len s1 = get_random_job_prefix(max_job_prefix_len=22) self.assertEqual(len(s1), 22) # fixed_prefix added s1 = get_random_job_prefix(fixed_prefix='TEST') s2 = get_random_job_prefix(fixed_prefix='TEST') self.assertNotEqual(s1, s2) self.assertEqual(len(s1), 10) self.assertTrue(s1.startswith('TEST')) self.assertTrue(s2.startswith('TEST')) # leading/trailing underscores added self.assertTrue(s1.startswith('TEST_')) self.assertTrue(s1.endswith('_')) # no leading/trailing underscores s1 = get_random_job_prefix(leading_trailing_underscores=False) self.assertFalse(s1.startswith('_')) self.assertFalse(s1.endswith('_')) # combo of all parameters s1 = get_random_job_prefix(leading_trailing_underscores=False,\ fixed_prefix='HELLO',max_job_prefix_len=12) self.assertEqual(len(s1), 12) self.assertTrue(s1.startswith('HELLO')) self.assertFalse(s1.endswith('_'))
def test_get_random_job_prefix(self): """ get_random_job_prefix functions as expected """ s1 = get_random_job_prefix() s2 = get_random_job_prefix() self.assertNotEqual(s1,s2) self.assertEqual(len(s1),10) self.assertEqual(len(s2),10) # different max len s1 = get_random_job_prefix(max_job_prefix_len=22) self.assertEqual(len(s1),22) # fixed_prefix added s1 = get_random_job_prefix(fixed_prefix='TEST') s2 = get_random_job_prefix(fixed_prefix='TEST') self.assertNotEqual(s1,s2) self.assertEqual(len(s1),10) self.assertTrue(s1.startswith('TEST')) self.assertTrue(s2.startswith('TEST')) # leading/trailing underscores added self.assertTrue(s1.startswith('TEST_')) self.assertTrue(s1.endswith('_')) # no leading/trailing underscores s1 = get_random_job_prefix(leading_trailing_underscores=False) self.assertFalse(s1.startswith('_')) self.assertFalse(s1.endswith('_')) # combo of all parameters s1 = get_random_job_prefix(leading_trailing_underscores=False,\ fixed_prefix='HELLO',max_job_prefix_len=12) self.assertEqual(len(s1),12) self.assertTrue(s1.startswith('HELLO')) self.assertFalse(s1.endswith('_'))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)] self.assertEqual(actual,expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual,expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.blast_db == None and opts.refseqs_fp == None: option_parser.error('Either blast_db or refseqs_fp must be provided.') # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep max_e_value = opts.max_e_value similarity = opts.similarity poll_directly = opts.poll_directly min_aligned_percent = opts.min_aligned_percent created_temp_paths = [] if not opts.blast_db: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) created_temp_paths += db_files_to_remove else: blast_db = opts.blast_db # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity, min_aligned_percent) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options input_path = opts.input_path output_dir = opts.output_path min_seqs = opts.min max_seqs = opts.max step = opts.step num_reps = opts.num_reps lineages_included = opts.lineages_included single_rarefaction_fp = opts.single_rarefaction_fp python_exe_fp = opts.python_exe_fp path_to_cluster_jobs = opts.cluster_jobs_fp poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly jobs_to_start = opts.jobs_to_start created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fn = split(input_path) input_file_basename, input_file_ext = splitext(input_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with ALDIV (ALphaDIVersity) job_prefix = opts.job_prefix or get_random_job_prefix('RARIF') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working_dir already exists pass # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = \ get_job_commands(python_exe_fp,single_rarefaction_fp,job_prefix,\ input_path,output_dir,working_dir,min_seqs,max_seqs,step,num_reps, lineages_included,command_prefix=' ',command_suffix=' ') # Merge commands into jobs_to_start number of jobs commands = merge_to_n_commands(commands,jobs_to_start) # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file even though no merging is necessary # (get_poller_command requires this, but a future version won't) merge_map_filepath = '%s/merge_map.txt' % working_dir open(merge_map_filepath,'w').close() created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) if not poll_directly: # Generate the command to run the poller, and the list of temp files # created by the poller poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp assign_taxonomy_fp = opts.assign_taxonomy_fp confidence = opts.confidence rdp_classifier_fp = opts.rdp_classifier_fp id_to_taxonomy_fp = opts.id_to_taxonomy_fp reference_seqs_fp = opts.reference_seqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly if not isfile(input_fasta_fp): raise ValueError('This file does not exists: %s' % input_fasta_fp) if id_to_taxonomy_fp or reference_seqs_fp: if not id_to_taxonomy_fp or not isfile(id_to_taxonomy_fp): raise ValueError('This file does not exits: %s' % id_to_taxonomy_fp) if not reference_seqs_fp or not isfile(reference_seqs_fp): raise ValueError('This file does not exits: %s' % reference_seqs_fp) try: makedirs(output_dir) except OSError: # output dir already exists pass created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('RDP') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: mkdir(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = \ get_commands(python_exe_fp,assign_taxonomy_fp,confidence,job_prefix,\ tmp_fasta_fps,rdp_classifier_fp,output_dir,working_dir,\ id_to_taxonomy_fp=id_to_taxonomy_fp,reference_seqs_fp=reference_seqs_fp) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir write_merge_map_file_assign_taxonomy(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,\ expected_files_filepath,merge_map_filepath,\ deletion_list_filepath,seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options input_path = opts.input_path output_dir = opts.output_path metrics = opts.metrics tree_fp = opts.tree_path beta_diversity_fp = opts.beta_diversity_fp python_exe_fp = opts.python_exe_fp path_to_cluster_jobs = opts.cluster_jobs_fp poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly jobs_to_start = opts.jobs_to_start if isfile(input_path): single_otu_table_mode = True else: single_otu_table_mode = False input_fps = glob('%s/*' % input_path) created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension # input_path, input_fn = split(input_path) # input_file_basename, input_file_ext = splitext(input_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with BDIV job_prefix = opts.job_prefix or get_random_job_prefix('BDIV') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # Get the list of commands to be run and the expected result files if single_otu_table_mode: # these will be the row dissim matrices # temp for making, then move to output/i so the poller knows we're done for i in range(jobs_to_start): makedirs(working_dir + '/' + str(i)) created_temp_paths.append(working_dir + '/' + str(i)) makedirs(output_dir + '/' + str(i)) created_temp_paths.append(output_dir + '/' + str(i)) # to speed up this process, if not opts.full_tree: call setup here once # and then use full_tree=True # not implemented yet commands, job_result_filepaths = \ get_job_commands_single_otu_table(python_exe_fp,beta_diversity_fp, tree_fp,job_prefix,metrics,input_path,output_dir,working_dir, jobs_to_start,command_prefix=' ',command_suffix=' ', full_tree=opts.full_tree) created_temp_paths += job_result_filepaths else: commands, job_result_filepaths = \ get_job_commands_multiple_otu_tables(python_exe_fp,beta_diversity_fp, tree_fp,job_prefix,metrics,input_fps,output_dir,working_dir, command_prefix=' ',command_suffix=' ', full_tree=opts.full_tree) # Merge commands into jobs_to_start number of jobs commands = merge_to_n_commands(commands,jobs_to_start) # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir if single_otu_table_mode: create_merge_map_file_single_otu_table( input_path,output_dir,metrics,merge_map_filepath, expected_files_filepath) process_run_results_f =\ 'qiime.parallel.beta_diversity.parallel_beta_diversity_process_run_results_f' else: open(merge_map_filepath,'w').close() process_run_results_f = None created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) if not poll_directly: # Generate the command to run the poller, and the list of temp files # created by the poller poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath, merge_map_filepath,deletion_list_filepath, process_run_results_f=process_run_results_f, seconds_to_sleep=seconds_to_sleep) # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath, merge_map_filepath,deletion_list_filepath, seconds_to_sleep=seconds_to_sleep, process_run_results_f=process_run_results_f, command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def __call__(self, input_fp, output_dir, params, job_prefix=None, poll_directly=False, suppress_submit_jobs=False): """ """ ## Generate a list of files and directories that will need to be cleaned up self.files_to_remove = [] # split the input filepath into directory and filename, base filename and # extension for use in naming other files try: input_dir, input_fn = split(input_fp) input_file_basename, input_ext = splitext(input_fn) except AttributeError: ## THIS IS AWFUL - SOME OF THE SCRIPTS PASS A LIST, SO THE ## PREVIOUS BLOCK WON'T WORK... WHAT DO WE WANT TO DO? input_dir, input_fn = split(input_fp[0]) input_file_basename, input_ext = splitext(input_fn) # Allow the user to override the default job_prefix (defined by the # base classes) if job_prefix is None: job_prefix = get_random_job_prefix(self._job_prefix) # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, # allowing a poller to detect when runs complete by the presence # of their output files. working_dir = join(output_dir,job_prefix) try: makedirs(working_dir) self.files_to_remove.append(working_dir) except OSError: # working dir already exists pass # Split the input file into the individual job input files. Add the # individual job files to the files_to_remove list input_fps, remove_input_on_completion = self._input_splitter( input_fp, self._jobs_to_start, job_prefix, working_dir) if remove_input_on_completion: self.files_to_remove += input_fps # Perform any method-specific setup (e.g., formatting a BLAST database) self._precommand_initiation(input_fp,output_dir,working_dir,params) # Generate the list of commands to be pushed out to workers # and the list of output files generated by each job. commands, job_result_filepaths = self._get_job_commands(input_fps, output_dir, params, job_prefix, working_dir) self.files_to_remove += \ self._identify_files_to_remove(job_result_filepaths,params) # Generate the output clean-up files merge_map_filepath, deletion_list_filepath, expected_files_filepath =\ self._initialize_output_cleanup_files(job_result_filepaths, output_dir, working_dir, input_file_basename, params) # Set up poller apparatus if the user does not suppress polling if not self._suppress_polling: poller_command = self._initiate_polling(job_result_filepaths, working_dir, poll_directly, merge_map_filepath, deletion_list_filepath, expected_files_filepath) # If the poller should be run in the same way as the other commands # (rather than by the current process), add it to the list of commands if not poll_directly: commands.append(poller_command) # Build the filepath for the 'jobs script'. Add that file to the # files_to_remove list. jobs_fp = join(working_dir,job_prefix + 'jobs.txt') self._write_jobs_file(commands,jobs_fp) self.files_to_remove.append(jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not suppress_submit_jobs: stdout, stderr, return_value = self._submit_jobs( jobs_fp=jobs_fp, job_prefix=job_prefix) # If the poller is going to be run by the current process, # start polling if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) print poller_command exit(-1)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep similarity = opts.similarity poll_directly = opts.poll_directly uclust_stable_sort = not opts.suppress_uclust_stable_sort save_uc_files = opts.save_uc_files enable_rev_strand_match = opts.enable_rev_strand_match optimal_uclust = opts.optimal_uclust exact_uclust = opts.exact_uclust max_accepts = opts.max_accepts max_rejects = opts.max_rejects stepwords = opts.stepwords word_length = opts.word_length created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,refseqs_fp,job_prefix,working_dir,similarity, enable_rev_strand_match,optimal_uclust,exact_uclust,max_accepts,max_rejects, stepwords, word_length, uclust_stable_sort, save_uc_files) if save_uc_files: # keep any .uc files that get created created_temp_paths +=\ [fp for fp in job_result_filepaths if not fp.endswith('.uc')] else: created_temp_paths += [job_result_filepaths] # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_uclust_ref.parallel_uclust_ref_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename,failures=True) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)