Example #1
0
 def test_split_fasta_diff_num_seqs_per_file_alt(self):
     """split_fasta funcs always catches all seqs
     """
     # start with 59 seqs (b/c it's prime, so should make more 
     # confusing splits)
     in_seqs = LoadSeqs(data=[('seq%s' % k,'AACCTTAA') for k in range(59)])
     infile = in_seqs.toFasta().split('\n')
     
     # test seqs_per_file from 1 to 1000
     for i in range(1,1000):
         filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
      
         actual = split_fasta(infile, i, filename_prefix)
     
         actual_seqs = []
         for fp in actual:
             actual_seqs += list(open(fp))
         # remove the files now, so if the test fails they still get 
         # cleaned up
         remove_files(actual)
         
         # building seq collections from infile and the split files result in
         # equivalent seq collections
         self.assertEqual(\
          LoadSeqs(data=infile,aligned=False),\
          LoadSeqs(data=actual_seqs,aligned=False))
Example #2
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(\
             LoadSeqs(data=infile,aligned=False),\
             LoadSeqs(data=actual_seqs,aligned=False))
Example #3
0
    def test_get_random_job_prefix(self):
        """ get_random_job_prefix functions as expected """

        s1 = get_random_job_prefix()
        s2 = get_random_job_prefix()
        self.assertNotEqual(s1, s2)
        self.assertEqual(len(s1), 10)
        self.assertEqual(len(s2), 10)

        # different max len
        s1 = get_random_job_prefix(max_job_prefix_len=22)
        self.assertEqual(len(s1), 22)

        # fixed_prefix added
        s1 = get_random_job_prefix(fixed_prefix='TEST')
        s2 = get_random_job_prefix(fixed_prefix='TEST')
        self.assertNotEqual(s1, s2)
        self.assertEqual(len(s1), 10)
        self.assertTrue(s1.startswith('TEST'))
        self.assertTrue(s2.startswith('TEST'))
        # leading/trailing underscores added
        self.assertTrue(s1.startswith('TEST_'))
        self.assertTrue(s1.endswith('_'))

        # no leading/trailing underscores
        s1 = get_random_job_prefix(leading_trailing_underscores=False)
        self.assertFalse(s1.startswith('_'))
        self.assertFalse(s1.endswith('_'))

        # combo of all parameters
        s1 = get_random_job_prefix(leading_trailing_underscores=False,\
         fixed_prefix='HELLO',max_job_prefix_len=12)
        self.assertEqual(len(s1), 12)
        self.assertTrue(s1.startswith('HELLO'))
        self.assertFalse(s1.endswith('_'))
Example #4
0
 def test_get_random_job_prefix(self):
     """ get_random_job_prefix functions as expected """
     
     s1 = get_random_job_prefix()
     s2 = get_random_job_prefix()
     self.assertNotEqual(s1,s2)
     self.assertEqual(len(s1),10)
     self.assertEqual(len(s2),10)
     
     # different max len
     s1 = get_random_job_prefix(max_job_prefix_len=22)
     self.assertEqual(len(s1),22)
     
     # fixed_prefix added
     s1 = get_random_job_prefix(fixed_prefix='TEST')
     s2 = get_random_job_prefix(fixed_prefix='TEST')
     self.assertNotEqual(s1,s2)
     self.assertEqual(len(s1),10)
     self.assertTrue(s1.startswith('TEST'))
     self.assertTrue(s2.startswith('TEST'))
     # leading/trailing underscores added
     self.assertTrue(s1.startswith('TEST_'))
     self.assertTrue(s1.endswith('_'))
     
     # no leading/trailing underscores
     s1 = get_random_job_prefix(leading_trailing_underscores=False)
     self.assertFalse(s1.startswith('_'))
     self.assertFalse(s1.endswith('_'))
     
     # combo of all parameters
     s1 = get_random_job_prefix(leading_trailing_underscores=False,\
      fixed_prefix='HELLO',max_job_prefix_len=12)
     self.assertEqual(len(s1),12)
     self.assertTrue(s1.startswith('HELLO'))
     self.assertFalse(s1.endswith('_'))
Example #5
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
        infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
         '>seq3','CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(\
         LoadSeqs(data=infile,aligned=False),\
         LoadSeqs(data=actual_seqs,aligned=False))
Example #6
0
 def test_split_fasta_equal_num_seqs_per_file(self):
     """split_fasta funcs as expected when equal num seqs go to each file
     """
     filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 1, filename_prefix)
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)]
     
     self.assertEqual(actual,expected)
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
Example #7
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
        infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
         '>seq3','CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(\
         LoadSeqs(data=infile,aligned=False),\
         LoadSeqs(data=actual_seqs,aligned=False))
Example #8
0
 def test_split_fasta_diff_num_seqs_per_file(self):
     """split_fasta funcs as expected when diff num seqs go to each file
     """
     filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 2, filename_prefix)
     
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(2)]
     # list of file paths is as expected
     self.assertEqual(actual,expected)
     # building seq collections from infile and the split files result in
     # equivalent seq collections
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.blast_db == None and opts.refseqs_fp == None:
        option_parser.error('Either blast_db or refseqs_fp must be provided.')

   # create local copies of command-line options
    python_exe_fp = opts.python_exe_fp
    pick_otus_fp = opts.pick_otus_fp
    refseqs_fp = opts.refseqs_fp
    cluster_jobs_fp = opts.cluster_jobs_fp
    input_fasta_fp = opts.input_fasta_fp 
    jobs_to_start = opts.jobs_to_start
    output_dir = opts.output_dir
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    max_e_value = opts.max_e_value
    similarity = opts.similarity
    poll_directly = opts.poll_directly
    min_aligned_percent = opts.min_aligned_percent

    created_temp_paths = []

    if not opts.blast_db:        
        # Build the blast database from the reference_seqs_fp -- all procs
        # will then access one db rather than create one per proc
        blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(refseqs_fp)
        created_temp_paths += db_files_to_remove
    else:
        blast_db = opts.blast_db
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fasta_fn = split(input_fasta_fp)
    input_file_basename, input_fasta_ext = splitext(input_fasta_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with RDP
    job_prefix = opts.job_prefix or get_random_job_prefix('POTU')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
    
    # compute the number of sequences that should be included in
    # each file after splitting the input fasta file   
    num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start)
     
    # split the fasta files and get the list of resulting files
    tmp_fasta_fps =\
      split_fasta(open(input_fasta_fp),num_seqs_per_file,\
      job_prefix,working_dir=output_dir)
    created_temp_paths += tmp_fasta_fps
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)
    
    # generate the list of commands to be pushed out to nodes and the list of
    # output files generated by each job
    commands, job_result_filepaths = \
     get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps,
     output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity,
     min_aligned_percent)
    created_temp_paths += job_result_filepaths

    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        process_run_results_f =\
         'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f'
        write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\
            merge_map_filepath,input_file_basename)
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)
        
        # Generate the command to run the poller, and the list of temp files
        # created by the poller
        if not poll_directly:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep)
            created_temp_paths += poller_result_filepaths
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='')
            created_temp_paths += poller_result_filepaths
        
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
     
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    # create local copies of command-line options
    input_path = opts.input_path
    output_dir = opts.output_path
    min_seqs = opts.min
    max_seqs = opts.max
    step = opts.step
    num_reps = opts.num_reps
    lineages_included = opts.lineages_included
    
    single_rarefaction_fp = opts.single_rarefaction_fp
    python_exe_fp = opts.python_exe_fp
    path_to_cluster_jobs = opts.cluster_jobs_fp
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    poll_directly = opts.poll_directly
    jobs_to_start = opts.jobs_to_start

    created_temp_paths = []
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fn = split(input_path)
    input_file_basename, input_file_ext = splitext(input_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with ALDIV (ALphaDIVersity)
    job_prefix = opts.job_prefix or get_random_job_prefix('RARIF')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working_dir already exists
        pass
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)
    
    # generate the list of commands to be pushed out to nodes
    commands, job_result_filepaths  = \
     get_job_commands(python_exe_fp,single_rarefaction_fp,job_prefix,\
     input_path,output_dir,working_dir,min_seqs,max_seqs,step,num_reps,
     lineages_included,command_prefix=' ',command_suffix=' ')
     
    # Merge commands into jobs_to_start number of jobs
    commands = merge_to_n_commands(commands,jobs_to_start)
    
    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file even though no merging is necessary 
        # (get_poller_command requires this, but a future version won't)
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        open(merge_map_filepath,'w').close()
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)
        
        if not poll_directly:
            # Generate the command to run the poller, and the list of temp files
            # created by the poller
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,\
             seconds_to_sleep=seconds_to_sleep)
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,\
             seconds_to_sleep=seconds_to_sleep,\
             command_prefix='',command_suffix='')
        
        created_temp_paths += poller_result_filepaths
        
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
    
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)    
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    # create local copies of command-line options
    python_exe_fp = opts.python_exe_fp
    assign_taxonomy_fp = opts.assign_taxonomy_fp
    confidence = opts.confidence
    rdp_classifier_fp = opts.rdp_classifier_fp
    id_to_taxonomy_fp = opts.id_to_taxonomy_fp
    reference_seqs_fp = opts.reference_seqs_fp
    cluster_jobs_fp = opts.cluster_jobs_fp
    input_fasta_fp = opts.input_fasta_fp 
    jobs_to_start = opts.jobs_to_start
    output_dir = opts.output_dir
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    poll_directly = opts.poll_directly

    if not isfile(input_fasta_fp):
        raise ValueError('This file does not exists: %s' % input_fasta_fp)

    if id_to_taxonomy_fp or reference_seqs_fp:
        if not id_to_taxonomy_fp or not isfile(id_to_taxonomy_fp):
            raise ValueError('This file does not exits: %s' % id_to_taxonomy_fp)
        if not reference_seqs_fp or not isfile(reference_seqs_fp):
            raise ValueError('This file does not exits: %s' % reference_seqs_fp)
            
    try:
        makedirs(output_dir)
    except OSError:
        # output dir already exists
        pass

    created_temp_paths = []
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fasta_fn = split(input_fasta_fp)
    input_file_basename, input_fasta_ext = splitext(input_fasta_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with RDP
    job_prefix = opts.job_prefix or get_random_job_prefix('RDP')

    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        mkdir(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
    
    # compute the number of sequences that should be included in
    # each file after splitting the input fasta file   
    num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start)
    
    # split the fasta files and get the list of resulting files
    tmp_fasta_fps =\
      split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir)
    created_temp_paths += tmp_fasta_fps
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)

    # generate the list of commands to be pushed out to nodes
    commands, job_result_filepaths = \
     get_commands(python_exe_fp,assign_taxonomy_fp,confidence,job_prefix,\
     tmp_fasta_fps,rdp_classifier_fp,output_dir,working_dir,\
     id_to_taxonomy_fp=id_to_taxonomy_fp,reference_seqs_fp=reference_seqs_fp)
    created_temp_paths += job_result_filepaths
    
    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        write_merge_map_file_assign_taxonomy(job_result_filepaths,output_dir,\
            merge_map_filepath,input_file_basename)
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)

        # Generate the command to run the poller, and the list of temp files
        # created by the poller
        if not poll_directly:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
              merge_map_filepath,deletion_list_filepath,\
              seconds_to_sleep=seconds_to_sleep)
            created_temp_paths += poller_result_filepaths
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,\
              expected_files_filepath,merge_map_filepath,\
              deletion_list_filepath,seconds_to_sleep=seconds_to_sleep,\
              command_prefix='',command_suffix='')
            created_temp_paths += poller_result_filepaths
        
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)

    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix)
    
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    
    # create local copies of command-line options
    input_path = opts.input_path
    output_dir = opts.output_path
    metrics = opts.metrics
    tree_fp = opts.tree_path
    
    beta_diversity_fp = opts.beta_diversity_fp
    python_exe_fp = opts.python_exe_fp
    path_to_cluster_jobs = opts.cluster_jobs_fp
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    poll_directly = opts.poll_directly
    jobs_to_start = opts.jobs_to_start
    
    if isfile(input_path):
        single_otu_table_mode = True
    else:
        single_otu_table_mode = False
        input_fps = glob('%s/*' % input_path)

    created_temp_paths = []
    # split the input filepath into directory and filename, base filename and
    # extension
    # input_path, input_fn = split(input_path)
    # input_file_basename, input_file_ext = splitext(input_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with BDIV
    job_prefix = opts.job_prefix or get_random_job_prefix('BDIV')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)
    
    # Get the list of commands to be run and the expected result files
    if single_otu_table_mode:
        # these will be the row dissim matrices
        # temp for making, then move to output/i so the poller knows we're done
        for i in range(jobs_to_start):
            makedirs(working_dir + '/' + str(i))
            created_temp_paths.append(working_dir + '/' + str(i))
            makedirs(output_dir + '/' + str(i))
            created_temp_paths.append(output_dir + '/' + str(i))

        # to speed up this process, if not opts.full_tree: call setup here once
        # and then use full_tree=True
        # not implemented yet
        commands, job_result_filepaths  = \
         get_job_commands_single_otu_table(python_exe_fp,beta_diversity_fp,
         tree_fp,job_prefix,metrics,input_path,output_dir,working_dir,
         jobs_to_start,command_prefix=' ',command_suffix=' ',
         full_tree=opts.full_tree)
        created_temp_paths += job_result_filepaths
    else:
        commands, job_result_filepaths  = \
         get_job_commands_multiple_otu_tables(python_exe_fp,beta_diversity_fp,
         tree_fp,job_prefix,metrics,input_fps,output_dir,working_dir,
         command_prefix=' ',command_suffix=' ', full_tree=opts.full_tree)
        # Merge commands into jobs_to_start number of jobs
        commands = merge_to_n_commands(commands,jobs_to_start)
        
    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        if single_otu_table_mode:
            create_merge_map_file_single_otu_table(
             input_path,output_dir,metrics,merge_map_filepath,
             expected_files_filepath)
            process_run_results_f =\
             'qiime.parallel.beta_diversity.parallel_beta_diversity_process_run_results_f'
        else:
            open(merge_map_filepath,'w').close()
            process_run_results_f = None
        created_temp_paths.append(merge_map_filepath)

        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)

        if not poll_directly:
            # Generate the command to run the poller, and the list of temp files
            # created by the poller
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,
             merge_map_filepath,deletion_list_filepath,
             process_run_results_f=process_run_results_f,
             seconds_to_sleep=seconds_to_sleep)
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,
             merge_map_filepath,deletion_list_filepath,
             seconds_to_sleep=seconds_to_sleep,
             process_run_results_f=process_run_results_f,
             command_prefix='',command_suffix='')            
        created_temp_paths += poller_result_filepaths
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
    
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(path_to_cluster_jobs,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)
Example #13
0
    def __call__(self,
                 input_fp,
                 output_dir,
                 params,
                 job_prefix=None,
                 poll_directly=False,
                 suppress_submit_jobs=False):
        """ """
        ## Generate a list of files and directories that will need to be cleaned up
        self.files_to_remove = []
    
        # split the input filepath into directory and filename, base filename and
        # extension for use in naming other files
        try:
            input_dir, input_fn = split(input_fp)
            input_file_basename, input_ext = splitext(input_fn)
        except AttributeError:
            ## THIS IS AWFUL - SOME OF THE SCRIPTS PASS A LIST, SO THE
            ## PREVIOUS BLOCK WON'T WORK... WHAT DO WE WANT TO DO?
            input_dir, input_fn = split(input_fp[0])
            input_file_basename, input_ext = splitext(input_fn)
        
        # Allow the user to override the default job_prefix (defined by the 
        # base classes)
        if job_prefix is None:
            job_prefix = get_random_job_prefix(self._job_prefix)
        # A temporary output directory is created in output_dir named
        # job_prefix. Output files are then moved from the temporary
        # directory to the output directory when they are complete,
        # allowing a poller to detect when runs complete by the presence
        # of their output files.
        working_dir = join(output_dir,job_prefix)
        try:
            makedirs(working_dir)
            self.files_to_remove.append(working_dir)
        except OSError:
            # working dir already exists
            pass
        
        # Split the input file into the individual job input files. Add the
        # individual job files to the files_to_remove list
        input_fps, remove_input_on_completion = self._input_splitter(
                                         input_fp,
                                         self._jobs_to_start,
                                         job_prefix,
                                         working_dir)
        if remove_input_on_completion:
            self.files_to_remove += input_fps
        
        # Perform any method-specific setup (e.g., formatting a BLAST database)
        self._precommand_initiation(input_fp,output_dir,working_dir,params)
        
        # Generate the list of commands to be pushed out to workers 
        # and the list of output files generated by each job.
        commands, job_result_filepaths = self._get_job_commands(input_fps,
                                                                output_dir,
                                                                params,
                                                                job_prefix,
                                                                working_dir)
        self.files_to_remove += \
         self._identify_files_to_remove(job_result_filepaths,params)

        # Generate the output clean-up files
        merge_map_filepath, deletion_list_filepath, expected_files_filepath =\
         self._initialize_output_cleanup_files(job_result_filepaths,
                                               output_dir,
                                               working_dir,
                                               input_file_basename,
                                               params)

        # Set up poller apparatus if the user does not suppress polling
        if not self._suppress_polling:
            poller_command = self._initiate_polling(job_result_filepaths,
                                                    working_dir,
                                                    poll_directly,
                                                    merge_map_filepath,
                                                    deletion_list_filepath,
                                                    expected_files_filepath)
        
        # If the poller should be run in the same way as the other commands
        # (rather than by the current process), add it to the list of commands
        if not poll_directly:
            commands.append(poller_command)
     
        # Build the filepath for the 'jobs script'. Add that file to the 
        # files_to_remove list.
        jobs_fp = join(working_dir,job_prefix + 'jobs.txt')
        self._write_jobs_file(commands,jobs_fp)
        self.files_to_remove.append(jobs_fp)
    
        # submit the jobs file using cluster_jobs, if not suppressed by the
        # user
        if not suppress_submit_jobs:
            stdout, stderr, return_value = self._submit_jobs(
             jobs_fp=jobs_fp, job_prefix=job_prefix)
        
        # If the poller is going to be run by the current process, 
        # start polling
        if poll_directly:
            try:
                check_call(poller_command.split())
            except CalledProcessError, e:
                print '**Error occuring when calling the poller directly. '+\
                'Jobs may have been submitted, but are not being polled.'
                print str(e)
                print poller_command
                exit(-1)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

   # create local copies of command-line options
    python_exe_fp = opts.python_exe_fp
    pick_otus_fp = opts.pick_otus_fp
    refseqs_fp = opts.refseqs_fp
    cluster_jobs_fp = opts.cluster_jobs_fp
    input_fasta_fp = opts.input_fasta_fp 
    jobs_to_start = opts.jobs_to_start
    output_dir = opts.output_dir
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    similarity = opts.similarity
    poll_directly = opts.poll_directly
    uclust_stable_sort = not opts.suppress_uclust_stable_sort
    save_uc_files = opts.save_uc_files
    
    enable_rev_strand_match = opts.enable_rev_strand_match
    optimal_uclust = opts.optimal_uclust
    exact_uclust = opts.exact_uclust
    max_accepts = opts.max_accepts
    max_rejects = opts.max_rejects
    stepwords = opts.stepwords
    word_length = opts.word_length

    created_temp_paths = []
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fasta_fn = split(input_fasta_fp)
    input_file_basename, input_fasta_ext = splitext(input_fasta_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with RDP
    job_prefix = opts.job_prefix or get_random_job_prefix('POTU')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
    
    # compute the number of sequences that should be included in
    # each file after splitting the input fasta file   
    num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start)
     
    # split the fasta files and get the list of resulting files
    tmp_fasta_fps =\
      split_fasta(open(input_fasta_fp),num_seqs_per_file,\
      job_prefix,working_dir=output_dir)
    created_temp_paths += tmp_fasta_fps
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)
    
    # generate the list of commands to be pushed out to nodes and the list of
    # output files generated by each job
    commands, job_result_filepaths = \
     get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps,
     output_dir,refseqs_fp,job_prefix,working_dir,similarity,
     enable_rev_strand_match,optimal_uclust,exact_uclust,max_accepts,max_rejects,
     stepwords, word_length, uclust_stable_sort, save_uc_files)
    if save_uc_files:
        # keep any .uc files that get created
        created_temp_paths +=\
         [fp for fp in job_result_filepaths if not fp.endswith('.uc')]
    else:
        created_temp_paths += [job_result_filepaths]

    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        process_run_results_f =\
         'qiime.parallel.pick_otus_uclust_ref.parallel_uclust_ref_process_run_results_f'
        write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\
            merge_map_filepath,input_file_basename,failures=True)
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)
        
        # Generate the command to run the poller, and the list of temp files
        # created by the poller
        if not poll_directly:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep)
            created_temp_paths += poller_result_filepaths
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='')
            created_temp_paths += poller_result_filepaths
            
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
     
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)