def setUp(self): self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Create example input file self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp,'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) # Define number of seconds a test can run for before timing out # and failing initiate_timeout(60)
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=""): """ Split infile into files with seqs_per_file sequences in each infile: list of fasta lines or open file object seqs_per_file: the number of sequences to include in each file out_fileprefix: string used to create output filepath - output filepaths are <out_prefix>.<i>.fasta where i runs from 0 to number of output files working_dir: directory to prepend to temp filepaths (defaults to empty string -- files written to cwd) List of output filepaths is returned. """ seq_counter = 0 out_files = [] if working_dir and not working_dir.endswith("/"): working_dir += "/" create_dir(working_dir) for seq_id, seq in MinimalFastaParser(infile): if seq_counter == 0: current_out_fp = "%s%s.%d.fasta" % (working_dir, outfile_prefix, len(out_files)) current_out_file = open(current_out_fp, "w") out_files.append(current_out_fp) current_out_file.write(">%s\n%s\n" % (seq_id, seq)) seq_counter += 1 if seq_counter == seqs_per_file: current_out_file.close() seq_counter = 0 return out_files
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''): """ Split infile into files with seqs_per_file sequences in each infile: list of fasta lines or open file object seqs_per_file: the number of sequences to include in each file out_fileprefix: string used to create output filepath - output filepaths are <out_prefix>.<i>.fasta where i runs from 0 to number of output files working_dir: directory to prepend to temp filepaths (defaults to empty string -- files written to cwd) List of output filepaths is returned. """ seq_counter = 0 out_files = [] if working_dir and not working_dir.endswith('/'): working_dir += '/' create_dir(working_dir) for seq_id, seq in MinimalFastaParser(infile): if seq_counter == 0: current_out_fp = '%s%s.%d.fasta' \ % (working_dir,outfile_prefix,len(out_files)) current_out_file = open(current_out_fp, 'w') out_files.append(current_out_fp) current_out_file.write('>%s\n%s\n' % (seq_id, seq)) seq_counter += 1 if seq_counter == seqs_per_file: current_out_file.close() seq_counter = 0 return out_files
def write_checkpoint(current_key, ctr, cluster_mapping, ids, bestscores, order, out_fp): """write intermediate results to checkpoint file current_key: the identifier of the current denoiser round ctr: a uniq counter to label the checkpoint cluster_mapping: an intermediate cluster mapping as dict ids: the dict of active ids order: a list of ids, which defines the order of which flowgrams are clustered bestscores: a dict of """ checkpoint_dir = out_fp + "/checkpoints/" if (not exists(checkpoint_dir)): create_dir(checkpoint_dir) out_fp = checkpoint_dir + "/checkpoint%d.pickle" % ctr out_fh = open(out_fp, "w") pickle.dump( (current_key, ctr, cluster_mapping, ids, bestscores, order), out_fh) return out_fp
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename( tmp_dir=tmp_dir, prefix='qiime_parallel_blaster_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.tmp_seq_filepath = get_tmp_filename( tmp_dir=self.test_out, prefix='qiime_parallel_blaster_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(blast_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_blaster_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(blast_ref_seqs) self.reference_seqs_file.seek(0) initiate_timeout(60)
def main(commandline_args=None): parser, opts, args = parse_command_line_parameters(**script_info) if not opts.sff_fp: parser.error('Required option flowgram file path (-i) not specified') elif not files_exist(opts.sff_fp): parser.error('Flowgram file path does not exist:\n %s \n Pass a valid one via -i.' % opts.sff_fp) if(opts.checkpoint_fp): bp_fp = opts.checkpoint_fp if not exists(bp_fp): parser.error('Specified checkpoint file does not exist: %s' % bp_fp) #peek into sff.txt files to make sure they are parseable #cat_sff_fles is lazy and only reads header flowgrams, header = cat_sff_files(map(open, opts.sff_fp.split(','))) if(opts.split and opts.preprocess_fp): parser.error('Options --split and --preprocess_fp are exclusive') if(opts.preprocess_fp): pp_fp = opts.preprocess_fp if not exists(opts.preprocess_fp): parser.error('Specified preprocess directory does not exist: %s' % opts.preprocess_fp) if not files_exist('%s/prefix_mapping.txt,%s/prefix_dereplicated.fasta' %(pp_fp, pp_fp)): parser.error('Specified preprocess directory does not contain expected files: ' +\ 'prefix_mapping.txt and prefix_dereplicated.fasta') if opts.titanium: opts.error_profile = DENOISER_DATA_DIR+'Titanium_error_profile.dat' opts.low_cutoff = 4 opts.high_cutoff = 5 if not exists(opts.error_profile): parser.error('Specified error profile %s does not exist' % opts.error_profile) if opts.output_dir: #make sure it always ends on / tmpoutdir=opts.output_dir+"/" else: #make random dir in current dir tmpoutdir = get_tmp_filename(tmp_dir="", prefix="denoiser_", suffix="/") create_dir(tmpoutdir, not opts.force) log_fp = 'denoiser.log' if opts.split: denoise_per_sample(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium) else: denoise_seqs(opts.sff_fp, opts.fasta_fp, tmpoutdir, opts.preprocess_fp, opts.cluster, opts.num_cpus, opts.squeeze, opts.percent_id, opts.bail, opts.primer, opts.low_cutoff, opts.high_cutoff, log_fp, opts.low_memory, opts.verbose, opts.error_profile, opts.max_num_iter, opts.titanium, opts.checkpoint_fp)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(blast_test_seqs.toFasta()) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt',dir=tmp_dir) self.id_to_taxonomy_file.write(blast_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta',dir=tmp_dir) self.reference_seqs_file.write(blast_reference_seqs.toFasta()) self.reference_seqs_file.seek(0) initiate_timeout(60)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_dir = opts.input_dir output_dir = opts.output_dir create_dir(output_dir) lanes = opts.lanes.split(',') bases = opts.bases read = opts.read for lane in lanes: read1_fps = glob('%s/s_%s_%d_*qseq.txt' % (input_dir, lane.replace(',',''), read)) # sort so results will be consistent across different runs (important # so amplicon and barcodes read headers will match) read1_fps.sort() for read1_fp in read1_fps: output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir,lane,read) output_f = open(output_fp,'w') for record in iter_split_lines(open(read1_fp,'U')): fastq_s = illumina_data_to_fastq(record, number_of_bases=bases) output_f.write('%s\n' % fastq_s) output_f.close()
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames = [] create_dir(jobs_dir) for command in commands: job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix + "_", suffix=".txt") out_fh = open(job_name, "w") out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix, keep_output, command)) out_fh.close() filenames.append(job_name) return filenames
def test_store_cluster(self): """store_clusters stores the centroid seqs for each cluster.""" self.tmpdir = get_tmp_filename(tmp_dir="./", suffix="_store_clusters/") create_dir(self.tmpdir) self.files_to_remove.append(self.tmpdir+"singletons.fasta") self.files_to_remove.append(self.tmpdir+"centroids.fasta") #empty map results in empty files store_clusters({}, self.tiny_test, self.tmpdir) actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta"))) self.assertEqual(actual_centroids, []) actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta"))) self.assertEqual(actual_singletons, []) #non-empty map creates non-empty files, centroids sorted by size mapping = {'FZTHQMS01B8T1H':[], 'FZTHQMS01DE1KN':['FZTHQMS01EHAJG'], 'FZTHQMS01EHAJG':[1,2,3]} # content doesn't really matter centroids = [('FZTHQMS01EHAJG | cluster size: 4', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA'), ('FZTHQMS01DE1KN | cluster size: 2','CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAACAAGACCATGCGGTCTGATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCA')] singletons= [('FZTHQMS01B8T1H', 'CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTATCCATCGAAGGTTTGGTGAGCCGTTACCTCACCAACTGCCTAATGGAACGCATCCCCATCGATAACCGAAATTCTTTAATAATTAAACCATGCGGTTTTATTATACCATCGGGTATTAATCTTTCTTTCGAAAGGCTATCCCCGAGTTATCGGCAGGTTGGATACGTGTTACTCACCCGTGCGCCGGTCGCCATCACTTA')] store_clusters(mapping, self.tiny_test, self.tmpdir) actual_centroids = list(MinimalFastaParser(open(self.tmpdir+"centroids.fasta"))) self.assertEqual(actual_centroids, centroids) actual_singletons = list(MinimalFastaParser(open(self.tmpdir+"singletons.fasta"))) self.assertEqual(actual_singletons,singletons)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.template_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_template', suffix='.fasta') template_f = open(self.template_fp, 'w') template_f.write(pynast_test1_template_fasta) template_f.close() self.files_to_remove.append(self.template_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp, 'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) initiate_timeout(60)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) self.template_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_template', suffix='.fasta') template_f = open(self.template_fp,'w') template_f.write(pynast_test1_template_fasta) template_f.close() self.files_to_remove.append(self.template_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp,'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) initiate_timeout(60)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_dir = opts.input_dir output_dir = opts.output_dir create_dir(output_dir) lanes = opts.lanes.split(',') bases = opts.bases read = opts.read ignore_pass_filter = opts.ignore_pass_filter for lane in lanes: read1_fps = sorted( glob('%s/s_%s_%d_*qseq.txt' % (input_dir, lane.replace(',', ''), read))) # sort so results will be consistent across different runs (important # so amplicon and barcodes read headers will match) output_fp = '%s/s_%s_%s_sequences.fastq' % (output_dir, lane, read) output_f = open(output_fp, 'w') for read1_fp in read1_fps: for record in iter_split_lines(open(read1_fp, 'U')): fastq_s, pass_filter = illumina_data_to_fastq( record, number_of_bases=bases) if ignore_pass_filter or pass_filter != 0: output_f.write('%s\n' % fastq_s) output_f.close()
def setUp(self): self.files_to_remove = [] self.dirs_to_remove = [] # Create example output directory tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='core_qiime_analyses_test_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Get input data self.test_data = get_test_data_fps() self.qiime_config = load_qiime_config() self.qiime_config['jobs_to_start'] = 2 self.qiime_config['seconds_to_sleep'] = 1 # suppress stderr during tests (one of the systems calls in the # workflow prints a warning, and we can't suppress that warning with # warnings.filterwarnings) here because it comes from within the code # executed through the system call. Found this trick here: # http://stackoverflow.com/questions/9949633/suppressing-print-as-stdout-python self.saved_stderr = sys.stderr sys.stderr = StringIO() # Define number of seconds a test can run for before timing out # and failing initiate_timeout(420)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.attempt_read_reorientation: if not opts.mapping_fp: option_parser.error("To use --attempt_read_reorientation, one must " "supply a mapping file that contains both LinkerPrimerSequence " "and ReversePrimer columns.") if opts.input_type == "barcode_paired_end": if not opts.fastq2: option_parser.error("To use input_type of barcode_paired_end, " "a second fastq file must be specified with --fastq2") if not opts.fastq2: disable_header_match = True else: disable_header_match = opts.disable_header_match fastq1 = qiime_open(opts.fastq1) if opts.fastq2: fastq2 = qiime_open(opts.fastq2) else: fastq2 = None create_dir(opts.output_dir) if opts.mapping_fp: map_fp = qiime_open(opts.mapping_fp) else: map_fp = None extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type, opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2, opts.char_delineator, opts.switch_bc_order, map_fp, opts.attempt_read_reorientation, disable_header_match)
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames = [] create_dir(jobs_dir) for command in commands: fd, job_name = mkstemp(dir=jobs_dir, prefix=job_prefix + "_", suffix=".txt") close(fd) out_fh = open(job_name, "w") out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix, keep_output, command)) out_fh.close() filenames.append(job_name) return filenames
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if len(opts.taxa_summary_fps) != 2: option_parser.error("Exactly two taxa summary files are required. You " "provided %d." % len(opts.taxa_summary_fps)) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U')) results = compare_taxa_summaries( parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')), parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')), opts.comparison_mode, correlation_type=opts.correlation_type, tail_type=opts.tail_type, num_permutations=opts.num_permutations, confidence_level=opts.confidence_level, perform_detailed_comparisons=opts.perform_detailed_comparisons, sample_id_map=sample_id_map, expected_sample_id=opts.expected_sample_id) # Write out the sorted and filled taxa summaries, basing their # filenames on the original input filenames. If the filenames are the same, # append a number to each filename. same_filenames = False if basename(opts.taxa_summary_fps[0]) == \ basename(opts.taxa_summary_fps[1]): same_filenames = True for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps, results[:2], range(0, 2)): filename_suffix = '_sorted_and_filled' if same_filenames: filename_suffix += '_%d' % file_num filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix) filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w') filled_ts_f.write(filled_ts_lines) filled_ts_f.close() # Write the overall comparison result. overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w') overall_comp_f.write(results[2]) overall_comp_f.close() # Write the correlation vector containing the pairwise sample comparisons. if opts.perform_detailed_comparisons: corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'), 'w') corr_vec_f.write(results[3]) corr_vec_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) \ for k,v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps] if opts.method == 'mantel': output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w') output_f.write( run_mantel_test('mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map)) elif opts.method == 'partial_mantel': output_f = open( path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w') output_f.write( run_mantel_test('partial_mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open( opts.control_dm, 'U')), sample_id_map=sample_id_map)) elif opts.method == 'mantel_corr': output_f = open( path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w') result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map, variable_size_distance_classes=opts.variable_size_distance_classes) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. out_dir = opts.output_dir try: create_dir(out_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") map_f = open(opts.mapping_fp, 'U') dm_f = open(opts.distance_matrix_fp, 'U') fields = map(strip, opts.fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] color_individual_within_by_field = opts.color_individual_within_by_field results = make_distance_boxplots(dm_f, map_f, fields, width=opts.width, height=opts.height, suppress_all_within=opts.suppress_all_within, suppress_all_between=opts.suppress_all_between, suppress_individual_within=opts.suppress_individual_within, suppress_individual_between=opts.suppress_individual_between, y_min=opts.y_min, y_max=opts.y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, color_individual_within_by_field=color_individual_within_by_field, sort=opts.sort) for field, plot_figure, plot_data, plot_labels, plot_colors in results: output_plot_fp = join(out_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(out_dir, "%s_Stats.txt" % field), 'w') sig_tests_results = all_pairs_t_test(plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert(len(plot_labels) == len(plot_data)) raw_data_fp = join(out_dir, "%s_Distances.txt" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def setUp(self): """ """ tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) create_dir(self.test_out) self.dirs_to_remove = [self.test_out] self.output_fp = join(self.test_out, 'fmap.txt') self.failure_fp = join(self.test_out, 'fail.txt') self.usearch_fp = join(self.test_out, 'out.uc') self.bl6_fp = join(self.test_out, 'out.bl6') self.log_fp = join(self.test_out, 'fmap.log') self.files_to_remove = [ self.output_fp, self.failure_fp, self.usearch_fp, self.log_fp, self.bl6_fp ] self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs1_f = open(self.refseqs1_fp, 'w') refseqs1_f.write(refseqs1) refseqs1_f.close() self.files_to_remove.append(self.refseqs1_fp) self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs2_f = open(self.refseqs2_fp, 'w') refseqs2_f.write(refseqs2) refseqs2_f.close() self.files_to_remove.append(self.refseqs2_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp, 'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs2_f = open(self.inseqs2_fp, 'w') inseqs2_f.write(inseqs2) inseqs2_f.close() self.files_to_remove.append(self.inseqs2_fp) initiate_timeout(60)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, "U")) for dm_fp in input_dm_fps] if opts.method == "mantel": output_f = open(path.join(opts.output_dir, "mantel_results.txt"), "w") output_f.write( run_mantel_test( "mantel", input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map, ) ) elif opts.method == "partial_mantel": output_f = open(path.join(opts.output_dir, "partial_mantel_results.txt"), "w") output_f.write( run_mantel_test( "partial_mantel", input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open(opts.control_dm, "U")), sample_id_map=sample_id_map, ) ) elif opts.method == "mantel_corr": output_f = open(path.join(opts.output_dir, "mantel_correlogram_results.txt"), "w") result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map ) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fps = opts.input_fps output_dir = opts.output_dir create_dir(output_dir) barcode_length = opts.barcode_length barcode_in_header = opts.barcode_in_header barcode_qual_c = opts.barcode_qual_c for input_fp in input_fps: if input_fp.endswith('.gz'): open_f = gzip_open input_basename = split(splitext(splitext(input_fp)[0])[0])[1] else: input_basename = split(splitext(input_fp)[0])[1] open_f = open sequence_output_fp = '%s/%s.fastq' % (output_dir,input_basename) sequence_output_f = open(sequence_output_fp,'w') barcode_output_fp = '%s/%s_barcodes.fastq' % (output_dir,input_basename) barcode_output_f = open(barcode_output_fp,'w') for line in open_f(input_fp): common_fields, sequence, sequence_qual, barcode, barcode_qual =\ iseq_to_qseq_fields(line, barcode_in_header, barcode_length, barcode_qual_c) sequence_s, pass_filter_s = illumina_data_to_fastq((common_fields[0], common_fields[1], common_fields[2], common_fields[3], common_fields[4], common_fields[5], common_fields[6], common_fields[7], sequence, sequence_qual)) barcode_s, pass_filter_b = illumina_data_to_fastq((common_fields[0], common_fields[1], common_fields[2], common_fields[3], common_fields[4], common_fields[5], common_fields[6], common_fields[7], barcode, barcode_qual),barcode_length) if pass_filter_s != 0: sequence_output_f.write('%s\n' % sequence_s) barcode_output_f.write('%s\n' % barcode_s) sequence_output_f.close() barcode_output_f.close()
def make_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="06:00:00", nodes=1, ncpus=16, mem=16, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames=[] create_dir(jobs_dir) job_list_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_joblist_", suffix = ".txt") job_log_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_prallel_job_log", suffix = ".txt") out_fh_list = open(job_list_name,"w") for command in commands[0:len(commands)-1]: out_fh_list.write(command+"\n") out_fh_list.close() job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_", suffix = ".txt") out_fh = open(job_name,"w") #num_nodes = int(math.ceil((len(commands)-1)/8.0)) # If you use the lab queue, then change the num_nodes and ncpus as: num_nodes = 1 ncpus = len(commands) - 1 out_fh.write(QSUB_TEXT % (walltime, num_nodes, ncpus, mem, queue, job_prefix, keep_output, job_list_name, len(commands)-1, job_log_name, commands[-1])) out_fh.close() filenames.append(job_name) return filenames
def make_sge_jobs(commands, job_prefix, queue, jobs_dir="jobs/", num_jobs=100, max_hours_per_job=24): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written max_hours_per_job: the maximum expected time for each command (this will be multiplied by number of commands per job to get a 'walltime' ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames = [] create_dir(jobs_dir) #calculate the number of commands to put in each job num_commands_per_job = int(ceil(len(commands) / float(num_jobs))) #calculate the walltime (time before job will be killed by scheduler if still running) total_time = max_hours_per_job * num_commands_per_job walltime = "{0}:00:00".format(total_time) for command_group in grouper(commands, num_commands_per_job, ''): job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix + "_", suffix=".txt") out_fh = open(job_name, "w") stderr_fp = job_name + "_stderr" stdout_fp = job_name + "_stdout" out_fh.write( SGE_QSUB_TEXT % (walltime, stderr_fp, stdout_fp, "\n".join(command_group))) out_fh.close() filenames.append(job_name) return filenames
def setUp(self): """ """ tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_tests_', suffix='', result_constructor=str) create_dir(self.test_out) self.dirs_to_remove = [self.test_out] self.output_fp = join(self.test_out,'fmap.txt') self.failure_fp = join(self.test_out,'fail.txt') self.usearch_fp = join(self.test_out,'out.uc') self.bl6_fp = join(self.test_out,'out.bl6') self.log_fp = join(self.test_out,'fmap.log') self.files_to_remove = [self.output_fp, self.failure_fp, self.usearch_fp, self.log_fp, self.bl6_fp] self.refseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs1_f = open(self.refseqs1_fp,'w') refseqs1_f.write(refseqs1) refseqs1_f.close() self.files_to_remove.append(self.refseqs1_fp) self.refseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_refseqs', suffix='.fasta') refseqs2_f = open(self.refseqs2_fp,'w') refseqs2_f.write(refseqs2) refseqs2_f.close() self.files_to_remove.append(self.refseqs2_fp) self.inseqs1_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs1_f = open(self.inseqs1_fp,'w') inseqs1_f.write(inseqs1) inseqs1_f.close() self.files_to_remove.append(self.inseqs1_fp) self.inseqs2_fp = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_inseqs', suffix='.fasta') inseqs2_f = open(self.inseqs2_fp,'w') inseqs2_f.write(inseqs2) inseqs2_f.close() self.files_to_remove.append(self.inseqs2_fp) initiate_timeout(60)
def main(commandline_args=None): option_parser, opts, args = parse_command_line_parameters(**script_info) #check for missing files required_files = [opts.denoiser_map_file, opts.otu_picker_map_file, opts.fasta_fp, opts.denoised_fasta_fp] if (not all(required_files) or not all(map(exists, required_files))): option_parser.error('Missing input files.') create_dir(opts.output_dir, fail_on_exist=False) combine_mappings(open(opts.fasta_fp), open(opts.denoiser_map_file), open(opts.denoised_fasta_fp), open(opts.otu_picker_map_file), opts.output_dir)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) out_dir = opts.output_dir categories = opts.categories.split(',') # Create the output dir if it doesn't already exist. try: if not exists(out_dir): create_dir(out_dir) except: option_parser.error("Could not create or access output directory '%s' " "specified with the -o option." % out_dir) compare_categories(opts.input_dm, opts.mapping_file, opts.method, categories, opts.num_permutations, out_dir)
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename( tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Temporary input file self.tmp_seq_filepath = get_tmp_filename( tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(rdp_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt', dir=tmp_dir) self.id_to_taxonomy_file.write(rdp_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta', dir=tmp_dir) self.reference_seqs_file.write(rdp_reference_seqs) self.reference_seqs_file.seek(0) jar_fp = getenv('RDP_JAR_PATH') jar_basename = basename(jar_fp) if '2.2' not in jar_basename: raise ApplicationError( "RDP_JAR_PATH does not point to version 2.2 of the " "RDP Classifier.") initiate_timeout(60)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose sequence_reads_fp = opts.sequence_reads_fp output_dir = opts.output_dir proportion_subsample = opts.proportion_subsample index_reads_fp = opts.index_reads_fp if proportion_subsample > 1 or proportion_subsample <= 0: raise ValueError,('proportion_subsample must be in range of 0-1') # if the output dir isn't specified, create one if not output_dir: input_file_basename, input_file_ext = \ splitext(split(sequence_reads_fp)[1]) output_dir = '%s_subsample/' % (input_file_basename) create_dir(output_dir) # prepare output files input_file_basename, input_file_ext = \ splitext(split(sequence_reads_fp)[1]) output_reads_fp = '%s_subsample%s' % (input_file_basename, input_file_ext) input_file_basename, input_file_ext = \ splitext(split(index_reads_fp)[1]) output_indexs_fp = '%s_subsample%s' % (input_file_basename, input_file_ext) # randomly subsample sequence_reads = open(sequence_reads_fp, "U") index_reads = open(index_reads_fp, "U") output_reads = open('%s/%s' %(output_dir,output_reads_fp), "w") output_indexs = open('%s/%s' %(output_dir,output_indexs_fp), "w") for (label, seq, qual), (labelI, seqI, qualI) in izip(MinimalFastqParser(sequence_reads,strict=False),MinimalFastqParser(index_reads,strict=False)): if random() < proportion_subsample: output_reads.write('@%s\n%s\n+\n%s\n' % (label, seq, qual)) output_indexs.write('@%s\n%s\n+\n%s\n' % (labelI, seqI, qualI)) sequence_reads.close() index_reads.close() output_reads.close() output_indexs.close()
def make_sge_jobs(commands, job_prefix, queue, jobs_dir="jobs/",num_jobs=100,max_hours_per_job=24): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written max_hours_per_job: the maximum expected time for each command (this will be multiplied by number of commands per job to get a 'walltime' ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames=[] create_dir(jobs_dir) #calculate the number of commands to put in each job num_commands_per_job=int(ceil(len(commands)/float(num_jobs))) #calculate the walltime (time before job will be killed by scheduler if still running) total_time = max_hours_per_job*num_commands_per_job walltime= "{0}:00:00".format(total_time) for command_group in grouper(commands,num_commands_per_job,''): job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_", suffix = ".txt") out_fh = open(job_name,"w") stderr_fp = job_name+"_stderr" stdout_fp = job_name+"_stdout" out_fh.write(SGE_QSUB_TEXT % (walltime, stderr_fp, stdout_fp, "\n".join(command_group))) out_fh.close() filenames.append(job_name) return filenames
def setUp(self): """ """ self.files_to_remove = [] self.dirs_to_remove = [] tmp_dir = get_qiime_temp_dir() self.test_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_parallel_taxonomy_assigner_tests_', suffix='', result_constructor=str) self.dirs_to_remove.append(self.test_out) create_dir(self.test_out) # Temporary input file self.tmp_seq_filepath = get_tmp_filename(tmp_dir=self.test_out, prefix='qiime_parallel_taxonomy_assigner_tests_input', suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(rdp_test_seqs) seq_file.close() self.files_to_remove.append(self.tmp_seq_filepath) self.id_to_taxonomy_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_id_to_taxonomy', suffix='.txt',dir=tmp_dir) self.id_to_taxonomy_file.write(rdp_id_to_taxonomy) self.id_to_taxonomy_file.seek(0) self.reference_seqs_file = NamedTemporaryFile( prefix='qiime_parallel_taxonomy_assigner_tests_ref_seqs', suffix='.fasta',dir=tmp_dir) self.reference_seqs_file.write(rdp_reference_seqs) self.reference_seqs_file.seek(0) jar_fp = getenv('RDP_JAR_PATH') jar_basename = basename(jar_fp) if '2.2' not in jar_basename: raise ApplicationError( "RDP_JAR_PATH does not point to version 2.2 of the " "RDP Classifier.") initiate_timeout(60)
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''): """ Split infile into files with seqs_per_file sequences in each infile: list of fasta lines or open file object seqs_per_file: the number of sequences to include in each file out_fileprefix: string used to create output filepath - output filepaths are <out_prefix>.<i>.fasta where i runs from 0 to number of output files working_dir: directory to prepend to temp filepaths (defaults to empty string -- files written to cwd) List of output filepaths is returned. """ if seqs_per_file <= 0: raise ValueError("seqs_per_file must be > 0!") seq_counter = 0 out_files = [] if working_dir and not working_dir.endswith('/'): working_dir += '/' create_dir(working_dir) for seq_id, seq in parse_fasta(infile): if seq_counter == 0: current_out_fp = '%s%s.%d.fasta' \ % (working_dir, outfile_prefix, len(out_files)) current_out_file = open(current_out_fp, 'w') out_files.append(current_out_fp) current_out_file.write('>%s\n%s\n' % (seq_id, seq)) seq_counter += 1 if seq_counter == seqs_per_file: current_out_file.close() seq_counter = 0 if not current_out_file.closed: current_out_file.close() return out_files
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. output_dir = opts.output_dir try: create_dir(output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o/--output_dir option.") otu_table_fp = opts.otu_table_fp with open(otu_table_fp, 'U') as table_f: table = parse_biom_table(table_f) estimator = ObservationRichnessEstimator(table, Chao1MultinomialPointEstimator) results = estimator(opts.min, opts.max, opts.num_steps, opts.confidence_level) out_fp = join(output_dir, 'estimates_table.txt') with open(out_fp, 'w') as out_f: results.toTable(out_f)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.attempt_read_reorientation: if not opts.mapping_fp: option_parser.error( "To use --attempt_read_reorientation, one must " "supply a mapping file that contains both LinkerPrimerSequence " "and ReversePrimer columns.") if opts.input_type == "barcode_paired_end": if not opts.fastq2: option_parser.error( "To use input_type of barcode_paired_end, " "a second fastq file must be specified with --fastq2") if not opts.fastq2: disable_header_match = True else: disable_header_match = opts.disable_header_match fastq1 = qiime_open(opts.fastq1) if opts.fastq2: fastq2 = qiime_open(opts.fastq2) else: fastq2 = None create_dir(opts.output_dir) if opts.mapping_fp: map_fp = qiime_open(opts.mapping_fp) else: map_fp = None extract_barcodes(fastq1, fastq2, opts.output_dir, opts.input_type, opts.bc1_len, opts.bc2_len, opts.rev_comp_bc1, opts.rev_comp_bc2, opts.char_delineator, opts.switch_bc_order, map_fp, opts.attempt_read_reorientation, disable_header_match)
def _get_job_commands( self, input_fp, output_dir, params, job_prefix, working_dir, command_prefix="/bin/bash; ", command_suffix="; exit", ): """Generate beta diversity to split single OTU table to multiple jobs full_tree=True is faster: beta_diversity.py -f will make things go faster, but be sure you already have the correct minimal tree. """ commands = [] result_filepaths = [] sids = parse_biom_table(open(input_fp, "U")).SampleIds if params["full_tree"]: full_tree_str = "-f" else: full_tree_str = "" if params["tree_path"]: tree_str = "-t %s" % params["tree_path"] else: tree_str = "" metrics = params["metrics"] # this is a little bit of an abuse of _merge_to_n_commands, so may # be worth generalizing that method - this determines the correct # number of samples to process in each command sample_id_groups = self._merge_to_n_commands( sids, params["jobs_to_start"], delimiter=",", command_prefix="", command_suffix="" ) for i, sample_id_group in enumerate(sample_id_groups): working_dir_i = join(working_dir, str(i)) create_dir(working_dir_i) output_dir_i = join(output_dir, str(i)) create_dir(output_dir_i) input_dir, input_fn = split(input_fp) input_basename, input_ext = splitext(input_fn) sample_id_desc = sample_id_group.replace(",", "_") output_fns = ["%s_%s.txt" % (metric, input_basename) for metric in metrics.split(",")] rename_command, current_result_filepaths = self._get_rename_command(output_fns, working_dir_i, output_dir_i) result_filepaths += current_result_filepaths bdiv_command = "%s -i %s -o %s %s -m %s %s -r %s" % ( self._script_name, input_fp, working_dir_i, tree_str, params["metrics"], full_tree_str, sample_id_group, ) shell_script_fp = "%s/%s%d.sh" % (working_dir_i, job_prefix, i) shell_script_commands = [bdiv_command] + rename_command.split(";") self._commands_to_shell_script(shell_script_commands, shell_script_fp) commands.append("bash %s" % shell_script_fp) return commands, result_filepaths
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i,input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir,i) if iteration_output_exists(iteration_output_dir,min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger,[input_fp,refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i,input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id,str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables",merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'), valid_states) if len(sample_ids) < 1: option_parser.error(\ "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\ valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp,'U')) otu_counts = [] summary_figure_fp = join(output_dir,'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp,'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids == None: output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write(\ "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\ (fraction_for_core_str, valid_states,' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iterObservations(): output_f.write('%s\t%s\n' % (id_,md[otu_md])) otu_count += 1 output_f.close() # write the core biom table output_table_f = open(output_table_fp,'w') output_table_f.write(format_biom_table(core_table)) output_table_f.close() # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core),max(fractions_for_core)) ylim(0,max(otu_counts)+1) xlabel("Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def pick_subsampled_open_referenence_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_tax_align_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking denovo_otu_picking_method = 'uclust' reference_otu_picking_method = 'uclust_ref' # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_otu_map_fp = \ '%s/%s_otus.txt' % (prefilter_dir,input_basename) prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_dir = opts.output_dir if opts.num_fraction_for_core_steps < 2: option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.") fractions_for_core = linspace(opts.min_fraction_for_core, opts.max_fraction_for_core, opts.num_fraction_for_core_steps) otu_md = opts.otu_md valid_states = opts.valid_states mapping_fp = opts.mapping_fp create_dir(output_dir) if valid_states and opts.mapping_fp: sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'), valid_states) if len(sample_ids) < 1: option_parser.error(\ "--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\ valid_states) else: # get core across all samples if user doesn't specify a subset of the # samples to work with sample_ids = None input_table = parse_biom_table(open(input_fp,'U')) otu_counts = [] summary_figure_fp = join(output_dir,'core_otu_size.pdf') for fraction_for_core in fractions_for_core: # build a string representation of the fraction as that gets used # several times fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.) # prep output files output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str) output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str) output_f = open(output_fp,'w') try: core_table = filter_table_to_core(input_table, sample_ids, fraction_for_core) except TableException: output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str) output_f.close() otu_counts.append(0) continue # write some header information to file if sample_ids == None: output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str) else: output_f.write(\ "# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\ (fraction_for_core_str, valid_states,' '.join(sample_ids))) # write the otu id and corresponding metadata for all core otus otu_count = 0 for value, id_, md in core_table.iterObservations(): output_f.write('%s\t%s\n' % (id_, get_first_metadata_entry(md[otu_md]))) otu_count += 1 output_f.close() # write the core biom table output_table_f = open(output_table_fp,'w') output_table_f.write(format_biom_table(core_table)) output_table_f.close() # append the otu count to the list of counts otu_counts.append(otu_count) plot(fractions_for_core, otu_counts) xlim(min(fractions_for_core),max(fractions_for_core)) ylim(0,max(otu_counts)+1) xlabel("Fraction of samples that OTU must be observed in to be considered 'core'") ylabel("Number of OTUs") savefig(summary_figure_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side option_parser.error("Only valid phred offsets are: %s" % ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { DNA.rc(k): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") fields = opts.fields fields = map(strip, fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] if fields is None: option_parser.error("You must provide at least one field using the -f " "option.") # Make sure each field is in the mapping file. for field in fields: if field not in mapping_header: option_parser.error( "The field '%s' is not in the provided " "mapping file. Please supply correct fields (using the -f " "option) corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Generate the various boxplots, depending on what the user wanted # suppressed. Add them all to one encompassing plot. for field in fields: plot_data = [] plot_labels = [] if not opts.suppress_all_within: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True)) plot_labels.append("All within %s" % field) if not opts.suppress_all_between: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False)) plot_labels.append("All between %s" % field) if not opts.suppress_individual_within: within_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True) for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if not opts.suppress_individual_between: between_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) # We now have our data and labels ready, so plot them! assert (len(plot_data) == len(plot_labels)), "The number " +\ "of boxplot labels does not match the number of " +\ "boxplots." if plot_data: if opts.sort: # Sort our plot data in order of increasing median. sorted_data = [] for label, distribution in zip(plot_labels, plot_data): sorted_data.append( (label, distribution, median(distribution))) sorted_data.sort(key=itemgetter(2)) plot_labels = [] plot_data = [] for label, distribution, median_value in sorted_data: plot_labels.append(label) plot_data.append(distribution) width = opts.width height = opts.height if width is None: width = len(plot_data) * opts.box_width + 2 if width <= 0 or height <= 0: option_parser.error("The specified width and height of the " "image must be greater than zero.") plot_figure = generate_box_plots( plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, figure_width=width, figure_height=height) output_plot_fp = join(opts.output_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) else: option_parser.error("You have chosen to suppress all plots. At " "least one type of plot must be unsuppressed.") if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field), 'w') sig_tests_results = all_pairs_t_test( plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert (len(plot_labels) == len(plot_data)) raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def iterative_pick_subsampled_open_referenence_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_tax_align_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_referenence_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_tax_align_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_tax_align_tree: otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) final_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp, final_otu_table_fp], error_on_missing=False) taxonomy_fp, pynast_failures_fp = tax_align_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\ (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_taxa_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(otu_table_w_tax_fp, 'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) otu_table_f = open(final_otu_table_fp, 'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_id = opts.sample_id mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_id == None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide a --sample_id.") barcode_read_fps = [None] * len(sequence_read_fps) elif barcode_read_fps == None: option_parser.error("Must provide --barcode_fps if " "--barcode_type is not 'not-barcoded'") else: pass phred_offset = opts.phred_offset if phred_offset != None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side opption_parser.error(\ "Only valid phred offsets are: %s" %\ ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char != None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) try: barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type] except KeyError: barcode_correction_fn = None if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode and mapping files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp,'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp,'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h,q): qual_f.write('>%s\n%s\n' % (h,q)) else: def qual_writer(h,q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp,'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h,s,q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q)) else: def fastq_writer(h,s,q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp,'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp,'w') for sequence_read_fp, barcode_read_fp, mapping_fp in\ zip(sequence_read_fps, barcode_read_fps, mapping_fps): mapping_f = open(mapping_fp, 'U') h, i, barcode_to_sample_id, warnings, errors, p, a =\ check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp != None) if rev_comp_mapping_barcodes: barcode_to_sample_id = \ dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()]) if barcode_type == 'golay_12': invalid_golay_barcodes = \ get_invalid_golay_barcodes(barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay codes. "+\ "Do they need to be reverse complimented? If these are not "+\ "golay barcodes pass --barcode_type 12 to disable barcode "+\ "error correction, or pass --barcode_type # if the barcodes "+\ "are not 12 base pairs, where # is the size of the barcodes. "+ " Invalid codes:\n\t%s" % \ ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") log_f.write('Mapping filepath: %s (md5: %s)\n' %\ (mapping_fp,safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' %\ (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp,'U') seq_id = start_seq_id if barcode_read_fp != None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\ (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp,'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header,sequence)) qual_writer(fasta_header,quality) fastq_writer(fasta_header,sequence,quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp,output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp,qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp,output_fastq_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Parse the field states that will be compared to every other field state. comparison_field_states = opts.comparison_groups comparison_field_states = map(strip, comparison_field_states.split(',')) comparison_field_states = [ field_state.strip('"').strip("'") for field_state in comparison_field_states ] if comparison_field_states is None: option_parser.error("You must provide at least one field state to " "compare (using the -c option).") # Get distance comparisons between each field state and each of the # comparison field states. field = opts.field comparison_groupings = get_field_state_comparisons( dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states) # Grab a list of all field states that had the comparison field states # compared against them. These will be plotted along the x-axis. field_states = comparison_groupings.keys() def custom_comparator(x, y): try: num_x = float(x) num_y = float(y) return int(num_x - num_y) except: if x < y: return -1 elif x > y: return 1 else: return 0 # Sort the field states as numbers if the elements are numbers, else sort # them lexically. field_states.sort(custom_comparator) # If the label type is numeric, get a list of all field states in sorted # numeric order. These will be used to determine the spacing of the # field state 'points' along the x-axis. x_spacing = None if opts.label_type == "numeric": try: x_spacing = map(float, field_states) x_spacing.sort() except: option_parser.error("The 'numeric' label type is invalid because " "not all field states could be converted into " "numbers. Please specify a different label " "type.") # Accumulate the data for each field state 'point' along the x-axis. plot_data = [] plot_x_axis_labels = [] for field_state in field_states: field_state_data = [] for comp_field_state in comparison_field_states: field_state_data.append( comparison_groupings[field_state][comp_field_state]) plot_data.append(field_state_data) plot_x_axis_labels.append(field_state) # Plot the data and labels. plot_title = "Distance Comparisons" plot_x_label = field plot_y_label = "Distance" # If we are creating a bar chart or box plot, grab a list of good data # colors to use. plot_type = opts.plot_type plot_colors = None if plot_type == "bar" or plot_type == "box": plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) \ for color in data_color_order] assert plot_data, "Error: there is no data to plot!" width = opts.width height = opts.height if width <= 0 or height <= 0: option_parser.error("The specified width and height of the image must " "be greater than zero.") plot_figure = generate_comparative_plots( opts.plot_type, plot_data, x_values=x_spacing, data_point_labels=plot_x_axis_labels, distribution_labels=comparison_field_states, distribution_markers=plot_colors, x_label=plot_x_label, y_label=plot_y_label, title=plot_title, x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, error_bar_type=opts.error_bar_type, distribution_width=opts.distribution_width, figure_width=width, figure_height=height) # Save the plot in the specified format. output_plot_fp = join( opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w') # Rearrange the plot data into a format suitable for all_pairs_t_test. sig_tests_labels = [] sig_tests_data = [] for data_point, data_point_label in zip(plot_data, plot_x_axis_labels): for dist, comp_field in zip(data_point, comparison_field_states): sig_tests_labels.append('%s vs %s' % (data_point_label, comp_field)) sig_tests_data.append(dist) sig_tests_results = all_pairs_t_test( sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file, where each line # has the distances between a comparison group and another field state # 'point' along the x-axis. assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\ "labels do not match the number of points along the x-axis." raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field) raw_data_f = open(raw_data_fp, 'w') raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n") for label, data in zip(plot_x_axis_labels, plot_data): assert (len(comparison_field_states) == len(data)), "The " +\ "number of specified comparison groups does not match " +\ "the number of groups found at the current point along " +\ "the x-axis." for comp_field_state, comp_grp_data in zip(comparison_field_states, data): raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n") raw_data_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat(open(opts.distance_matrix_fp, "U")) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option." ) try: mapping, mapping_header, mapping_comments = parse_mapping_file(open(opts.mapping_fp, "U")) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option." ) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == "auto": y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == "auto": y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Parse the field states that will be compared to every other field state. comparison_field_states = opts.comparison_groups comparison_field_states = map(strip, comparison_field_states.split(",")) comparison_field_states = [field_state.strip('"').strip("'") for field_state in comparison_field_states] if comparison_field_states is None: option_parser.error("You must provide at least one field state to " "compare (using the -c option).") # Get distance comparisons between each field state and each of the # comparison field states. field = opts.field comparison_groupings = get_field_state_comparisons( dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states ) # Grab a list of all field states that had the comparison field states # compared against them. These will be plotted along the x-axis. field_states = comparison_groupings.keys() def custom_comparator(x, y): try: num_x = float(x) num_y = float(y) return int(num_x - num_y) except: if x < y: return -1 elif x > y: return 1 else: return 0 # Sort the field states as numbers if the elements are numbers, else sort # them lexically. field_states.sort(custom_comparator) # If the label type is numeric, get a list of all field states in sorted # numeric order. These will be used to determine the spacing of the # field state 'points' along the x-axis. x_spacing = None if opts.label_type == "numeric": try: x_spacing = sorted(map(float, field_states)) except: option_parser.error( "The 'numeric' label type is invalid because " "not all field states could be converted into " "numbers. Please specify a different label " "type." ) # Accumulate the data for each field state 'point' along the x-axis. plot_data = [] plot_x_axis_labels = [] for field_state in field_states: field_state_data = [] for comp_field_state in comparison_field_states: field_state_data.append(comparison_groupings[field_state][comp_field_state]) plot_data.append(field_state_data) plot_x_axis_labels.append(field_state) # Plot the data and labels. plot_title = "Distance Comparisons" plot_x_label = field plot_y_label = "Distance" # If we are creating a bar chart or box plot, grab a list of good data # colors to use. plot_type = opts.plot_type plot_colors = None if plot_type == "bar" or plot_type == "box": plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) for color in data_color_order] assert plot_data, "Error: there is no data to plot!" width = opts.width height = opts.height if width <= 0 or height <= 0: option_parser.error("The specified width and height of the image must " "be greater than zero.") plot_figure = grouped_distributions( opts.plot_type, plot_data, x_values=x_spacing, data_point_labels=plot_x_axis_labels, distribution_labels=comparison_field_states, distribution_markers=plot_colors, x_label=plot_x_label, y_label=plot_y_label, title=plot_title, x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, error_bar_type=opts.error_bar_type, distribution_width=opts.distribution_width, figure_width=width, figure_height=height, ) # Save the plot in the specified format. output_plot_fp = join(opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), "w") # Rearrange the plot data into a format suitable for all_pairs_t_test. sig_tests_labels = [] sig_tests_data = [] for data_point, data_point_label in zip(plot_data, plot_x_axis_labels): for dist, comp_field in zip(data_point, comparison_field_states): sig_tests_labels.append("%s vs %s" % (data_point_label, comp_field)) sig_tests_data.append(dist) sig_tests_results = all_pairs_t_test( sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations ) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file, where each line # has the distances between a comparison group and another field state # 'point' along the x-axis. assert len(plot_x_axis_labels) == len(plot_data), ( "The number of " + "labels do not match the number of points along the x-axis." ) raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field) raw_data_f = open(raw_data_fp, "w") raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n") for label, data in zip(plot_x_axis_labels, plot_data): assert len(comparison_field_states) == len(data), ( "The " + "number of specified comparison groups does not match " + "the number of groups found at the current point along " + "the x-axis." ) for comp_field_state, comp_grp_data in zip(comparison_field_states, data): raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n") raw_data_f.close()
def _get_job_commands(self, input_fp, output_dir, params, job_prefix, working_dir, command_prefix='/bin/bash; ', command_suffix='; exit'): """Generate beta diversity to split single OTU table to multiple jobs full_tree=True is faster: beta_diversity.py -f will make things go faster, but be sure you already have the correct minimal tree. """ commands = [] result_filepaths = [] sids = parse_biom_table(open(input_fp, 'U')).SampleIds if params['full_tree']: full_tree_str = '-f' else: full_tree_str = '' if params['tree_path']: tree_str = '-t %s' % params['tree_path'] else: tree_str = '' metrics = params['metrics'] # this is a little bit of an abuse of _merge_to_n_commands, so may # be worth generalizing that method - this determines the correct # number of samples to process in each command sample_id_groups = self._merge_to_n_commands(sids, params['jobs_to_start'], delimiter=',', command_prefix='', command_suffix='') for i, sample_id_group in enumerate(sample_id_groups): working_dir_i = join(working_dir, str(i)) create_dir(working_dir_i) output_dir_i = join(output_dir, str(i)) create_dir(output_dir_i) result_filepaths.append(output_dir_i) input_dir, input_fn = split(input_fp) input_basename, input_ext = splitext(input_fn) sample_id_desc = sample_id_group.replace(',', '_') output_fns = [ '%s_%s.txt' % (metric, input_basename) for metric in metrics.split(',') ] rename_command, current_result_filepaths = self._get_rename_command( output_fns, working_dir_i, output_dir_i) result_filepaths += current_result_filepaths bdiv_command = '%s -i %s -o %s %s -m %s %s -r %s' %\ (self._script_name, input_fp, working_dir_i, tree_str, params['metrics'], full_tree_str, sample_id_group) shell_script_fp = '%s/%s%d.sh' % (working_dir_i, job_prefix, i) shell_script_commands = [bdiv_command] + rename_command.split(';') self._commands_to_shell_script(shell_script_commands, shell_script_fp) commands.append('bash %s' % shell_script_fp) commands = self._merge_to_n_commands(commands, params['jobs_to_start'], command_prefix=command_prefix, command_suffix=command_suffix) return commands, result_filepaths