def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs all of the processing necessary to produce the " "signals used for later processing. In particular, it runs the standard " "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with " "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes " "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq " "signal for the most-expressed transcripts.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (json) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--tmp', help="The location for temp files", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) star_index = filenames.get_star_index(config['genome_base_path'], config['genome_name'], is_merged=False) models_base = config.get('models_base', default_models_base) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_argument = "" if not call: do_not_call_argument = "--do-not-call" overwrite_argument = "" if args.overwrite: overwrite_argument = "--overwrite" orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) # check if we want to keep multimappers is_unique = not ('keep_riboseq_multimappers' in config) riboseq_raw_data = args.raw_data riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ( "create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(riboseq_raw_data, args.config, args.name, args.num_cpus, do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str, flexbar_format_option_str, keep_intermediate_str, mem_str)) # There could be cases where we start somewhere in the middle of creating # the base genome profile. So even if the "raw data" is not available, # we still want to call the base pipeline. #in_files = [riboseq_raw_data] in_files = [] out_files = [riboseq_bam_filename] # we always call this, and pass --do-not-call through shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # create the metagene profiles metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], args.name, is_unique=is_unique, note=note) seqids_to_keep_str = utils.get_config_argument(config, 'seqids_to_keep') start_upstream_str = utils.get_config_argument( config, 'metagene_profile_start_upstream', 'start-upstream') start_downstream_str = utils.get_config_argument( config, 'metagene_profile_start_downstream', 'start-downstream') end_upstream_str = utils.get_config_argument( config, 'metagene_profile_end_upstream', 'end-upstream') end_downstream_str = utils.get_config_argument( config, 'metagene_profile_end_downstream', 'end-downstream') # use the canonical transcripts for extracting the metagene profiles transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {} {}" .format(riboseq_bam_filename, transcript_bed, metagene_profiles, args.num_cpus, logging_str, seqids_to_keep_str, start_upstream_str, start_downstream_str, end_upstream_str, end_downstream_str)) in_files = [riboseq_bam_filename, orfs_genomic] out_files = [metagene_profiles] file_checkers = {metagene_profiles: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # estimate the periodicity for each offset for all read lengths metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], args.name, is_unique=is_unique, note=note) #periodic_models_str = utils.get_config_argument(config, 'periodic_models') #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models') periodic_models = filenames.get_models(models_base, 'periodic') non_periodic_models = filenames.get_models(models_base, 'nonperiodic') periodic_models_str = ' '.join(periodic_models) non_periodic_models_str = ' '.join(non_periodic_models) periodic_models_str = "--periodic-models {}".format(periodic_models_str) non_periodic_models_str = "--nonperiodic-models {}".format( non_periodic_models_str) periodic_offset_start_str = utils.get_config_argument( config, 'periodic_offset_start') periodic_offset_end_str = utils.get_config_argument( config, 'periodic_offset_end') metagene_profile_length_str = utils.get_config_argument( config, 'metagene_profile_length') seed_str = utils.get_config_argument(config, 'seed') chains_str = utils.get_config_argument(config, 'chains') iterations_str = utils.get_config_argument(config, 'metagene_profile_iterations', 'iterations') cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} " "{} {} {} {} {} {} {}".format( metagene_profiles, metagene_profile_bayes_factors, args.num_cpus, periodic_models_str, non_periodic_models_str, periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str, seed_str, chains_str, iterations_str, logging_str)) in_files = [metagene_profiles] in_files.extend(periodic_models) in_files.extend(non_periodic_models) out_files = [metagene_profile_bayes_factors] file_checkers = {metagene_profile_bayes_factors: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # select the best read lengths for constructing the signal periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = "select-periodic-offsets {} {}".format( metagene_profile_bayes_factors, periodic_offsets) in_files = [metagene_profile_bayes_factors] out_files = [periodic_offsets] file_checkers = {periodic_offsets: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, args.name, args.do_not_call, is_unique=is_unique) if len(lengths) == 0: msg = ( "No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, " "and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return lengths_str = ' '.join(lengths) offsets_str = ' '.join(offsets) seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix') # extract the riboseq profiles for each orf unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) cmd = ( "extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} " .format(unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, offsets_str, logging_str, seqname_prefix_str, args.num_cpus)) in_files = [orfs_genomic, exons_file, unique_filename] out_files = [profiles_filename] #todo: implement a file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="") #filenames.run_riboseq_preprocessing_description) parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('-t', '--tmp', help="The location for temporary files. If not " "specified, program-specific temp locations are used.", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv)) logger.info(msg) config = yaml.load(open(args.config)) call = not args.do_not_call keep_delete_files = args.keep_intermediate_files or args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'remove-multimapping-reads' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # Step 0: Running flexbar to remove adapter sequences raw_data = args.raw_data flexbar_target = filenames.get_without_adapters_base( config['riboseq_data'], args.name, note=note) without_adapters = filenames.get_without_adapters_fastq( config['riboseq_data'], args.name, note=note) adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq') adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters') quality_format_str = utils.get_config_argument( config, 'quality_format', args.flexbar_format_option, default=default_quality_format) max_uncalled_str = utils.get_config_argument(config, 'max_uncalled', default=default_max_uncalled) pre_trim_left_str = utils.get_config_argument( config, 'pre_trim_left', default=default_pre_trim_left) cmd = "flexbar {} {} {} {} -n {} {} -r {} -t {} {}".format( quality_format_str, max_uncalled_str, adapter_seq_str, adapter_file_str, args.num_cpus, flexbar_compression_str, raw_data, flexbar_target, pre_trim_left_str) in_files = [raw_data] out_files = [without_adapters] file_checkers = {without_adapters: fastx_utils.check_fastq_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # Step 1: Running bowtie2 to remove rRNA alignments out = utils.abspath("dev", "null") # we do not care about the alignments without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'], args.name, note=note) with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'], args.name, note=note) cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format( args.num_cpus, config['ribosomal_index'], without_adapters, out, without_rrna, with_rrna) in_files = [without_adapters] in_files.extend(bio.get_bowtie2_index_files(config['ribosomal_index'])) out_files = [without_rrna, with_rrna] to_delete = [without_adapters] file_checkers = {without_rrna: fastx_utils.check_fastq_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # Step 2: Running STAR to align rRNA-depleted reads to genome star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'], args.name, note=note) #transcriptome_bam = "{}{}".format(star_output_prefix, "Aligned.toTranscriptome.out.bam") genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam") star_compression_str = "--readFilesCommand {}".format( shlex.quote(args.star_read_files_command)) align_intron_min_str = utils.get_config_argument( config, 'align_intron_min', 'alignIntronMin', default=default_align_intron_min) align_intron_max_str = utils.get_config_argument( config, 'align_intron_max', 'alignIntronMax', default=default_align_intron_max) out_filter_mismatch_n_max_str = utils.get_config_argument( config, 'out_filter_mismatch_n_max', 'outFilterMismatchNmax', default=default_out_filter_mismatch_n_max) out_filter_mismatch_n_over_l_max_str = utils.get_config_argument( config, 'out_filter_mismatch_n_over_l_max', 'outFilterMismatchNoverLmax', default=default_out_filter_mismatch_n_over_l_max) out_filter_type_str = utils.get_config_argument( config, 'out_filter_type', 'outFilterType', default=default_out_filter_type) out_filter_intron_motifs_str = utils.get_config_argument( config, 'out_filter_intron_motifs', 'outFilterIntronMotifs', default=default_out_filter_intron_motifs) out_sam_attributes_str = utils.get_config_argument( config, 'out_sam_attributes', 'outSAMattributes', default=default_out_sam_attributes) star_tmp_str = "" if args.tmp is not None: star_tmp_name = "STAR_rpbp" star_tmp_dir = star_utils.create_star_tmp(args.tmp, star_tmp_name) star_tmp_str = "--outTmpDir {}".format(star_tmp_dir) mem_bytes = utils.human2bytes(args.mem) star_mem_str = "--limitBAMsortRAM {}".format(mem_bytes) cmd = ( "{} --runThreadN {} {} --genomeDir {} --sjdbGTFfile {} --readFilesIn {} " "{} {} {} {} {} {} {} {} --outFileNamePrefix {} {} {} {}".format( args.star_executable, args.num_cpus, star_compression_str, config['star_index'], config['gtf'], without_rrna, align_intron_min_str, align_intron_max_str, out_filter_mismatch_n_max_str, out_filter_type_str, out_filter_intron_motifs_str, quant_mode_str, out_filter_mismatch_n_over_l_max_str, out_sam_attributes_str, star_output_prefix, star_out_str, star_tmp_str, star_mem_str)) in_files = [without_rrna] in_files.extend(star_utils.get_star_index_files(config['star_index'])) #out_files = [transcriptome_bam, genome_star_bam] to_delete = [without_rrna] out_files = [genome_star_bam] file_checkers = { #transcriptome_bam: bam_utils.check_bam_file, genome_star_bam: bam_utils.check_bam_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'], args.name, note=note) if os.path.exists(genome_star_bam): utils.create_symlink(genome_star_bam, genome_sorted_bam, call) else: msg = ("Could not find the STAR genome bam alignment file. Unless " "--do-not-call was given, this is a problem.") logger.warning(msg) # create the bamtools index cmd = "samtools index -b {}".format(genome_sorted_bam) shell_utils.check_call(cmd, call=call) # check if we want to keep multimappers if 'keep_riboseq_multimappers' in config: return # remove multimapping reads from the genome file tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=True, note=note) cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, unique_genome_filename, tmp_str) in_files = [genome_sorted_bam] out_files = [unique_genome_filename] to_delete = [genome_star_bam, genome_sorted_bam] file_checkers = {unique_genome_filename: bam_utils.check_bam_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + [ 'orf_num', 'orf_len', 'orf_type' ] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): utils.create_symlink(annotated_exons_file, exons_file, call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. " "It requires a YAML config file that includes a number of keys. Please see the " "documentation for a complete description.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the ORF profiles will be created", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # now, check if we want to use slurm msg = "use_slurm: {}".format(args.use_slurm) logger.debug(msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return note_str = config.get('note', None) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" # for a sample, we first create its filtered genome profile keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, logging_str, star_str, tmp_str, flexbar_format_option_str, keep_intermediate_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format( args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This is a helper script which submits a set of samples to SLURM. It " "can also be used to run a set of samples sequentially. Due to limitations on " "the config file specification, all of the samples must use the same reference " "indices (i.e., genome sequence, set of ORFs, etc.).") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '--merge-replicates', help="If this flag is present, then " "the ORF profiles from the replicates will be merged before making the final " "predictions", action='store_true') parser.add_argument( '--run-replicates', help="If this flag is given with the " "--merge-replicates flag, then both the replicates *and* the individual " "samples will be run. This flag has no effect if --merge-replicates is not " "given.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles profiles_only_str = "" if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ( "The --run-replicates option was given with the --merge-replicates " "option. It will be ignored.") logger.warning(msg) tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) # collect the job_ids in case we are using slurm and need to merge replicates job_ids = [] sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, tmp_str, do_not_call_str, overwrite_str, logging_str, star_str, profiles_only_str, flexbar_format_option_str, keep_intermediate_str, mem_str) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, if we are running the "standard" pipeline, we are finished if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)