def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script collects counts of riboseq reads filtered at each step in " "the micropeptide prediction pipeline. It mostly parses fastqc results (using the " "crimson python package).") parser.add_argument('config', help="The yaml config file") parser.add_argument('out', help="The output csv file with the counts") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--overwrite', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['samtools'] shell_utils.check_programs_exist(programs) config = yaml.load(open(args.config)) res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, get_counts, config, args) res = [r for r in res if r is not None] res_df = pd.DataFrame(res) pandas_utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script collects counts of riboseq reads filtered at each step in " "the micropeptide prediction pipeline. It mostly parses fastqc results (using the " "crimson python package).") parser.add_argument('config', help="The yaml config file") parser.add_argument('out', help="The output csv file with the counts") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--overwrite', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['samtools'] shell_utils.check_programs_exist(programs) config = yaml.load(open(args.config)) res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, get_counts, config, args) res = [r for r in res if r is not None] res_df = pd.DataFrame(res) utils.write_df(res_df, args.out, index=False)
def _post_install(self): import site importlib.reload(site) import shlex import riboutils.ribo_filenames as filenames import misc.utils as utils import misc.shell_utils as shell_utils smf = [os.path.join("rpbp_models", s) for s in stan_model_files] models_base = filenames.get_default_models_base() spf = [os.path.join(models_base, s) for s in stan_pickle_files] # compile and pickle the stans models for stan, pickle in zip(smf, spf): if os.path.exists(pickle): msg = "A model alread exists at: {}. Skipping.".format(pickle) logging.warning(msg) continue # make sure the path exists dirname = os.path.dirname(pickle) if not os.path.exists(dirname): os.makedirs(dirname) cmd = "pickle-stan {} {}".format(shlex.quote(stan), shlex.quote(pickle)) logging.info(cmd) subprocess.call(cmd, shell=True) # check for the prerequisite programs programs = ['flexbar'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='flexbar', logger=logger) programs = ['STAR'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='STAR', logger=logger) programs = ['bowtie2', 'bowtie2-build-s'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='bowtie2', logger=logger) programs = ['samtools'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='SAMtools', logger=logger)
def _post_install(self): import site importlib.reload(site) import shlex import riboutils.ribo_filenames as filenames import misc.shell_utils as shell_utils smf = [os.path.join("rpbp_models", s) for s in stan_model_files] models_base = filenames.get_default_models_base() spf = [os.path.join(models_base, s) for s in stan_pickle_files] # compile and pickle the stans models for stan, pickle in zip(smf, spf): if os.path.exists(pickle): msg = "A model already exists at: {}. Skipping.".format(pickle) logging.warning(msg) continue # make sure the path exists dirname = os.path.dirname(pickle) if not os.path.exists(dirname): os.makedirs(dirname) cmd = "pickle-stan {} {}".format(shlex.quote(stan), shlex.quote(pickle)) logging.info(cmd) subprocess.call(cmd, shell=True) # check for the prerequisite programs programs = ['flexbar'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='flexbar', logger=logger) programs = ['STAR'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='STAR', logger=logger) programs = ['bowtie2', 'bowtie2-build-s'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='bowtie2', logger=logger) programs = ['samtools'] shell_utils.check_programs_exist(programs, raise_on_error=False, package_name='SAMtools', logger=logger)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="") #filenames.run_riboseq_preprocessing_description) parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('-t', '--tmp', help="The location for temporary files. If not " "specified, program-specific temp locations are used.", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv)) logger.info(msg) config = yaml.load(open(args.config)) call = not args.do_not_call keep_delete_files = args.keep_intermediate_files or args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'remove-multimapping-reads' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # Step 0: Running flexbar to remove adapter sequences raw_data = args.raw_data flexbar_target = filenames.get_without_adapters_base( config['riboseq_data'], args.name, note=note) without_adapters = filenames.get_without_adapters_fastq( config['riboseq_data'], args.name, note=note) adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq') adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters') quality_format_str = utils.get_config_argument( config, 'quality_format', args.flexbar_format_option, default=default_quality_format) max_uncalled_str = utils.get_config_argument(config, 'max_uncalled', default=default_max_uncalled) pre_trim_left_str = utils.get_config_argument( config, 'pre_trim_left', default=default_pre_trim_left) cmd = "flexbar {} {} {} {} -n {} {} -r {} -t {} {}".format( quality_format_str, max_uncalled_str, adapter_seq_str, adapter_file_str, args.num_cpus, flexbar_compression_str, raw_data, flexbar_target, pre_trim_left_str) in_files = [raw_data] out_files = [without_adapters] file_checkers = {without_adapters: fastx_utils.check_fastq_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # Step 1: Running bowtie2 to remove rRNA alignments out = utils.abspath("dev", "null") # we do not care about the alignments without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'], args.name, note=note) with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'], args.name, note=note) cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format( args.num_cpus, config['ribosomal_index'], without_adapters, out, without_rrna, with_rrna) in_files = [without_adapters] in_files.extend(bio.get_bowtie2_index_files(config['ribosomal_index'])) out_files = [without_rrna, with_rrna] to_delete = [without_adapters] file_checkers = {without_rrna: fastx_utils.check_fastq_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # Step 2: Running STAR to align rRNA-depleted reads to genome star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'], args.name, note=note) #transcriptome_bam = "{}{}".format(star_output_prefix, "Aligned.toTranscriptome.out.bam") genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam") star_compression_str = "--readFilesCommand {}".format( shlex.quote(args.star_read_files_command)) align_intron_min_str = utils.get_config_argument( config, 'align_intron_min', 'alignIntronMin', default=default_align_intron_min) align_intron_max_str = utils.get_config_argument( config, 'align_intron_max', 'alignIntronMax', default=default_align_intron_max) out_filter_mismatch_n_max_str = utils.get_config_argument( config, 'out_filter_mismatch_n_max', 'outFilterMismatchNmax', default=default_out_filter_mismatch_n_max) out_filter_mismatch_n_over_l_max_str = utils.get_config_argument( config, 'out_filter_mismatch_n_over_l_max', 'outFilterMismatchNoverLmax', default=default_out_filter_mismatch_n_over_l_max) out_filter_type_str = utils.get_config_argument( config, 'out_filter_type', 'outFilterType', default=default_out_filter_type) out_filter_intron_motifs_str = utils.get_config_argument( config, 'out_filter_intron_motifs', 'outFilterIntronMotifs', default=default_out_filter_intron_motifs) out_sam_attributes_str = utils.get_config_argument( config, 'out_sam_attributes', 'outSAMattributes', default=default_out_sam_attributes) star_tmp_str = "" if args.tmp is not None: star_tmp_name = "STAR_rpbp" star_tmp_dir = star_utils.create_star_tmp(args.tmp, star_tmp_name) star_tmp_str = "--outTmpDir {}".format(star_tmp_dir) mem_bytes = utils.human2bytes(args.mem) star_mem_str = "--limitBAMsortRAM {}".format(mem_bytes) cmd = ( "{} --runThreadN {} {} --genomeDir {} --sjdbGTFfile {} --readFilesIn {} " "{} {} {} {} {} {} {} {} --outFileNamePrefix {} {} {} {}".format( args.star_executable, args.num_cpus, star_compression_str, config['star_index'], config['gtf'], without_rrna, align_intron_min_str, align_intron_max_str, out_filter_mismatch_n_max_str, out_filter_type_str, out_filter_intron_motifs_str, quant_mode_str, out_filter_mismatch_n_over_l_max_str, out_sam_attributes_str, star_output_prefix, star_out_str, star_tmp_str, star_mem_str)) in_files = [without_rrna] in_files.extend(star_utils.get_star_index_files(config['star_index'])) #out_files = [transcriptome_bam, genome_star_bam] to_delete = [without_rrna] out_files = [genome_star_bam] file_checkers = { #transcriptome_bam: bam_utils.check_bam_file, genome_star_bam: bam_utils.check_bam_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete) # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'], args.name, note=note) if os.path.exists(genome_star_bam): utils.create_symlink(genome_star_bam, genome_sorted_bam, call) else: msg = ("Could not find the STAR genome bam alignment file. Unless " "--do-not-call was given, this is a problem.") logger.warning(msg) # create the bamtools index cmd = "samtools index -b {}".format(genome_sorted_bam) shell_utils.check_call(cmd, call=call) # check if we want to keep multimappers if 'keep_riboseq_multimappers' in config: return # remove multimapping reads from the genome file tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=True, note=note) cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, unique_genome_filename, tmp_str) in_files = [genome_sorted_bam] out_files = [unique_genome_filename] to_delete = [genome_star_bam, genome_sorted_bam] file_checkers = {unique_genome_filename: bam_utils.check_bam_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates the plots which detail the basic characteristics " "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if " "possible) a latex report for them.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('out', help="The base output directory for the latex report") parser.add_argument( '--show-unfiltered-orfs', help="If this flag is " "present, bar charts showing the distribution of the types of the " "unfiltered ORF set will be included", action='store_true') parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--image-type', help="The format of the image files. This must be " "a format usable by matplotlib.", default=default_image_type) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) parser.add_argument( '--show-chisq', help="If this flag is given, then the " "results from Rp-chi will be included in the document; otherwise, they " "will not be created or shown.", action='store_true') parser.add_argument('-t', '--tmp', help="A location for temporary files", default=default_tmp) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-length-distribution-line-graph', 'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles' ] shell_utils.check_programs_exist(programs) required_keys = ['riboseq_data', 'riboseq_samples'] utils.check_keys_exist(config, required_keys) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # by default, we will not include chisq chisq_values = [False] if args.show_chisq: chisq_values = [True, False] filtered_values = [True] if args.show_unfiltered_orfs: filtered_values = [True, False] grouped_values = [True, False] # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create all of the figures create_all_figures(config, args) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) project_name = config.get("project_name", default_project_name) title = "Rp-Bp prediction analysis for {}".format(project_name) abstract = "This document shows the results of the Rp-Bp pipeline analysis." #tex_file = os.path.join(args.out, "prediction-report.tex") tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str) with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract) latex.write(out, "\n") latex.clearpage(out) ### ORF type distributions title = "Predicted ORF type distributions" latex.section(out, title) # first, handle all of the regular datasets sample_names = sorted(config['riboseq_samples'].keys()) # and check if we also have replicates replicate_names = [] if 'riboseq_biological_replicates' in config: replicate_names = sorted( ribo_utils.get_riboseq_replicates(config).keys()) strands = ["+", "-"] i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF types: {}".format(sample_name) is_first = True # first, just dump all of the bar charts to the page it = itertools.product(grouped_values, chisq_values, filtered_values) for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) it = itertools.product(grouped_values, chisq_values, filtered_values) is_first = True for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) ### ORF type length distributions title = "Predicted ORF type length distributions" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type length distributions: {}".format(sample_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "Predicted ORF type metagene profiles" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type metagene profiles: {}".format(sample_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.warning(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF type metagene profiles: {}".format( replicate_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.debug(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text.") parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument('--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) sample_names = sorted(config['riboseq_samples'].keys()) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc','java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information... create_read_filtering_plots(args.config, config, args) # ... and all the other figures. for name in sample_names: periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) create_figures(args.config, config, name, offsets_df, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, width=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format(read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format(unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row['highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row['highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #title = ("length: {}. P-site offset: {}. \\newline status: {}" #"\n".format(length, offset, offset_status)) #latex.write(out, title, size="scriptsize") title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status)) if args.show_read_length_bfs: title = "\scriptsize{" + title + "}" title = "\\multicolumn{2}{c}{" + title + "}" latex.write(out, title) latex.write_row_sep(out) else: latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i%2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format(orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i%4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i%4 == 0: latex.end_figure(out) latex.clearpage(out) if (i>0) and (i%4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text." ) parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument( '--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument( '-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not None: config['note'] = args.note # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc', 'java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information create_read_filtering_plots(args.config, config, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) sample_names = sorted(config['riboseq_samples'].keys()) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, height=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, height=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format( read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format( unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets( config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) create_figures(args.config, config, name, offsets_df, args) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row[ 'highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row[ 'highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row[ 'highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row[ 'highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) title = ("length: {}. P-site offset: {}. \\newline status: {}" "\n".format(length, offset, offset_status)) latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i % 2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i % 4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.end_figure(out) latex.clearpage(out) if (i > 0) and (i % 4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script converts bam files to bigWig files. It is mostly " "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py " "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig " "program (from UCSC tools) must be in the path.\n\nN.B. If given, the " "start and end coordinates must be base-0.") parser.add_argument('bam', help="The bam file", nargs='+') parser.add_argument( '-o', '--overwrite', help="If this flag is given, then " "the bigWig file will be created whether it exists or not", action='store_true') parser.add_argument('-c', '--chrom', help="If specified, only alignments " "from this chromosome will be in the output", default=default_chrom) parser.add_argument('-s', '--start', help="If specied, only alignments " "from this position will be in the output", default=default_start) parser.add_argument('-e', '--end', help="If specied, only alignments " "up to this position will be in the output", default=default_end) parser.add_argument('-n', '--normalize', help="If this flag is given, " "then values will be normalized to reads per million", action='store_true') parser.add_argument( '-t', '--use-tempfile', help="If this flag is given, " "then a temp file will be used to avoid permission issues", action='store_true') parser.add_argument('-k', '--keep-wig', help="If this flag is given, then " "the wiggle file will not be deleted", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['wigToBigWig'] shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return parallel.apply_parallel_iter(args.bam, args.num_cpus, bam_to_wiggle, args, progress_bar=True)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs the Rp-Bp and Rp-chi pipelines on a given sample. " "It requires a YAML config file that includes a number of keys. Please see the " "documentation for a complete description.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument('--flexbar-options', help="A space-delimited list of options to" "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\"" "If specified, flexbar options will override default settings.", nargs='*', type=str) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the ORF profiles will be created", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # now, check if we want to use slurm msg = "use_slurm: {}".format(args.use_slurm) logger.debug(msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return note_str = config.get('note', None) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" # for a sample, we first create its filtered genome profile keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_option_str = "" if args.flexbar_options is not None: flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"' for flx_op in args.flexbar_options)) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs bowtie2 on all of the provided input files " "using the given index. By default, it does not save the alignments, " "aligned reads or unaligned reads. The respective flags must be given " "to retain the desired entities.") parser.add_argument('index', help="The bowtie2 index") parser.add_argument('out', help="The output directory") parser.add_argument('fastq', help="The fastq files", nargs='+') parser.add_argument('-a', '--alignments', help="If this flag is present, " "the alignments will be present in the output folder", action='store_true') parser.add_argument('--un-gz', help="If this flag is present, then the " "unaligned reads will be present in the output folder", action='store_true') parser.add_argument('--al-gz', help="If this flag is present, then the " "aligned reads will be present in the output folder", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['bowtie2', 'call-program'] shell_utils.check_programs_exist(programs) if not os.path.exists(args.out): if not args.do_not_call: msg = "Creating output directory: {}".format(args.out) logger.info(msg) os.makedirs(args.out) for fastq in args.fastq: basename = utils.get_basename(fastq) out_files = [] out = utils.abspath("dev","null") # we do not care about the alignments out_str = "-S {}".format(out) if args.alignments: n = "{}.bam".format(basename) out = os.path.join(args.out, n) out_str = "-S {}".format(out) out_files.append(out) un_gz_str = "" if args.un_gz: n = "{}.un-al.fastq.gz".format(basename) n = os.path.join(args.out, n) un_gz_str = "--un-gz {}".format(n) out_files.append(n) al_gz_str = "" if args.al_gz: n = "{}.al.fastq.gz".format(basename) n = os.path.join(args.out, n) al_gz_str = "--al-gz {}".format(n) out_files.append(n) cmd = "call-program bowtie2 -p {} --very-fast -x {} -U {} {} {} {}".format( args.num_cpus, args.index, fastq, out_str, un_gz_str, al_gz_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument('--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call programs = [ 'get-orf-peptide-matches' ] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument(args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument(args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(): if cell_type not in cell_types: msg = ("Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ("Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}". format(peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + [ 'orf_num', 'orf_len', 'orf_type' ] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): utils.create_symlink(annotated_exons_file, exons_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This is a helper script which submits a set of samples to SLURM. It " "can also be used to run a set of samples sequentially. Due to limitations on " "the config file specification, all of the samples must use the same reference " "indices (i.e., genome sequence, set of ORFs, etc.).") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument('--flexbar-options', help="A space-delimited list of options to" "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\"" "If specified, flexbar options will override default settings.", nargs='*', type=str) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the pre-processing part of the pipeline will be called, i.e. profiles " "will be created for each sample specified in the config file, but no predictions" "will be made.", action='store_true') parser.add_argument('--merge-replicates', help="If this flag is present, then " "the ORF profiles from the replicates will be merged before making the final " "predictions", action='store_true') parser.add_argument('--run-replicates', help="If this flag is given with the " "--merge-replicates flag, then both the replicates *and* the individual " "samples will be run. This flag has no effect if --merge-replicates is not " "given.", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # check if we only want to create the profiles, in this case # we call run-rpbp-pipeline with the --profiles-only option profiles_only_str = "" if args.profiles_only: args.merge_replicates = False profiles_only_str = "--profiles-only" msg = ("The --profiles-only option was given, this will override --merge-replicates " "and/or --run-replicates, if these options were also given!") logger.info(msg) # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles, but we still make predictions if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ("The --run-replicates option was given without the --merge-replicates " "option. It will be ignored.") logger.warning(msg) tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_option_str = "" if args.flexbar_options is not None: flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"' for flx_op in args.flexbar_options)) # collect the job_ids in case we are using slurm and need to merge replicates job_ids = [] sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, tmp_str, do_not_call_str, overwrite_str, logging_str, star_str, profiles_only_str, flexbar_option_str, keep_intermediate_str, mem_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, if we are running the "standard" pipeline, we are finished if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs all of the processing necessary to produce the " "signals used for later processing. In particular, it runs the standard " "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with " "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes " "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq " "signal for the most-expressed transcripts.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '--merge-replicates', help="If this flag is present, then the ORF " "profiles will be merged for all replicates in the condition given by <name>. The " "filenames, etc., will reflect the condition name, but not the lengths and offsets " "of the individual replicates.\n\nN.B. If this flag is is present, the --overwrite " "flag will automatically be set!", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[predict_translated_orfs]: {}".format(' '.join(sys.argv)) logger.debug(msg) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = ['estimate-orf-bayes-factors', 'select-final-prediction-set'] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'fasta', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) models_base = config.get('models_base', default_models_base) note_str = config.get('note', None) # we always need the ORFs orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # first, check if we are merging replicates # either way, the following variables need to have values for the rest of # the pipeline: lengths, offsets, smooth_profiles if args.merge_replicates: msg = ("The --merge-replicates option was given, so --overwrite is " "being set to True.") logger.warning(msg) args.overwrite = True # now, actually merge the replicates riboseq_replicates = ribo_utils.get_riboseq_replicates(config) # we will not use the lengths and offsets in the filenames lengths = None offsets = None # we will also merge all of unsmoothed profiles replicate_profiles = [ get_profile(name, config, args) for name in riboseq_replicates[args.name] ] replicate_profiles_str = ' '.join(replicate_profiles) profiles = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str) cmd = "merge-replicate-orf-profiles {} {} {}".format( replicate_profiles_str, profiles, logging_str) in_files = replicate_profiles out_files = [profiles] # todo: implement file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) else: # otherwise, just treat things as normal # get the lengths and offsets which meet the required criteria from # the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, args.name, args.do_not_call, is_unique=is_unique) profiles = get_profile(args.name, config, args) # estimate the bayes factors bayes_factors = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) # the smoothing options min_length_str = utils.get_config_argument(config, 'min_orf_length', 'min-length') max_length_str = utils.get_config_argument(config, 'max_orf_length', 'max-length') min_profile_str = utils.get_config_argument(config, 'min_signal', 'min-profile') fraction_str = utils.get_config_argument(config, 'smoothing_fraction', 'fraction') reweighting_iterations_str = utils.get_config_argument( config, 'smoothing_reweighting_iterations', 'reweighting-iterations') # parse out all of the options from the config file, if they are present translated_models = filenames.get_models(models_base, 'translated') untranslated_models = filenames.get_models(models_base, 'untranslated') translated_models_str = ' '.join(translated_models) untranslated_models_str = ' '.join(untranslated_models) translated_models_str = "--translated-models {}".format( translated_models_str) untranslated_models_str = "--untranslated-models {}".format( untranslated_models_str) orf_types_str = utils.get_config_argument(config, 'orf_types') seed_str = utils.get_config_argument(config, 'seed') chains_str = utils.get_config_argument(config, 'chains', 'chains') iterations_str = utils.get_config_argument(config, 'translation_iterations', 'iterations') chi_square_only_str = "" chi_square_only = False if 'chi_square_only' in config: chi_square_only = True chi_square_only_str = "--chi-square-only" cmd = ( "estimate-orf-bayes-factors {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} " "--num-cpus {}".format(profiles, orfs_genomic, bayes_factors, translated_models_str, untranslated_models_str, logging_str, orf_types_str, min_length_str, max_length_str, min_profile_str, fraction_str, reweighting_iterations_str, seed_str, iterations_str, chains_str, chi_square_only_str, args.num_cpus)) in_files = [profiles, orfs_genomic] in_files.extend(translated_models) in_files.extend(untranslated_models) out_files = [bayes_factors] file_checkers = {bayes_factors: utils.check_gzip_file} msg = "estimate-bayes-factors in_files: {}".format(in_files) logger.debug(msg) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) is_chisq_values = [True, False] if chi_square_only: is_chisq_values = [True] for is_filtered in [True, False]: for is_chisq in is_chisq_values: filtered_str = "" if is_filtered: filtered_str = "--select-longest-by-stop --select-best-overlapping" if is_chisq: chisq_str = "--use-chi-square" else: chisq_str = "" # now, select the ORFs (longest for each stop codon) which pass the prediction filters predicted_orfs = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=is_chisq) predicted_orfs_dna = filenames.get_riboseq_predicted_orfs_dna( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=is_chisq) predicted_orfs_protein = filenames.get_riboseq_predicted_orfs_protein( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=is_chisq) min_bf_mean_str = utils.get_config_argument(config, 'min_bf_mean') max_bf_var_str = utils.get_config_argument(config, 'max_bf_var') min_bf_likelihood_str = utils.get_config_argument( config, 'min_bf_likelihood') chisq_significance_level_str = utils.get_config_argument( config, 'chisq_significance_level') min_profile_str = utils.get_config_argument( config, 'min_signal', 'minimum-profile-sum') cmd = "select-final-prediction-set {} {} {} {} {} {} {} {} {} {} {}".format( bayes_factors, config['fasta'], predicted_orfs, predicted_orfs_dna, predicted_orfs_protein, min_bf_mean_str, max_bf_var_str, min_bf_likelihood_str, logging_str, chisq_str, filtered_str) in_files = [bayes_factors, config['fasta']] out_files = [ predicted_orfs, predicted_orfs_dna, predicted_orfs_protein ] file_checkers = {predicted_orfs: utils.check_gzip_file} # todo: implement file checker for fasta files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script downloads short read archive runs (i.e., SRR) files " "over ftp. It only requires the run number. It also converts the files from " "the .sra format to .fastq.gz files. It then deletes the .sra file.") parser.add_argument( 'srr', help="A csv file containing the SRR accessions to " "download. Optionally, it can also include whether the samples are paired-" "end or not.") parser.add_argument('outdir', help="The location for the fastq.gz files") parser.add_argument('-a', '--accession-field', help="The name of the column " "containing the SRR identifiers", default=default_accession_field) parser.add_argument('-p', '--paired-field', help="The name of the column " "indicating whether the sample is paired-end", default=default_paired_field) parser.add_argument( '-v', '--paired-values', help="The exact string values in " "the paired-field which indicate the sample is paired-end", nargs="*", default=default_paired_values) parser.add_argument('-s', '--source', help="The server from which the files " "will be downloaded", choices=source_choices, default=default_source) parser.add_argument( '--overwrite', help="If this flag is given, then existing " "files will be re-downloaded. Otherwise, if either the .sra or .fastq.gz " "file already exists, then the sra file will not be downloaded.", action='store_true') parser.add_argument( '--num-downloads-per-connection', help="The number of " "files to download with each open connection. Each connections will be " "closed and re-opened after this many files are downloaded.", type=int, default=default_num_downloads_per_connection) parser.add_argument('--sep', help="The separator in the SRR file", default=default_sep) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['fastq-dump'] shell_utils.check_programs_exist(programs) # check if we want to use slurm if args.use_slurm: msg = ("The --use-slurm option was given, so sbatch will now be used " "to submit to slurm.") logger.warning(msg) cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) # and quit! return msg = "Reading SRR list" logger.info(msg) srr = pd.read_csv(args.srr, sep=args.sep) parallel.apply_parallel_split(srr, args.num_cpus, process_files, args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates the plots which detail the basic characteristics " "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if " "possible) a latex report for them.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('out', help="The base output directory for the latex report") parser.add_argument('--show-unfiltered-orfs', help="If this flag is " "present, bar charts showing the distribution of the types of the " "unfiltered ORF set will be included", action='store_true') parser.add_argument('--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--image-type', help="The format of the image files. This must be " "a format usable by matplotlib.", default=default_image_type) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) parser.add_argument('--show-chisq', help="If this flag is given, then the " "results from Rp-chi will be included in the document; otherwise, they " "will not be created or shown.", action='store_true') parser.add_argument('-t', '--tmp', help="A location for temporary files", default=default_tmp) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-length-distribution-line-graph', 'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # by default, we will not include chisq chisq_values = [False] if args.show_chisq: chisq_values = [True, False] filtered_values = [True] if args.show_unfiltered_orfs: filtered_values = [True, False] grouped_values = [True, False] # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create all of the figures create_all_figures(config, args) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) project_name = config.get("project_name", default_project_name) title = "Rp-Bp prediction analysis for {}".format(project_name) abstract = "This document shows the results of the Rp-Bp pipeline analysis." #tex_file = os.path.join(args.out, "prediction-report.tex") tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str) with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract) latex.write(out, "\n") latex.clearpage(out) ### ORF type distributions title = "Predicted ORF type distributions" latex.section(out, title) # first, handle all of the regular datasets sample_names = sorted(config['riboseq_samples'].keys()) # and check if we also have replicates replicate_names = [] if 'riboseq_biological_replicates' in config: replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys()) strands = ["+", "-"] i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF types: {}".format(sample_name) is_first = True # first, just dump all of the bar charts to the page it = itertools.product(grouped_values, chisq_values, filtered_values) for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered ) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i%6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i%6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%6 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) it = itertools.product(grouped_values, chisq_values, filtered_values) is_first = True for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered ) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i%6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i%6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%6 != 0: latex.clearpage(out) ### ORF type length distributions title = "Predicted ORF type length distributions" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type length distributions: {}".format(sample_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq ) if os.path.exists(orf_length_line_graph): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_length_line_graph) logger.debug(msg) if (i > 0) and (i%4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq ) if os.path.exists(orf_length_line_graph): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_length_line_graph) logger.debug(msg) if (i > 0) and (i%4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "Predicted ORF type metagene profiles" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type metagene profiles: {}".format(sample_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type ) msg = "Looking for image file: {}".format(orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i%4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_type_profile) logger.warning(msg) if (i > 0) and (i%4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF type metagene profiles: {}".format(replicate_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq ) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type ) if os.path.exists(orf_type_profile): if is_first or (i%4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format(orf_type_profile) logger.debug(msg) if (i > 0) and (i%4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i%4 != 0: latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument( '--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call programs = ['get-orf-peptide-matches'] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument( args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument( args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items( ): if cell_type not in cell_types: msg = ( "Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ( "Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}".format( peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format( cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs all of the processing necessary to produce the " "signals used for later processing. In particular, it runs the standard " "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with " "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes " "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq " "signal for the most-expressed transcripts.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (json) config file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument('--flexbar-options', help="A space-delimited list of options to" "pass to flexbar. Each option must be quoted separately as in \"--flexbarOption value\"" "If specified, flexbar options will override default settings.", nargs='*', type=str) parser.add_argument('--tmp', help="The location for temp files", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) star_index = filenames.get_star_index(config['genome_base_path'], config['genome_name'], is_merged=False) models_base = config.get('models_base', default_models_base) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_argument = "" if not call: do_not_call_argument = "--do-not-call" overwrite_argument = "" if args.overwrite: overwrite_argument = "--overwrite" orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_option_str = "" if args.flexbar_options is not None: flexbar_option_str = "--flexbar-options {}".format(' '.join('"' + flx_op + '"' for flx_op in args.flexbar_options)) # check if we want to keep multimappers is_unique = not ('keep_riboseq_multimappers' in config) riboseq_raw_data = args.raw_data riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(riboseq_raw_data, args.config, args.name, args.num_cpus, do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str, flexbar_option_str, keep_intermediate_str, mem_str)) # There could be cases where we start somewhere in the middle of creating # the base genome profile. So even if the "raw data" is not available, # we still want to call the base pipeline. #in_files = [riboseq_raw_data] in_files = [] out_files = [riboseq_bam_filename] # we always call this, and pass --do-not-call through shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # create the metagene profiles metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], args.name, is_unique=is_unique, note=note) start_upstream_str = utils.get_config_argument(config, 'metagene_profile_start_upstream', 'start-upstream') start_downstream_str = utils.get_config_argument(config, 'metagene_profile_start_downstream', 'start-downstream') end_upstream_str = utils.get_config_argument(config, 'metagene_profile_end_upstream', 'end-upstream') end_downstream_str = utils.get_config_argument(config, 'metagene_profile_end_downstream', 'end-downstream') # use the canonical transcripts for extracting the metagene profiles transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}" .format(riboseq_bam_filename, transcript_bed, metagene_profiles, args.num_cpus, logging_str, start_upstream_str, start_downstream_str, end_upstream_str, end_downstream_str)) in_files = [riboseq_bam_filename, orfs_genomic] out_files = [metagene_profiles] file_checkers = { metagene_profiles: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # estimate the periodicity for each offset for all read lengths metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], args.name, is_unique=is_unique, note=note) #periodic_models_str = utils.get_config_argument(config, 'periodic_models') #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models') periodic_models = filenames.get_models(models_base, 'periodic') non_periodic_models = filenames.get_models(models_base, 'nonperiodic') periodic_models_str = ' '.join(periodic_models) non_periodic_models_str = ' '.join(non_periodic_models) periodic_models_str = "--periodic-models {}".format(periodic_models_str) non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str) periodic_offset_start_str = utils.get_config_argument(config, 'periodic_offset_start') periodic_offset_end_str = utils.get_config_argument(config, 'periodic_offset_end') metagene_profile_length_str = utils.get_config_argument(config, 'metagene_profile_length') seed_str = utils.get_config_argument(config, 'seed') chains_str = utils.get_config_argument(config, 'chains') iterations_str = utils.get_config_argument(config, 'metagene_profile_iterations', 'iterations') cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} " "{} {} {} {} {} {} {}".format(metagene_profiles, metagene_profile_bayes_factors, args.num_cpus, periodic_models_str, non_periodic_models_str, periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str, seed_str, chains_str, iterations_str, logging_str)) in_files = [metagene_profiles] in_files.extend(periodic_models) in_files.extend(non_periodic_models) out_files = [metagene_profile_bayes_factors] file_checkers = { metagene_profile_bayes_factors: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # select the best read lengths for constructing the signal periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors, periodic_offsets) in_files = [metagene_profile_bayes_factors] out_files = [periodic_offsets] file_checkers = { periodic_offsets: utils.check_gzip_file } shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name, args.do_not_call, is_unique=is_unique) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, " "and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return lengths_str = ' '.join(lengths) offsets_str = ' '.join(offsets) seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix') # extract the riboseq profiles for each orf unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_orf=True) cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format( unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, offsets_str, logging_str, seqname_prefix_str, args.num_cpus)) in_files = [orfs_genomic, exons_file, unique_filename] out_files = [profiles_filename] #todo: implement a file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs the Rp-Bp and Rp-chi pipelines on a given sample. " "It requires a YAML config file that includes a number of keys. Please see the " "documentation for a complete description.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="If this flag is present, then only " "the ORF profiles will be created", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # now, check if we want to use slurm msg = "use_slurm: {}".format(args.use_slurm) logger.debug(msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return note_str = config.get('note', None) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" # for a sample, we first create its filtered genome profile keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, logging_str, star_str, tmp_str, flexbar_format_option_str, keep_intermediate_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format( args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script runs all of the processing necessary to produce the " "signals used for later processing. In particular, it runs the standard " "rnaseq and riboseq preprocessing, estimates the abundance of transcripts with " "the rnaseq data, and selects the most-expressed isoforms and ORFs. Next, it removes " "multimapping and non-periodic-length riboseq reads. Finally, it extracts the riboseq " "signal for the most-expressed transcripts.") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (json) config file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--tmp', help="The location for temp files", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'gtf', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) star_index = filenames.get_star_index(config['genome_base_path'], config['genome_name'], is_merged=False) models_base = config.get('models_base', default_models_base) # the first step is the standard riboseq preprocessing # handle do_not_call so that we _do_ call the preprocessing script, # but that it does not run anything do_not_call_argument = "" if not call: do_not_call_argument = "--do-not-call" overwrite_argument = "" if args.overwrite: overwrite_argument = "--overwrite" orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) # check if we want to keep multimappers is_unique = not ('keep_riboseq_multimappers' in config) riboseq_raw_data = args.raw_data riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ( "create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(riboseq_raw_data, args.config, args.name, args.num_cpus, do_not_call_argument, overwrite_argument, logging_str, star_str, tmp_str, flexbar_format_option_str, keep_intermediate_str, mem_str)) # There could be cases where we start somewhere in the middle of creating # the base genome profile. So even if the "raw data" is not available, # we still want to call the base pipeline. #in_files = [riboseq_raw_data] in_files = [] out_files = [riboseq_bam_filename] # we always call this, and pass --do-not-call through shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=True) # create the metagene profiles metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'], args.name, is_unique=is_unique, note=note) seqids_to_keep_str = utils.get_config_argument(config, 'seqids_to_keep') start_upstream_str = utils.get_config_argument( config, 'metagene_profile_start_upstream', 'start-upstream') start_downstream_str = utils.get_config_argument( config, 'metagene_profile_start_downstream', 'start-downstream') end_upstream_str = utils.get_config_argument( config, 'metagene_profile_end_upstream', 'end-upstream') end_downstream_str = utils.get_config_argument( config, 'metagene_profile_end_downstream', 'end-downstream') # use the canonical transcripts for extracting the metagene profiles transcript_bed = filenames.get_bed(config['genome_base_path'], config['genome_name'], is_merged=False, is_annotated=True) cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {} {}" .format(riboseq_bam_filename, transcript_bed, metagene_profiles, args.num_cpus, logging_str, seqids_to_keep_str, start_upstream_str, start_downstream_str, end_upstream_str, end_downstream_str)) in_files = [riboseq_bam_filename, orfs_genomic] out_files = [metagene_profiles] file_checkers = {metagene_profiles: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # estimate the periodicity for each offset for all read lengths metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors( config['riboseq_data'], args.name, is_unique=is_unique, note=note) #periodic_models_str = utils.get_config_argument(config, 'periodic_models') #non_periodic_models_str = utils.get_config_argument(config, 'nonperiodic_models') periodic_models = filenames.get_models(models_base, 'periodic') non_periodic_models = filenames.get_models(models_base, 'nonperiodic') periodic_models_str = ' '.join(periodic_models) non_periodic_models_str = ' '.join(non_periodic_models) periodic_models_str = "--periodic-models {}".format(periodic_models_str) non_periodic_models_str = "--nonperiodic-models {}".format( non_periodic_models_str) periodic_offset_start_str = utils.get_config_argument( config, 'periodic_offset_start') periodic_offset_end_str = utils.get_config_argument( config, 'periodic_offset_end') metagene_profile_length_str = utils.get_config_argument( config, 'metagene_profile_length') seed_str = utils.get_config_argument(config, 'seed') chains_str = utils.get_config_argument(config, 'chains') iterations_str = utils.get_config_argument(config, 'metagene_profile_iterations', 'iterations') cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} " "{} {} {} {} {} {} {}".format( metagene_profiles, metagene_profile_bayes_factors, args.num_cpus, periodic_models_str, non_periodic_models_str, periodic_offset_start_str, periodic_offset_end_str, metagene_profile_length_str, seed_str, chains_str, iterations_str, logging_str)) in_files = [metagene_profiles] in_files.extend(periodic_models) in_files.extend(non_periodic_models) out_files = [metagene_profile_bayes_factors] file_checkers = {metagene_profile_bayes_factors: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # select the best read lengths for constructing the signal periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], args.name, is_unique=is_unique, note=note) cmd = "select-periodic-offsets {} {}".format( metagene_profile_bayes_factors, periodic_offsets) in_files = [metagene_profile_bayes_factors] out_files = [periodic_offsets] file_checkers = {periodic_offsets: utils.check_gzip_file} shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, args.name, args.do_not_call, is_unique=is_unique) if len(lengths) == 0: msg = ( "No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, " "and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return lengths_str = ' '.join(lengths) offsets_str = ' '.join(offsets) seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix') # extract the riboseq profiles for each orf unique_filename = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) cmd = ( "extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} " .format(unique_filename, orfs_genomic, exons_file, profiles_filename, lengths_str, offsets_str, logging_str, seqname_prefix_str, args.num_cpus)) in_files = [orfs_genomic, exons_file, unique_filename] out_files = [profiles_filename] #todo: implement a file checker for mtx files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [ config['gtf'], config['fasta'], config['ribosomal_fasta'] ] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format(args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False, is_orf=True) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_orf=True) use_gff3_specs = config['gtf'].endswith('gff') gtf_file = filenames.get_gtf(config['genome_base_path'], config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True, is_orf=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type'] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start'] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) # we also need to concat the annotations to inform STAR # there is no particular reason to merge and sort the files, so # we just concatenate them... if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs): cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file)) in_files = [config['gtf'], config['de_novo_gtf']] out_files = [gtf_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) else: msg = ("Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)" "for reference and do novo annotations. Symlink to reference annotations created.") logger.warning(msg) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): shell_utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): shell_utils.create_symlink(annotated_exons_file, exons_file, call) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script runs the second part of the pipeline: it estimate ORF Bayes" "factors using the ORF profiles, then make the final prediction set.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name for the dataset, used in the created files") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--do-not-call', action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--merge-replicates', help="If this flag is present, then the ORF " "profiles will be merged for all replicates in the condition given by <name>. The " "filenames, etc., will reflect the condition name, but not the lengths and offsets " "of the individual replicates.\n\nN.B. If this flag is is present, the --overwrite " "flag will automatically be set!", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[predict_translated_orfs]: {}".format(' '.join(sys.argv)) logger.debug(msg) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'estimate-orf-bayes-factors', 'select-final-prediction-set' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'fasta', 'genome_base_path', 'genome_name' ] utils.check_keys_exist(config, required_keys) models_base = config.get('models_base', default_models_base) note_str = config.get('note', None) # we always need the ORFs orfs_genomic = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=config.get('orf_note') ) # smoothing parameters (filenames) # default values are not used in the file names fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) # check if we are running Rp-Bp (default) or Rp-chi chi_square_only_str = "" chi_square_only = False if 'chi_square_only' in config: chi_square_only_str = "--chi-square-only" chi_square_only = True fraction = None reweighting_iterations = None msg = """ The final prediction set will be made based on the chi square test only! The translation models will not be fit to the data, and the posterior distributions will not be estimated. """ logger.info(msg) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # first, check if we are merging replicates # either way, the following variables need to have values for the rest of # the pipeline: lengths, offsets, smooth_profiles if args.merge_replicates: msg = ("The --merge-replicates option was given, so --overwrite is " "being set to True.") logger.warning(msg) args.overwrite = True # now, actually merge the replicates riboseq_replicates = ribo_utils.get_riboseq_replicates(config) # we will not use the lengths and offsets in the filenames lengths = None offsets = None # we will also merge all of unsmoothed profiles replicate_profiles = [ get_profile(name, config, args) for name in riboseq_replicates[args.name] ] replicate_profiles_str = ' '.join(replicate_profiles) profiles = filenames.get_riboseq_profiles(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str) cmd = "merge-replicate-orf-profiles {} {} {}".format(replicate_profiles_str, profiles, logging_str) in_files = replicate_profiles out_files = [profiles] # todo: implement file checker for mtx files shell_utils.call_if_not_exists( cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call ) else: # otherwise, just treat things as normal # get the lengths and offsets which meet the required criteria from # the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name, args.do_not_call, is_unique=is_unique) profiles = get_profile(args.name, config, args) # estimate the bayes factors bayes_factors = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations ) # the smoothing options min_length_str = utils.get_config_argument(config, 'min_orf_length', 'min-length') max_length_str = utils.get_config_argument(config, 'max_orf_length', 'max-length') min_profile_str = utils.get_config_argument(config, 'min_signal', 'min-profile') fraction_str = utils.get_config_argument(config, 'smoothing_fraction', 'fraction') reweighting_iterations_str = utils.get_config_argument(config, 'smoothing_reweighting_iterations', 'reweighting-iterations') # parse out all of the options from the config file, if they are present translated_models = filenames.get_models(models_base, 'translated') untranslated_models = filenames.get_models(models_base, 'untranslated') translated_models_str = ' '.join(translated_models) untranslated_models_str = ' '.join(untranslated_models) translated_models_str = "--translated-models {}".format( translated_models_str) untranslated_models_str = "--untranslated-models {}".format( untranslated_models_str) orf_types_str = utils.get_config_argument(config, 'orf_types') seed_str = utils.get_config_argument(config, 'seed') chains_str = utils.get_config_argument(config, 'chains', 'chains') iterations_str = utils.get_config_argument(config, 'translation_iterations', 'iterations') cmd = ("estimate-orf-bayes-factors {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} " "--num-cpus {}".format( profiles, orfs_genomic, bayes_factors, translated_models_str, untranslated_models_str, logging_str, orf_types_str, min_length_str, max_length_str, min_profile_str, fraction_str, reweighting_iterations_str, seed_str, iterations_str, chains_str, chi_square_only_str, args.num_cpus) ) in_files = [profiles, orfs_genomic] in_files.extend(translated_models) in_files.extend(untranslated_models) out_files = [bayes_factors] file_checkers = { bayes_factors: utils.check_gzip_file } msg = "estimate-bayes-factors in_files: {}".format(in_files) logger.debug(msg) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call) for is_filtered in [True, False]: filtered_str = "" if is_filtered: filtered_str = "--select-longest-by-stop --select-best-overlapping" # now, select the ORFs (longest for each stop codon) which pass the prediction filters predicted_orfs = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=chi_square_only ) predicted_orfs_dna = filenames.get_riboseq_predicted_orfs_dna( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=chi_square_only ) predicted_orfs_protein = filenames.get_riboseq_predicted_orfs_protein( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=is_filtered, is_chisq=chi_square_only ) min_bf_mean_str = utils.get_config_argument(config, 'min_bf_mean') max_bf_var_str = utils.get_config_argument(config, 'max_bf_var') min_bf_likelihood_str = utils.get_config_argument(config, 'min_bf_likelihood') chisq_significance_level_str = utils.get_config_argument(config, 'chisq_significance_level') min_profile_str = utils.get_config_argument(config, 'min_signal', 'minimum-profile-sum') cmd = "select-final-prediction-set {} {} {} {} {} {} {} {} {} {} {}".format( bayes_factors, config['fasta'], predicted_orfs, predicted_orfs_dna, predicted_orfs_protein, min_bf_mean_str, max_bf_var_str, min_bf_likelihood_str, logging_str, chi_square_only_str, filtered_str ) in_files = [bayes_factors, config['fasta']] out_files = [ predicted_orfs, predicted_orfs_dna, predicted_orfs_protein ] file_checkers = { predicted_orfs: utils.check_gzip_file } # todo: implement file checker for fasta files shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, file_checkers=file_checkers, overwrite=args.overwrite, call=call)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This is a helper script which submits a set of samples to SLURM. It " "can also be used to run a set of samples sequentially. Due to limitations on " "the config file specification, all of the samples must use the same reference " "indices (i.e., genome sequence, set of ORFs, etc.).") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--tmp', help="The temp directory", default=default_tmp) parser.add_argument( '--flexbar-format-option', help="The name of the \"format\" " "option for flexbar. This changed from \"format\" to \"qtrim-format\" in " "version 2.7.", default=default_flexbar_format_option) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument( '--merge-replicates', help="If this flag is present, then " "the ORF profiles from the replicates will be merged before making the final " "predictions", action='store_true') parser.add_argument( '--run-replicates', help="If this flag is given with the " "--merge-replicates flag, then both the replicates *and* the individual " "samples will be run. This flag has no effect if --merge-replicates is not " "given.", action='store_true') parser.add_argument( '-k', '--keep-intermediate-files', help="If this flag is given, " "then all intermediate files will be kept; otherwise, they will be " "deleted. This feature is implemented piecemeal. If the --do-not-call flag " "is given, then nothing will be deleted.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) star_str = star_utils.get_star_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) note = config.get('note', None) # handle do_not_call so that we _do_ call the pipeline script, but that it does not run anything do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles profiles_only_str = "" if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ( "The --run-replicates option was given with the --merge-replicates " "option. It will be ignored.") logger.warning(msg) tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(args.tmp) flexbar_format_option_str = "" if args.flexbar_format_option is not None: flexbar_format_option_str = "--flexbar-format-option {}".format( args.flexbar_format_option) # collect the job_ids in case we are using slurm and need to merge replicates job_ids = [] sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_{}_rpbp".format(sample_name, note)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, tmp_str, do_not_call_str, overwrite_str, logging_str, star_str, profiles_only_str, flexbar_format_option_str, keep_intermediate_str, mem_str) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) # now, if we are running the "standard" pipeline, we are finished if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)